Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck...
authorIngo Molnar <mingo@kernel.org>
Wed, 16 Mar 2016 07:24:34 +0000 (08:24 +0100)
committerIngo Molnar <mingo@kernel.org>
Wed, 16 Mar 2016 07:24:34 +0000 (08:24 +0100)
Pull memory barriers documentation updates from Paul E. McKenney.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
2291 files changed:
.mailmap
Documentation/DocBook/media/v4l/media-types.xml
Documentation/Intel-IOMMU.txt
Documentation/cgroup-v2.txt
Documentation/devicetree/bindings/arm/omap/omap.txt
Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt
Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt
Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/net/brcm,bcmgenet.txt
Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt
Documentation/devicetree/bindings/net/hisilicon-hns-nic.txt
Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt
Documentation/devicetree/bindings/net/mdio-mux-gpio.txt
Documentation/devicetree/bindings/net/mdio-mux.txt
Documentation/devicetree/bindings/net/phy.txt
Documentation/devicetree/bindings/net/renesas,ravb.txt
Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt
Documentation/devicetree/bindings/pci/rcar-pci.txt
Documentation/devicetree/bindings/regulator/tps65217.txt
Documentation/devicetree/bindings/rtc/s3c-rtc.txt
Documentation/devicetree/bindings/serial/fsl-imx-uart.txt
Documentation/devicetree/bindings/sound/fsl-asoc-card.txt
Documentation/devicetree/bindings/thermal/rcar-thermal.txt
Documentation/filesystems/efivarfs.txt
Documentation/filesystems/proc.txt
Documentation/kernel-parameters.txt
Documentation/networking/ip-sysctl.txt
Documentation/ptp/testptp.c
Documentation/sysctl/kernel.txt
Documentation/timers/hpet.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/mmu.txt
Documentation/watchdog/watchdog-parameters.txt
Documentation/x86/early-microcode.txt
Documentation/x86/exception-tables.txt
Documentation/x86/x86_64/boot-options.txt
MAINTAINERS
Makefile
arch/alpha/kernel/smp.c
arch/arc/Kconfig
arch/arc/Makefile
arch/arc/configs/axs101_defconfig
arch/arc/configs/axs103_defconfig
arch/arc/configs/axs103_smp_defconfig
arch/arc/configs/nsim_700_defconfig
arch/arc/configs/nsim_hs_defconfig
arch/arc/configs/nsim_hs_smp_defconfig
arch/arc/configs/nsimosci_defconfig
arch/arc/configs/nsimosci_hs_defconfig
arch/arc/configs/nsimosci_hs_smp_defconfig
arch/arc/configs/tb10x_defconfig
arch/arc/configs/vdk_hs38_smp_defconfig
arch/arc/include/asm/arcregs.h
arch/arc/include/asm/irq.h
arch/arc/include/asm/irqflags-arcv2.h
arch/arc/include/asm/mcip.h
arch/arc/include/asm/pgtable.h
arch/arc/kernel/entry-arcv2.S
arch/arc/kernel/intc-arcv2.c
arch/arc/kernel/intc-compact.c
arch/arc/kernel/mcip.c
arch/arc/kernel/setup.c
arch/arc/kernel/smp.c
arch/arc/kernel/time.c
arch/arm/boot/compressed/Makefile
arch/arm/boot/dts/am335x-bone-common.dtsi
arch/arm/boot/dts/am335x-chilisom.dtsi
arch/arm/boot/dts/am335x-nano.dts
arch/arm/boot/dts/am335x-pepper.dts
arch/arm/boot/dts/am335x-shc.dts
arch/arm/boot/dts/am335x-sl50.dts
arch/arm/boot/dts/am33xx.dtsi
arch/arm/boot/dts/am4372.dtsi
arch/arm/boot/dts/am437x-gp-evm.dts
arch/arm/boot/dts/am43x-epos-evm.dts
arch/arm/boot/dts/am57xx-beagle-x15.dts
arch/arm/boot/dts/am57xx-cl-som-am57x.dts
arch/arm/boot/dts/am57xx-sbc-am57x.dts
arch/arm/boot/dts/armada-xp-axpwifiap.dts
arch/arm/boot/dts/armada-xp-db.dts
arch/arm/boot/dts/armada-xp-gp.dts
arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
arch/arm/boot/dts/armada-xp-linksys-mamba.dts
arch/arm/boot/dts/armada-xp-matrix.dts
arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
arch/arm/boot/dts/armada-xp-synology-ds414.dts
arch/arm/boot/dts/at91-sama5d2_xplained.dts
arch/arm/boot/dts/at91-sama5d4_xplained.dts
arch/arm/boot/dts/at91-sama5d4ek.dts
arch/arm/boot/dts/at91sam9n12ek.dts
arch/arm/boot/dts/dra7.dtsi
arch/arm/boot/dts/imx6qdl.dtsi
arch/arm/boot/dts/kirkwood-ds112.dts
arch/arm/boot/dts/kirkwood-lswvl.dts
arch/arm/boot/dts/kirkwood-lswxl.dts
arch/arm/boot/dts/kirkwood-pogoplug-series-4.dts
arch/arm/boot/dts/logicpd-torpedo-som.dtsi
arch/arm/boot/dts/omap5-board-common.dtsi
arch/arm/boot/dts/orion5x-linkstation-lswtgl.dts
arch/arm/boot/dts/r8a7791-porter.dts
arch/arm/boot/dts/sama5d2-pinfunc.h
arch/arm/boot/dts/sama5d4.dtsi
arch/arm/boot/dts/ste-nomadik-stn8815.dtsi
arch/arm/boot/dts/tps65217.dtsi [new file with mode: 0644]
arch/arm/common/icst.c
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/omap2plus_defconfig
arch/arm/crypto/aes-ce-glue.c
arch/arm/include/asm/arch_gicv3.h
arch/arm/include/asm/cacheflush.h
arch/arm/include/asm/xen/page-coherent.h
arch/arm/include/uapi/asm/unistd.h
arch/arm/kernel/Makefile
arch/arm/kernel/calls.S
arch/arm/kernel/setup.c
arch/arm/kernel/smp.c
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm/kvm/mmio.c
arch/arm/kvm/psci.c
arch/arm/mach-lpc32xx/phy3250.c
arch/arm/mach-mvebu/Kconfig
arch/arm/mach-netx/fb.c
arch/arm/mach-nspire/clcd.c
arch/arm/mach-omap2/board-generic.c
arch/arm/mach-omap2/devices.c
arch/arm/mach-omap2/gpmc-onenand.c
arch/arm/mach-omap2/omap_device.c
arch/arm/mach-omap2/omap_hwmod.c
arch/arm/mach-omap2/omap_hwmod.h
arch/arm/mach-omap2/pdata-quirks.c
arch/arm/mach-omap2/sleep34xx.S
arch/arm/mach-omap2/sleep44xx.S
arch/arm/mach-realview/Kconfig
arch/arm/mach-realview/platsmp-dt.c
arch/arm/mach-shmobile/common.h
arch/arm/mach-shmobile/headsmp-scu.S
arch/arm/mach-shmobile/headsmp.S
arch/arm/mach-shmobile/platsmp-apmu.c
arch/arm/mach-shmobile/platsmp-scu.c
arch/arm/mach-shmobile/smp-r8a7779.c
arch/arm/mach-tango/Kconfig
arch/arm/mach-tango/platsmp.c
arch/arm/mm/mmap.c
arch/arm/mm/pageattr.c
arch/arm/plat-samsung/pm-check.c
arch/arm/vdso/vdso.S
arch/arm64/Makefile
arch/arm64/boot/Makefile
arch/arm64/boot/dts/arm/juno-base.dtsi
arch/arm64/boot/dts/hisilicon/hip05_hns.dtsi
arch/arm64/boot/dts/nvidia/tegra132-norrin.dts
arch/arm64/boot/install.sh
arch/arm64/configs/defconfig
arch/arm64/crypto/aes-glue.c
arch/arm64/include/asm/arch_gicv3.h
arch/arm64/include/asm/cacheflush.h
arch/arm64/include/asm/futex.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/include/asm/page.h
arch/arm64/include/asm/pgtable.h
arch/arm64/kernel/debug-monitors.c
arch/arm64/kernel/head.S
arch/arm64/kernel/image.h
arch/arm64/kernel/setup.c
arch/arm64/kernel/sleep.S
arch/arm64/kernel/smp.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kernel/traps.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp-init.S
arch/arm64/kvm/hyp/switch.c
arch/arm64/kvm/hyp/vgic-v3-sr.c
arch/arm64/kvm/inject_fault.c
arch/arm64/kvm/sys_regs.c
arch/arm64/lib/strnlen.S
arch/arm64/mm/dma-mapping.c
arch/arm64/mm/dump.c
arch/arm64/mm/fault.c
arch/arm64/mm/hugetlbpage.c
arch/arm64/mm/init.c
arch/arm64/mm/kasan_init.c
arch/arm64/mm/mmap.c
arch/arm64/mm/pageattr.c
arch/arm64/mm/proc-macros.S
arch/arm64/mm/proc.S
arch/avr32/kernel/setup.c
arch/blackfin/mach-common/smp.c
arch/c6x/kernel/setup.c
arch/hexagon/kernel/smp.c
arch/ia64/kernel/efi.c
arch/ia64/kernel/setup.c
arch/ia64/kernel/smpboot.c
arch/m32r/Kconfig
arch/m32r/kernel/setup.c
arch/m32r/kernel/smpboot.c
arch/m68k/configs/amiga_defconfig
arch/m68k/configs/apollo_defconfig
arch/m68k/configs/atari_defconfig
arch/m68k/configs/bvme6000_defconfig
arch/m68k/configs/hp300_defconfig
arch/m68k/configs/mac_defconfig
arch/m68k/configs/multi_defconfig
arch/m68k/configs/mvme147_defconfig
arch/m68k/configs/mvme16x_defconfig
arch/m68k/configs/q40_defconfig
arch/m68k/configs/sun3_defconfig
arch/m68k/configs/sun3x_defconfig
arch/m68k/include/asm/unistd.h
arch/m68k/include/uapi/asm/unistd.h
arch/m68k/kernel/syscalltable.S
arch/metag/kernel/smp.c
arch/microblaze/kernel/setup.c
arch/mips/Kconfig
arch/mips/ath79/irq.c
arch/mips/bcm63xx/nvram.c
arch/mips/bmips/irq.c
arch/mips/boot/compressed/uart-16550.c
arch/mips/boot/dts/brcm/bcm6328.dtsi
arch/mips/boot/dts/brcm/bcm7125.dtsi
arch/mips/boot/dts/brcm/bcm7346.dtsi
arch/mips/boot/dts/brcm/bcm7358.dtsi
arch/mips/boot/dts/brcm/bcm7360.dtsi
arch/mips/boot/dts/brcm/bcm7362.dtsi
arch/mips/boot/dts/brcm/bcm7420.dtsi
arch/mips/boot/dts/brcm/bcm7425.dtsi
arch/mips/boot/dts/brcm/bcm7435.dtsi
arch/mips/include/asm/elf.h
arch/mips/include/asm/fpu.h
arch/mips/include/asm/mach-ath79/ath79.h
arch/mips/include/asm/mach-bcm63xx/bcm63xx_nvram.h
arch/mips/include/asm/octeon/octeon-feature.h
arch/mips/include/asm/processor.h
arch/mips/include/asm/smp-ops.h
arch/mips/include/asm/stackframe.h
arch/mips/include/asm/syscall.h
arch/mips/include/uapi/asm/unistd.h
arch/mips/jz4740/gpio.c
arch/mips/kernel/Makefile
arch/mips/kernel/binfmt_elfn32.c
arch/mips/kernel/binfmt_elfo32.c
arch/mips/kernel/process.c
arch/mips/kernel/r2300_fpu.S
arch/mips/kernel/r4k_fpu.S
arch/mips/kernel/scall32-o32.S
arch/mips/kernel/scall64-64.S
arch/mips/kernel/scall64-n32.S
arch/mips/kernel/scall64-o32.S
arch/mips/kernel/setup.c
arch/mips/kernel/smp-cmp.c
arch/mips/kernel/smp-cps.c
arch/mips/kernel/smp-mt.c
arch/mips/kernel/smp.c
arch/mips/kernel/traps.c
arch/mips/kvm/mips.c
arch/mips/mm/mmap.c
arch/mips/mm/sc-mips.c
arch/mips/mti-malta/malta-init.c
arch/mips/pci/pci-mt7620.c
arch/mn10300/kernel/smp.c
arch/parisc/include/asm/cache.h
arch/parisc/include/asm/cacheflush.h
arch/parisc/include/asm/floppy.h
arch/parisc/include/uapi/asm/unistd.h
arch/parisc/kernel/ptrace.c
arch/parisc/kernel/smp.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/syscall_table.S
arch/parisc/mm/init.c
arch/powerpc/Kconfig
arch/powerpc/include/asm/book3s/64/hash.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/eeh.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/systbl.h
arch/powerpc/include/asm/trace.h
arch/powerpc/include/asm/unistd.h
arch/powerpc/include/uapi/asm/unistd.h
arch/powerpc/kernel/eeh_driver.c
arch/powerpc/kernel/eeh_pe.c
arch/powerpc/kernel/hw_breakpoint.c
arch/powerpc/kernel/misc_64.S
arch/powerpc/kernel/module_64.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/setup_32.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/powerpc.c
arch/powerpc/mm/hash64_64k.c
arch/powerpc/mm/hugepage-hash64.c
arch/powerpc/mm/hugetlbpage-book3e.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/mmap.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/perf/power8-pmu.c
arch/powerpc/platforms/powernv/eeh-powernv.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/pci.h
arch/s390/include/asm/fpu/internal.h
arch/s390/include/asm/irqflags.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/livepatch.h
arch/s390/include/asm/mmu_context.h
arch/s390/include/asm/pci_io.h
arch/s390/include/asm/pgalloc.h
arch/s390/include/asm/processor.h
arch/s390/include/asm/ptrace.h
arch/s390/include/uapi/asm/unistd.h
arch/s390/kernel/compat_signal.c
arch/s390/kernel/compat_wrapper.c
arch/s390/kernel/crash_dump.c
arch/s390/kernel/debug.c
arch/s390/kernel/dumpstack.c
arch/s390/kernel/early.c
arch/s390/kernel/ftrace.c
arch/s390/kernel/head64.S
arch/s390/kernel/ipl.c
arch/s390/kernel/kprobes.c
arch/s390/kernel/perf_event.c
arch/s390/kernel/process.c
arch/s390/kernel/ptrace.c
arch/s390/kernel/setup.c
arch/s390/kernel/signal.c
arch/s390/kernel/smp.c
arch/s390/kernel/stacktrace.c
arch/s390/kernel/syscalls.S
arch/s390/kernel/trace.c
arch/s390/kernel/traps.c
arch/s390/kvm/Kconfig
arch/s390/kvm/Makefile
arch/s390/kvm/guestdbg.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/mm/fault.c
arch/s390/mm/init.c
arch/s390/mm/maccess.c
arch/s390/mm/mmap.c
arch/s390/mm/pgtable.c
arch/s390/numa/numa.c
arch/s390/oprofile/backtrace.c
arch/s390/pci/pci.c
arch/s390/pci/pci_event.c
arch/score/kernel/setup.c
arch/sh/include/asm/barrier.h
arch/sh/kernel/setup.c
arch/sh/kernel/smp.c
arch/sparc/Makefile
arch/sparc/include/uapi/asm/unistd.h
arch/sparc/kernel/entry.S
arch/sparc/kernel/head_64.S
arch/sparc/kernel/hvcalls.S
arch/sparc/kernel/signal_64.c
arch/sparc/kernel/smp_32.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sparc_ksyms_64.c
arch/sparc/kernel/sys_sparc_64.c
arch/sparc/kernel/syscalls.S
arch/sparc/kernel/systbls_32.S
arch/sparc/kernel/systbls_64.S
arch/sparc/mm/init_64.c
arch/tile/kernel/setup.c
arch/tile/kernel/smpboot.c
arch/um/include/asm/page.h
arch/um/kernel/reboot.c
arch/um/kernel/signal.c
arch/unicore32/kernel/setup.c
arch/x86/Kbuild
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/boot/cpuflags.h
arch/x86/boot/mkcpustr.c
arch/x86/boot/tools/build.c
arch/x86/configs/i386_defconfig
arch/x86/crypto/chacha20-ssse3-x86_64.S
arch/x86/crypto/crc32-pclmul_glue.c
arch/x86/crypto/crc32c-intel_glue.c
arch/x86/crypto/crct10dif-pclmul_glue.c
arch/x86/entry/calling.h
arch/x86/entry/common.c
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/entry/entry_64_compat.S
arch/x86/entry/syscall_32.c
arch/x86/entry/syscall_64.c
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/entry/syscalls/syscalltbl.sh
arch/x86/entry/vdso/vdso2c.h
arch/x86/entry/vdso/vdso32-setup.c
arch/x86/entry/vdso/vdso32/system_call.S
arch/x86/entry/vdso/vma.c
arch/x86/entry/vsyscall/vsyscall_gtod.c
arch/x86/events/Makefile [new file with mode: 0644]
arch/x86/events/amd/core.c [new file with mode: 0644]
arch/x86/events/amd/ibs.c [new file with mode: 0644]
arch/x86/events/amd/iommu.c [new file with mode: 0644]
arch/x86/events/amd/iommu.h [new file with mode: 0644]
arch/x86/events/amd/uncore.c [new file with mode: 0644]
arch/x86/events/core.c [new file with mode: 0644]
arch/x86/events/intel/bts.c [new file with mode: 0644]
arch/x86/events/intel/core.c [new file with mode: 0644]
arch/x86/events/intel/cqm.c [new file with mode: 0644]
arch/x86/events/intel/cstate.c [new file with mode: 0644]
arch/x86/events/intel/ds.c [new file with mode: 0644]
arch/x86/events/intel/knc.c [new file with mode: 0644]
arch/x86/events/intel/lbr.c [new file with mode: 0644]
arch/x86/events/intel/p4.c [new file with mode: 0644]
arch/x86/events/intel/p6.c [new file with mode: 0644]
arch/x86/events/intel/pt.c [new file with mode: 0644]
arch/x86/events/intel/pt.h [new file with mode: 0644]
arch/x86/events/intel/rapl.c [new file with mode: 0644]
arch/x86/events/intel/uncore.c [new file with mode: 0644]
arch/x86/events/intel/uncore.h [new file with mode: 0644]
arch/x86/events/intel/uncore_nhmex.c [new file with mode: 0644]
arch/x86/events/intel/uncore_snb.c [new file with mode: 0644]
arch/x86/events/intel/uncore_snbep.c [new file with mode: 0644]
arch/x86/events/msr.c [new file with mode: 0644]
arch/x86/events/perf_event.h [new file with mode: 0644]
arch/x86/include/asm/alternative.h
arch/x86/include/asm/amd_nb.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/arch_hweight.h
arch/x86/include/asm/asm.h
arch/x86/include/asm/barrier.h
arch/x86/include/asm/bitops.h
arch/x86/include/asm/cacheflush.h
arch/x86/include/asm/clocksource.h
arch/x86/include/asm/cmpxchg.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/cpufeatures.h [new file with mode: 0644]
arch/x86/include/asm/desc_defs.h
arch/x86/include/asm/dmi.h
arch/x86/include/asm/elf.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/fpu/internal.h
arch/x86/include/asm/fpu/xstate.h
arch/x86/include/asm/frame.h
arch/x86/include/asm/imr.h
arch/x86/include/asm/ipi.h
arch/x86/include/asm/irq.h
arch/x86/include/asm/irq_work.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/livepatch.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/microcode.h
arch/x86/include/asm/microcode_intel.h
arch/x86/include/asm/mmu.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/mwait.h
arch/x86/include/asm/pci_x86.h
arch/x86/include/asm/perf_event.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/proto.h
arch/x86/include/asm/sections.h
arch/x86/include/asm/sighandling.h
arch/x86/include/asm/smap.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/string_64.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/tsc.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uaccess_32.h
arch/x86/include/asm/uaccess_64.h
arch/x86/include/asm/vdso.h
arch/x86/include/asm/vgtod.h
arch/x86/include/asm/xen/pci.h
arch/x86/include/uapi/asm/sigcontext.h
arch/x86/include/uapi/asm/ucontext.h
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/apic_numachip.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/ipi.c
arch/x86/kernel/apic/vector.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/bugs_64.c
arch/x86/kernel/cpu/centaur.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cyrix.c
arch/x86/kernel/cpu/hypervisor.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/intel_pt.h [deleted file]
arch/x86/kernel/cpu/match.c
arch/x86/kernel/cpu/mcheck/mce-inject.c
arch/x86/kernel/cpu/mcheck/mce-severity.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/p5.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/mcheck/threshold.c
arch/x86/kernel/cpu/mcheck/winchip.c
arch/x86/kernel/cpu/microcode/amd.c
arch/x86/kernel/cpu/microcode/core.c
arch/x86/kernel/cpu/microcode/intel.c
arch/x86/kernel/cpu/microcode/intel_lib.c
arch/x86/kernel/cpu/mkcapflags.sh
arch/x86/kernel/cpu/mshyperv.c
arch/x86/kernel/cpu/mtrr/centaur.c
arch/x86/kernel/cpu/mtrr/cleanup.c
arch/x86/kernel/cpu/mtrr/generic.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/perf_event.c [deleted file]
arch/x86/kernel/cpu/perf_event.h [deleted file]
arch/x86/kernel/cpu/perf_event_amd.c [deleted file]
arch/x86/kernel/cpu/perf_event_amd_ibs.c [deleted file]
arch/x86/kernel/cpu/perf_event_amd_iommu.c [deleted file]
arch/x86/kernel/cpu/perf_event_amd_iommu.h [deleted file]
arch/x86/kernel/cpu/perf_event_amd_uncore.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_bts.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_cqm.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_cstate.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_ds.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_lbr.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_pt.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_rapl.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_uncore.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_uncore.h [deleted file]
arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c [deleted file]
arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c [deleted file]
arch/x86/kernel/cpu/perf_event_knc.c [deleted file]
arch/x86/kernel/cpu/perf_event_msr.c [deleted file]
arch/x86/kernel/cpu/perf_event_p4.c [deleted file]
arch/x86/kernel/cpu/perf_event_p6.c [deleted file]
arch/x86/kernel/cpu/rdrand.c
arch/x86/kernel/cpu/topology.c
arch/x86/kernel/cpu/transmeta.c
arch/x86/kernel/cpu/vmware.c
arch/x86/kernel/crash.c
arch/x86/kernel/e820.c
arch/x86/kernel/fpu/core.c
arch/x86/kernel/fpu/init.c
arch/x86/kernel/fpu/xstate.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/hpet.c
arch/x86/kernel/irq.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/kprobes/core.c
arch/x86/kernel/mcount_64.S
arch/x86/kernel/mpparse.c
arch/x86/kernel/msr.c
arch/x86/kernel/nmi.c
arch/x86/kernel/pmem.c
arch/x86/kernel/process.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/test_nx.c
arch/x86/kernel/test_rodata.c
arch/x86/kernel/traps.c
arch/x86/kernel/tsc.c
arch/x86/kernel/verify_cpu.S
arch/x86/kernel/vm86_32.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/kvm/emulate.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/lguest/boot.c
arch/x86/lib/clear_page_64.S
arch/x86/lib/cmdline.c
arch/x86/lib/copy_page_64.S
arch/x86/lib/copy_user_64.S
arch/x86/lib/delay.c
arch/x86/lib/memcpy_64.S
arch/x86/lib/memmove_64.S
arch/x86/lib/memset_64.S
arch/x86/mm/dump_pagetables.c
arch/x86/mm/extable.c
arch/x86/mm/fault.c
arch/x86/mm/gup.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/kmmio.c
arch/x86/mm/mmap.c
arch/x86/mm/mpx.c
arch/x86/mm/numa.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/setup_nx.c
arch/x86/oprofile/op_model_amd.c
arch/x86/pci/common.c
arch/x86/pci/intel_mid_pci.c
arch/x86/pci/irq.c
arch/x86/pci/xen.c
arch/x86/platform/efi/quirks.c
arch/x86/platform/geode/alix.c
arch/x86/platform/geode/geos.c
arch/x86/platform/geode/net5501.c
arch/x86/platform/intel-mid/intel-mid.c
arch/x86/platform/intel-mid/mfld.c
arch/x86/platform/intel-mid/mrfl.c
arch/x86/platform/intel-quark/imr.c
arch/x86/platform/intel-quark/imr_selftest.c
arch/x86/um/asm/barrier.h
arch/x86/um/os-Linux/task_size.c
arch/x86/um/sys_call_table_32.c
arch/x86/um/sys_call_table_64.c
arch/x86/um/user-offsets.c
arch/x86/xen/enlighten.c
arch/x86/xen/pmu.c
arch/x86/xen/smp.c
arch/xtensa/kernel/smp.c
block/Kconfig
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-map.c
block/blk-merge.c
block/blk-mq.c
block/blk-settings.c
block/blk-sysfs.c
block/deadline-iosched.c
block/ioctl.c
block/partition-generic.c
crypto/Kconfig
crypto/algif_hash.c
crypto/algif_skcipher.c
crypto/asymmetric_keys/pkcs7_parser.c
crypto/crypto_user.c
crypto/shash.c
drivers/acpi/acpi_lpss.c
drivers/acpi/acpi_platform.c
drivers/acpi/acpica/psargs.c
drivers/acpi/apei/einj.c
drivers/acpi/nfit.c
drivers/acpi/pci_irq.c
drivers/acpi/pci_link.c
drivers/acpi/video_detect.c
drivers/android/binder.c
drivers/ata/ahci.c
drivers/ata/ahci.h
drivers/ata/ahci_brcmstb.c
drivers/ata/ahci_xgene.c
drivers/ata/libahci.c
drivers/ata/libata-core.c
drivers/ata/libata-scsi.c
drivers/ata/libata-sff.c
drivers/ata/pata_rb532_cf.c
drivers/base/component.c
drivers/base/platform-msi.c
drivers/base/platform.c
drivers/base/power/common.c
drivers/base/power/domain.c
drivers/base/property.c
drivers/base/regmap/regmap-mmio.c
drivers/block/floppy.c
drivers/block/null_blk.c
drivers/block/xen-blkfront.c
drivers/bus/Kconfig
drivers/bus/vexpress-config.c
drivers/char/hpet.c
drivers/char/hw_random/Kconfig
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/random.c
drivers/clk/Makefile
drivers/clk/clk-gpio.c
drivers/clk/clk-scpi.c
drivers/clk/mvebu/dove-divider.c
drivers/clk/qcom/gcc-apq8084.c
drivers/clk/qcom/gcc-ipq806x.c
drivers/clk/qcom/gcc-msm8660.c
drivers/clk/qcom/gcc-msm8916.c
drivers/clk/qcom/gcc-msm8960.c
drivers/clk/qcom/gcc-msm8974.c
drivers/clk/qcom/lcc-ipq806x.c
drivers/clk/qcom/lcc-msm8960.c
drivers/clk/qcom/mmcc-apq8084.c
drivers/clk/qcom/mmcc-msm8960.c
drivers/clk/qcom/mmcc-msm8974.c
drivers/clk/rockchip/clk-rk3036.c
drivers/clk/rockchip/clk-rk3368.c
drivers/clk/tegra/clk-emc.c
drivers/clk/tegra/clk-id.h
drivers/clk/tegra/clk-pll.c
drivers/clk/tegra/clk-tegra-periph.c
drivers/clk/tegra/clk-tegra-super-gen4.c
drivers/clk/tegra/clk-tegra210.c
drivers/clk/ti/dpll3xxx.c
drivers/clk/versatile/clk-icst.c
drivers/clocksource/Kconfig
drivers/clocksource/arm_arch_timer.c
drivers/clocksource/arm_global_timer.c
drivers/clocksource/exynos_mct.c
drivers/clocksource/rockchip_timer.c
drivers/clocksource/tcb_clksrc.c
drivers/clocksource/time-lpc32xx.c
drivers/cpufreq/Kconfig
drivers/cpufreq/Kconfig.arm
drivers/cpufreq/cpufreq-dt.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/cpufreq_governor.c
drivers/cpufreq/intel_pstate.c
drivers/cpufreq/mt8173-cpufreq.c
drivers/cpufreq/pxa2xx-cpufreq.c
drivers/cpuidle/coupled.c
drivers/cpuidle/cpuidle.c
drivers/crypto/atmel-aes.c
drivers/crypto/atmel-sha.c
drivers/crypto/caam/ctrl.c
drivers/crypto/marvell/cesa.c
drivers/devfreq/tegra-devfreq.c
drivers/dma/at_xdmac.c
drivers/dma/dw/core.c
drivers/dma/dw/pci.c
drivers/dma/edma.c
drivers/dma/fsldma.c
drivers/dma/ioat/dma.c
drivers/dma/iop-adma.c
drivers/dma/mv_xor.c
drivers/dma/pxa_dma.c
drivers/dma/qcom_bam_dma.c
drivers/edac/mce_amd.c
drivers/edac/sb_edac.c
drivers/firmware/efi/efivars.c
drivers/firmware/efi/vars.c
drivers/gpio/gpio-altera.c
drivers/gpio/gpio-davinci.c
drivers/gpio/gpio-rcar.c
drivers/gpu/drm/amd/amdgpu/Makefile
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/atombios_dp.c
drivers/gpu/drm/amd/amdgpu/ci_dpm.c
drivers/gpu/drm/amd/amdgpu/cik.c
drivers/gpu/drm/amd/amdgpu/cik_sdma.c
drivers/gpu/drm/amd/amdgpu/cz_dpm.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
drivers/gpu/drm/amd/amdgpu/iceland_smc.c
drivers/gpu/drm/amd/amdgpu/kv_dpm.c
drivers/gpu/drm/amd/amdgpu/tonga_dpm.c
drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
drivers/gpu/drm/amd/amdgpu/vi.c
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/include/amd_shared.h
drivers/gpu/drm/amd/include/cgs_common.h
drivers/gpu/drm/amd/powerplay/amd_powerplay.c
drivers/gpu/drm/amd/powerplay/eventmgr/eventactionchains.c
drivers/gpu/drm/amd/powerplay/eventmgr/eventmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/cz_clockpowergating.c
drivers/gpu/drm/amd/powerplay/hwmgr/cz_hwmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/tonga_hwmgr.c
drivers/gpu/drm/amd/powerplay/smumgr/cz_smumgr.c
drivers/gpu/drm/ast/ast_main.c
drivers/gpu/drm/drm_atomic.c
drivers/gpu/drm/drm_atomic_helper.c
drivers/gpu/drm/drm_crtc.c
drivers/gpu/drm/drm_dp_mst_topology.c
drivers/gpu/drm/drm_gem_cma_helper.c
drivers/gpu/drm/drm_irq.c
drivers/gpu/drm/etnaviv/common.xml.h
drivers/gpu/drm/etnaviv/etnaviv_drv.c
drivers/gpu/drm/etnaviv/etnaviv_drv.h
drivers/gpu/drm/etnaviv/etnaviv_dump.c
drivers/gpu/drm/etnaviv/etnaviv_gem.c
drivers/gpu/drm/etnaviv/etnaviv_gem.h
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
drivers/gpu/drm/etnaviv/etnaviv_gpu.c
drivers/gpu/drm/etnaviv/etnaviv_gpu.h
drivers/gpu/drm/etnaviv/state_hi.xml.h
drivers/gpu/drm/exynos/Kconfig
drivers/gpu/drm/exynos/exynos5433_drm_decon.c
drivers/gpu/drm/exynos/exynos_dp_core.c
drivers/gpu/drm/exynos/exynos_drm_dsi.c
drivers/gpu/drm/exynos/exynos_drm_fbdev.c
drivers/gpu/drm/exynos/exynos_drm_fimc.c
drivers/gpu/drm/exynos/exynos_drm_g2d.c
drivers/gpu/drm/exynos/exynos_drm_gem.c
drivers/gpu/drm/exynos/exynos_drm_gsc.c
drivers/gpu/drm/exynos/exynos_drm_ipp.c
drivers/gpu/drm/exynos/exynos_drm_mic.c
drivers/gpu/drm/exynos/exynos_drm_rotator.c
drivers/gpu/drm/exynos/exynos_drm_vidi.c
drivers/gpu/drm/exynos/exynos_mixer.c
drivers/gpu/drm/i2c/adv7511.c
drivers/gpu/drm/i2c/adv7511.h
drivers/gpu/drm/i2c/tda998x_drv.c
drivers/gpu/drm/i915/Kconfig
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_userptr.c
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/i915_suspend.c
drivers/gpu/drm/i915/intel_audio.c
drivers/gpu/drm/i915/intel_crt.c
drivers/gpu/drm/i915/intel_csr.c
drivers/gpu/drm/i915/intel_ddi.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp.c
drivers/gpu/drm/i915/intel_dp_link_training.c
drivers/gpu/drm/i915/intel_drv.h
drivers/gpu/drm/i915/intel_dsi.c
drivers/gpu/drm/i915/intel_dsi_panel_vbt.c
drivers/gpu/drm/i915/intel_hdmi.c
drivers/gpu/drm/i915/intel_i2c.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_lvds.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_runtime_pm.c
drivers/gpu/drm/imx/ipuv3-crtc.c
drivers/gpu/drm/imx/ipuv3-plane.c
drivers/gpu/drm/nouveau/nouveau_bo.c
drivers/gpu/drm/nouveau/nouveau_display.c
drivers/gpu/drm/nouveau/nouveau_platform.c
drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
drivers/gpu/drm/nouveau/nvkm/engine/disp/dport.c
drivers/gpu/drm/nouveau/nvkm/engine/disp/dport.h
drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
drivers/gpu/drm/omapdrm/omap_gem.c
drivers/gpu/drm/qxl/qxl_ioctl.c
drivers/gpu/drm/qxl/qxl_prime.c
drivers/gpu/drm/radeon/atombios_dp.c
drivers/gpu/drm/radeon/dce6_afmt.c
drivers/gpu/drm/radeon/evergreen_hdmi.c
drivers/gpu/drm/radeon/evergreend.h
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_atombios.c
drivers/gpu/drm/radeon/radeon_audio.c
drivers/gpu/drm/radeon/radeon_audio.h
drivers/gpu/drm/radeon/radeon_device.c
drivers/gpu/drm/radeon/radeon_display.c
drivers/gpu/drm/radeon/radeon_gem.c
drivers/gpu/drm/radeon/radeon_object.c
drivers/gpu/drm/radeon/radeon_pm.c
drivers/gpu/drm/radeon/radeon_sa.c
drivers/gpu/drm/radeon/radeon_ttm.c
drivers/gpu/drm/radeon/vce_v1_0.c
drivers/gpu/drm/rockchip/Makefile
drivers/gpu/drm/rockchip/dw-mipi-dsi.c
drivers/gpu/drm/rockchip/rockchip_drm_drv.c
drivers/gpu/drm/rockchip/rockchip_drm_fb.c
drivers/gpu/drm/rockchip/rockchip_drm_fbdev.h
drivers/gpu/drm/rockchip/rockchip_drm_gem.c
drivers/gpu/drm/rockchip/rockchip_drm_vop.c
drivers/gpu/drm/sti/sti_cursor.c
drivers/gpu/drm/sti/sti_gdp.c
drivers/gpu/drm/sti/sti_hqvdp.c
drivers/gpu/drm/tegra/gem.c
drivers/gpu/drm/vc4/vc4_bo.c
drivers/gpu/drm/vc4/vc4_drv.h
drivers/gpu/drm/vc4/vc4_gem.c
drivers/gpu/drm/vc4/vc4_irq.c
drivers/gpu/drm/vc4/vc4_render_cl.c
drivers/gpu/drm/vc4/vc4_v3d.c
drivers/gpu/drm/vc4/vc4_validate.c
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
drivers/gpu/host1x/bus.c
drivers/gpu/host1x/cdma.c
drivers/gpu/host1x/dev.c
drivers/gpu/host1x/dev.h
drivers/gpu/host1x/job.c
drivers/gpu/ipu-v3/ipu-common.c
drivers/hwmon/ads1015.c
drivers/hwmon/dell-smm-hwmon.c
drivers/hwmon/fam15h_power.c
drivers/hwmon/gpio-fan.c
drivers/hwspinlock/hwspinlock_core.c
drivers/i2c/busses/i2c-brcmstb.c
drivers/i2c/busses/i2c-designware-core.c
drivers/i2c/busses/i2c-i801.c
drivers/i2c/busses/i2c-omap.c
drivers/i2c/busses/i2c-piix4.c
drivers/i2c/busses/i2c-uniphier-f.c
drivers/i2c/busses/i2c-uniphier.c
drivers/iio/accel/Kconfig
drivers/iio/adc/Kconfig
drivers/iio/adc/ti_am335x_adc.c
drivers/iio/dac/mcp4725.c
drivers/iio/humidity/dht11.c
drivers/iio/imu/adis_buffer.c
drivers/iio/imu/inv_mpu6050/Kconfig
drivers/iio/inkern.c
drivers/iio/light/acpi-als.c
drivers/iio/light/ltr501.c
drivers/iio/pressure/mpl115.c
drivers/iio/proximity/pulsedlight-lidar-lite-v2.c
drivers/infiniband/core/device.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ud_header.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/hw/mlx4/mad.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/srq.c
drivers/infiniband/hw/ocrdma/ocrdma.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_stats.c
drivers/infiniband/hw/ocrdma/ocrdma_stats.h
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/input/joystick/xpad.c
drivers/input/keyboard/adp5589-keys.c
drivers/input/keyboard/cap11xx.c
drivers/input/misc/Kconfig
drivers/input/misc/sirfsoc-onkey.c
drivers/input/mouse/vmmouse.c
drivers/input/serio/serio.c
drivers/input/touchscreen/colibri-vf50-ts.c
drivers/input/touchscreen/edt-ft5x06.c
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/dmar.c
drivers/iommu/intel-iommu.c
drivers/iommu/intel-svm.c
drivers/iommu/intel_irq_remapping.c
drivers/iommu/io-pgtable-arm.c
drivers/irqchip/Kconfig
drivers/irqchip/Makefile
drivers/irqchip/irq-alpine-msi.c [new file with mode: 0644]
drivers/irqchip/irq-armada-370-xp.c
drivers/irqchip/irq-ath79-cpu.c [new file with mode: 0644]
drivers/irqchip/irq-ath79-misc.c [new file with mode: 0644]
drivers/irqchip/irq-atmel-aic-common.c
drivers/irqchip/irq-atmel-aic-common.h
drivers/irqchip/irq-atmel-aic.c
drivers/irqchip/irq-atmel-aic5.c
drivers/irqchip/irq-bcm2836.c
drivers/irqchip/irq-bcm6345-l1.c [new file with mode: 0644]
drivers/irqchip/irq-gic-realview.c
drivers/irqchip/irq-gic-v2m.c
drivers/irqchip/irq-gic-v3-its.c
drivers/irqchip/irq-gic-v3.c
drivers/irqchip/irq-gic.c
drivers/irqchip/irq-mips-gic.c
drivers/irqchip/irq-mvebu-odmi.c [new file with mode: 0644]
drivers/irqchip/irq-mxs.c
drivers/irqchip/irq-s3c24xx.c
drivers/irqchip/irq-sun4i.c
drivers/irqchip/irq-sunxi-nmi.c
drivers/irqchip/irq-tango.c [new file with mode: 0644]
drivers/irqchip/irq-ts4800.c
drivers/isdn/gigaset/ser-gigaset.c
drivers/isdn/hardware/mISDN/netjet.c
drivers/lightnvm/core.c
drivers/lightnvm/rrpc.c
drivers/lightnvm/rrpc.h
drivers/mailbox/Kconfig
drivers/mailbox/pcc.c
drivers/md/bitmap.c
drivers/md/dm.c
drivers/md/faulty.c
drivers/md/md-cluster.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/media/dvb-frontends/tda1004x.c
drivers/media/i2c/adp1653.c
drivers/media/i2c/adv7604.c
drivers/media/i2c/ir-kbd-i2c.c
drivers/media/i2c/s5k6a3.c
drivers/media/media-device.c
drivers/media/pci/saa7134/saa7134-alsa.c
drivers/media/platform/Kconfig
drivers/media/platform/coda/coda-bit.c
drivers/media/platform/exynos4-is/Kconfig
drivers/media/platform/exynos4-is/fimc-is.c
drivers/media/platform/exynos4-is/fimc-isp-video.c
drivers/media/platform/exynos4-is/media-dev.c
drivers/media/platform/soc_camera/atmel-isi.c
drivers/media/platform/soc_camera/soc_camera.c
drivers/media/platform/vsp1/vsp1_drv.c
drivers/media/platform/vsp1/vsp1_video.c
drivers/media/usb/au0828/au0828-video.c
drivers/media/v4l2-core/videobuf2-core.c
drivers/media/v4l2-core/videobuf2-v4l2.c
drivers/mfd/db8500-prcmu.c
drivers/misc/cxl/pci.c
drivers/misc/lkdtm.c
drivers/misc/mei/main.c
drivers/mmc/card/block.c
drivers/mmc/host/mmc_spi.c
drivers/mmc/host/omap_hsmmc.c
drivers/mmc/host/pxamci.c
drivers/mmc/host/sdhci-acpi.c
drivers/mmc/host/sdhci-of-at91.c
drivers/mmc/host/sdhci-pci-core.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sdhci.h
drivers/mmc/host/sh_mmcif.c
drivers/mtd/tests/mtd_nandecctest.c
drivers/mtd/ubi/upd.c
drivers/net/bonding/bond_main.c
drivers/net/can/spi/mcp251x.c
drivers/net/can/usb/ems_usb.c
drivers/net/can/usb/gs_usb.c
drivers/net/dsa/mv88e6352.c
drivers/net/dsa/mv88e6xxx.c
drivers/net/ethernet/3com/3c59x.c
drivers/net/ethernet/8390/pcnet_cs.c
drivers/net/ethernet/agere/et131x.c
drivers/net/ethernet/altera/altera_tse_main.c
drivers/net/ethernet/amd/am79c961a.c
drivers/net/ethernet/amd/lance.c
drivers/net/ethernet/apm/xgene/xgene_enet_main.c
drivers/net/ethernet/apm/xgene/xgene_enet_main.h
drivers/net/ethernet/arc/emac_main.c
drivers/net/ethernet/aurora/nb8800.c
drivers/net/ethernet/broadcom/Kconfig
drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_reg.h
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/bnxt/bnxt.h
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/broadcom/genet/bcmmii.c
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/brocade/bna/bna_tx_rx.c
drivers/net/ethernet/cadence/macb.c
drivers/net/ethernet/cavium/liquidio/lio_main.c
drivers/net/ethernet/cavium/liquidio/octeon_droq.c
drivers/net/ethernet/cavium/thunder/nic.h
drivers/net/ethernet/cavium/thunder/nic_main.c
drivers/net/ethernet/cavium/thunder/nic_reg.h
drivers/net/ethernet/cavium/thunder/nicvf_main.c
drivers/net/ethernet/cavium/thunder/nicvf_queues.c
drivers/net/ethernet/cavium/thunder/nicvf_queues.h
drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
drivers/net/ethernet/cisco/enic/enic.h
drivers/net/ethernet/cisco/enic/vnic_dev.c
drivers/net/ethernet/davicom/dm9000.c
drivers/net/ethernet/emulex/benet/be.h
drivers/net/ethernet/emulex/benet/be_cmds.h
drivers/net/ethernet/emulex/benet/be_main.c
drivers/net/ethernet/ethoc.c
drivers/net/ethernet/ezchip/Kconfig
drivers/net/ethernet/freescale/Makefile
drivers/net/ethernet/freescale/fec.h
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/freescale/fman/fman.c
drivers/net/ethernet/freescale/fs_enet/mac-fcc.c
drivers/net/ethernet/freescale/gianfar.c
drivers/net/ethernet/fujitsu/fmvj18x_cs.c
drivers/net/ethernet/hisilicon/Kconfig
drivers/net/ethernet/hisilicon/hns/hnae.c
drivers/net/ethernet/hisilicon/hns/hnae.h
drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
drivers/net/ethernet/hisilicon/hns/hns_enet.c
drivers/net/ethernet/hisilicon/hns/hns_enet.h
drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
drivers/net/ethernet/hp/hp100.c
drivers/net/ethernet/ibm/ibmveth.c
drivers/net/ethernet/ibm/ibmvnic.c
drivers/net/ethernet/ibm/ibmvnic.h
drivers/net/ethernet/intel/Kconfig
drivers/net/ethernet/intel/e1000e/defines.h
drivers/net/ethernet/intel/e1000e/ptp.c
drivers/net/ethernet/intel/e1000e/regs.h
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/jme.c
drivers/net/ethernet/marvell/mv643xx_eth.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/marvell/mvpp2.c
drivers/net/ethernet/mellanox/mlx4/catas.c
drivers/net/ethernet/mellanox/mlx4/cmd.c
drivers/net/ethernet/mellanox/mlx4/cq.c
drivers/net/ethernet/mellanox/mlx4/en_clock.c
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/en_port.c
drivers/net/ethernet/mellanox/mlx4/en_resources.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/eq.c
drivers/net/ethernet/mellanox/mlx4/main.c
drivers/net/ethernet/mellanox/mlx4/pd.c
drivers/net/ethernet/mellanox/mlx4/port.c
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
drivers/net/ethernet/mellanox/mlxsw/pci.c
drivers/net/ethernet/mellanox/mlxsw/port.h
drivers/net/ethernet/mellanox/mlxsw/reg.h
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
drivers/net/ethernet/mellanox/mlxsw/spectrum.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
drivers/net/ethernet/moxa/moxart_ether.c
drivers/net/ethernet/moxa/moxart_ether.h
drivers/net/ethernet/neterion/vxge/vxge-main.c
drivers/net/ethernet/qualcomm/qca_spi.c
drivers/net/ethernet/realtek/r8169.c
drivers/net/ethernet/renesas/ravb_main.c
drivers/net/ethernet/renesas/sh_eth.c
drivers/net/ethernet/rocker/rocker.c
drivers/net/ethernet/smsc/smc91x.c
drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
drivers/net/ethernet/sun/sunvnet.c
drivers/net/ethernet/synopsys/dwc_eth_qos.c
drivers/net/ethernet/ti/cpsw-phy-sel.c
drivers/net/ethernet/ti/davinci_cpdma.c
drivers/net/ethernet/ti/netcp_core.c
drivers/net/fddi/defxx.c
drivers/net/geneve.c
drivers/net/hyperv/hyperv_net.h
drivers/net/hyperv/netvsc.c
drivers/net/hyperv/netvsc_drv.c
drivers/net/irda/bfin_sir.h
drivers/net/macvlan.c
drivers/net/phy/Kconfig
drivers/net/phy/bcm7xxx.c
drivers/net/phy/dp83640.c
drivers/net/phy/marvell.c
drivers/net/phy/micrel.c
drivers/net/phy/phy.c
drivers/net/phy/phy_device.c
drivers/net/phy/smsc.c
drivers/net/ppp/ppp_generic.c
drivers/net/ppp/pppoe.c
drivers/net/ppp/pptp.c
drivers/net/usb/Kconfig
drivers/net/usb/Makefile
drivers/net/usb/ax88172a.c
drivers/net/usb/cdc_ncm.c
drivers/net/usb/lan78xx.c
drivers/net/usb/qmi_wwan.c
drivers/net/usb/usbnet.c
drivers/net/vmxnet3/vmxnet3_defs.h
drivers/net/vmxnet3/vmxnet3_drv.c
drivers/net/vmxnet3/vmxnet3_int.h
drivers/net/vrf.c
drivers/net/vxlan.c
drivers/net/wan/dscc4.c
drivers/net/wireless/ath/ath9k/eeprom.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.h
drivers/net/wireless/intel/iwlwifi/Kconfig
drivers/net/wireless/intel/iwlwifi/iwl-7000.c
drivers/net/wireless/intel/iwlwifi/iwl-8000.c
drivers/net/wireless/intel/iwlwifi/iwl-drv.c
drivers/net/wireless/intel/iwlwifi/mvm/fw-api-tx.h
drivers/net/wireless/intel/iwlwifi/mvm/fw.c
drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
drivers/net/wireless/intel/iwlwifi/mvm/ops.c
drivers/net/wireless/intel/iwlwifi/mvm/rs.c
drivers/net/wireless/intel/iwlwifi/mvm/scan.c
drivers/net/wireless/intel/iwlwifi/mvm/tx.c
drivers/net/wireless/intel/iwlwifi/pcie/drv.c
drivers/net/wireless/intel/iwlwifi/pcie/internal.h
drivers/net/wireless/intel/iwlwifi/pcie/rx.c
drivers/net/wireless/intel/iwlwifi/pcie/trans.c
drivers/net/wireless/mac80211_hwsim.c
drivers/net/wireless/ralink/rt2x00/rt2400pci.c
drivers/net/wireless/ralink/rt2x00/rt2500pci.c
drivers/net/wireless/ralink/rt2x00/rt2500usb.c
drivers/net/wireless/ralink/rt2x00/rt2800lib.c
drivers/net/wireless/ralink/rt2x00/rt2x00.h
drivers/net/wireless/ralink/rt2x00/rt2x00config.c
drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
drivers/net/wireless/ralink/rt2x00/rt61pci.c
drivers/net/wireless/ralink/rt2x00/rt73usb.c
drivers/net/wireless/realtek/rtlwifi/rc.c
drivers/net/wireless/realtek/rtlwifi/regd.c
drivers/net/wireless/ti/wlcore/io.c
drivers/net/wireless/ti/wlcore/io.h
drivers/net/xen-netfront.c
drivers/nvdimm/bus.c
drivers/nvdimm/e820.c
drivers/nvdimm/namespace_devs.c
drivers/nvdimm/pfn_devs.c
drivers/nvdimm/pmem.c
drivers/nvme/host/Kconfig
drivers/nvme/host/core.c
drivers/nvme/host/lightnvm.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvmem/core.c
drivers/nvmem/qfprom.c
drivers/of/irq.c
drivers/of/of_mdio.c
drivers/parisc/eisa_enumerator.c
drivers/pci/host/Kconfig
drivers/pci/host/pci-keystone-dw.c
drivers/pci/host/pci-layerscape.c
drivers/pci/host/pcie-iproc.c
drivers/pci/hotplug/acpiphp_glue.c
drivers/pci/pci.c
drivers/pci/pcie/aer/aerdrv.c
drivers/pci/pcie/aer/aerdrv.h
drivers/pci/pcie/aer/aerdrv_core.c
drivers/pci/xen-pcifront.c
drivers/phy/Kconfig
drivers/phy/phy-core.c
drivers/phy/phy-twl4030-usb.c
drivers/pinctrl/mediatek/pinctrl-mtk-common.c
drivers/pinctrl/mvebu/pinctrl-mvebu.c
drivers/pinctrl/nomadik/pinctrl-abx500.c
drivers/pinctrl/pxa/pinctrl-pxa2xx.c
drivers/pinctrl/samsung/pinctrl-samsung.c
drivers/pinctrl/sunxi/pinctrl-sun8i-h3.c
drivers/platform/x86/intel-hid.c
drivers/platform/x86/intel_scu_ipcutil.c
drivers/pnp/quirks.c
drivers/power/bq27xxx_battery_i2c.c
drivers/ptp/ptp_chardev.c
drivers/ptp/ptp_ixp46x.c
drivers/rapidio/rio.c
drivers/s390/block/dasd.c
drivers/s390/block/dasd_alias.c
drivers/s390/block/dasd_diag.c
drivers/s390/cio/chp.c
drivers/s390/cio/chp.h
drivers/s390/cio/chsc.c
drivers/s390/crypto/zcrypt_error.h
drivers/s390/crypto/zcrypt_msgtype50.c
drivers/s390/crypto/zcrypt_msgtype6.c
drivers/scsi/device_handler/scsi_dh_rdac.c
drivers/scsi/hisi_sas/Kconfig
drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
drivers/scsi/ipr.c
drivers/scsi/qla2xxx/qla_init.c
drivers/scsi/qla2xxx/qla_isr.c
drivers/scsi/qla2xxx/qla_mid.c
drivers/scsi/qla2xxx/qla_os.c
drivers/scsi/qla2xxx/qla_target.c
drivers/scsi/qla2xxx/qla_target.h
drivers/scsi/qla2xxx/qla_tmpl.c
drivers/scsi/qla2xxx/tcm_qla2xxx.c
drivers/scsi/scsi_devinfo.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_sysfs.c
drivers/scsi/sd.c
drivers/scsi/sg.c
drivers/scsi/sr.c
drivers/scsi/storvsc_drv.c
drivers/sh/pm_runtime.c
drivers/sh/superhyway/superhyway.c
drivers/spi/spi-atmel.c
drivers/spi/spi-bcm2835aux.c
drivers/spi/spi-fsl-espi.c
drivers/spi/spi-imx.c
drivers/spi/spi-loopback-test.c
drivers/spi/spi-omap2-mcspi.c
drivers/spi/spi-rockchip.c
drivers/ssb/Kconfig
drivers/ssb/main.c
drivers/staging/iio/adc/Kconfig
drivers/staging/iio/meter/ade7753.c
drivers/staging/media/davinci_vpfe/vpfe_video.c
drivers/staging/panel/panel.c
drivers/staging/rdma/Kconfig
drivers/staging/rdma/Makefile
drivers/staging/rdma/amso1100/Kbuild [deleted file]
drivers/staging/rdma/amso1100/Kconfig [deleted file]
drivers/staging/rdma/amso1100/TODO [deleted file]
drivers/staging/rdma/amso1100/c2.c [deleted file]
drivers/staging/rdma/amso1100/c2.h [deleted file]
drivers/staging/rdma/amso1100/c2_ae.c [deleted file]
drivers/staging/rdma/amso1100/c2_ae.h [deleted file]
drivers/staging/rdma/amso1100/c2_alloc.c [deleted file]
drivers/staging/rdma/amso1100/c2_cm.c [deleted file]
drivers/staging/rdma/amso1100/c2_cq.c [deleted file]
drivers/staging/rdma/amso1100/c2_intr.c [deleted file]
drivers/staging/rdma/amso1100/c2_mm.c [deleted file]
drivers/staging/rdma/amso1100/c2_mq.c [deleted file]
drivers/staging/rdma/amso1100/c2_mq.h [deleted file]
drivers/staging/rdma/amso1100/c2_pd.c [deleted file]
drivers/staging/rdma/amso1100/c2_provider.c [deleted file]
drivers/staging/rdma/amso1100/c2_provider.h [deleted file]
drivers/staging/rdma/amso1100/c2_qp.c [deleted file]
drivers/staging/rdma/amso1100/c2_rnic.c [deleted file]
drivers/staging/rdma/amso1100/c2_status.h [deleted file]
drivers/staging/rdma/amso1100/c2_user.h [deleted file]
drivers/staging/rdma/amso1100/c2_vq.c [deleted file]
drivers/staging/rdma/amso1100/c2_vq.h [deleted file]
drivers/staging/rdma/amso1100/c2_wr.h [deleted file]
drivers/staging/rdma/ehca/Kconfig [deleted file]
drivers/staging/rdma/ehca/Makefile [deleted file]
drivers/staging/rdma/ehca/TODO [deleted file]
drivers/staging/rdma/ehca/ehca_av.c [deleted file]
drivers/staging/rdma/ehca/ehca_classes.h [deleted file]
drivers/staging/rdma/ehca/ehca_classes_pSeries.h [deleted file]
drivers/staging/rdma/ehca/ehca_cq.c [deleted file]
drivers/staging/rdma/ehca/ehca_eq.c [deleted file]
drivers/staging/rdma/ehca/ehca_hca.c [deleted file]
drivers/staging/rdma/ehca/ehca_irq.c [deleted file]
drivers/staging/rdma/ehca/ehca_irq.h [deleted file]
drivers/staging/rdma/ehca/ehca_iverbs.h [deleted file]
drivers/staging/rdma/ehca/ehca_main.c [deleted file]
drivers/staging/rdma/ehca/ehca_mcast.c [deleted file]
drivers/staging/rdma/ehca/ehca_mrmw.c [deleted file]
drivers/staging/rdma/ehca/ehca_mrmw.h [deleted file]
drivers/staging/rdma/ehca/ehca_pd.c [deleted file]
drivers/staging/rdma/ehca/ehca_qes.h [deleted file]
drivers/staging/rdma/ehca/ehca_qp.c [deleted file]
drivers/staging/rdma/ehca/ehca_reqs.c [deleted file]
drivers/staging/rdma/ehca/ehca_sqp.c [deleted file]
drivers/staging/rdma/ehca/ehca_tools.h [deleted file]
drivers/staging/rdma/ehca/ehca_uverbs.c [deleted file]
drivers/staging/rdma/ehca/hcp_if.c [deleted file]
drivers/staging/rdma/ehca/hcp_if.h [deleted file]
drivers/staging/rdma/ehca/hcp_phyp.c [deleted file]
drivers/staging/rdma/ehca/hcp_phyp.h [deleted file]
drivers/staging/rdma/ehca/hipz_fns.h [deleted file]
drivers/staging/rdma/ehca/hipz_fns_core.h [deleted file]
drivers/staging/rdma/ehca/hipz_hw.h [deleted file]
drivers/staging/rdma/ehca/ipz_pt_fn.c [deleted file]
drivers/staging/rdma/ehca/ipz_pt_fn.h [deleted file]
drivers/staging/rdma/ipath/Kconfig [deleted file]
drivers/staging/rdma/ipath/Makefile [deleted file]
drivers/staging/rdma/ipath/TODO [deleted file]
drivers/staging/rdma/ipath/ipath_common.h [deleted file]
drivers/staging/rdma/ipath/ipath_cq.c [deleted file]
drivers/staging/rdma/ipath/ipath_debug.h [deleted file]
drivers/staging/rdma/ipath/ipath_diag.c [deleted file]
drivers/staging/rdma/ipath/ipath_dma.c [deleted file]
drivers/staging/rdma/ipath/ipath_driver.c [deleted file]
drivers/staging/rdma/ipath/ipath_eeprom.c [deleted file]
drivers/staging/rdma/ipath/ipath_file_ops.c [deleted file]
drivers/staging/rdma/ipath/ipath_fs.c [deleted file]
drivers/staging/rdma/ipath/ipath_iba6110.c [deleted file]
drivers/staging/rdma/ipath/ipath_init_chip.c [deleted file]
drivers/staging/rdma/ipath/ipath_intr.c [deleted file]
drivers/staging/rdma/ipath/ipath_kernel.h [deleted file]
drivers/staging/rdma/ipath/ipath_keys.c [deleted file]
drivers/staging/rdma/ipath/ipath_mad.c [deleted file]
drivers/staging/rdma/ipath/ipath_mmap.c [deleted file]
drivers/staging/rdma/ipath/ipath_mr.c [deleted file]
drivers/staging/rdma/ipath/ipath_qp.c [deleted file]
drivers/staging/rdma/ipath/ipath_rc.c [deleted file]
drivers/staging/rdma/ipath/ipath_registers.h [deleted file]
drivers/staging/rdma/ipath/ipath_ruc.c [deleted file]
drivers/staging/rdma/ipath/ipath_sdma.c [deleted file]
drivers/staging/rdma/ipath/ipath_srq.c [deleted file]
drivers/staging/rdma/ipath/ipath_stats.c [deleted file]
drivers/staging/rdma/ipath/ipath_sysfs.c [deleted file]
drivers/staging/rdma/ipath/ipath_uc.c [deleted file]
drivers/staging/rdma/ipath/ipath_ud.c [deleted file]
drivers/staging/rdma/ipath/ipath_user_pages.c [deleted file]
drivers/staging/rdma/ipath/ipath_user_sdma.c [deleted file]
drivers/staging/rdma/ipath/ipath_user_sdma.h [deleted file]
drivers/staging/rdma/ipath/ipath_verbs.c [deleted file]
drivers/staging/rdma/ipath/ipath_verbs.h [deleted file]
drivers/staging/rdma/ipath/ipath_verbs_mcast.c [deleted file]
drivers/staging/rdma/ipath/ipath_wc_ppc64.c [deleted file]
drivers/staging/rdma/ipath/ipath_wc_x86_64.c [deleted file]
drivers/staging/speakup/Kconfig
drivers/staging/speakup/main.c
drivers/staging/speakup/selection.c
drivers/staging/speakup/serialio.c
drivers/target/target_core_configfs.c
drivers/target/target_core_device.c
drivers/target/target_core_file.c
drivers/target/target_core_iblock.c
drivers/target/target_core_internal.h
drivers/target/target_core_tmr.c
drivers/target/target_core_transport.c
drivers/target/target_core_user.c
drivers/thermal/Kconfig
drivers/thermal/cpu_cooling.c
drivers/thermal/of-thermal.c
drivers/thermal/rcar_thermal.c
drivers/thermal/spear_thermal.c
drivers/tty/n_tty.c
drivers/tty/pty.c
drivers/tty/serial/8250/8250_pci.c
drivers/tty/serial/omap-serial.c
drivers/tty/tty_io.c
drivers/tty/tty_mutex.c
drivers/tty/vt/vt.c
drivers/usb/chipidea/ci_hdrc_pci.c
drivers/usb/chipidea/debug.c
drivers/usb/chipidea/otg.c
drivers/usb/class/cdc-acm.c
drivers/usb/class/cdc-acm.h
drivers/usb/dwc2/Kconfig
drivers/usb/dwc2/core.c
drivers/usb/dwc2/hcd_ddma.c
drivers/usb/dwc2/hcd_intr.c
drivers/usb/dwc2/platform.c
drivers/usb/dwc3/core.h
drivers/usb/dwc3/ep0.c
drivers/usb/dwc3/gadget.c
drivers/usb/gadget/legacy/inode.c
drivers/usb/gadget/udc/fsl_qe_udc.c
drivers/usb/gadget/udc/net2280.h
drivers/usb/gadget/udc/udc-core.c
drivers/usb/host/xhci-ext-caps.h
drivers/usb/host/xhci-mtk-sch.c
drivers/usb/host/xhci-mtk.c
drivers/usb/host/xhci-pci.c
drivers/usb/host/xhci-plat.c
drivers/usb/host/xhci-ring.c
drivers/usb/host/xhci.c
drivers/usb/host/xhci.h
drivers/usb/musb/musb_host.c
drivers/usb/musb/ux500.c
drivers/usb/phy/phy-msm-usb.c
drivers/usb/phy/phy-mxs-usb.c
drivers/usb/serial/Kconfig
drivers/usb/serial/Makefile
drivers/usb/serial/cp210x.c
drivers/usb/serial/ftdi_sio.c
drivers/usb/serial/ftdi_sio_ids.h
drivers/usb/serial/mxu11x0.c [deleted file]
drivers/usb/serial/option.c
drivers/usb/serial/qcserial.c
drivers/usb/serial/visor.c
drivers/vfio/pci/vfio_pci.c
drivers/vfio/platform/vfio_platform_common.c
drivers/vfio/vfio.c
drivers/vfio/vfio_iommu_type1.c
drivers/vhost/vhost.c
drivers/video/console/fbcon.c
drivers/video/fbdev/acornfb.c
drivers/video/fbdev/amba-clcd-versatile.c
drivers/video/fbdev/amba-clcd.c
drivers/video/fbdev/atmel_lcdfb.c
drivers/video/fbdev/da8xx-fb.c
drivers/video/fbdev/ep93xx-fb.c
drivers/video/fbdev/exynos/s6e8ax0.c
drivers/video/fbdev/gbefb.c
drivers/video/fbdev/imxfb.c
drivers/video/fbdev/mmp/hw/mmp_ctrl.c
drivers/video/fbdev/mx3fb.c
drivers/video/fbdev/nuc900fb.c
drivers/video/fbdev/ocfb.c
drivers/video/fbdev/omap/lcdc.c
drivers/video/fbdev/pxa168fb.c
drivers/video/fbdev/pxafb.c
drivers/video/fbdev/s3c-fb.c
drivers/video/fbdev/s3c2410fb.c
drivers/video/fbdev/sa1100fb.c
drivers/virtio/virtio_pci_common.c
drivers/virtio/virtio_pci_modern.c
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/max63xx_wdt.c
drivers/watchdog/pcwd_usb.c
drivers/watchdog/sp805_wdt.c
drivers/watchdog/sun4v_wdt.c [new file with mode: 0644]
drivers/xen/balloon.c
drivers/xen/tmem.c
drivers/xen/xen-pciback/pciback_ops.c
drivers/xen/xen-scsiback.c
drivers/xen/xenbus/xenbus_dev_frontend.c
fs/affs/file.c
fs/binfmt_elf.c
fs/block_dev.c
fs/btrfs/async-thread.c
fs/btrfs/backref.c
fs/btrfs/compression.c
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-inode.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/free-space-tree.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/sysfs.c
fs/btrfs/sysfs.h
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/tree-log.c
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.h
fs/cifs/cifs_dfs_ref.c
fs/cifs/cifsencrypt.c
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/cifssmb.c
fs/cifs/connect.c
fs/cifs/smb2pdu.c
fs/compat_ioctl.c
fs/dax.c
fs/dcache.c
fs/devpts/inode.c
fs/direct-io.c
fs/efivarfs/file.c
fs/efivarfs/inode.c
fs/efivarfs/internal.h
fs/efivarfs/super.c
fs/eventpoll.c
fs/ext2/file.c
fs/ext2/inode.c
fs/ext4/balloc.c
fs/ext4/crypto.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/fs-writeback.c
fs/hpfs/namei.c
fs/inode.c
fs/jffs2/README.Locking
fs/jffs2/build.c
fs/jffs2/dir.c
fs/jffs2/file.c
fs/jffs2/gc.c
fs/jffs2/nodelist.h
fs/namei.c
fs/ncpfs/dir.c
fs/nfs/blocklayout/extent_tree.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/nfs42proc.c
fs/nfs/nfs4proc.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/notify/mark.c
fs/ocfs2/aops.c
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/mmap.c
fs/overlayfs/dir.c
fs/overlayfs/inode.c
fs/overlayfs/super.c
fs/pnode.c
fs/proc/task_mmu.c
fs/proc/task_nommu.c
fs/read_write.c
fs/super.c
fs/timerfd.c
fs/userfaultfd.c
fs/xattr.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_log_recover.c
include/acpi/cppc_acpi.h
include/asm-generic/cputime_nsecs.h
include/asm-generic/pgtable.h
include/asm-generic/qspinlock.h
include/asm-generic/qspinlock_types.h
include/asm-generic/vmlinux.lds.h
include/drm/drm_atomic_helper.h
include/drm/drm_cache.h
include/drm/drm_crtc.h
include/drm/drm_dp_mst_helper.h
include/drm/drm_fixed.h
include/dt-bindings/clock/tegra210-car.h
include/linux/ata.h
include/linux/atomic.h
include/linux/bio.h
include/linux/blkdev.h
include/linux/cache.h
include/linux/ceph/ceph_features.h
include/linux/cgroup-defs.h
include/linux/cleancache.h
include/linux/clockchips.h
include/linux/clocksource.h
include/linux/compiler.h
include/linux/cpu.h
include/linux/cpuhotplug.h [new file with mode: 0644]
include/linux/cpuset.h
include/linux/crush/crush.h
include/linux/dax.h
include/linux/dcache.h
include/linux/devpts_fs.h
include/linux/dma-mapping.h
include/linux/efi.h
include/linux/fs.h
include/linux/fsnotify_backend.h
include/linux/ftrace.h
include/linux/gfp.h
include/linux/hrtimer.h
include/linux/init.h
include/linux/intel-iommu.h
include/linux/iommu.h
include/linux/ioport.h
include/linux/irq.h
include/linux/irqchip/mips-gic.h
include/linux/irqdomain.h
include/linux/kasan.h
include/linux/kvm_host.h
include/linux/latencytop.h
include/linux/libata.h
include/linux/libnvdimm.h
include/linux/lightnvm.h
include/linux/list.h
include/linux/lockdep.h
include/linux/memcontrol.h
include/linux/mlx4/device.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/module.h
include/linux/netdevice.h
include/linux/nfs_fs.h
include/linux/nfs_xdr.h
include/linux/notifier.h
include/linux/of.h
include/linux/pci.h
include/linux/perf_event.h
include/linux/pfn.h
include/linux/pfn_t.h
include/linux/posix-timers.h
include/linux/power/bq27xxx_battery.h
include/linux/pps_kernel.h
include/linux/ptp_clock_kernel.h
include/linux/radix-tree.h
include/linux/raid/pq.h
include/linux/random.h
include/linux/rcupdate.h
include/linux/rmap.h
include/linux/sched.h
include/linux/sched/sysctl.h
include/linux/skbuff.h
include/linux/soc/ti/knav_dma.h
include/linux/srcu.h
include/linux/stmmac.h
include/linux/swait.h [new file with mode: 0644]
include/linux/swiotlb.h
include/linux/tick.h
include/linux/timekeeper_internal.h
include/linux/timekeeping.h
include/linux/trace_events.h
include/linux/tracepoint.h
include/linux/tty.h
include/linux/ucs2_string.h
include/linux/wait.h
include/linux/workqueue.h
include/linux/writeback.h
include/media/videobuf2-core.h
include/net/af_unix.h
include/net/bluetooth/l2cap.h
include/net/dst_metadata.h
include/net/inet_connection_sock.h
include/net/ip6_route.h
include/net/ip_fib.h
include/net/ip_tunnels.h
include/net/iw_handler.h
include/net/netfilter/nf_conntrack_core.h
include/net/scm.h
include/net/sctp/structs.h
include/net/sock.h
include/net/sock_reuseport.h
include/net/tcp.h
include/sound/hdaudio.h
include/sound/rawmidi.h
include/target/target_core_backend.h
include/target/target_core_base.h
include/trace/events/asoc.h
include/trace/events/cpuhp.h [new file with mode: 0644]
include/trace/events/fence.h
include/trace/events/timer.h
include/uapi/drm/etnaviv_drm.h
include/uapi/linux/bpf.h
include/uapi/linux/fs.h
include/uapi/linux/media.h
include/uapi/linux/ndctl.h
include/uapi/linux/ptp_clock.h
init/main.c
ipc/shm.c
kernel/bpf/arraymap.c
kernel/bpf/verifier.c
kernel/cgroup.c
kernel/cpu.c
kernel/cpuset.c
kernel/debug/kdb/kdb_bp.c
kernel/events/core.c
kernel/events/hw_breakpoint.c
kernel/events/ring_buffer.c
kernel/events/uprobes.c
kernel/futex.c
kernel/irq/Kconfig
kernel/irq/Makefile
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/ipi.c [new file with mode: 0644]
kernel/irq/irqdesc.c
kernel/irq/irqdomain.c
kernel/irq/manage.c
kernel/irq/proc.c
kernel/irq/spurious.c
kernel/kexec_core.c
kernel/kexec_file.c
kernel/latencytop.c
kernel/locking/lockdep.c
kernel/locking/mcs_spinlock.h
kernel/locking/mutex.c
kernel/locking/qspinlock.c
kernel/locking/qspinlock_paravirt.h
kernel/locking/qspinlock_stat.h
kernel/locking/rtmutex.c
kernel/memremap.c
kernel/module.c
kernel/pid.c
kernel/power/Kconfig
kernel/profile.c
kernel/rcu/rcutorture.c
kernel/rcu/tiny_plugin.h
kernel/rcu/tree.c
kernel/rcu/tree.h
kernel/rcu/tree_plugin.h
kernel/rcu/update.c
kernel/resource.c
kernel/sched/Makefile
kernel/sched/clock.c
kernel/sched/core.c
kernel/sched/cputime.c
kernel/sched/deadline.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/idle.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/stats.h
kernel/sched/swait.c [new file with mode: 0644]
kernel/seccomp.c
kernel/signal.c
kernel/smp.c
kernel/smpboot.c
kernel/smpboot.h
kernel/softirq.c
kernel/sysctl.c
kernel/time/clocksource.c
kernel/time/hrtimer.c
kernel/time/itimer.c
kernel/time/jiffies.c
kernel/time/ntp.c
kernel/time/posix-cpu-timers.c
kernel/time/posix-timers.c
kernel/time/tick-sched.c
kernel/time/tick-sched.h
kernel/time/timekeeping.c
kernel/time/timer_list.c
kernel/trace/bpf_trace.c
kernel/trace/ftrace.c
kernel/trace/trace.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_kprobe.c
kernel/trace/trace_stack.c
kernel/trace/trace_syscalls.c
kernel/tsacct.c
kernel/workqueue.c
lib/Kconfig.debug
lib/Kconfig.ubsan
lib/atomic64_test.c
lib/cpumask.c
lib/debugobjects.c
lib/dump_stack.c
lib/klist.c
lib/list_debug.c
lib/radix-tree.c
lib/scatterlist.c
lib/test-string_helpers.c
lib/test_static_keys.c
lib/ucs2_string.c
lib/vsprintf.c
mm/Kconfig
mm/backing-dev.c
mm/cleancache.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/kasan/kasan.c
mm/memblock.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/migrate.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/page_alloc.c
mm/pgtable-generic.c
mm/slab.c
mm/slab.h
mm/slab_common.c
mm/slob.c
mm/slub.c
mm/util.c
mm/vmpressure.c
mm/vmscan.c
mm/vmstat.c
net/appletalk/ddp.c
net/batman-adv/gateway_client.c
net/batman-adv/hard-interface.c
net/batman-adv/translation-table.c
net/bluetooth/6lowpan.c
net/bluetooth/hci_core.c
net/bluetooth/hci_request.c
net/bluetooth/l2cap_core.c
net/bluetooth/l2cap_sock.c
net/bluetooth/smp.c
net/bridge/br.c
net/bridge/br_fdb.c
net/bridge/br_mdb.c
net/caif/cfrfml.c
net/ceph/crush/mapper.c
net/ceph/messenger.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/core/dev.c
net/core/filter.c
net/core/flow_dissector.c
net/core/rtnetlink.c
net/core/scm.c
net/core/skbuff.c
net/core/sock_reuseport.c
net/core/sysctl_net_core.c
net/dccp/ipv4.c
net/dccp/ipv6.c
net/dsa/slave.c
net/ipv4/Kconfig
net/ipv4/devinet.c
net/ipv4/fib_trie.c
net/ipv4/igmp.c
net/ipv4/inet_connection_sock.c
net/ipv4/inet_diag.c
net/ipv4/ip_fragment.c
net/ipv4/ip_gre.c
net/ipv4/ip_input.c
net/ipv4/ip_output.c
net/ipv4/ip_sockglue.c
net/ipv4/ip_tunnel.c
net/ipv4/ipconfig.c
net/ipv4/netfilter/nf_defrag_ipv4.c
net/ipv4/ping.c
net/ipv4/raw.c
net/ipv4/route.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_metrics.c
net/ipv4/tcp_minisocks.c
net/ipv4/udp.c
net/ipv4/udp_tunnel.c
net/ipv6/Kconfig
net/ipv6/addrconf.c
net/ipv6/datagram.c
net/ipv6/exthdrs_core.c
net/ipv6/ip6_flowlabel.c
net/ipv6/ip6_gre.c
net/ipv6/ip6_output.c
net/ipv6/ip6_tunnel.c
net/ipv6/mcast.c
net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
net/ipv6/route.c
net/ipv6/sit.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/irda/ircomm/ircomm_param.c
net/iucv/af_iucv.c
net/l2tp/l2tp_netlink.c
net/mac80211/agg-rx.c
net/mac80211/ibss.c
net/mac80211/ieee80211_i.h
net/mac80211/main.c
net/mac80211/mesh.c
net/mac80211/mesh.h
net/mac80211/mlme.c
net/mac80211/offchannel.c
net/mac80211/rc80211_minstrel.c
net/mac80211/rc80211_minstrel_ht.c
net/mac80211/rx.c
net/mac80211/scan.c
net/mac80211/sta_info.c
net/mac80211/status.c
net/mac80211/util.c
net/netfilter/Kconfig
net/netfilter/ipset/ip_set_hash_netiface.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_tables_netdev.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_cttimeout.c
net/netfilter/nft_byteorder.c
net/netfilter/nft_counter.c
net/netfilter/nft_ct.c
net/netfilter/xt_TCPMSS.c
net/netfilter/xt_TEE.c
net/netlink/af_netlink.c
net/openvswitch/vport-vxlan.c
net/rfkill/core.c
net/sched/act_ipt.c
net/sched/sch_api.c
net/sched/sch_drr.c
net/sctp/input.c
net/sctp/ipv6.c
net/sctp/proc.c
net/sctp/protocol.c
net/sctp/sm_sideeffect.c
net/sctp/socket.c
net/sctp/transport.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/cache.c
net/sunrpc/xprtrdma/backchannel.c
net/switchdev/switchdev.c
net/tipc/link.c
net/tipc/node.c
net/tipc/socket.c
net/tipc/subscr.c
net/unix/af_unix.c
net/unix/diag.c
net/unix/garbage.c
net/vmw_vsock/af_vsock.c
net/wireless/core.c
net/wireless/nl80211.c
net/wireless/reg.c
net/wireless/sme.c
net/wireless/wext-core.c
scripts/checkpatch.pl
scripts/ld-version.sh
scripts/mod/modpost.c
scripts/prune-kernel [new file with mode: 0755]
scripts/sortextable.c
security/integrity/evm/evm_main.c
security/keys/key.c
security/selinux/hooks.c
security/selinux/nlmsgtab.c
sound/arm/pxa2xx-pcm-lib.c
sound/core/Kconfig
sound/core/compress_offload.c
sound/core/control_compat.c
sound/core/oss/pcm_oss.c
sound/core/pcm_compat.c
sound/core/pcm_native.c
sound/core/rawmidi.c
sound/core/rawmidi_compat.c
sound/core/seq/oss/seq_oss.c
sound/core/seq/oss/seq_oss_device.h
sound/core/seq/oss/seq_oss_init.c
sound/core/seq/oss/seq_oss_synth.c
sound/core/seq/seq_clientmgr.c
sound/core/seq/seq_memory.c
sound/core/seq/seq_ports.c
sound/core/seq/seq_timer.c
sound/core/seq/seq_virmidi.c
sound/core/timer.c
sound/core/timer_compat.c
sound/drivers/dummy.c
sound/firewire/bebob/bebob_stream.c
sound/firewire/digi00x/amdtp-dot.c
sound/firewire/tascam/tascam-transaction.c
sound/firewire/tascam/tascam.c
sound/firewire/tascam/tascam.h
sound/hda/hdac_controller.c
sound/isa/Kconfig
sound/pci/Kconfig
sound/pci/emu10k1/emu10k1_main.c
sound/pci/hda/hda_controller.c
sound/pci/hda/hda_generic.c
sound/pci/hda/hda_intel.c
sound/pci/hda/hda_jack.c
sound/pci/hda/hda_jack.h
sound/pci/hda/patch_ca0132.c
sound/pci/hda/patch_cirrus.c
sound/pci/hda/patch_hdmi.c
sound/pci/hda/patch_realtek.c
sound/pci/hda/patch_sigmatel.c
sound/pci/rme9652/hdsp.c
sound/pci/rme9652/hdspm.c
sound/soc/amd/acp-pcm-dma.c
sound/soc/codecs/ab8500-codec.c
sound/soc/codecs/adau17x1.h
sound/soc/codecs/arizona.c
sound/soc/codecs/cs42l51.c
sound/soc/codecs/da732x.c
sound/soc/codecs/max98088.c
sound/soc/codecs/max98095.c
sound/soc/codecs/rt286.c
sound/soc/codecs/rt5645.c
sound/soc/codecs/rt5659.c
sound/soc/codecs/rt5659.h
sound/soc/codecs/sigmadsp-i2c.c
sound/soc/codecs/tlv320dac33.c
sound/soc/codecs/wl1273.c
sound/soc/codecs/wm5110.c
sound/soc/codecs/wm8753.c
sound/soc/codecs/wm8904.c
sound/soc/codecs/wm8958-dsp2.c
sound/soc/codecs/wm8960.c
sound/soc/codecs/wm8983.c
sound/soc/codecs/wm8985.c
sound/soc/codecs/wm8994.c
sound/soc/codecs/wm8996.c
sound/soc/codecs/wm9081.c
sound/soc/codecs/wm9713.c
sound/soc/codecs/wm_adsp.c
sound/soc/dwc/designware_i2s.c
sound/soc/fsl/imx-pcm-fiq.c
sound/soc/fsl/imx-spdif.c
sound/soc/generic/simple-card.c
sound/soc/intel/Kconfig
sound/soc/intel/atom/sst-mfld-platform-pcm.c
sound/soc/intel/boards/cht_bsw_rt5645.c
sound/soc/intel/boards/mfld_machine.c
sound/soc/intel/boards/skl_rt286.c
sound/soc/intel/common/Makefile
sound/soc/intel/common/sst-acpi.c
sound/soc/intel/common/sst-match-acpi.c
sound/soc/intel/skylake/skl-messages.c
sound/soc/intel/skylake/skl-pcm.c
sound/soc/intel/skylake/skl-topology.c
sound/soc/intel/skylake/skl.c
sound/soc/mediatek/Kconfig
sound/soc/mxs/mxs-saif.c
sound/soc/nuc900/nuc900-pcm.c
sound/soc/omap/n810.c
sound/soc/omap/omap-pcm.c
sound/soc/omap/rx51.c
sound/soc/pxa/corgi.c
sound/soc/pxa/magician.c
sound/soc/pxa/poodle.c
sound/soc/pxa/spitz.c
sound/soc/pxa/tosa.c
sound/soc/qcom/lpass-cpu.c
sound/soc/qcom/lpass-platform.c
sound/soc/samsung/i2s.c
sound/soc/soc-dapm.c
sound/soc/soc-pcm.c
sound/sparc/Kconfig
sound/usb/midi.c
sound/usb/quirks.c
tools/build/Makefile.build
tools/build/Makefile.feature
tools/build/feature/Makefile
tools/build/feature/test-all.c
tools/build/feature/test-compile.c
tools/build/feature/test-libcrypto.c [new file with mode: 0644]
tools/lib/api/Build
tools/lib/api/Makefile
tools/lib/api/debug-internal.h [new file with mode: 0644]
tools/lib/api/debug.c [new file with mode: 0644]
tools/lib/api/debug.h [new file with mode: 0644]
tools/lib/api/fs/fs.c
tools/lib/api/fs/fs.h
tools/lib/bpf/libbpf.c
tools/lib/lockdep/Makefile
tools/lib/lockdep/common.c
tools/lib/lockdep/include/liblockdep/common.h
tools/lib/lockdep/lockdep.c
tools/lib/lockdep/preload.c
tools/lib/lockdep/tests/AA.c
tools/lib/lockdep/tests/ABA.c [new file with mode: 0644]
tools/lib/lockdep/tests/ABBA_2threads.c [new file with mode: 0644]
tools/lib/lockdep/uinclude/linux/compiler.h
tools/lib/traceevent/event-parse.c
tools/lib/traceevent/event-parse.h
tools/perf/Documentation/perf-config.txt
tools/perf/Documentation/perf-inject.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-stat.txt
tools/perf/Documentation/perf-top.txt
tools/perf/Documentation/perfconfig.example
tools/perf/Documentation/tips.txt
tools/perf/Makefile
tools/perf/Makefile.perf
tools/perf/arch/arm/Makefile
tools/perf/arch/arm64/Makefile
tools/perf/arch/powerpc/Makefile
tools/perf/arch/powerpc/util/Build
tools/perf/arch/powerpc/util/book3s_hcalls.h [new file with mode: 0644]
tools/perf/arch/powerpc/util/book3s_hv_exits.h [new file with mode: 0644]
tools/perf/arch/powerpc/util/kvm-stat.c [new file with mode: 0644]
tools/perf/arch/s390/util/kvm-stat.c
tools/perf/arch/x86/Makefile
tools/perf/arch/x86/tests/intel-cqm.c
tools/perf/arch/x86/tests/rdpmc.c
tools/perf/arch/x86/util/intel-bts.c
tools/perf/arch/x86/util/intel-pt.c
tools/perf/arch/x86/util/kvm-stat.c
tools/perf/bench/mem-memcpy-x86-64-asm.S
tools/perf/builtin-annotate.c
tools/perf/builtin-buildid-cache.c
tools/perf/builtin-config.c
tools/perf/builtin-diff.c
tools/perf/builtin-help.c
tools/perf/builtin-inject.c
tools/perf/builtin-kmem.c
tools/perf/builtin-kvm.c
tools/perf/builtin-mem.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/config/Makefile
tools/perf/jvmti/Makefile [new file with mode: 0644]
tools/perf/jvmti/jvmti_agent.c [new file with mode: 0644]
tools/perf/jvmti/jvmti_agent.h [new file with mode: 0644]
tools/perf/jvmti/libjvmti.c [new file with mode: 0644]
tools/perf/perf.c
tools/perf/perf.h
tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
tools/perf/tests/.gitignore
tools/perf/tests/Build
tools/perf/tests/bp_signal.c
tools/perf/tests/bpf-script-test-relocation.c [new file with mode: 0644]
tools/perf/tests/bpf.c
tools/perf/tests/code-reading.c
tools/perf/tests/hists_cumulate.c
tools/perf/tests/hists_filter.c
tools/perf/tests/hists_output.c
tools/perf/tests/llvm.c
tools/perf/tests/llvm.h
tools/perf/tests/make
tools/perf/tests/parse-events.c
tools/perf/tests/vmlinux-kallsyms.c
tools/perf/ui/browser.c
tools/perf/ui/browser.h
tools/perf/ui/browsers/annotate.c
tools/perf/ui/browsers/hists.c
tools/perf/ui/gtk/hists.c
tools/perf/ui/hist.c
tools/perf/ui/stdio/hist.c
tools/perf/util/Build
tools/perf/util/auxtrace.c
tools/perf/util/auxtrace.h
tools/perf/util/bpf-loader.c
tools/perf/util/bpf-loader.h
tools/perf/util/build-id.c
tools/perf/util/build-id.h
tools/perf/util/cache.h
tools/perf/util/callchain.c
tools/perf/util/color.c
tools/perf/util/config.c
tools/perf/util/cpumap.c
tools/perf/util/cpumap.h
tools/perf/util/ctype.c
tools/perf/util/data-convert-bt.c
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/demangle-java.c [new file with mode: 0644]
tools/perf/util/demangle-java.h [new file with mode: 0644]
tools/perf/util/dso.c
tools/perf/util/env.c
tools/perf/util/env.h
tools/perf/util/event.c
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/genelf.c [new file with mode: 0644]
tools/perf/util/genelf.h [new file with mode: 0644]
tools/perf/util/genelf_debug.c [new file with mode: 0644]
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/help-unknown-cmd.c
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/intel-pt.c
tools/perf/util/jit.h [new file with mode: 0644]
tools/perf/util/jitdump.c [new file with mode: 0644]
tools/perf/util/jitdump.h [new file with mode: 0644]
tools/perf/util/kvm-stat.h
tools/perf/util/machine.h
tools/perf/util/mem-events.c [new file with mode: 0644]
tools/perf/util/mem-events.h [new file with mode: 0644]
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-events.l
tools/perf/util/parse-events.y
tools/perf/util/pmu.c
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/setup.py
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/stat-shadow.c
tools/perf/util/stat.c
tools/perf/util/stat.h
tools/perf/util/strbuf.c
tools/perf/util/strbuf.h
tools/perf/util/symbol-elf.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/trace-event.c
tools/perf/util/tsc.c
tools/perf/util/util.c
tools/perf/util/util.h
tools/power/x86/turbostat/turbostat.c
tools/testing/nvdimm/test/iomap.c
tools/testing/nvdimm/test/nfit.c
tools/testing/selftests/efivarfs/efivarfs.sh
tools/testing/selftests/efivarfs/open-unlink.c
tools/testing/selftests/ftrace/test.d/instances/instance.tc
tools/testing/selftests/rcutorture/bin/parse-console.sh
tools/testing/selftests/timers/valid-adjtimex.c
tools/testing/selftests/x86/Makefile
tools/testing/selftests/x86/check_initial_reg_state.c [new file with mode: 0644]
tools/testing/selftests/x86/ptrace_syscall.c
tools/testing/selftests/x86/sigreturn.c
tools/testing/selftests/x86/syscall_nt.c
tools/virtio/asm/barrier.h
tools/virtio/linux/compiler.h [new file with mode: 0644]
tools/virtio/linux/kernel.h
tools/virtio/ringtest/Makefile [new file with mode: 0644]
tools/virtio/ringtest/README [new file with mode: 0644]
tools/virtio/ringtest/main.c [new file with mode: 0644]
tools/virtio/ringtest/main.h [new file with mode: 0644]
tools/virtio/ringtest/ring.c [new file with mode: 0644]
tools/virtio/ringtest/run-on-all.sh [new file with mode: 0755]
tools/virtio/ringtest/virtio_ring_0_9.c [new file with mode: 0644]
tools/virtio/ringtest/virtio_ring_poll.c [new file with mode: 0644]
virt/kvm/arm/arch_timer.c
virt/kvm/arm/vgic.c
virt/kvm/async_pf.c
virt/kvm/kvm_main.c

index b1e9a97..7e6c533 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -21,6 +21,7 @@ Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
 Andrew Morton <akpm@linux-foundation.org>
 Andrew Vasquez <andrew.vasquez@qlogic.com>
 Andy Adamson <andros@citi.umich.edu>
+Antonio Ospite <ao2@ao2.it> <ao2@amarulasolutions.com>
 Archit Taneja <archit@ti.com>
 Arnaud Patard <arnaud.patard@rtp-net.org>
 Arnd Bergmann <arnd@arndb.de>
index 1af3842..0ee0f33 100644 (file)
            <entry><constant>MEDIA_ENT_F_CONN_COMPOSITE</constant></entry>
            <entry>Connector for a RGB composite signal.</entry>
          </row>
-         <row>
-           <entry><constant>MEDIA_ENT_F_CONN_TEST</constant></entry>
-           <entry>Connector for a test generator.</entry>
-         </row>
          <row>
            <entry><constant>MEDIA_ENT_F_CAM_SENSOR</constant></entry>
            <entry>Camera video sensor entity.</entry>
index 7b57fc0..49585b6 100644 (file)
@@ -3,7 +3,7 @@ Linux IOMMU Support
 
 The architecture spec can be obtained from the below location.
 
-http://www.intel.com/technology/virtualization/
+http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
 
 This guide gives a quick cheat sheet for some basic understanding.
 
index 65b3eac..ff49cf9 100644 (file)
@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
 conventions of cgroup v2.  It describes all userland-visible aspects
 of cgroup including core and specific controller behaviors.  All
 future changes must be reflected in this document.  Documentation for
-v1 is available under Documentation/cgroup-legacy/.
+v1 is available under Documentation/cgroup-v1/.
 
 CONTENTS
 
@@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back.
                Amount of memory used to cache filesystem data,
                including tmpfs and shared memory.
 
+         sock
+
+               Amount of memory used in network transmission buffers
+
          file_mapped
 
                Amount of cached filesystem data mapped with mmap()
index a2bd593..66422d6 100644 (file)
@@ -23,6 +23,7 @@ Optional properties:
   during suspend.
 - ti,no-reset-on-init: When present, the module should not be reset at init
 - ti,no-idle-on-init: When present, the module should not be idled at init
+- ti,no-idle: When present, the module is never allowed to idle.
 
 Example:
 
index ace0599..20df350 100644 (file)
@@ -30,7 +30,7 @@ that they are defined using standard clock bindings with following
 clock-output-names:
  - "xin24m" - crystal input - required,
  - "ext_i2s" - external I2S clock - optional,
- - "ext_gmac" - external GMAC clock - optional
+ - "rmii_clkin" - external EMAC clock - optional
 
 Example: Clock controller node:
 
diff --git a/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt
new file mode 100644 (file)
index 0000000..f6f1c14
--- /dev/null
@@ -0,0 +1,26 @@
+Alpine MSIX controller
+
+See arm,gic-v3.txt for SPI and MSI definitions.
+
+Required properties:
+
+- compatible: should be "al,alpine-msix"
+- reg: physical base address and size of the registers
+- interrupt-parent: specifies the parent interrupt controller.
+- interrupt-controller: identifies the node as an interrupt controller
+- msi-controller: identifies the node as an PCI Message Signaled Interrupt
+                 controller
+- al,msi-base-spi: SPI base of the MSI frame
+- al,msi-num-spis: number of SPIs assigned to the MSI frame, relative to SPI0
+
+Example:
+
+msix: msix {
+       compatible = "al,alpine-msix";
+       reg = <0x0 0xfbe00000 0x0 0x100000>;
+       interrupt-parent = <&gic>;
+       interrupt-controller;
+       msi-controller;
+       al,msi-base-spi = <160>;
+       al,msi-num-spis = <160>;
+};
index 7803e77..007a5b4 100644 (file)
@@ -24,9 +24,8 @@ Main node required properties:
                1 = edge triggered
                4 = level triggered
 
-  Cells 4 and beyond are reserved for future use. When the 1st cell
-  has a value of 0 or 1, cells 4 and beyond act as padding, and may be
-  ignored. It is recommended that padding cells have a value of 0.
+  Cells 4 and beyond are reserved for future use and must have a value
+  of 0 if present.
 
 - reg : Specifies base physical address(s) and size of the GIC
   registers, in the following order:
index 5a1cb4b..793c20f 100644 (file)
@@ -16,6 +16,7 @@ Main node required properties:
        "arm,cortex-a15-gic"
        "arm,cortex-a7-gic"
        "arm,cortex-a9-gic"
+       "arm,eb11mp-gic"
        "arm,gic-400"
        "arm,pl390"
        "arm,tc11mp-gic"
diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
new file mode 100644 (file)
index 0000000..8af0a8e
--- /dev/null
@@ -0,0 +1,44 @@
+
+* Marvell ODMI for MSI support
+
+Some Marvell SoCs have an On-Die Message Interrupt (ODMI) controller
+which can be used by on-board peripheral for MSI interrupts.
+
+Required properties:
+
+- compatible           : The value here should contain:
+
+    "marvell,ap806-odmi-controller", "marvell,odmi-controller".
+
+- interrupt,controller : Identifies the node as an interrupt controller.
+
+- msi-controller       : Identifies the node as an MSI controller.
+
+- marvell,odmi-frames  : Number of ODMI frames available. Each frame
+                         provides a number of events.
+
+- reg                  : List of register definitions, one for each
+                         ODMI frame.
+
+- marvell,spi-base     : List of GIC base SPI interrupts, one for each
+                         ODMI frame. Those SPI interrupts are 0-based,
+                         i.e marvell,spi-base = <128> will use SPI #96.
+                         See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
+                         for details about the GIC Device Tree binding.
+
+- interrupt-parent     : Reference to the parent interrupt controller.
+
+Example:
+
+       odmi: odmi@300000 {
+               compatible = "marvell,ap806-odm-controller",
+                            "marvell,odmi-controller";
+               interrupt-controller;
+               msi-controller;
+               marvell,odmi-frames = <4>;
+               reg = <0x300000 0x4000>,
+                     <0x304000 0x4000>,
+                     <0x308000 0x4000>,
+                     <0x30C000 0x4000>;
+               marvell,spi-base = <128>, <136>, <144>, <152>;
+       };
index aae4c38..1735953 100644 (file)
@@ -23,6 +23,12 @@ Optional properties:
 - mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors
   to which the GIC may not route interrupts.  Valid values are 2 - 7.
   This property is ignored if the CPU is started in EIC mode.
+- mti,reserved-ipi-vectors : Specifies the range of GIC interrupts that are
+  reserved for IPIs.
+  It accepts 2 values, the 1st is the starting interrupt and the 2nd is the size
+  of the reserved range.
+  If not specified, the driver will allocate the last 2 * number of VPEs in the
+  system.
 
 Required properties for timer sub-node:
 - compatible : Should be "mti,gic-timer".
@@ -44,6 +50,7 @@ Example:
                #interrupt-cells = <3>;
 
                mti,reserved-cpu-vectors = <7>;
+               mti,reserved-ipi-vectors = <40 8>;
 
                timer {
                        compatible = "mti,gic-timer";
diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
new file mode 100644 (file)
index 0000000..1f441fa
--- /dev/null
@@ -0,0 +1,49 @@
+Sigma Designs SMP86xx/SMP87xx secondary interrupt controller
+
+Required properties:
+- compatible: should be "sigma,smp8642-intc"
+- reg: physical address of MMIO region
+- ranges: address space mapping of child nodes
+- interrupt-parent: phandle of parent interrupt controller
+- interrupt-controller: boolean
+- #address-cells: should be <1>
+- #size-cells: should be <1>
+
+One child node per control block with properties:
+- reg: address of registers for this control block
+- interrupt-controller: boolean
+- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt
+- interrupts: interrupt spec of primary interrupt controller
+
+Example:
+
+interrupt-controller@6e000 {
+       compatible = "sigma,smp8642-intc";
+       reg = <0x6e000 0x400>;
+       ranges = <0x0 0x6e000 0x400>;
+       interrupt-parent = <&gic>;
+       interrupt-controller;
+       #address-cells = <1>;
+       #size-cells = <1>;
+
+       irq0: interrupt-controller@0 {
+               reg = <0x000 0x100>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+       };
+
+       irq1: interrupt-controller@100 {
+               reg = <0x100 0x100>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
+       };
+
+       irq2: interrupt-controller@300 {
+               reg = <0x300 0x100>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
+       };
+};
index 451fef2..10587bd 100644 (file)
@@ -68,7 +68,7 @@ ethernet@f0b60000 {
                phy1: ethernet-phy@1 {
                        max-speed = <1000>;
                        reg = <0x1>;
-                       compatible = "brcm,28nm-gphy", "ethernet-phy-ieee802.3-c22";
+                       compatible = "ethernet-phy-ieee802.3-c22";
                };
        };
 };
@@ -115,7 +115,7 @@ ethernet@f0ba0000 {
                phy0: ethernet-phy@0 {
                        max-speed = <1000>;
                        reg = <0x0>;
-                       compatible = "brcm,bcm53125", "ethernet-phy-ieee802.3-c22";
+                       compatible = "ethernet-phy-ieee802.3-c22";
                };
        };
 };
index 80411b2..ecacfa4 100644 (file)
@@ -4,8 +4,6 @@ Required properties:
 - compatible: should be "hisilicon,hns-dsaf-v1" or "hisilicon,hns-dsaf-v2".
   "hisilicon,hns-dsaf-v1" is for hip05.
   "hisilicon,hns-dsaf-v2" is for Hi1610 and Hi1612.
-- dsa-name: dsa fabric name who provide this interface.
-  should be "dsafX", X is the dsaf id.
 - mode: dsa fabric mode string. only support one of dsaf modes like these:
                "2port-64vf",
                "6port-16rss",
@@ -26,9 +24,8 @@ Required properties:
 
 Example:
 
-dsa: dsa@c7000000 {
+dsaf0: dsa@c7000000 {
        compatible = "hisilicon,hns-dsaf-v1";
-       dsa_name = "dsaf0";
        mode = "6port-16rss";
        interrupt-parent = <&mbigen_dsa>;
        reg = <0x0 0xC0000000 0x0 0x420000
index 41d19be..e6a9d1c 100644 (file)
@@ -4,8 +4,9 @@ Required properties:
 - compatible: "hisilicon,hns-nic-v1" or "hisilicon,hns-nic-v2".
   "hisilicon,hns-nic-v1" is for hip05.
   "hisilicon,hns-nic-v2" is for Hi1610 and Hi1612.
-- ae-name: accelerator name who provides this interface,
-  is simply a name referring to the name of name in the accelerator node.
+- ae-handle: accelerator engine handle for hns,
+  specifies a reference to the associating hardware driver node.
+  see Documentation/devicetree/bindings/net/hisilicon-hns-dsaf.txt
 - port-id: is the index of port provided by DSAF (the accelerator). DSAF can
   connect to 8 PHYs. Port 0 to 1 are both used for adminstration purpose. They
   are called debug ports.
@@ -41,7 +42,7 @@ Example:
 
        ethernet@0{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <0>;
                local-mac-address = [a2 14 e4 4b 56 76];
        };
index aeea50c..d0cb869 100644 (file)
@@ -6,12 +6,17 @@ Required properties:
 - interrupts: interrupt for the device
 - phy: See ethernet.txt file in the same directory.
 - phy-mode: See ethernet.txt file in the same directory
-- clocks: a pointer to the reference clock for this device.
+- clocks: List of clocks for this device. At least one clock is
+  mandatory for the core clock. If several clocks are given, then the
+  clock-names property must be used to identify them.
 
 Optional properties:
 - tx-csum-limit: maximum mtu supported by port that allow TX checksum.
   Value is presented in bytes. If not used, by default 1600B is set for
   "marvell,armada-370-neta" and 9800B for others.
+- clock-names: List of names corresponding to clocks property; shall be
+  "core" for core clock and "bus" for the optional bus clock.
+
 
 Example:
 
index 7938411..694987d 100644 (file)
@@ -38,7 +38,6 @@ Example :
 
                        phy11: ethernet-phy@1 {
                                reg = <1>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -48,7 +47,6 @@ Example :
                        };
                        phy12: ethernet-phy@2 {
                                reg = <2>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -58,7 +56,6 @@ Example :
                        };
                        phy13: ethernet-phy@3 {
                                reg = <3>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -68,7 +65,6 @@ Example :
                        };
                        phy14: ethernet-phy@4 {
                                reg = <4>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -85,7 +81,6 @@ Example :
 
                        phy21: ethernet-phy@1 {
                                reg = <1>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -95,7 +90,6 @@ Example :
                        };
                        phy22: ethernet-phy@2 {
                                reg = <2>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -105,7 +99,6 @@ Example :
                        };
                        phy23: ethernet-phy@3 {
                                reg = <3>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -115,7 +108,6 @@ Example :
                        };
                        phy24: ethernet-phy@4 {
                                reg = <4>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
index f65606f..491f5bd 100644 (file)
@@ -47,7 +47,6 @@ Example :
 
                        phy11: ethernet-phy@1 {
                                reg = <1>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -57,7 +56,6 @@ Example :
                        };
                        phy12: ethernet-phy@2 {
                                reg = <2>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -67,7 +65,6 @@ Example :
                        };
                        phy13: ethernet-phy@3 {
                                reg = <3>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -77,7 +74,6 @@ Example :
                        };
                        phy14: ethernet-phy@4 {
                                reg = <4>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -94,7 +90,6 @@ Example :
 
                        phy21: ethernet-phy@1 {
                                reg = <1>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -104,7 +99,6 @@ Example :
                        };
                        phy22: ethernet-phy@2 {
                                reg = <2>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -114,7 +108,6 @@ Example :
                        };
                        phy23: ethernet-phy@3 {
                                reg = <3>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
@@ -124,7 +117,6 @@ Example :
                        };
                        phy24: ethernet-phy@4 {
                                reg = <4>;
-                               compatible = "marvell,88e1149r";
                                marvell,reg-init = <3 0x10 0 0x5777>,
                                        <3 0x11 0 0x00aa>,
                                        <3 0x12 0 0x4105>,
index 525e165..bc1c3c8 100644 (file)
@@ -17,8 +17,7 @@ Optional Properties:
   "ethernet-phy-ieee802.3-c22" or "ethernet-phy-ieee802.3-c45" for
   PHYs that implement IEEE802.3 clause 22 or IEEE802.3 clause 45
   specifications. If neither of these are specified, the default is to
-  assume clause 22. The compatible list may also contain other
-  elements.
+  assume clause 22.
 
   If the phy's identifier is known then the list may contain an entry
   of the form: "ethernet-phy-idAAAA.BBBB" where
@@ -28,6 +27,9 @@ Optional Properties:
             4 hex digits. This is the chip vendor OUI bits 19:24,
             followed by 10 bits of a vendor specific ID.
 
+  The compatible list should not contain other values than those
+  listed here.
+
 - max-speed: Maximum PHY supported speed (10, 100, 1000...)
 
 - broken-turn-around: If set, indicates the PHY device does not correctly
index 81a9f9e..c8ac222 100644 (file)
@@ -82,8 +82,8 @@ Example:
                                  "ch16", "ch17", "ch18", "ch19",
                                  "ch20", "ch21", "ch22", "ch23",
                                  "ch24";
-               clocks = <&mstp8_clks R8A7795_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               clocks = <&cpg CPG_MOD 812>;
+               power-domains = <&cpg>;
                phy-mode = "rgmii-id";
                phy-handle = <&phy0>;
 
index 4e8b90e..07a7509 100644 (file)
@@ -8,6 +8,7 @@ OHCI and EHCI controllers.
 Required properties:
 - compatible: "renesas,pci-r8a7790" for the R8A7790 SoC;
              "renesas,pci-r8a7791" for the R8A7791 SoC;
+             "renesas,pci-r8a7793" for the R8A7793 SoC;
              "renesas,pci-r8a7794" for the R8A7794 SoC;
              "renesas,pci-rcar-gen2" for a generic R-Car Gen2 compatible device
 
index 558fe52..6cf9969 100644 (file)
@@ -4,6 +4,7 @@ Required properties:
 compatible: "renesas,pcie-r8a7779" for the R8A7779 SoC;
            "renesas,pcie-r8a7790" for the R8A7790 SoC;
            "renesas,pcie-r8a7791" for the R8A7791 SoC;
+           "renesas,pcie-r8a7793" for the R8A7793 SoC;
            "renesas,pcie-r8a7795" for the R8A7795 SoC;
            "renesas,pcie-rcar-gen2" for a generic R-Car Gen2 compatible device.
 
index d181096..4f05d20 100644 (file)
@@ -26,11 +26,7 @@ Example:
                ti,pmic-shutdown-controller;
 
                regulators {
-                       #address-cells = <1>;
-                       #size-cells = <0>;
-
                        dcdc1_reg: dcdc1 {
-                               reg = <0>;
                                regulator-min-microvolt = <900000>;
                                regulator-max-microvolt = <1800000>;
                                regulator-boot-on;
@@ -38,7 +34,6 @@ Example:
                        };
 
                        dcdc2_reg: dcdc2 {
-                               reg = <1>;
                                regulator-min-microvolt = <900000>;
                                regulator-max-microvolt = <3300000>;
                                regulator-boot-on;
@@ -46,7 +41,6 @@ Example:
                        };
 
                        dcdc3_reg: dcc3 {
-                               reg = <2>;
                                regulator-min-microvolt = <900000>;
                                regulator-max-microvolt = <1500000>;
                                regulator-boot-on;
@@ -54,7 +48,6 @@ Example:
                        };
 
                        ldo1_reg: ldo1 {
-                               reg = <3>;
                                regulator-min-microvolt = <1000000>;
                                regulator-max-microvolt = <3300000>;
                                regulator-boot-on;
@@ -62,7 +55,6 @@ Example:
                        };
 
                        ldo2_reg: ldo2 {
-                               reg = <4>;
                                regulator-min-microvolt = <900000>;
                                regulator-max-microvolt = <3300000>;
                                regulator-boot-on;
@@ -70,7 +62,6 @@ Example:
                        };
 
                        ldo3_reg: ldo3 {
-                               reg = <5>;
                                regulator-min-microvolt = <1800000>;
                                regulator-max-microvolt = <3300000>;
                                regulator-boot-on;
@@ -78,7 +69,6 @@ Example:
                        };
 
                        ldo4_reg: ldo4 {
-                               reg = <6>;
                                regulator-min-microvolt = <1800000>;
                                regulator-max-microvolt = <3300000>;
                                regulator-boot-on;
index ac2fcd6..1068ffc 100644 (file)
@@ -14,6 +14,10 @@ Required properties:
   interrupt number is the rtc alarm interrupt and second interrupt number
   is the rtc tick interrupt. The number of cells representing a interrupt
   depends on the parent interrupt controller.
+- clocks: Must contain a list of phandle and clock specifier for the rtc
+          and source clocks.
+- clock-names: Must contain "rtc" and "rtc_src" entries sorted in the
+               same order as the clocks property.
 
 Example:
 
@@ -21,4 +25,6 @@ Example:
                compatible = "samsung,s3c6410-rtc";
                reg = <0x10070000 0x100>;
                interrupts = <44 0 45 0>;
+               clocks = <&clock CLK_RTC>, <&s2mps11_osc S2MPS11_CLK_AP>;
+               clock-names = "rtc", "rtc_src";
        };
index 35ae1fb..ed94c21 100644 (file)
@@ -9,7 +9,7 @@ Optional properties:
 - fsl,uart-has-rtscts : Indicate the uart has rts and cts
 - fsl,irda-mode : Indicate the uart supports irda mode
 - fsl,dte-mode : Indicate the uart works in DTE mode. The uart works
-                  is DCE mode by default.
+                  in DCE mode by default.
 
 Note: Each uart controller should have an alias correctly numbered
 in "aliases" node.
index ce55c0a..4da41bf 100644 (file)
@@ -30,6 +30,8 @@ The compatible list for this generic sound card currently:
  "fsl,imx-audio-sgtl5000"
  (compatible with Documentation/devicetree/bindings/sound/imx-audio-sgtl5000.txt)
 
+ "fsl,imx-audio-wm8960"
+
 Required properties:
 
   - compatible         : Contains one of entries in the compatible list.
index 332e625..e5ee3f1 100644 (file)
@@ -1,8 +1,9 @@
 * Renesas R-Car Thermal
 
 Required properties:
-- compatible           : "renesas,thermal-<soctype>", "renesas,rcar-thermal"
-                         as fallback.
+- compatible           : "renesas,thermal-<soctype>",
+                          "renesas,rcar-gen2-thermal" (with thermal-zone) or
+                          "renesas,rcar-thermal" (without thermal-zone) as fallback.
                          Examples with soctypes are:
                            - "renesas,thermal-r8a73a4" (R-Mobile APE6)
                            - "renesas,thermal-r8a7779" (R-Car H1)
@@ -36,3 +37,35 @@ thermal@e61f0000 {
                0xe61f0300 0x38>;
        interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>;
 };
+
+Example (with thermal-zone):
+
+thermal-zones {
+       cpu_thermal: cpu-thermal {
+               polling-delay-passive   = <1000>;
+               polling-delay           = <5000>;
+
+               thermal-sensors = <&thermal>;
+
+               trips {
+                       cpu-crit {
+                               temperature     = <115000>;
+                               hysteresis      = <0>;
+                               type            = "critical";
+                       };
+               };
+               cooling-maps {
+               };
+       };
+};
+
+thermal: thermal@e61f0000 {
+       compatible =    "renesas,thermal-r8a7790",
+                       "renesas,rcar-gen2-thermal",
+                       "renesas,rcar-thermal";
+       reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
+       interrupts = <0 69 IRQ_TYPE_LEVEL_HIGH>;
+       clocks = <&mstp5_clks R8A7790_CLK_THERMAL>;
+       power-domains = <&cpg_clocks>;
+       #thermal-sensor-cells = <0>;
+};
index c477af0..686a64b 100644 (file)
@@ -14,3 +14,10 @@ filesystem.
 efivarfs is typically mounted like this,
 
        mount -t efivarfs none /sys/firmware/efi/efivars
+
+Due to the presence of numerous firmware bugs where removing non-standard
+UEFI variables causes the system firmware to fail to POST, efivarfs
+files that are not well-known standardized variables are created
+as immutable files.  This doesn't prevent removal - "chattr -i" will work -
+but it does prevent this kind of failure from being accomplished
+accidentally.
index fde9fd0..843b045 100644 (file)
@@ -240,8 +240,8 @@ Table 1-2: Contents of the status files (as of 4.1)
  RssFile                     size of resident file mappings
  RssShmem                    size of resident shmem memory (includes SysV shm,
                              mapping of tmpfs and shared anonymous mappings)
- VmData                      size of data, stack, and text segments
- VmStk                       size of data, stack, and text segments
+ VmData                      size of private data segments
+ VmStk                       size of stack segments
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
@@ -356,7 +356,7 @@ address           perms offset  dev   inode      pathname
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
 a7cb2000-a7eb2000 rw-p 00000000 00:00 0
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0          [stack:1001]
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
 a8008000-a800a000 r--p 00133000 03:00 4222       /lib/libc.so.6
 a800a000-a800b000 rw-p 00135000 03:00 4222       /lib/libc.so.6
@@ -388,7 +388,6 @@ is not associated with a file:
 
  [heap]                   = the heap of the program
  [stack]                  = the stack of the main process
- [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
 
@@ -396,10 +395,8 @@ is not associated with a file:
 
 The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
 of the individual tasks of a process. In this file you will see a mapping marked
-as [stack] if that task sees it as a stack. This is a key difference from the
-content of /proc/PID/maps, where you will see all mappings that are being used
-as stack by all of those tasks. Hence, for the example above, the task-level
-map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
+as [stack] if that task sees it as a stack. Hence, for the example above, the
+task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
 
 08048000-08049000 r-xp 00000000 03:00 8312       /opt/test
 08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
index 87d40a7..4d9ca7d 100644 (file)
@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        clearcpuid=BITNUM [X86]
                        Disable CPUID feature X for the kernel. See
-                       arch/x86/include/asm/cpufeature.h for the valid bit
+                       arch/x86/include/asm/cpufeatures.h for the valid bit
                        numbers. Note the Linux specific bits are not necessarily
                        stable over kernel options, but the vendor specific
                        ones should be.
@@ -1496,6 +1496,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        could change it dynamically, usually by
                        /sys/module/printk/parameters/ignore_loglevel.
 
+       ignore_rlimit_data
+                       Ignore RLIMIT_DATA setting for data mappings,
+                       print warning at first misuse.  Can be changed via
+                       /sys/module/kernel/parameters/ignore_rlimit_data.
+
        ihash_entries=  [KNL]
                        Set number of hash buckets for inode cache.
 
@@ -1682,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        ip=             [IP_PNP]
                        See Documentation/filesystems/nfs/nfsroot.txt.
 
+       irqaffinity=    [SMP] Set the default irq affinity mask
+                       Format:
+                       <cpu number>,...,<cpu number>
+                       or
+                       <cpu number>-<cpu number>
+                       (must be a positive range in ascending order)
+                       or a mixture
+                       <cpu number>,...,<cpu number>-<cpu number>
+
        irqfixup        [HW]
                        When an interrupt is not handled search all handlers
                        for it. Intended to get systems with badly broken
@@ -2561,6 +2575,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        nointroute      [IA-64]
 
+       noinvpcid       [X86] Disable the INVPCID cpu feature.
+
        nojitter        [IA-64] Disables jitter checking for ITC timers.
 
        no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
@@ -3486,6 +3502,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        ro              [KNL] Mount root device read-only on boot
 
+       rodata=         [KNL]
+               on      Mark read-only kernel memory as read-only (default).
+               off     Leave read-only kernel memory writable for debugging.
+
        root=           [KNL] Root filesystem
                        See name_to_dev_t comment in init/do_mounts.c.
 
@@ -3523,6 +3543,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        sched_debug     [KNL] Enables verbose scheduler debug messages.
 
+       schedstats=     [KNL,X86] Enable or disable scheduled statistics.
+                       Allowed values are enable and disable. This feature
+                       incurs a small amount of overhead in the scheduler
+                       but is useful for debugging and performance tuning.
+
        skew_tick=      [KNL] Offset the periodic timer tick per cpu to mitigate
                        xtime_lock contention on larger systems, and/or RCU lock
                        contention on all systems with CONFIG_MAXSMP set.
@@ -4230,6 +4255,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        The default value of this parameter is determined by
                        the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
 
+       workqueue.debug_force_rr_cpu
+                       Workqueue used to implicitly guarantee that work
+                       items queued without explicit CPU specified are put
+                       on the local CPU.  This guarantee is no longer true
+                       and while local CPU is still preferred work items
+                       may be put on foreign CPUs.  This debug option
+                       forces round-robin CPU selection to flush out
+                       usages which depend on the now broken guarantee.
+                       When enabled, memory and cache locality will be
+                       impacted.
+
        x2apic_phys     [X86-64,APIC] Use x2apic physical mode instead of
                        default x2apic cluster mode on platforms
                        supporting x2apic.
index ceb44a0..73b36d7 100644 (file)
@@ -594,7 +594,7 @@ tcp_fastopen - INTEGER
 
 tcp_syn_retries - INTEGER
        Number of times initial SYNs for an active TCP connection attempt
-       will be retransmitted. Should not be higher than 255. Default value
+       will be retransmitted. Should not be higher than 127. Default value
        is 6, which corresponds to 63seconds till the last retransmission
        with the current initial RTO of 1second. With this the final timeout
        for an active TCP connection attempt will happen after 127seconds.
index 6c6247a..d99012f 100644 (file)
@@ -277,13 +277,15 @@ int main(int argc, char *argv[])
                               "  %d external time stamp channels\n"
                               "  %d programmable periodic signals\n"
                               "  %d pulse per second\n"
-                              "  %d programmable pins\n",
+                              "  %d programmable pins\n"
+                              "  %d cross timestamping\n",
                               caps.max_adj,
                               caps.n_alarm,
                               caps.n_ext_ts,
                               caps.n_per_out,
                               caps.pps,
-                              caps.n_pins);
+                              caps.n_pins,
+                              caps.cross_timestamping);
                }
        }
 
index a93b414..f4444c9 100644 (file)
@@ -58,6 +58,8 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- perf_cpu_time_max_percent
+- perf_event_paranoid
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -639,6 +641,17 @@ allowed to execute.
 
 ==============================================================
 
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 1.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+
+==============================================================
 
 pid_max:
 
@@ -760,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued.
 
 ==============================================================
 
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
 sg-big-buff:
 
 This file shows the size of the generic SCSI (sg) buffer.
index 767392f..a484d2c 100644 (file)
@@ -1,9 +1,7 @@
                High Precision Event Timer Driver for Linux
 
 The High Precision Event Timer (HPET) hardware follows a specification
-by Intel and Microsoft which can be found at
-
-       http://www.intel.com/hardwaredesign/hpetspec_1.pdf
+by Intel and Microsoft, revision 1.
 
 Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
 and up to 32 comparators.  Normally three or more comparators are provided,
index 053f613..07e4cdf 100644 (file)
@@ -3025,7 +3025,7 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
-4.90 KVM_SMI
+4.96 KVM_SMI
 
 Capability: KVM_CAP_X86_SMM
 Architectures: x86
index daf9c0f..c817310 100644 (file)
@@ -358,7 +358,8 @@ In the first case there are two additional complications:
 - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
   the kernel may now execute it.  We handle this by also setting spte.nx.
   If we get a user fetch or read fault, we'll change spte.u=1 and
-  spte.nx=gpte.nx back.
+  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
+  shadow paging is in use.
 - if CR4.SMAP is disabled: since the page has been changed to a kernel
   page, it can not be reused when CR4.SMAP is enabled. We set
   CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
index 9f9ec9f..4e4b6f1 100644 (file)
@@ -400,3 +400,7 @@ wm8350_wdt:
 nowayout: Watchdog cannot be stopped once started
        (default=kernel config parameter)
 -------------------------------------------------
+sun4v_wdt:
+timeout_ms: Watchdog timeout in milliseconds 1..180000, default=60000)
+nowayout: Watchdog cannot be stopped once started
+-------------------------------------------------
index d62bea6..c956d99 100644 (file)
@@ -40,3 +40,28 @@ cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin)
 find . | cpio -o -H newc >../ucode.cpio
 cd ..
 cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
+
+Builtin microcode
+=================
+
+We can also load builtin microcode supplied through the regular firmware
+builtin method CONFIG_FIRMWARE_IN_KERNEL. Here's an example:
+
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
+CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
+
+This basically means, you have the following tree structure locally:
+
+/lib/firmware/
+|-- amd-ucode
+...
+|   |-- microcode_amd_fam15h.bin
+...
+|-- intel-ucode
+...
+|   |-- 06-3a-09
+...
+
+so that the build system can find those files and integrate them into
+the final kernel image. The early loader finds them and applies them.
index 32901aa..e396bcd 100644 (file)
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+       .long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+       ex_insn_addr(const struct exception_table_entry *x)
+       {
+               return (unsigned long)&x->insn + x->insn;
+       }
+
+In v4.6 the exception table entry was expanded with a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
index 68ed311..0965a71 100644 (file)
@@ -60,6 +60,8 @@ Machine check
                threshold to 1. Enabling this may make memory predictive failure
                analysis less effective if the bios sets thresholds for memory
                errors since we will not see details for all errors.
+   mce=recovery
+               Force-enable recoverable machine check code paths
 
    nomce (for compatibility with i386): same as mce=off
 
index 30aca4a..57adf39 100644 (file)
@@ -223,9 +223,7 @@ F:  drivers/scsi/aacraid/
 
 ABI/API
 L:     linux-api@vger.kernel.org
-F:     Documentation/ABI/
 F:     include/linux/syscalls.h
-F:     include/uapi/
 F:     kernel/sys_ni.c
 
 ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
@@ -686,13 +684,6 @@ M: Michael Hanselmann <linux-kernel@hansmi.ch>
 S:     Supported
 F:     drivers/macintosh/ams/
 
-AMSO1100 RNIC DRIVER
-M:     Tom Tucker <tom@opengridcomputing.com>
-M:     Steve Wise <swise@opengridcomputing.com>
-L:     linux-rdma@vger.kernel.org
-S:     Maintained
-F:     drivers/infiniband/hw/amso1100/
-
 ANALOG DEVICES INC AD9389B DRIVER
 M:     Hans Verkuil <hans.verkuil@cisco.com>
 L:     linux-media@vger.kernel.org
@@ -929,17 +920,24 @@ M:        Emilio López <emilio@elopez.com.ar>
 S:     Maintained
 F:     drivers/clk/sunxi/
 
-ARM/Amlogic MesonX SoC support
+ARM/Amlogic Meson SoC support
 M:     Carlo Caione <carlo@caione.org>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L:     linux-meson@googlegroups.com
+W:     http://linux-meson.com/
 S:     Maintained
-F:     drivers/media/rc/meson-ir.c
-N:     meson[x68]
+F:     arch/arm/mach-meson/
+F:     arch/arm/boot/dts/meson*
+N:     meson
 
 ARM/Annapurna Labs ALPINE ARCHITECTURE
 M:     Tsahee Zidenberg <tsahee@annapurnalabs.com>
+M:     Antoine Tenart <antoine.tenart@free-electrons.com>
 S:     Maintained
 F:     arch/arm/mach-alpine/
+F:     arch/arm/boot/dts/alpine*
+F:     arch/arm64/boot/dts/al/
+F:     drivers/*/*alpine*
 
 ARM/ATMEL AT91RM9200, AT91SAM9 AND SAMA5 SOC SUPPORT
 M:     Nicolas Ferre <nicolas.ferre@atmel.com>
@@ -967,6 +965,8 @@ M:  Rob Herring <robh@kernel.org>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-highbank/
+F:     arch/arm/boot/dts/highbank.dts
+F:     arch/arm/boot/dts/ecx-*.dts*
 
 ARM/CAVIUM NETWORKS CNS3XXX MACHINE SUPPORT
 M:     Krzysztof Halasa <khalasa@piap.pl>
@@ -1042,6 +1042,7 @@ M:        Barry Song <baohua@kernel.org>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/baohua/linux.git
 S:     Maintained
+F:     arch/arm/boot/dts/prima2*
 F:     arch/arm/mach-prima2/
 F:     drivers/clk/sirf/
 F:     drivers/clocksource/timer-prima2.c
@@ -1143,6 +1144,10 @@ W:       http://www.hisilicon.com
 S:     Supported
 T:     git git://github.com/hisilicon/linux-hisi.git
 F:     arch/arm/mach-hisi/
+F:     arch/arm/boot/dts/hi3*
+F:     arch/arm/boot/dts/hip*
+F:     arch/arm/boot/dts/hisi*
+F:     arch/arm64/boot/dts/hisilicon/
 
 ARM/HP JORNADA 7XX MACHINE SUPPORT
 M:     Kristoffer Ericson <kristoffer.ericson@gmail.com>
@@ -1219,6 +1224,7 @@ M:        Santosh Shilimkar <ssantosh@kernel.org>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-keystone/
+F:     arch/arm/boot/dts/k2*
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux-keystone.git
 
 ARM/TEXAS INSTRUMENT KEYSTONE CLOCK FRAMEWORK
@@ -1287,6 +1293,7 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-berlin/
 F:     arch/arm/boot/dts/berlin*
+F:     arch/arm64/boot/dts/marvell/berlin*
 
 
 ARM/Marvell Dove/MV78xx0/Orion SOC support
@@ -1425,6 +1432,7 @@ S:        Maintained
 F:     arch/arm/boot/dts/qcom-*.dts
 F:     arch/arm/boot/dts/qcom-*.dtsi
 F:     arch/arm/mach-qcom/
+F:     arch/arm64/boot/dts/qcom/*
 F:     drivers/soc/qcom/
 F:     drivers/tty/serial/msm_serial.h
 F:     drivers/tty/serial/msm_serial.c
@@ -1441,8 +1449,8 @@ S:        Maintained
 ARM/RENESAS ARM64 ARCHITECTURE
 M:     Simon Horman <horms@verge.net.au>
 M:     Magnus Damm <magnus.damm@gmail.com>
-L:     linux-sh@vger.kernel.org
-Q:     http://patchwork.kernel.org/project/linux-sh/list/
+L:     linux-renesas-soc@vger.kernel.org
+Q:     http://patchwork.kernel.org/project/linux-renesas-soc/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
 S:     Supported
 F:     arch/arm64/boot/dts/renesas/
@@ -1484,6 +1492,8 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 L:     linux-samsung-soc@vger.kernel.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/boot/dts/s3c*
+F:     arch/arm/boot/dts/s5p*
+F:     arch/arm/boot/dts/samsung*
 F:     arch/arm/boot/dts/exynos*
 F:     arch/arm64/boot/dts/exynos/
 F:     arch/arm/plat-samsung/
@@ -1563,6 +1573,7 @@ S:        Maintained
 F:     arch/arm/mach-socfpga/
 F:     arch/arm/boot/dts/socfpga*
 F:     arch/arm/configs/socfpga_defconfig
+F:     arch/arm64/boot/dts/altera/
 W:     http://www.rocketboards.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/dinguyen/linux.git
 
@@ -1716,7 +1727,7 @@ M:        Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/boot/dts/vexpress*
-F:     arch/arm64/boot/dts/arm/vexpress*
+F:     arch/arm64/boot/dts/arm/
 F:     arch/arm/mach-vexpress/
 F:     */*/vexpress*
 F:     */*/*/vexpress*
@@ -2343,6 +2354,7 @@ F:        arch/arm/mach-bcm/
 F:     arch/arm/boot/dts/bcm113*
 F:     arch/arm/boot/dts/bcm216*
 F:     arch/arm/boot/dts/bcm281*
+F:     arch/arm64/boot/dts/broadcom/
 F:     arch/arm/configs/bcm_defconfig
 F:     drivers/mmc/host/sdhci-bcm-kona.c
 F:     drivers/clocksource/bcm_kona_timer.c
@@ -2357,14 +2369,6 @@ T:       git git://git.kernel.org/pub/scm/linux/kernel/git/rpi/linux-rpi.git
 S:     Maintained
 N:     bcm2835
 
-BROADCOM BCM33XX MIPS ARCHITECTURE
-M:     Kevin Cernekee <cernekee@gmail.com>
-L:     linux-mips@linux-mips.org
-S:     Maintained
-F:     arch/mips/bcm3384/*
-F:     arch/mips/include/asm/mach-bcm3384/*
-F:     arch/mips/kernel/*bmips*
-
 BROADCOM BCM47XX MIPS ARCHITECTURE
 M:     Hauke Mehrtens <hauke@hauke-m.de>
 M:     RafaÅ‚ MiÅ‚ecki <zajec5@gmail.com>
@@ -2418,6 +2422,7 @@ F:        arch/mips/bmips/*
 F:     arch/mips/include/asm/mach-bmips/*
 F:     arch/mips/kernel/*bmips*
 F:     arch/mips/boot/dts/brcm/bcm*.dts*
+F:     drivers/irqchip/irq-bcm63*
 F:     drivers/irqchip/irq-bcm7*
 F:     drivers/irqchip/irq-brcmstb*
 F:     include/linux/bcm963xx_nvram.h
@@ -3445,9 +3450,8 @@ S:        Maintained
 F:     drivers/usb/dwc2/
 
 DESIGNWARE USB3 DRD IP DRIVER
-M:     Felipe Balbi <balbi@ti.com>
+M:     Felipe Balbi <balbi@kernel.org>
 L:     linux-usb@vger.kernel.org
-L:     linux-omap@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
 S:     Maintained
 F:     drivers/usb/dwc3/
@@ -4184,13 +4188,6 @@ W:       http://aeschi.ch.eu.org/efs/
 S:     Orphan
 F:     fs/efs/
 
-EHCA (IBM GX bus InfiniBand adapter) DRIVER
-M:     Hoang-Nam Nguyen <hnguyen@de.ibm.com>
-M:     Christoph Raisch <raisch@de.ibm.com>
-L:     linux-rdma@vger.kernel.org
-S:     Supported
-F:     drivers/infiniband/hw/ehca/
-
 EHEA (IBM pSeries eHEA 10Gb ethernet adapter) DRIVER
 M:     Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
 L:     netdev@vger.kernel.org
@@ -4522,6 +4519,12 @@ L:       linuxppc-dev@lists.ozlabs.org
 S:     Maintained
 F:     drivers/dma/fsldma.*
 
+FREESCALE GPMI NAND DRIVER
+M:     Han Xu <han.xu@nxp.com>
+L:     linux-mtd@lists.infradead.org
+S:     Maintained
+F:     drivers/mtd/nand/gpmi-nand/*
+
 FREESCALE I2C CPM DRIVER
 M:     Jochen Friedrich <jochen@scram.de>
 L:     linuxppc-dev@lists.ozlabs.org
@@ -4538,7 +4541,7 @@ F:        include/linux/platform_data/video-imxfb.h
 F:     drivers/video/fbdev/imxfb.c
 
 FREESCALE QUAD SPI DRIVER
-M:     Han Xu <han.xu@freescale.com>
+M:     Han Xu <han.xu@nxp.com>
 L:     linux-mtd@lists.infradead.org
 S:     Maintained
 F:     drivers/mtd/spi-nor/fsl-quadspi.c
@@ -4552,6 +4555,15 @@ S:       Maintained
 F:     drivers/net/ethernet/freescale/fs_enet/
 F:     include/linux/fs_enet_pd.h
 
+FREESCALE IMX / MXC FEC DRIVER
+M:     Fugang Duan <fugang.duan@nxp.com>
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     drivers/net/ethernet/freescale/fec_main.c
+F:     drivers/net/ethernet/freescale/fec_ptp.c
+F:     drivers/net/ethernet/freescale/fec.h
+F:     Documentation/devicetree/bindings/net/fsl-fec.txt
+
 FREESCALE QUICC ENGINE LIBRARY
 L:     linuxppc-dev@lists.ozlabs.org
 S:     Orphan
@@ -5809,12 +5821,6 @@ M:       Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:     Maintained
 F:     net/ipv4/netfilter/ipt_MASQUERADE.c
 
-IPATH DRIVER
-M:     Mike Marciniszyn <infinipath@intel.com>
-L:     linux-rdma@vger.kernel.org
-S:     Maintained
-F:     drivers/staging/rdma/ipath/
-
 IPMI SUBSYSTEM
 M:     Corey Minyard <minyard@acm.org>
 L:     openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)
@@ -6144,7 +6150,7 @@ F:        include/uapi/linux/sunrpc/
 
 KERNEL SELFTEST FRAMEWORK
 M:     Shuah Khan <shuahkh@osg.samsung.com>
-L:     linux-api@vger.kernel.org
+L:     linux-kselftest@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/shuah/linux-kselftest
 S:     Maintained
 F:     tools/testing/selftests
@@ -6774,6 +6780,7 @@ S:        Maintained
 F:     Documentation/networking/mac80211-injection.txt
 F:     include/net/mac80211.h
 F:     net/mac80211/
+F:     drivers/net/wireless/mac80211_hwsim.[ch]
 
 MACVLAN DRIVER
 M:     Patrick McHardy <kaber@trash.net>
@@ -7370,7 +7377,7 @@ F:        drivers/tty/isicom.c
 F:     include/linux/isicom.h
 
 MUSB MULTIPOINT HIGH SPEED DUAL-ROLE CONTROLLER
-M:     Felipe Balbi <balbi@ti.com>
+M:     Bin Liu <b-liu@ti.com>
 L:     linux-usb@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
 S:     Maintained
@@ -7393,6 +7400,17 @@ W:       https://www.myricom.com/support/downloads/myri10ge.html
 S:     Supported
 F:     drivers/net/ethernet/myricom/myri10ge/
 
+NAND FLASH SUBSYSTEM
+M:     Boris Brezillon <boris.brezillon@free-electrons.com>
+R:     Richard Weinberger <richard@nod.at>
+L:     linux-mtd@lists.infradead.org
+W:     http://www.linux-mtd.infradead.org/
+Q:     http://patchwork.ozlabs.org/project/linux-mtd/list/
+T:     git git://github.com/linux-nand/linux.git
+S:     Maintained
+F:     drivers/mtd/nand/
+F:     include/linux/mtd/nand*.h
+
 NATSEMI ETHERNET DRIVER (DP8381x)
 S:     Orphan
 F:     drivers/net/ethernet/natsemi/natsemi.c
@@ -7702,13 +7720,13 @@ S:      Maintained
 F:     arch/nios2/
 
 NOKIA N900 POWER SUPPLY DRIVERS
-M:     Pali Rohár <pali.rohar@gmail.com>
-S:     Maintained
+R:     Pali Rohár <pali.rohar@gmail.com>
 F:     include/linux/power/bq2415x_charger.h
 F:     include/linux/power/bq27xxx_battery.h
 F:     include/linux/power/isp1704_charger.h
 F:     drivers/power/bq2415x_charger.c
 F:     drivers/power/bq27xxx_battery.c
+F:     drivers/power/bq27xxx_battery_i2c.c
 F:     drivers/power/isp1704_charger.c
 F:     drivers/power/rx51_battery.c
 
@@ -7939,11 +7957,9 @@ F:       drivers/media/platform/omap3isp/
 F:     drivers/staging/media/omap4iss/
 
 OMAP USB SUPPORT
-M:     Felipe Balbi <balbi@ti.com>
 L:     linux-usb@vger.kernel.org
 L:     linux-omap@vger.kernel.org
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
-S:     Maintained
+S:     Orphan
 F:     drivers/usb/*/*omap*
 F:     arch/arm/*omap*/usb*
 
@@ -8460,6 +8476,7 @@ PERFORMANCE EVENTS SUBSYSTEM
 M:     Peter Zijlstra <peterz@infradead.org>
 M:     Ingo Molnar <mingo@redhat.com>
 M:     Arnaldo Carvalho de Melo <acme@kernel.org>
+R:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
 L:     linux-kernel@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 S:     Supported
@@ -8818,6 +8835,7 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 T:     git git://github.com/hzhuang1/linux.git
 T:     git git://github.com/rjarzmik/linux.git
 S:     Maintained
+F:     arch/arm/boot/dts/pxa*
 F:     arch/arm/mach-pxa/
 F:     drivers/dma/pxa*
 F:     drivers/pcmcia/pxa2xx*
@@ -8847,6 +8865,7 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 T:     git git://github.com/hzhuang1/linux.git
 T:     git git://git.linaro.org/people/ycmiao/pxa-linux.git
 S:     Maintained
+F:     arch/arm/boot/dts/mmp*
 F:     arch/arm/mach-mmp/
 
 PXA MMCI DRIVER
@@ -9572,6 +9591,12 @@ M:       Andreas Noever <andreas.noever@gmail.com>
 S:     Maintained
 F:     drivers/thunderbolt/
 
+TI BQ27XXX POWER SUPPLY DRIVER
+R:     Andrew F. Davis <afd@ti.com>
+F:     include/linux/power/bq27xxx_battery.h
+F:     drivers/power/bq27xxx_battery.c
+F:     drivers/power/bq27xxx_battery_i2c.c
+
 TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
 M:     John Stultz <john.stultz@linaro.org>
 M:     Thomas Gleixner <tglx@linutronix.de>
@@ -9793,10 +9818,11 @@ S:      Supported
 F:     drivers/scsi/be2iscsi/
 
 Emulex 10Gbps NIC BE2, BE3-R, Lancer, Skyhawk-R DRIVER
-M:     Sathya Perla <sathya.perla@avagotech.com>
-M:     Ajit Khaparde <ajit.khaparde@avagotech.com>
-M:     Padmanabh Ratnakar <padmanabh.ratnakar@avagotech.com>
-M:     Sriharsha Basavapatna <sriharsha.basavapatna@avagotech.com>
+M:     Sathya Perla <sathya.perla@broadcom.com>
+M:     Ajit Khaparde <ajit.khaparde@broadcom.com>
+M:     Padmanabh Ratnakar <padmanabh.ratnakar@broadcom.com>
+M:     Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
+M:     Somnath Kotur <somnath.kotur@broadcom.com>
 L:     netdev@vger.kernel.org
 W:     http://www.emulex.com
 S:     Supported
@@ -10158,6 +10184,7 @@ S:      Supported
 F:     drivers/media/pci/solo6x10/
 
 SOFTWARE RAID (Multiple Disks) SUPPORT
+M:     Shaohua Li <shli@kernel.org>
 L:     linux-raid@vger.kernel.org
 T:     git git://neil.brown.name/md
 S:     Supported
@@ -10173,7 +10200,7 @@ F:      drivers/net/ethernet/natsemi/sonic.*
 
 SONICS SILICON BACKPLANE DRIVER (SSB)
 M:     Michael Buesch <m@bues.ch>
-L:     netdev@vger.kernel.org
+L:     linux-wireless@vger.kernel.org
 S:     Maintained
 F:     drivers/ssb/
 F:     include/linux/ssb/
@@ -10291,6 +10318,7 @@ L:      spear-devel@list.st.com
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:     http://www.st.com/spear
 S:     Maintained
+F:     arch/arm/boot/dts/spear*
 F:     arch/arm/mach-spear/
 
 SPEAR CLOCK FRAMEWORK SUPPORT
@@ -11318,7 +11346,7 @@ F:      Documentation/usb/ehci.txt
 F:     drivers/usb/host/ehci*
 
 USB GADGET/PERIPHERAL SUBSYSTEM
-M:     Felipe Balbi <balbi@ti.com>
+M:     Felipe Balbi <balbi@kernel.org>
 L:     linux-usb@vger.kernel.org
 W:     http://www.linux-usb.org/gadget
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
@@ -11394,7 +11422,7 @@ S:      Maintained
 F:     drivers/net/usb/pegasus.*
 
 USB PHY LAYER
-M:     Felipe Balbi <balbi@ti.com>
+M:     Felipe Balbi <balbi@kernel.org>
 L:     linux-usb@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
 S:     Maintained
@@ -12024,7 +12052,6 @@ F:      arch/arm64/xen/
 F:     arch/arm64/include/asm/xen/
 
 XEN NETWORK BACKEND DRIVER
-M:     Ian Campbell <ian.campbell@citrix.com>
 M:     Wei Liu <wei.liu2@citrix.com>
 L:     xen-devel@lists.xenproject.org (moderated for non-subscribers)
 L:     netdev@vger.kernel.org
@@ -12133,7 +12160,7 @@ F:      drivers/net/hamradio/*scc.c
 F:     drivers/net/hamradio/z8530.h
 
 ZBUD COMPRESSED PAGE ALLOCATOR
-M:     Seth Jennings <sjennings@variantweb.net>
+M:     Seth Jennings <sjenning@redhat.com>
 L:     linux-mm@kvack.org
 S:     Maintained
 F:     mm/zbud.c
@@ -12188,7 +12215,7 @@ F:      include/linux/zsmalloc.h
 F:     Documentation/vm/zsmalloc.txt
 
 ZSWAP COMPRESSED SWAP CACHING
-M:     Seth Jennings <sjennings@variantweb.net>
+M:     Seth Jennings <sjenning@redhat.com>
 L:     linux-mm@kvack.org
 S:     Maintained
 F:     mm/zswap.c
index c65fe37..7b3ecdc 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 5
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION =
 NAME = Blurry Fish Butt
 
 # *DOCUMENTATION*
index 2f24447..46bf263 100644 (file)
@@ -168,7 +168,7 @@ smp_callin(void)
              cpuid, current, current->active_mm));
 
        preempt_disable();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Wait until hwrpb->txrdy is clear for cpu.  Return -1 on timeout.  */
index 76dde9d..8a188bc 100644 (file)
@@ -12,8 +12,6 @@ config ARC
        select BUILDTIME_EXTABLE_SORT
        select COMMON_CLK
        select CLONE_BACKWARDS
-       # ARC Busybox based initramfs absolutely relies on DEVTMPFS for /dev
-       select DEVTMPFS if !INITRAMFS_SOURCE=""
        select GENERIC_ATOMIC64
        select GENERIC_CLOCKEVENTS
        select GENERIC_FIND_FIRST_BIT
@@ -275,14 +273,6 @@ config ARC_DCCM_BASE
        default "0xA0000000"
        depends on ARC_HAS_DCCM
 
-config ARC_HAS_HW_MPY
-       bool "Use Hardware Multiplier (Normal or Faster XMAC)"
-       default y
-       help
-         Influences how gcc generates code for MPY operations.
-         If enabled, MPYxx insns are generated, provided by Standard/XMAC
-         Multipler. Otherwise software multipy lib is used
-
 choice
        prompt "MMU Version"
        default ARC_MMU_V3 if ARC_CPU_770
@@ -338,6 +328,19 @@ config ARC_PAGE_SIZE_4K
 
 endchoice
 
+choice
+       prompt "MMU Super Page Size"
+       depends on ISA_ARCV2 && TRANSPARENT_HUGEPAGE
+       default ARC_HUGEPAGE_2M
+
+config ARC_HUGEPAGE_2M
+       bool "2MB"
+
+config ARC_HUGEPAGE_16M
+       bool "16MB"
+
+endchoice
+
 if ISA_ARCOMPACT
 
 config ARC_COMPACT_IRQ_LEVELS
@@ -410,7 +413,7 @@ config ARC_HAS_RTC
        default n
        depends on !SMP
 
-config ARC_HAS_GRTC
+config ARC_HAS_GFRC
        bool "SMP synchronized 64-bit cycle counter"
        default y
        depends on SMP
@@ -529,14 +532,6 @@ config ARC_DBG_TLB_MISS_COUNT
          Counts number of I and D TLB Misses and exports them via Debugfs
          The counters can be cleared via Debugfs as well
 
-if SMP
-
-config ARC_IPI_DBG
-       bool "Debug Inter Core interrupts"
-       default n
-
-endif
-
 endif
 
 config ARC_UBOOT_SUPPORT
@@ -566,6 +561,12 @@ endmenu
 endmenu         # "ARC Architecture Configuration"
 
 source "mm/Kconfig"
+
+config FORCE_MAX_ZONEORDER
+       int "Maximum zone order"
+       default "12" if ARC_HUGEPAGE_16M
+       default "11"
+
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "fs/Kconfig"
index aeb1902..c8230f3 100644 (file)
@@ -74,10 +74,6 @@ ldflags-$(CONFIG_CPU_BIG_ENDIAN)     += -EB
 # --build-id w/o "-marclinux". Default arc-elf32-ld is OK
 ldflags-$(upto_gcc44)                  += -marclinux
 
-ifndef CONFIG_ARC_HAS_HW_MPY
-       cflags-y        += -mno-mpy
-endif
-
 LIBGCC := $(shell $(CC) $(cflags-y) --print-libgcc-file-name)
 
 # Modules with short calls might break for calls into builtin-kernel
index f1ac981..5d4e2a0 100644 (file)
@@ -39,6 +39,7 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -73,7 +74,6 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_DESIGNWARE_PLATFORM=y
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
 CONFIG_LOGO=y
@@ -91,12 +91,10 @@ CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_DW=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_NTFS_FS=y
 CONFIG_TMPFS=y
-CONFIG_JFFS2_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
index 323486d..87ee46b 100644 (file)
@@ -39,14 +39,10 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_AXS=y
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_NETDEVICES=y
@@ -78,14 +74,12 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_DESIGNWARE_PLATFORM=y
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
 CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_MONO is not set
 # CONFIG_LOGO_LINUX_VGA16 is not set
 # CONFIG_LOGO_LINUX_CLUT224 is not set
-CONFIG_USB=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_HCD_PLATFORM=y
 CONFIG_USB_OHCI_HCD=y
@@ -97,12 +91,10 @@ CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_DW=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_NTFS_FS=y
 CONFIG_TMPFS=y
-CONFIG_JFFS2_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
index 66191cd..d80daf4 100644 (file)
@@ -40,14 +40,10 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_AXS=y
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_NETDEVICES=y
@@ -79,14 +75,12 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_DESIGNWARE_PLATFORM=y
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
 CONFIG_LOGO=y
 # CONFIG_LOGO_LINUX_MONO is not set
 # CONFIG_LOGO_LINUX_VGA16 is not set
 # CONFIG_LOGO_LINUX_CLUT224 is not set
-CONFIG_USB=y
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_HCD_PLATFORM=y
 CONFIG_USB_OHCI_HCD=y
@@ -98,12 +92,10 @@ CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_DW=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
 CONFIG_NTFS_FS=y
 CONFIG_TMPFS=y
-CONFIG_JFFS2_FS=y
 CONFIG_NFS_FS=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
index 138f9d8..f410953 100644 (file)
@@ -4,6 +4,7 @@ CONFIG_DEFAULT_HOSTNAME="ARCLinux"
 # CONFIG_SWAP is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
@@ -26,7 +27,6 @@ CONFIG_ARC_PLAT_SIM=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700"
 CONFIG_PREEMPT=y
 # CONFIG_COMPACTION is not set
-# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -34,6 +34,7 @@ CONFIG_UNIX_DIAG=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -51,7 +52,6 @@ CONFIG_SERIAL_ARC=y
 CONFIG_SERIAL_ARC_CONSOLE=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
-# CONFIG_VGA_CONSOLE is not set
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
 # CONFIG_IOMMU_SUPPORT is not set
@@ -63,4 +63,3 @@ CONFIG_NFS_FS=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_DEBUG_PREEMPT is not set
-CONFIG_XZ_DEC=y
index f68838e..cfaa33c 100644 (file)
@@ -35,6 +35,7 @@ CONFIG_UNIX_DIAG=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -49,7 +50,6 @@ CONFIG_SERIAL_ARC=y
 CONFIG_SERIAL_ARC_CONSOLE=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
-# CONFIG_VGA_CONSOLE is not set
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
 # CONFIG_IOMMU_SUPPORT is not set
@@ -61,4 +61,3 @@ CONFIG_NFS_FS=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_DEBUG_PREEMPT is not set
-CONFIG_XZ_DEC=y
index 96bd1c2..bb2a8dc 100644 (file)
@@ -2,6 +2,7 @@ CONFIG_CROSS_COMPILE="arc-linux-"
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_DEFAULT_HOSTNAME="ARCLinux"
 # CONFIG_SWAP is not set
+# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
@@ -21,13 +22,11 @@ CONFIG_MODULES=y
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARC_PLAT_SIM=y
-CONFIG_ARC_BOARD_ML509=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu"
 CONFIG_PREEMPT=y
 # CONFIG_COMPACTION is not set
-# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -35,6 +34,7 @@ CONFIG_UNIX_DIAG=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -49,7 +49,6 @@ CONFIG_SERIAL_ARC=y
 CONFIG_SERIAL_ARC_CONSOLE=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
-# CONFIG_VGA_CONSOLE is not set
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
 # CONFIG_IOMMU_SUPPORT is not set
@@ -60,4 +59,3 @@ CONFIG_TMPFS=y
 CONFIG_NFS_FS=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_XZ_DEC=y
index 31e1d95..646182e 100644 (file)
@@ -33,6 +33,7 @@ CONFIG_UNIX_DIAG=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -58,7 +59,6 @@ CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 # CONFIG_HID is not set
index fcae666..ceca254 100644 (file)
@@ -34,12 +34,12 @@ CONFIG_UNIX_DIAG=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
 # CONFIG_BLK_DEV is not set
 CONFIG_NETDEVICES=y
-CONFIG_NET_OSCI_LAN=y
 CONFIG_INPUT_EVDEV=y
 # CONFIG_MOUSE_PS2_ALPS is not set
 # CONFIG_MOUSE_PS2_LOGIPS2PP is not set
@@ -58,7 +58,6 @@ CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 # CONFIG_HID is not set
index b01b659..4b6da90 100644 (file)
@@ -2,6 +2,7 @@ CONFIG_CROSS_COMPILE="arc-linux-"
 CONFIG_DEFAULT_HOSTNAME="ARCLinux"
 # CONFIG_SWAP is not set
 CONFIG_SYSVIPC=y
+# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_IKCONFIG=y
@@ -18,15 +19,11 @@ CONFIG_MODULES=y
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
 CONFIG_ARC_PLAT_SIM=y
-CONFIG_ARC_BOARD_ML509=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
-CONFIG_ARC_HAS_LL64=y
-# CONFIG_ARC_HAS_RTSC is not set
 CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs_idu"
 CONFIG_PREEMPT=y
 # CONFIG_COMPACTION is not set
-# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=y
@@ -40,6 +37,7 @@ CONFIG_INET=y
 # CONFIG_INET_LRO is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_STANDALONE is not set
 # CONFIG_PREVENT_FIRMWARE_BUILD is not set
 # CONFIG_FIRMWARE_IN_KERNEL is not set
@@ -56,14 +54,11 @@ CONFIG_NETDEVICES=y
 # CONFIG_NET_VENDOR_STMICRO is not set
 # CONFIG_NET_VENDOR_VIA is not set
 # CONFIG_NET_VENDOR_WIZNET is not set
-CONFIG_NET_OSCI_LAN=y
 # CONFIG_WLAN is not set
 CONFIG_INPUT_EVDEV=y
 CONFIG_MOUSE_PS2_TOUCHKIT=y
 # CONFIG_SERIO_SERPORT is not set
-CONFIG_SERIO_LIBPS2=y
 CONFIG_SERIO_ARC_PS2=y
-CONFIG_VT_HW_CONSOLE_BINDING=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
@@ -75,9 +70,6 @@ CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_HW_RANDOM is not set
 # CONFIG_HWMON is not set
 CONFIG_FB=y
-CONFIG_ARCPGU_RGB888=y
-CONFIG_ARCPGU_DISPTYPE=0
-# CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 # CONFIG_HID is not set
index 3b4dc9c..9b342ea 100644 (file)
@@ -3,6 +3,7 @@ CONFIG_CROSS_COMPILE="arc-linux-"
 CONFIG_DEFAULT_HOSTNAME="tb10x"
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
@@ -26,12 +27,10 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLOCK is not set
 CONFIG_ARC_PLAT_TB10X=y
 CONFIG_ARC_CACHE_LINE_SHIFT=5
-CONFIG_ARC_STACK_NONEXEC=y
 CONFIG_HZ=250
 CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk"
 CONFIG_PREEMPT_VOLUNTARY=y
 # CONFIG_COMPACTION is not set
-# CONFIG_CROSS_MEMORY_ATTACH is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -44,8 +43,8 @@ CONFIG_IP_MULTICAST=y
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
+CONFIG_DEVTMPFS=y
 # CONFIG_FIRMWARE_IN_KERNEL is not set
-CONFIG_PROC_DEVICETREE=y
 CONFIG_NETDEVICES=y
 # CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
@@ -55,9 +54,6 @@ CONFIG_NETDEVICES=y
 # CONFIG_NET_VENDOR_NATSEMI is not set
 # CONFIG_NET_VENDOR_SEEQ is not set
 CONFIG_STMMAC_ETH=y
-CONFIG_STMMAC_DEBUG_FS=y
-CONFIG_STMMAC_DA=y
-CONFIG_STMMAC_CHAINED=y
 # CONFIG_NET_VENDOR_WIZNET is not set
 # CONFIG_WLAN is not set
 # CONFIG_INPUT is not set
@@ -91,7 +87,6 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_LEDS_TRIGGER_TRANSIENT=y
 CONFIG_DMADEVICES=y
 CONFIG_DW_DMAC=y
-CONFIG_NET_DMA=y
 CONFIG_ASYNC_TX_DMA=y
 # CONFIG_IOMMU_SUPPORT is not set
 # CONFIG_DNOTIFY is not set
@@ -100,17 +95,16 @@ CONFIG_TMPFS=y
 CONFIG_CONFIGFS_FS=y
 # CONFIG_MISC_FILESYSTEMS is not set
 # CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
 CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_MEMORY_INIT=y
-CONFIG_DEBUG_STACKOVERFLOW=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
index f36c047..7359859 100644 (file)
@@ -16,7 +16,7 @@ CONFIG_ARC_PLAT_AXS10X=y
 CONFIG_AXS103=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
-# CONFIG_ARC_HAS_GRTC is not set
+# CONFIG_ARC_HAS_GFRC is not set
 CONFIG_ARC_UBOOT_SUPPORT=y
 CONFIG_ARC_BUILTIN_DTB_NAME="vdk_hs38_smp"
 CONFIG_PREEMPT=y
index 7fac7d8..f9f4c6f 100644 (file)
@@ -10,7 +10,8 @@
 #define _ASM_ARC_ARCREGS_H
 
 /* Build Configuration Registers */
-#define ARC_REG_DCCMBASE_BCR   0x61    /* DCCM Base Addr */
+#define ARC_REG_AUX_DCCM       0x18    /* DCCM Base Addr ARCv2 */
+#define ARC_REG_DCCM_BASE_BUILD        0x61    /* DCCM Base Addr ARCompact */
 #define ARC_REG_CRC_BCR                0x62
 #define ARC_REG_VECBASE_BCR    0x68
 #define ARC_REG_PERIBASE_BCR   0x69
 #define ARC_REG_DPFP_BCR       0x6C    /* ARCompact: Dbl Precision FPU */
 #define ARC_REG_FP_V2_BCR      0xc8    /* ARCv2 FPU */
 #define ARC_REG_SLC_BCR                0xce
-#define ARC_REG_DCCM_BCR       0x74    /* DCCM Present + SZ */
+#define ARC_REG_DCCM_BUILD     0x74    /* DCCM size (common) */
 #define ARC_REG_TIMERS_BCR     0x75
 #define ARC_REG_AP_BCR         0x76
-#define ARC_REG_ICCM_BCR       0x78
+#define ARC_REG_ICCM_BUILD     0x78    /* ICCM size (common) */
 #define ARC_REG_XY_MEM_BCR     0x79
 #define ARC_REG_MAC_BCR                0x7a
 #define ARC_REG_MUL_BCR                0x7b
@@ -36,6 +37,7 @@
 #define ARC_REG_IRQ_BCR                0xF3
 #define ARC_REG_SMART_BCR      0xFF
 #define ARC_REG_CLUSTER_BCR    0xcf
+#define ARC_REG_AUX_ICCM       0x208   /* ICCM Base Addr (ARCv2) */
 
 /* status32 Bits Positions */
 #define STATUS_AE_BIT          5       /* Exception active */
@@ -246,7 +248,7 @@ struct bcr_perip {
 #endif
 };
 
-struct bcr_iccm {
+struct bcr_iccm_arcompact {
 #ifdef CONFIG_CPU_BIG_ENDIAN
        unsigned int base:16, pad:5, sz:3, ver:8;
 #else
@@ -254,17 +256,15 @@ struct bcr_iccm {
 #endif
 };
 
-/* DCCM Base Address Register: ARC_REG_DCCMBASE_BCR */
-struct bcr_dccm_base {
+struct bcr_iccm_arcv2 {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-       unsigned int addr:24, ver:8;
+       unsigned int pad:8, sz11:4, sz01:4, sz10:4, sz00:4, ver:8;
 #else
-       unsigned int ver:8, addr:24;
+       unsigned int ver:8, sz00:4, sz10:4, sz01:4, sz11:4, pad:8;
 #endif
 };
 
-/* DCCM RAM Configuration Register: ARC_REG_DCCM_BCR */
-struct bcr_dccm {
+struct bcr_dccm_arcompact {
 #ifdef CONFIG_CPU_BIG_ENDIAN
        unsigned int res:21, sz:3, ver:8;
 #else
@@ -272,6 +272,14 @@ struct bcr_dccm {
 #endif
 };
 
+struct bcr_dccm_arcv2 {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       unsigned int pad2:12, cyc:3, pad1:1, sz1:4, sz0:4, ver:8;
+#else
+       unsigned int ver:8, sz0:4, sz1:4, pad1:1, cyc:3, pad2:12;
+#endif
+};
+
 /* ARCompact: Both SP and DP FPU BCRs have same format */
 struct bcr_fp_arcompact {
 #ifdef CONFIG_CPU_BIG_ENDIAN
@@ -315,9 +323,9 @@ struct bcr_bpu_arcv2 {
 
 struct bcr_generic {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-       unsigned int pad:24, ver:8;
+       unsigned int info:24, ver:8;
 #else
-       unsigned int ver:8, pad:24;
+       unsigned int ver:8, info:24;
 #endif
 };
 
@@ -349,14 +357,13 @@ struct cpuinfo_arc {
        struct cpuinfo_arc_bpu bpu;
        struct bcr_identity core;
        struct bcr_isa isa;
-       struct bcr_timer timers;
        unsigned int vec_base;
        struct cpuinfo_arc_ccm iccm, dccm;
        struct {
                unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, pad1:3,
                             fpu_sp:1, fpu_dp:1, pad2:6,
                             debug:1, ap:1, smart:1, rtt:1, pad3:4,
-                            pad4:8;
+                            timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4;
        } extn;
        struct bcr_mpy extn_mpy;
        struct bcr_extn_xymem extn_xymem;
index 4fd7d62..49014f0 100644 (file)
 #ifdef CONFIG_ISA_ARCOMPACT
 #define TIMER0_IRQ      3
 #define TIMER1_IRQ      4
-#define IPI_IRQ                (NR_CPU_IRQS-1) /* dummy to enable SMP build for up hardware */
 #else
 #define TIMER0_IRQ      16
 #define TIMER1_IRQ      17
-#define IPI_IRQ         19
 #endif
 
 #include <linux/interrupt.h>
index 258b0e5..37c2f75 100644 (file)
@@ -22,6 +22,7 @@
 #define AUX_IRQ_CTRL           0x00E
 #define AUX_IRQ_ACT            0x043   /* Active Intr across all levels */
 #define AUX_IRQ_LVL_PEND       0x200   /* Pending Intr across all levels */
+#define AUX_IRQ_HINT           0x201   /* For generating Soft Interrupts */
 #define AUX_IRQ_PRIORITY       0x206
 #define ICAUSE                 0x40a
 #define AUX_IRQ_SELECT         0x40b
 /* Was Intr taken in User Mode */
 #define AUX_IRQ_ACT_BIT_U      31
 
-/* 0 is highest level, but taken by FIRQs, if present in design */
-#define ARCV2_IRQ_DEF_PRIO             0
+/*
+ * User space should be interruptable even by lowest prio interrupt
+ * Safe even if actual interrupt priorities is fewer or even one
+ */
+#define ARCV2_IRQ_DEF_PRIO     15
 
 /* seed value for status register */
 #define ISA_INIT_STATUS_BITS   (STATUS_IE_MASK | STATUS_AD_MASK | \
@@ -112,6 +116,16 @@ static inline int arch_irqs_disabled(void)
        return arch_irqs_disabled_flags(arch_local_save_flags());
 }
 
+static inline void arc_softirq_trigger(int irq)
+{
+       write_aux_reg(AUX_IRQ_HINT, irq);
+}
+
+static inline void arc_softirq_clear(int irq)
+{
+       write_aux_reg(AUX_IRQ_HINT, 0);
+}
+
 #else
 
 .macro IRQ_DISABLE  scratch
index 46f4e53..847e3bb 100644 (file)
@@ -39,8 +39,8 @@ struct mcip_cmd {
 #define CMD_DEBUG_SET_MASK             0x34
 #define CMD_DEBUG_SET_SELECT           0x36
 
-#define CMD_GRTC_READ_LO               0x42
-#define CMD_GRTC_READ_HI               0x43
+#define CMD_GFRC_READ_LO               0x42
+#define CMD_GFRC_READ_HI               0x43
 
 #define CMD_IDU_ENABLE                 0x71
 #define CMD_IDU_DISABLE                        0x72
index 57af2f0..d426d42 100644 (file)
 #define __S111  PAGE_U_X_W_R
 
 /****************************************************************
- * Page Table Lookup split
+ * 2 tier (PGD:PTE) software page walker
  *
- * We implement 2 tier paging and since this is all software, we are free
- * to customize the span of a PGD / PTE entry to suit us
- *
- *                     32 bit virtual address
+ * [31]                    32 bit virtual address              [0]
  * -------------------------------------------------------
- * | BITS_FOR_PGD    |  BITS_FOR_PTE    |  BITS_IN_PAGE  |
+ * |               | <------------ PGDIR_SHIFT ----------> |
+ * |              |                                     |
+ * | BITS_FOR_PGD  |  BITS_FOR_PTE  | <-- PAGE_SHIFT --> |
  * -------------------------------------------------------
  *       |                  |                |
  *       |                  |                --> off in page frame
- *       |                 |
  *       |                  ---> index into Page Table
- *       |
  *       ----> index into Page Directory
+ *
+ * In a single page size configuration, only PAGE_SHIFT is fixed
+ * So both PGD and PTE sizing can be tweaked
+ *  e.g. 8K page (PAGE_SHIFT 13) can have
+ *  - PGDIR_SHIFT 21  -> 11:8:13 address split
+ *  - PGDIR_SHIFT 24  -> 8:11:13 address split
+ *
+ * If Super Page is configured, PGDIR_SHIFT becomes fixed too,
+ * so the sizing flexibility is gone.
  */
 
-#define BITS_IN_PAGE   PAGE_SHIFT
-
-/* Optimal Sizing of Pg Tbl - based on MMU page size */
-#if defined(CONFIG_ARC_PAGE_SIZE_8K)
-#define BITS_FOR_PTE   8               /* 11:8:13 */
-#elif defined(CONFIG_ARC_PAGE_SIZE_16K)
-#define BITS_FOR_PTE   8               /* 10:8:14 */
-#elif defined(CONFIG_ARC_PAGE_SIZE_4K)
-#define BITS_FOR_PTE   9               /* 11:9:12 */
+#if defined(CONFIG_ARC_HUGEPAGE_16M)
+#define PGDIR_SHIFT    24
+#elif defined(CONFIG_ARC_HUGEPAGE_2M)
+#define PGDIR_SHIFT    21
+#else
+/*
+ * Only Normal page support so "hackable" (see comment above)
+ * Default value provides 11:8:13 (8K), 11:9:12 (4K)
+ */
+#define PGDIR_SHIFT    21
 #endif
 
-#define BITS_FOR_PGD   (32 - BITS_FOR_PTE - BITS_IN_PAGE)
+#define BITS_FOR_PTE   (PGDIR_SHIFT - PAGE_SHIFT)
+#define BITS_FOR_PGD   (32 - PGDIR_SHIFT)
 
-#define PGDIR_SHIFT    (32 - BITS_FOR_PGD)
 #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)    /* vaddr span, not PDG sz */
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 
index cbfec79..c126460 100644 (file)
@@ -45,11 +45,12 @@ VECTOR      reserved                ; Reserved slots
 VECTOR handle_interrupt        ; (16) Timer0
 VECTOR handle_interrupt        ; unused (Timer1)
 VECTOR handle_interrupt        ; unused (WDT)
-VECTOR handle_interrupt        ; (19) ICI (inter core interrupt)
-VECTOR handle_interrupt
-VECTOR handle_interrupt
-VECTOR handle_interrupt
-VECTOR handle_interrupt        ; (23) End of fixed IRQs
+VECTOR handle_interrupt        ; (19) Inter core Interrupt (IPI)
+VECTOR handle_interrupt        ; (20) perf Interrupt
+VECTOR handle_interrupt        ; (21) Software Triggered Intr (Self IPI)
+VECTOR handle_interrupt        ; unused
+VECTOR handle_interrupt        ; (23) unused
+# End of fixed IRQs
 
 .rept CONFIG_ARC_NUMBER_OF_INTERRUPTS - 8
        VECTOR  handle_interrupt
@@ -211,7 +212,11 @@ debug_marker_syscall:
 ; (since IRQ NOT allowed in DS in ARCv2, this can only happen if orig
 ; entry was via Exception in DS which got preempted in kernel).
 ;
-; IRQ RTIE won't reliably restore DE bit and/or BTA, needs handling
+; IRQ RTIE won't reliably restore DE bit and/or BTA, needs workaround
+;
+; Solution is return from Intr w/o any delay slot quirks into a kernel trampoline
+; and from pure kernel mode return to delay slot which handles DS bit/BTA correctly
+
 .Lintr_ret_to_delay_slot:
 debug_marker_ds:
 
@@ -222,18 +227,23 @@ debug_marker_ds:
        ld      r2, [sp, PT_ret]
        ld      r3, [sp, PT_status32]
 
+       ; STAT32 for Int return created from scratch
+       ; (No delay dlot, disable Further intr in trampoline)
+
        bic     r0, r3, STATUS_U_MASK|STATUS_DE_MASK|STATUS_IE_MASK|STATUS_L_MASK
        st      r0, [sp, PT_status32]
 
        mov     r1, .Lintr_ret_to_delay_slot_2
        st      r1, [sp, PT_ret]
 
+       ; Orig exception PC/STAT32 safekept @orig_r0 and @event stack slots
        st      r2, [sp, 0]
        st      r3, [sp, 4]
 
        b       .Lisr_ret_fast_path
 
 .Lintr_ret_to_delay_slot_2:
+       ; Trampoline to restore orig exception PC/STAT32/BTA/AUX_USER_SP
        sub     sp, sp, SZ_PT_REGS
        st      r9, [sp, -4]
 
@@ -243,11 +253,19 @@ debug_marker_ds:
        ld      r9, [sp, 4]
        sr      r9, [erstatus]
 
+       ; restore AUX_USER_SP if returning to U mode
+       bbit0   r9, STATUS_U_BIT, 1f
+       ld      r9, [sp, PT_sp]
+       sr      r9, [AUX_USER_SP]
+
+1:
        ld      r9, [sp, 8]
        sr      r9, [erbta]
 
        ld      r9, [sp, -4]
        add     sp, sp, SZ_PT_REGS
+
+       ; return from pure kernel mode to delay slot
        rtie
 
 END(ret_from_exception)
index 0394f9f..9425263 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/irqchip.h>
 #include <asm/irq.h>
 
+static int irq_prio;
+
 /*
  * Early Hardware specific Interrupt setup
  * -Called very early (start_kernel -> setup_arch -> setup_processor)
@@ -24,6 +26,14 @@ void arc_init_IRQ(void)
 {
        unsigned int tmp;
 
+       struct irq_build {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+               unsigned int pad:3, firq:1, prio:4, exts:8, irqs:8, ver:8;
+#else
+               unsigned int ver:8, irqs:8, exts:8, prio:4, firq:1, pad:3;
+#endif
+       } irq_bcr;
+
        struct aux_irq_ctrl {
 #ifdef CONFIG_CPU_BIG_ENDIAN
                unsigned int res3:18, save_idx_regs:1, res2:1,
@@ -46,28 +56,25 @@ void arc_init_IRQ(void)
 
        WRITE_AUX(AUX_IRQ_CTRL, ictrl);
 
-       /* setup status32, don't enable intr yet as kernel doesn't want */
-       tmp = read_aux_reg(0xa);
-       tmp |= ISA_INIT_STATUS_BITS;
-       tmp &= ~STATUS_IE_MASK;
-       asm volatile("flag %0   \n"::"r"(tmp));
-
        /*
         * ARCv2 core intc provides multiple interrupt priorities (upto 16).
         * Typical builds though have only two levels (0-high, 1-low)
         * Linux by default uses lower prio 1 for most irqs, reserving 0 for
         * NMI style interrupts in future (say perf)
-        *
-        * Read the intc BCR to confirm that Linux default priority is avail
-        * in h/w
-        *
-        * Note:
-        *  IRQ_BCR[27..24] contains N-1 (for N priority levels) and prio level
-        *  is 0 based.
         */
-       tmp = (read_aux_reg(ARC_REG_IRQ_BCR) >> 24 ) & 0xF;
-       if (ARCV2_IRQ_DEF_PRIO > tmp)
-               panic("Linux default irq prio incorrect\n");
+
+       READ_BCR(ARC_REG_IRQ_BCR, irq_bcr);
+
+       irq_prio = irq_bcr.prio;        /* Encoded as N-1 for N levels */
+       pr_info("archs-intc\t: %d priority levels (default %d)%s\n",
+               irq_prio + 1, irq_prio,
+               irq_bcr.firq ? " FIRQ (not used)":"");
+
+       /* setup status32, don't enable intr yet as kernel doesn't want */
+       tmp = read_aux_reg(0xa);
+       tmp |= STATUS_AD_MASK | (irq_prio << 1);
+       tmp &= ~STATUS_IE_MASK;
+       asm volatile("flag %0   \n"::"r"(tmp));
 }
 
 static void arcv2_irq_mask(struct irq_data *data)
@@ -86,7 +93,7 @@ void arcv2_irq_enable(struct irq_data *data)
 {
        /* set default priority */
        write_aux_reg(AUX_IRQ_SELECT, data->irq);
-       write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO);
+       write_aux_reg(AUX_IRQ_PRIORITY, irq_prio);
 
        /*
         * hw auto enables (linux unmask) all by default
index 06bcedf..224d1c3 100644 (file)
@@ -81,9 +81,6 @@ static int arc_intc_domain_map(struct irq_domain *d, unsigned int irq,
 {
        switch (irq) {
        case TIMER0_IRQ:
-#ifdef CONFIG_SMP
-       case IPI_IRQ:
-#endif
                irq_set_chip_and_handler(irq, &onchip_intc, handle_percpu_irq);
                break;
        default:
index bd237ac..c41c364 100644 (file)
 #include <linux/smp.h>
 #include <linux/irq.h>
 #include <linux/spinlock.h>
+#include <asm/irqflags-arcv2.h>
 #include <asm/mcip.h>
 #include <asm/setup.h>
 
+#define IPI_IRQ                19
+#define SOFTIRQ_IRQ    21
+
 static char smp_cpuinfo_buf[128];
 static int idu_detected;
 
@@ -22,6 +26,7 @@ static DEFINE_RAW_SPINLOCK(mcip_lock);
 static void mcip_setup_per_cpu(int cpu)
 {
        smp_ipi_irq_setup(cpu, IPI_IRQ);
+       smp_ipi_irq_setup(cpu, SOFTIRQ_IRQ);
 }
 
 static void mcip_ipi_send(int cpu)
@@ -29,46 +34,44 @@ static void mcip_ipi_send(int cpu)
        unsigned long flags;
        int ipi_was_pending;
 
+       /* ARConnect can only send IPI to others */
+       if (unlikely(cpu == raw_smp_processor_id())) {
+               arc_softirq_trigger(SOFTIRQ_IRQ);
+               return;
+       }
+
+       raw_spin_lock_irqsave(&mcip_lock, flags);
+
        /*
-        * NOTE: We must spin here if the other cpu hasn't yet
-        * serviced a previous message. This can burn lots
-        * of time, but we MUST follows this protocol or
-        * ipi messages can be lost!!!
-        * Also, we must release the lock in this loop because
-        * the other side may get to this same loop and not
-        * be able to ack -- thus causing deadlock.
+        * If receiver already has a pending interrupt, elide sending this one.
+        * Linux cross core calling works well with concurrent IPIs
+        * coalesced into one
+        * see arch/arc/kernel/smp.c: ipi_send_msg_one()
         */
+       __mcip_cmd(CMD_INTRPT_READ_STATUS, cpu);
+       ipi_was_pending = read_aux_reg(ARC_REG_MCIP_READBACK);
+       if (!ipi_was_pending)
+               __mcip_cmd(CMD_INTRPT_GENERATE_IRQ, cpu);
 
-       do {
-               raw_spin_lock_irqsave(&mcip_lock, flags);
-               __mcip_cmd(CMD_INTRPT_READ_STATUS, cpu);
-               ipi_was_pending = read_aux_reg(ARC_REG_MCIP_READBACK);
-               if (ipi_was_pending == 0)
-                       break; /* break out but keep lock */
-               raw_spin_unlock_irqrestore(&mcip_lock, flags);
-       } while (1);
-
-       __mcip_cmd(CMD_INTRPT_GENERATE_IRQ, cpu);
        raw_spin_unlock_irqrestore(&mcip_lock, flags);
-
-#ifdef CONFIG_ARC_IPI_DBG
-       if (ipi_was_pending)
-               pr_info("IPI ACK delayed from cpu %d\n", cpu);
-#endif
 }
 
 static void mcip_ipi_clear(int irq)
 {
        unsigned int cpu, c;
        unsigned long flags;
-       unsigned int __maybe_unused copy;
+
+       if (unlikely(irq == SOFTIRQ_IRQ)) {
+               arc_softirq_clear(irq);
+               return;
+       }
 
        raw_spin_lock_irqsave(&mcip_lock, flags);
 
        /* Who sent the IPI */
        __mcip_cmd(CMD_INTRPT_CHECK_SOURCE, 0);
 
-       copy = cpu = read_aux_reg(ARC_REG_MCIP_READBACK);       /* 1,2,4,8... */
+       cpu = read_aux_reg(ARC_REG_MCIP_READBACK);      /* 1,2,4,8... */
 
        /*
         * In rare case, multiple concurrent IPIs sent to same target can
@@ -82,12 +85,6 @@ static void mcip_ipi_clear(int irq)
        } while (cpu);
 
        raw_spin_unlock_irqrestore(&mcip_lock, flags);
-
-#ifdef CONFIG_ARC_IPI_DBG
-       if (c != __ffs(copy))
-               pr_info("IPIs from %x coalesced to %x\n",
-                       copy, raw_smp_processor_id());
-#endif
 }
 
 static void mcip_probe_n_setup(void)
@@ -96,13 +93,13 @@ static void mcip_probe_n_setup(void)
 #ifdef CONFIG_CPU_BIG_ENDIAN
                unsigned int pad3:8,
                             idu:1, llm:1, num_cores:6,
-                            iocoh:1,  grtc:1, dbg:1, pad2:1,
+                            iocoh:1,  gfrc:1, dbg:1, pad2:1,
                             msg:1, sem:1, ipi:1, pad:1,
                             ver:8;
 #else
                unsigned int ver:8,
                             pad:1, ipi:1, sem:1, msg:1,
-                            pad2:1, dbg:1, grtc:1, iocoh:1,
+                            pad2:1, dbg:1, gfrc:1, iocoh:1,
                             num_cores:6, llm:1, idu:1,
                             pad3:8;
 #endif
@@ -111,12 +108,13 @@ static void mcip_probe_n_setup(void)
        READ_BCR(ARC_REG_MCIP_BCR, mp);
 
        sprintf(smp_cpuinfo_buf,
-               "Extn [SMP]\t: ARConnect (v%d): %d cores with %s%s%s%s\n",
+               "Extn [SMP]\t: ARConnect (v%d): %d cores with %s%s%s%s%s\n",
                mp.ver, mp.num_cores,
                IS_AVAIL1(mp.ipi, "IPI "),
                IS_AVAIL1(mp.idu, "IDU "),
+               IS_AVAIL1(mp.llm, "LLM "),
                IS_AVAIL1(mp.dbg, "DEBUG "),
-               IS_AVAIL1(mp.grtc, "GRTC"));
+               IS_AVAIL1(mp.gfrc, "GFRC"));
 
        idu_detected = mp.idu;
 
@@ -125,8 +123,8 @@ static void mcip_probe_n_setup(void)
                __mcip_cmd_data(CMD_DEBUG_SET_MASK, 0xf, 0xf);
        }
 
-       if (IS_ENABLED(CONFIG_ARC_HAS_GRTC) && !mp.grtc)
-               panic("kernel trying to use non-existent GRTC\n");
+       if (IS_ENABLED(CONFIG_ARC_HAS_GFRC) && !mp.gfrc)
+               panic("kernel trying to use non-existent GFRC\n");
 }
 
 struct plat_smp_ops plat_smp_ops = {
index e1b8744..cdc821d 100644 (file)
@@ -42,9 +42,57 @@ struct task_struct *_current_task[NR_CPUS];  /* For stack switching */
 
 struct cpuinfo_arc cpuinfo_arc700[NR_CPUS];
 
+static void read_decode_ccm_bcr(struct cpuinfo_arc *cpu)
+{
+       if (is_isa_arcompact()) {
+               struct bcr_iccm_arcompact iccm;
+               struct bcr_dccm_arcompact dccm;
+
+               READ_BCR(ARC_REG_ICCM_BUILD, iccm);
+               if (iccm.ver) {
+                       cpu->iccm.sz = 4096 << iccm.sz; /* 8K to 512K */
+                       cpu->iccm.base_addr = iccm.base << 16;
+               }
+
+               READ_BCR(ARC_REG_DCCM_BUILD, dccm);
+               if (dccm.ver) {
+                       unsigned long base;
+                       cpu->dccm.sz = 2048 << dccm.sz; /* 2K to 256K */
+
+                       base = read_aux_reg(ARC_REG_DCCM_BASE_BUILD);
+                       cpu->dccm.base_addr = base & ~0xF;
+               }
+       } else {
+               struct bcr_iccm_arcv2 iccm;
+               struct bcr_dccm_arcv2 dccm;
+               unsigned long region;
+
+               READ_BCR(ARC_REG_ICCM_BUILD, iccm);
+               if (iccm.ver) {
+                       cpu->iccm.sz = 256 << iccm.sz00;        /* 512B to 16M */
+                       if (iccm.sz00 == 0xF && iccm.sz01 > 0)
+                               cpu->iccm.sz <<= iccm.sz01;
+
+                       region = read_aux_reg(ARC_REG_AUX_ICCM);
+                       cpu->iccm.base_addr = region & 0xF0000000;
+               }
+
+               READ_BCR(ARC_REG_DCCM_BUILD, dccm);
+               if (dccm.ver) {
+                       cpu->dccm.sz = 256 << dccm.sz0;
+                       if (dccm.sz0 == 0xF && dccm.sz1 > 0)
+                               cpu->dccm.sz <<= dccm.sz1;
+
+                       region = read_aux_reg(ARC_REG_AUX_DCCM);
+                       cpu->dccm.base_addr = region & 0xF0000000;
+               }
+       }
+}
+
 static void read_arc_build_cfg_regs(void)
 {
        struct bcr_perip uncached_space;
+       struct bcr_timer timer;
        struct bcr_generic bcr;
        struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
        unsigned long perip_space;
@@ -53,7 +101,11 @@ static void read_arc_build_cfg_regs(void)
        READ_BCR(AUX_IDENTITY, cpu->core);
        READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa);
 
-       READ_BCR(ARC_REG_TIMERS_BCR, cpu->timers);
+       READ_BCR(ARC_REG_TIMERS_BCR, timer);
+       cpu->extn.timer0 = timer.t0;
+       cpu->extn.timer1 = timer.t1;
+       cpu->extn.rtc = timer.rtc;
+
        cpu->vec_base = read_aux_reg(AUX_INTR_VEC_BASE);
 
        READ_BCR(ARC_REG_D_UNCACH_BCR, uncached_space);
@@ -71,36 +123,11 @@ static void read_arc_build_cfg_regs(void)
        cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR) ? 1 : 0;        /* 1,3 */
        cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR) ? 1 : 0;
        cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR) > 1 ? 1 : 0; /* 2 */
-
-       /* Note that we read the CCM BCRs independent of kernel config
-        * This is to catch the cases where user doesn't know that
-        * CCMs are present in hardware build
-        */
-       {
-               struct bcr_iccm iccm;
-               struct bcr_dccm dccm;
-               struct bcr_dccm_base dccm_base;
-               unsigned int bcr_32bit_val;
-
-               bcr_32bit_val = read_aux_reg(ARC_REG_ICCM_BCR);
-               if (bcr_32bit_val) {
-                       iccm = *((struct bcr_iccm *)&bcr_32bit_val);
-                       cpu->iccm.base_addr = iccm.base << 16;
-                       cpu->iccm.sz = 0x2000 << (iccm.sz - 1);
-               }
-
-               bcr_32bit_val = read_aux_reg(ARC_REG_DCCM_BCR);
-               if (bcr_32bit_val) {
-                       dccm = *((struct bcr_dccm *)&bcr_32bit_val);
-                       cpu->dccm.sz = 0x800 << (dccm.sz);
-
-                       READ_BCR(ARC_REG_DCCMBASE_BCR, dccm_base);
-                       cpu->dccm.base_addr = dccm_base.addr << 8;
-               }
-       }
-
        READ_BCR(ARC_REG_XY_MEM_BCR, cpu->extn_xymem);
 
+       /* Read CCM BCRs for boot reporting even if not enabled in Kconfig */
+       read_decode_ccm_bcr(cpu);
+
        read_decode_mmu_bcr();
        read_decode_cache_bcr();
 
@@ -208,9 +235,9 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
                       (unsigned int)(arc_get_core_freq() / 10000) % 100);
 
        n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s\nISA Extn\t: ",
-                      IS_AVAIL1(cpu->timers.t0, "Timer0 "),
-                      IS_AVAIL1(cpu->timers.t1, "Timer1 "),
-                      IS_AVAIL2(cpu->timers.rtc, "64-bit RTC ",
+                      IS_AVAIL1(cpu->extn.timer0, "Timer0 "),
+                      IS_AVAIL1(cpu->extn.timer1, "Timer1 "),
+                      IS_AVAIL2(cpu->extn.rtc, "Local-64-bit-Ctr ",
                                 CONFIG_ARC_HAS_RTC));
 
        n += i = scnprintf(buf + n, len - n, "%s%s%s%s%s",
@@ -232,8 +259,6 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
 
                        n += scnprintf(buf + n, len - n, "mpy[opt %d] ", opt);
                }
-               n += scnprintf(buf + n, len - n, "%s",
-                              IS_USED_CFG(CONFIG_ARC_HAS_HW_MPY));
        }
 
        n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n",
@@ -293,13 +318,13 @@ static void arc_chk_core_config(void)
        struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
        int fpu_enabled;
 
-       if (!cpu->timers.t0)
+       if (!cpu->extn.timer0)
                panic("Timer0 is not present!\n");
 
-       if (!cpu->timers.t1)
+       if (!cpu->extn.timer1)
                panic("Timer1 is not present!\n");
 
-       if (IS_ENABLED(CONFIG_ARC_HAS_RTC) && !cpu->timers.rtc)
+       if (IS_ENABLED(CONFIG_ARC_HAS_RTC) && !cpu->extn.rtc)
                panic("RTC is not present\n");
 
 #ifdef CONFIG_ARC_HAS_DCCM
@@ -334,6 +359,7 @@ static void arc_chk_core_config(void)
                panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n");
 
        if (is_isa_arcv2() && IS_ENABLED(CONFIG_SMP) && cpu->isa.atomic &&
+           IS_ENABLED(CONFIG_ARC_HAS_LLSC) &&
            !IS_ENABLED(CONFIG_ARC_STAR_9000923308))
                panic("llock/scond livelock workaround missing\n");
 }
index ef6e9e1..4cb3add 100644 (file)
@@ -142,7 +142,7 @@ void start_kernel_secondary(void)
 
        local_irq_enable();
        preempt_disable();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /*
@@ -336,11 +336,8 @@ irqreturn_t do_IPI(int irq, void *dev_id)
                int rc;
 
                rc = __do_IPI(msg);
-#ifdef CONFIG_ARC_IPI_DBG
-               /* IPI received but no valid @msg */
                if (rc)
                        pr_info("IPI with bogus msg %ld in %ld\n", msg, copy);
-#endif
                pending &= ~(1U << msg);
        } while (pending);
 
index dfad287..156d983 100644 (file)
@@ -62,7 +62,7 @@
 
 /********** Clock Source Device *********/
 
-#ifdef CONFIG_ARC_HAS_GRTC
+#ifdef CONFIG_ARC_HAS_GFRC
 
 static int arc_counter_setup(void)
 {
@@ -83,10 +83,10 @@ static cycle_t arc_counter_read(struct clocksource *cs)
 
        local_irq_save(flags);
 
-       __mcip_cmd(CMD_GRTC_READ_LO, 0);
+       __mcip_cmd(CMD_GFRC_READ_LO, 0);
        stamp.l = read_aux_reg(ARC_REG_MCIP_READBACK);
 
-       __mcip_cmd(CMD_GRTC_READ_HI, 0);
+       __mcip_cmd(CMD_GFRC_READ_HI, 0);
        stamp.h = read_aux_reg(ARC_REG_MCIP_READBACK);
 
        local_irq_restore(flags);
@@ -95,7 +95,7 @@ static cycle_t arc_counter_read(struct clocksource *cs)
 }
 
 static struct clocksource arc_counter = {
-       .name   = "ARConnect GRTC",
+       .name   = "ARConnect GFRC",
        .rating = 400,
        .read   = arc_counter_read,
        .mask   = CLOCKSOURCE_MASK(64),
index 4c23a68..43788b1 100644 (file)
@@ -106,6 +106,15 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
 endif
 
+# -fstack-protector-strong triggers protection checks in this code,
+# but it is being used too early to link to meaningful stack_chk logic.
+nossp_flags := $(call cc-option, -fno-stack-protector)
+CFLAGS_atags_to_fdt.o := $(nossp_flags)
+CFLAGS_fdt.o := $(nossp_flags)
+CFLAGS_fdt_ro.o := $(nossp_flags)
+CFLAGS_fdt_rw.o := $(nossp_flags)
+CFLAGS_fdt_wip.o := $(nossp_flags)
+
 ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj)
 asflags-y := -DZIMAGE
 
@@ -186,5 +195,7 @@ CFLAGS_font.o := -Dstatic=
 $(obj)/font.c: $(FONTC)
        $(call cmd,shipped)
 
+AFLAGS_hyp-stub.o := -Wa,-march=armv7-a
+
 $(obj)/hyp-stub.S: $(srctree)/arch/$(SRCARCH)/kernel/hyp-stub.S
        $(call cmd,shipped)
index f3db13d..0cc150b 100644 (file)
        };
 };
 
+
+/include/ "tps65217.dtsi"
+
 &tps {
-       compatible = "ti,tps65217";
        /*
         * Configure pmic to enter OFF-state instead of SLEEP-state ("RTC-only
         * mode") at poweroff.  Most BeagleBone versions do not support RTC-only
        ti,pmic-shutdown-controller;
 
        regulators {
-               #address-cells = <1>;
-               #size-cells = <0>;
-
                dcdc1_reg: regulator@0 {
-                       reg = <0>;
                        regulator-name = "vdds_dpr";
                        regulator-always-on;
                };
 
                dcdc2_reg: regulator@1 {
-                       reg = <1>;
                        /* VDD_MPU voltage limits 0.95V - 1.26V with +/-4% tolerance */
                        regulator-name = "vdd_mpu";
                        regulator-min-microvolt = <925000>;
                };
 
                dcdc3_reg: regulator@2 {
-                       reg = <2>;
                        /* VDD_CORE voltage limits 0.95V - 1.1V with +/-4% tolerance */
                        regulator-name = "vdd_core";
                        regulator-min-microvolt = <925000>;
                };
 
                ldo1_reg: regulator@3 {
-                       reg = <3>;
                        regulator-name = "vio,vrtc,vdds";
                        regulator-always-on;
                };
 
                ldo2_reg: regulator@4 {
-                       reg = <4>;
                        regulator-name = "vdd_3v3aux";
                        regulator-always-on;
                };
 
                ldo3_reg: regulator@5 {
-                       reg = <5>;
                        regulator-name = "vdd_1v8";
                        regulator-always-on;
                };
 
                ldo4_reg: regulator@6 {
-                       reg = <6>;
                        regulator-name = "vdd_3v3a";
                        regulator-always-on;
                };
index fda457b..857d989 100644 (file)
 
 };
 
-&tps {
-       compatible = "ti,tps65217";
+/include/ "tps65217.dtsi"
 
+&tps {
        regulators {
-               #address-cells = <1>;
-               #size-cells = <0>;
-
                dcdc1_reg: regulator@0 {
-                       reg = <0>;
                        regulator-name = "vdds_dpr";
                        regulator-always-on;
                };
 
                dcdc2_reg: regulator@1 {
-                       reg = <1>;
                        /* VDD_MPU voltage limits 0.95V - 1.26V with +/-4% tolerance */
                        regulator-name = "vdd_mpu";
                        regulator-min-microvolt = <925000>;
                };
 
                dcdc3_reg: regulator@2 {
-                       reg = <2>;
                        /* VDD_CORE voltage limits 0.95V - 1.1V with +/-4% tolerance */
                        regulator-name = "vdd_core";
                        regulator-min-microvolt = <925000>;
                };
 
                ldo1_reg: regulator@3 {
-                       reg = <3>;
                        regulator-name = "vio,vrtc,vdds";
                        regulator-boot-on;
                        regulator-always-on;
                };
 
                ldo2_reg: regulator@4 {
-                       reg = <4>;
                        regulator-name = "vdd_3v3aux";
                        regulator-boot-on;
                        regulator-always-on;
                };
 
                ldo3_reg: regulator@5 {
-                       reg = <5>;
                        regulator-name = "vdd_1v8";
                        regulator-boot-on;
                        regulator-always-on;
                };
 
                ldo4_reg: regulator@6 {
-                       reg = <6>;
                        regulator-name = "vdd_3v3d";
                        regulator-boot-on;
                        regulator-always-on;
index 77559a1..f313999 100644 (file)
        wp-gpios = <&gpio3 18 0>;
 };
 
-&tps {
-       compatible = "ti,tps65217";
+#include "tps65217.dtsi"
 
+&tps {
        regulators {
-               #address-cells = <1>;
-               #size-cells = <0>;
-
                dcdc1_reg: regulator@0 {
-                       reg = <0>;
                        /* +1.5V voltage with Â±4% tolerance */
                        regulator-min-microvolt = <1450000>;
                        regulator-max-microvolt = <1550000>;
                };
 
                dcdc2_reg: regulator@1 {
-                       reg = <1>;
                        /* VDD_MPU voltage limits 0.95V - 1.1V with Â±4% tolerance */
                        regulator-name = "vdd_mpu";
                        regulator-min-microvolt = <915000>;
                };
 
                dcdc3_reg: regulator@2 {
-                       reg = <2>;
                        /* VDD_CORE voltage limits 0.95V - 1.1V with Â±4% tolerance */
                        regulator-name = "vdd_core";
                        regulator-min-microvolt = <915000>;
                };
 
                ldo1_reg: regulator@3 {
-                       reg = <3>;
                        /* +1.8V voltage with Â±4% tolerance */
                        regulator-min-microvolt = <1750000>;
                        regulator-max-microvolt = <1870000>;
                };
 
                ldo2_reg: regulator@4 {
-                       reg = <4>;
                        /* +3.3V voltage with Â±4% tolerance */
                        regulator-min-microvolt = <3175000>;
                        regulator-max-microvolt = <3430000>;
                };
 
                ldo3_reg: regulator@5 {
-                       reg = <5>;
                        /* +1.8V voltage with Â±4% tolerance */
                        regulator-min-microvolt = <1750000>;
                        regulator-max-microvolt = <1870000>;
                };
 
                ldo4_reg: regulator@6 {
-                       reg = <6>;
                        /* +3.3V voltage with Â±4% tolerance */
                        regulator-min-microvolt = <3175000>;
                        regulator-max-microvolt = <3430000>;
index 471a3a7..8867aaa 100644 (file)
        vin-supply = <&vbat>;
 };
 
-&tps {
-       compatible = "ti,tps65217";
+/include/ "tps65217.dtsi"
 
+&tps {
        backlight {
                isel = <1>; /* ISET1 */
                fdim = <200>; /* TPS65217_BL_FDIM_200HZ */
        };
 
        regulators {
-               #address-cells = <1>;
-               #size-cells = <0>;
-
                dcdc1_reg: regulator@0 {
-                       reg = <0>;
                        /* VDD_1V8 system supply */
                        regulator-always-on;
                };
 
                dcdc2_reg: regulator@1 {
-                       reg = <1>;
                        /* VDD_CORE voltage limits 0.95V - 1.26V with +/-4% tolerance */
                        regulator-name = "vdd_core";
                        regulator-min-microvolt = <925000>;
                };
 
                dcdc3_reg: regulator@2 {
-                       reg = <2>;
                        /* VDD_MPU voltage limits 0.95V - 1.1V with +/-4% tolerance */
                        regulator-name = "vdd_mpu";
                        regulator-min-microvolt = <925000>;
                };
 
                ldo1_reg: regulator@3 {
-                       reg = <3>;
                        /* VRTC 1.8V always-on supply */
                        regulator-name = "vrtc,vdds";
                        regulator-always-on;
                };
 
                ldo2_reg: regulator@4 {
-                       reg = <4>;
                        /* 3.3V rail */
                        regulator-name = "vdd_3v3aux";
                        regulator-always-on;
                };
 
                ldo3_reg: regulator@5 {
-                       reg = <5>;
                        /* VDD_3V3A 3.3V rail */
                        regulator-name = "vdd_3v3a";
                        regulator-min-microvolt = <3300000>;
                };
 
                ldo4_reg: regulator@6 {
-                       reg = <6>;
                        /* VDD_3V3B 3.3V rail */
                        regulator-name = "vdd_3v3b";
                        regulator-always-on;
index 1b5b044..865de85 100644 (file)
@@ -46,7 +46,7 @@
                        gpios = <&gpio1 29 GPIO_ACTIVE_HIGH>;
                        linux,code = <KEY_BACK>;
                        debounce-interval = <1000>;
-                       gpio-key,wakeup;
+                       wakeup-source;
                };
 
                front_button {
@@ -54,7 +54,7 @@
                        gpios = <&gpio1 25 GPIO_ACTIVE_HIGH>;
                        linux,code = <KEY_FRONT>;
                        debounce-interval = <1000>;
-                       gpio-key,wakeup;
+                       wakeup-source;
                };
        };
 
index d38edfa..3303c28 100644 (file)
        pinctrl-0 = <&uart4_pins>;
 };
 
+#include "tps65217.dtsi"
+
 &tps {
-       compatible = "ti,tps65217";
        ti,pmic-shutdown-controller;
 
        interrupt-parent = <&intc>;
        interrupts = <7>;       /* NNMI */
 
        regulators {
-               #address-cells = <1>;
-               #size-cells = <0>;
-
                dcdc1_reg: regulator@0 {
-                       reg = <0>;
                        /* VDDS_DDR */
                        regulator-min-microvolt = <1500000>;
                        regulator-max-microvolt = <1500000>;
                };
 
                dcdc2_reg: regulator@1 {
-                       reg = <1>;
                        /* VDD_MPU voltage limits 0.95V - 1.26V with +/-4% tolerance */
                        regulator-name = "vdd_mpu";
                        regulator-min-microvolt = <925000>;
                };
 
                dcdc3_reg: regulator@2 {
-                       reg = <2>;
                        /* VDD_CORE voltage limits 0.95V - 1.1V with +/-4% tolerance */
                        regulator-name = "vdd_core";
                        regulator-min-microvolt = <925000>;
                };
 
                ldo1_reg: regulator@3 {
-                       reg = <3>;
                        /* VRTC / VIO / VDDS*/
                        regulator-always-on;
                        regulator-min-microvolt = <1800000>;
                };
 
                ldo2_reg: regulator@4 {
-                       reg = <4>;
                        /* VDD_3V3AUX */
                        regulator-always-on;
                        regulator-min-microvolt = <3300000>;
                };
 
                ldo3_reg: regulator@5 {
-                       reg = <5>;
                        /* VDD_1V8 */
                        regulator-min-microvolt = <1800000>;
                        regulator-max-microvolt = <1800000>;
                };
 
                ldo4_reg: regulator@6 {
-                       reg = <6>;
                        /* VDD_3V3A */
                        regulator-min-microvolt = <3300000>;
                        regulator-max-microvolt = <3300000>;
index 04885f9..1fafaad 100644 (file)
                        ti,mbox-num-users = <4>;
                        ti,mbox-num-fifos = <8>;
                        mbox_wkupm3: wkup_m3 {
+                               ti,mbox-send-noirq;
                                ti,mbox-tx = <0 0 0>;
                                ti,mbox-rx = <0 0 3>;
                        };
index df955ba..92068fb 100644 (file)
@@ -73,7 +73,7 @@
        global_timer: timer@48240200 {
                compatible = "arm,cortex-a9-global-timer";
                reg = <0x48240200 0x100>;
-               interrupts = <GIC_PPI 11 IRQ_TYPE_LEVEL_HIGH>;
+               interrupts = <GIC_PPI 11 IRQ_TYPE_EDGE_RISING>;
                interrupt-parent = <&gic>;
                clocks = <&mpu_periphclk>;
        };
@@ -81,7 +81,7 @@
        local_timer: timer@48240600 {
                compatible = "arm,cortex-a9-twd-timer";
                reg = <0x48240600 0x100>;
-               interrupts = <GIC_PPI 13 IRQ_TYPE_LEVEL_HIGH>;
+               interrupts = <GIC_PPI 13 IRQ_TYPE_EDGE_RISING>;
                interrupt-parent = <&gic>;
                clocks = <&mpu_periphclk>;
        };
                        ti,mbox-num-users = <4>;
                        ti,mbox-num-fifos = <8>;
                        mbox_wkupm3: wkup_m3 {
+                               ti,mbox-send-noirq;
                                ti,mbox-tx = <0 0 0>;
                                ti,mbox-rx = <0 0 3>;
                        };
index 64d4332..ecd09ab 100644 (file)
                pinctrl-names = "default";
                pinctrl-0 = <&pixcir_ts_pins>;
                reg = <0x5c>;
-               interrupt-parent = <&gpio3>;
-               interrupts = <22 0>;
 
                attb-gpio = <&gpio3 22 GPIO_ACTIVE_HIGH>;
 
                 * 0x264 represents the offset of padconf register of
                 * gpio3_22 from am43xx_pinmux base.
                 */
-               interrupts-extended = <&gpio3 22 IRQ_TYPE_NONE>,
+               interrupts-extended = <&gpio3 22 IRQ_TYPE_EDGE_FALLING>,
                                      <&am43xx_pinmux 0x264>;
                interrupt-names = "tsc", "wakeup";
 
index 746fd2b..d580e2b 100644 (file)
                pinctrl-0 = <&pixcir_ts_pins>;
                reg = <0x5c>;
                interrupt-parent = <&gpio1>;
-               interrupts = <17 0>;
+               interrupts = <17 IRQ_TYPE_EDGE_FALLING>;
 
                attb-gpio = <&gpio1 17 GPIO_ACTIVE_HIGH>;
 
index 36c0fa6..a0986c6 100644 (file)
 
                sound0_master: simple-audio-card,codec {
                        sound-dai = <&tlv320aic3104>;
+                       assigned-clocks = <&clkoutmux2_clk_mux>;
+                       assigned-clock-parents = <&sys_clk2_dclk_div>;
                        clocks = <&clkout2_clk>;
                };
        };
        pinctrl-names = "default", "sleep";
        pinctrl-0 = <&mcasp3_pins_default>;
        pinctrl-1 = <&mcasp3_pins_sleep>;
+       assigned-clocks = <&mcasp3_ahclkx_mux>;
+       assigned-clock-parents = <&sys_clkin2>;
        status = "okay";
 
        op-mode = <0>;  /* MCASP_IIS_MODE */
index c538826..1c06cb7 100644 (file)
                        DRA7XX_CORE_IOPAD(0x35b8, PIN_INPUT_PULLDOWN | MUX_MODE3) /* vin2a_d20.rgmii1_rd3 */
                        DRA7XX_CORE_IOPAD(0x35bc, PIN_INPUT_PULLDOWN | MUX_MODE3) /* vin2a_d21.rgmii1_rd2 */
                        DRA7XX_CORE_IOPAD(0x35c0, PIN_INPUT_PULLDOWN | MUX_MODE3) /* vin2a_d22.rgmii1_rd1 */
-                       DRA7XX_CORE_IOPAD(0x35c4, PIN_INPUT_PULLUP | MUX_MODE3) /* vin2a_d23.rgmii1_rd0 */
+                       DRA7XX_CORE_IOPAD(0x35c4, PIN_INPUT_PULLDOWN | MUX_MODE3) /* vin2a_d23.rgmii1_rd0 */
                >;
        };
 
        pinctrl-names = "default";
        pinctrl-0 = <&qspi1_pins>;
 
-       spi-max-frequency = <20000000>;
+       spi-max-frequency = <48000000>;
 
        spi_flash: spi_flash@0 {
                #address-cells = <1>;
                #size-cells = <1>;
                compatible = "spansion,m25p80", "jedec,spi-nor";
                reg = <0>;                              /* CS0 */
-               spi-max-frequency = <20000000>;
+               spi-max-frequency = <48000000>;
 
                partition@0 {
                        label = "uboot";
                ti,debounce-tol = /bits/ 16 <10>;
                ti,debounce-rep = /bits/ 16 <1>;
 
-               linux,wakeup;
+               wakeup-source;
        };
 };
 
 
 &cpsw_emac0 {
        phy_id = <&davinci_mdio>, <0>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-txid";
        dual_emac_res_vlan = <0>;
 };
 
 &cpsw_emac1 {
        phy_id = <&davinci_mdio>, <1>;
-       phy-mode = "rgmii";
+       phy-mode = "rgmii-txid";
        dual_emac_res_vlan = <1>;
 };
 
 };
 
 &usb2 {
-       dr_mode = "peripheral";
+       dr_mode = "host";
 };
 
 &mcasp3 {
index 77bb8e1..988e996 100644 (file)
@@ -25,8 +25,8 @@
 &dra7_pmx_core {
        uart3_pins_default: uart3_pins_default {
                pinctrl-single,pins = <
-                       DRA7XX_CORE_IOPAD(0x37f8, PIN_INPUT_SLEW | MUX_MODE2)   /* uart2_ctsn.uart3_rxd */
-                       DRA7XX_CORE_IOPAD(0x37fc, PIN_INPUT_SLEW | MUX_MODE1)   /* uart2_rtsn.uart3_txd */
+                       DRA7XX_CORE_IOPAD(0x3648, PIN_INPUT_SLEW | MUX_MODE0)   /* uart3_rxd */
+                       DRA7XX_CORE_IOPAD(0x364c, PIN_INPUT_SLEW | MUX_MODE0)   /* uart3_txd */
                >;
        };
 
        pinctrl-0 = <&i2c5_pins_default>;
        clock-frequency = <400000>;
 
-       eeprom_base: atmel@50 {
+       eeprom_base: atmel@54 {
                compatible = "atmel,24c08";
-               reg = <0x50>;
+               reg = <0x54>;
                pagesize = <16>;
        };
 
index 23fc670..5c21b23 100644 (file)
@@ -70,8 +70,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                pcie-controller {
                        status = "okay";
index f774101..ebe1d26 100644 (file)
@@ -76,8 +76,8 @@
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                          MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                devbus-bootcs {
                        status = "okay";
index 4878d73..5730b87 100644 (file)
@@ -95,8 +95,8 @@
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
                          MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                devbus-bootcs {
                        status = "okay";
index 13cf69a..8af463f 100644 (file)
@@ -65,8 +65,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                        MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                       MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                       MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                       MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                       MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                pcie-controller {
                        status = "okay";
                                nand-on-flash-bbt;
 
                                partitions {
+                                       compatible = "fixed-partitions";
                                        #address-cells = <1>;
                                        #size-cells = <1>;
 
index 6e9820e..b89e6cf 100644 (file)
@@ -70,8 +70,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                pcie-controller {
                        status = "okay";
index 6ab3383..6522b04 100644 (file)
@@ -68,8 +68,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                internal-regs {
                        serial@12000 {
index 62175a8..d19f44c 100644 (file)
@@ -64,8 +64,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                pcie-controller {
                        status = "okay";
index a5db177..853bd39 100644 (file)
@@ -65,9 +65,9 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x8000000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x01, 0x2f) 0 0 0xe8000000 0x8000000
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                devbus-bootcs {
                        status = "okay";
index 2391b11..d17dab0 100644 (file)
@@ -78,8 +78,8 @@
        soc {
                ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
                          MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-                         MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-                         MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+                         MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+                         MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
                pcie-controller {
                        status = "okay";
index 77ddff0..e683856 100644 (file)
 
                        macb0: ethernet@f8008000 {
                                pinctrl-names = "default";
-                               pinctrl-0 = <&pinctrl_macb0_default>;
+                               pinctrl-0 = <&pinctrl_macb0_default &pinctrl_macb0_phy_irq>;
                                phy-mode = "rmii";
                                status = "okay";
+
+                               ethernet-phy@1 {
+                                       reg = <0x1>;
+                                       interrupt-parent = <&pioA>;
+                                       interrupts = <73 IRQ_TYPE_LEVEL_LOW>;
+                               };
                        };
 
                        pdmic@f8018000 {
                                        bias-disable;
                                };
 
+                               pinctrl_macb0_phy_irq: macb0_phy_irq {
+                                       pinmux = <PIN_PC9__GPIO>;
+                               };
+
                                pinctrl_pdmic_default: pdmic_default {
                                        pinmux = <PIN_PB26__PDMIC_DAT>,
                                                <PIN_PB27__PDMIC_CLK>;
index 131614f..569026e 100644 (file)
                        macb0: ethernet@f8020000 {
                                phy-mode = "rmii";
                                status = "okay";
+                               pinctrl-names = "default";
+                               pinctrl-0 = <&pinctrl_macb0_rmii &pinctrl_macb0_phy_irq>;
 
                                phy0: ethernet-phy@1 {
                                        interrupt-parent = <&pioE>;
-                                       interrupts = <1 IRQ_TYPE_EDGE_FALLING>;
+                                       interrupts = <1 IRQ_TYPE_LEVEL_LOW>;
                                        reg = <1>;
                                };
                        };
                                                atmel,pins =
                                                        <AT91_PIOE 8 AT91_PERIPH_GPIO AT91_PINCTRL_PULL_UP_DEGLITCH>;
                                        };
+                                       pinctrl_macb0_phy_irq: macb0_phy_irq_0 {
+                                               atmel,pins =
+                                                       <AT91_PIOE 1 AT91_PERIPH_GPIO AT91_PINCTRL_PULL_UP_DEGLITCH>;
+                                       };
                                };
                        };
                };
index 2d4a331..4e98cda 100644 (file)
                        };
 
                        macb0: ethernet@f8020000 {
+                               pinctrl-0 = <&pinctrl_macb0_rmii &pinctrl_macb0_phy_irq>;
                                phy-mode = "rmii";
                                status = "okay";
+
+                               ethernet-phy@1 {
+                                       reg = <0x1>;
+                                       interrupt-parent = <&pioE>;
+                                       interrupts = <1 IRQ_TYPE_LEVEL_LOW>;
+                               };
                        };
 
                        mmc1: mmc@fc000000 {
 
                        pinctrl@fc06a000 {
                                board {
+                                       pinctrl_macb0_phy_irq: macb0_phy_irq {
+                                               atmel,pins =
+                                                       <AT91_PIOE 1 AT91_PERIPH_GPIO AT91_PINCTRL_NONE>;
+                                       };
                                        pinctrl_mmc0_cd: mmc0_cd {
                                                atmel,pins =
                                                        <AT91_PIOE 5 AT91_PERIPH_GPIO AT91_PINCTRL_PULL_UP_DEGLITCH>;
index ca4ddf8..626c67d 100644 (file)
        };
 
        panel: panel {
-               compatible = "qd,qd43003c0-40", "simple-panel";
+               compatible = "qiaodian,qd43003c0-40", "simple-panel";
                backlight = <&backlight>;
                power-supply = <&panel_reg>;
                #address-cells = <1>;
index c4d9175..f82aa44 100644 (file)
                               0x48485200 0x2E00>;
                        #address-cells = <1>;
                        #size-cells = <1>;
+
+                       /*
+                        * Do not allow gating of cpsw clock as workaround
+                        * for errata i877. Keeping internal clock disabled
+                        * causes the device switching characteristics
+                        * to degrade over time and eventually fail to meet
+                        * the data manual delay time/skew specs.
+                        */
+                       ti,no-idle;
+
                        /*
                         * rx_thresh_pend
                         * rx_pend
index 4f6ae92..f74d3db 100644 (file)
                                #size-cells = <1>;
                                reg = <0x2100000 0x10000>;
                                ranges = <0 0x2100000 0x10000>;
-                               interrupt-parent = <&intc>;
                                clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
                                         <&clks IMX6QDL_CLK_CAAM_ACLK>,
                                         <&clks IMX6QDL_CLK_CAAM_IPG>,
index bf4143c..b84af3d 100644 (file)
@@ -14,7 +14,7 @@
 #include "kirkwood-synology.dtsi"
 
 / {
-       model = "Synology DS111";
+       model = "Synology DS112";
        compatible = "synology,ds111", "marvell,kirkwood";
 
        memory {
index 09eed3c..36eec73 100644 (file)
@@ -1,7 +1,8 @@
 /*
  * Device Tree file for Buffalo Linkstation LS-WVL/VL
  *
- * Copyright (C) 2015, rogershimizu@gmail.com
+ * Copyright (C) 2015, 2016
+ * Roger Shimizu <rogershimizu@gmail.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
                button@1 {
                        label = "Function Button";
                        linux,code = <KEY_OPTION>;
-                       gpios = <&gpio0 45 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 13 GPIO_ACTIVE_LOW>;
                };
 
                button@2 {
                        label = "Power-on Switch";
                        linux,code = <KEY_RESERVED>;
                        linux,input-type = <5>;
-                       gpios = <&gpio0 46 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
                };
 
                button@3 {
                        label = "Power-auto Switch";
                        linux,code = <KEY_ESC>;
                        linux,input-type = <5>;
-                       gpios = <&gpio0 47 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 15 GPIO_ACTIVE_LOW>;
                };
        };
 
 
                led@1 {
                        label = "lswvl:red:alarm";
-                       gpios = <&gpio0 36 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>;
                };
 
                led@2 {
                        label = "lswvl:red:func";
-                       gpios = <&gpio0 37 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 5 GPIO_ACTIVE_HIGH>;
                };
 
                led@3 {
                        label = "lswvl:amber:info";
-                       gpios = <&gpio0 38 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 6 GPIO_ACTIVE_HIGH>;
                };
 
                led@4 {
                        label = "lswvl:blue:func";
-                       gpios = <&gpio0 39 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>;
                };
 
                led@5 {
                        label = "lswvl:blue:power";
-                       gpios = <&gpio0 40 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 8 GPIO_ACTIVE_LOW>;
                        default-state = "keep";
                };
 
                led@6 {
                        label = "lswvl:red:hdderr0";
-                       gpios = <&gpio0 34 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>;
                };
 
                led@7 {
                        label = "lswvl:red:hdderr1";
-                       gpios = <&gpio0 35 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 3 GPIO_ACTIVE_HIGH>;
                };
        };
 
                                3250 1
                                5000 0>;
 
-               alarm-gpios = <&gpio0 43 GPIO_ACTIVE_HIGH>;
+               alarm-gpios = <&gpio1 11 GPIO_ACTIVE_HIGH>;
        };
 
        restart_poweroff {
index f5db16a..b13ec20 100644 (file)
@@ -1,7 +1,8 @@
 /*
  * Device Tree file for Buffalo Linkstation LS-WXL/WSXL
  *
- * Copyright (C) 2015, rogershimizu@gmail.com
+ * Copyright (C) 2015, 2016
+ * Roger Shimizu <rogershimizu@gmail.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
                button@1 {
                        label = "Function Button";
                        linux,code = <KEY_OPTION>;
-                       gpios = <&gpio1 41 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 9 GPIO_ACTIVE_LOW>;
                };
 
                button@2 {
                        label = "Power-on Switch";
                        linux,code = <KEY_RESERVED>;
                        linux,input-type = <5>;
-                       gpios = <&gpio1 42 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 10 GPIO_ACTIVE_LOW>;
                };
 
                button@3 {
                        label = "Power-auto Switch";
                        linux,code = <KEY_ESC>;
                        linux,input-type = <5>;
-                       gpios = <&gpio1 43 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 11 GPIO_ACTIVE_LOW>;
                };
        };
 
 
                led@1 {
                        label = "lswxl:blue:func";
-                       gpios = <&gpio1 36 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 4 GPIO_ACTIVE_LOW>;
                };
 
                led@2 {
                        label = "lswxl:red:alarm";
-                       gpios = <&gpio1 49 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 17 GPIO_ACTIVE_LOW>;
                };
 
                led@3 {
 
                led@4 {
                        label = "lswxl:blue:power";
-                       gpios = <&gpio1 8 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>;
+                       default-state = "keep";
                };
 
                led@5 {
                        label = "lswxl:red:func";
-                       gpios = <&gpio1 5 GPIO_ACTIVE_LOW>;
-                       default-state = "keep";
+                       gpios = <&gpio1 2 GPIO_ACTIVE_HIGH>;
                };
 
                led@6 {
                        label = "lswxl:red:hdderr0";
-                       gpios = <&gpio1 2 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio0 8 GPIO_ACTIVE_HIGH>;
                };
 
                led@7 {
                        label = "lswxl:red:hdderr1";
-                       gpios = <&gpio1 3 GPIO_ACTIVE_LOW>;
+                       gpios = <&gpio1 14 GPIO_ACTIVE_HIGH>;
                };
        };
 
                pinctrl-0 = <&pmx_fan_low &pmx_fan_high &pmx_fan_lock>;
                pinctrl-names = "default";
 
-               gpios = <&gpio0 47 GPIO_ACTIVE_LOW
-                        &gpio0 48 GPIO_ACTIVE_LOW>;
+               gpios = <&gpio1 16 GPIO_ACTIVE_LOW
+                        &gpio1 15 GPIO_ACTIVE_LOW>;
 
                gpio-fan,speed-map = <0 3
                                1500 2
                                3250 1
                                5000 0>;
 
-               alarm-gpios = <&gpio1 49 GPIO_ACTIVE_HIGH>;
+               alarm-gpios = <&gpio1 8 GPIO_ACTIVE_HIGH>;
        };
 
        restart_poweroff {
                        enable-active-high;
                        regulator-always-on;
                        regulator-boot-on;
-                       gpio = <&gpio0 37 GPIO_ACTIVE_HIGH>;
+                       gpio = <&gpio1 5 GPIO_ACTIVE_HIGH>;
                };
                hdd_power0: regulator@2 {
                        compatible = "regulator-fixed";
index 1db6f2c..8082d64 100644 (file)
        chip-delay = <40>;
        status = "okay";
        partitions {
+               compatible = "fixed-partitions";
                #address-cells = <1>;
                #size-cells = <1>;
 
index 7fed0bd..0080532 100644 (file)
        clock-frequency = <400000>;
 };
 
-&i2c2 {
-       clock-frequency = <400000>;
-};
-
-&i2c3 {
-       clock-frequency = <400000>;
-};
-
 /*
  * Only found on the wireless SOM. For the SOM without wireless, the pins for
  * MMC3 can be routed with jumpers to the second MMC slot on the devkit and
                interrupt-parent = <&gpio5>;
                interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; /* gpio 152 */
                ref-clock-frequency = <26000000>;
+               tcxo-clock-frequency = <26000000>;
        };
 };
 
index 888412c..902657d 100644 (file)
        };
 };
 
+&gpio8 {
+       /* TI trees use GPIO instead of msecure, see also muxing */
+       p234 {
+               gpio-hog;
+               gpios = <10 GPIO_ACTIVE_HIGH>;
+               output-high;
+               line-name = "gpio8_234/msecure";
+       };
+};
+
 &omap5_pmx_core {
        pinctrl-names = "default";
        pinctrl-0 = <
                >;
        };
 
+       /* TI trees use GPIO mode; msecure mode does not work reliably? */
+       palmas_msecure_pins: palmas_msecure_pins {
+               pinctrl-single,pins = <
+                       OMAP5_IOPAD(0x180, PIN_OUTPUT | MUX_MODE6) /* gpio8_234 */
+               >;
+       };
+
        usbhost_pins: pinmux_usbhost_pins {
                pinctrl-single,pins = <
                        OMAP5_IOPAD(0x0c4, PIN_INPUT | MUX_MODE0) /* usbb2_hsic_strobe */
                        &usbhost_wkup_pins
        >;
 
+       palmas_sys_nirq_pins: pinmux_palmas_sys_nirq_pins {
+               pinctrl-single,pins = <
+                       OMAP5_IOPAD(0x068, PIN_INPUT_PULLUP | MUX_MODE0) /* sys_nirq1 */
+               >;
+       };
+
        usbhost_wkup_pins: pinmux_usbhost_wkup_pins {
                pinctrl-single,pins = <
                        OMAP5_IOPAD(0x05a, PIN_OUTPUT | MUX_MODE0) /* fref_clk1_out, USB hub clk */
                interrupt-controller;
                #interrupt-cells = <2>;
                ti,system-power-controller;
+               pinctrl-names = "default";
+               pinctrl-0 = <&palmas_sys_nirq_pins &palmas_msecure_pins>;
 
                extcon_usb3: palmas_usb {
                        compatible = "ti,palmas-usb-vid";
                        #clock-cells = <0>;
                };
 
+               rtc {
+                       compatible = "ti,palmas-rtc";
+                       interrupt-parent = <&palmas>;
+                       interrupts = <8 IRQ_TYPE_NONE>;
+                       ti,backup-battery-chargeable;
+                       ti,backup-battery-charge-high-current;
+               };
+
                palmas_pmic {
                        compatible = "ti,palmas-pmic";
                        interrupt-parent = <&palmas>;
index 3daec91..aae8a7a 100644 (file)
@@ -1,7 +1,8 @@
 /*
  * Device Tree file for Buffalo Linkstation LS-WTGL
  *
- * Copyright (C) 2015, Roger Shimizu <rogershimizu@gmail.com>
+ * Copyright (C) 2015, 2016
+ * Roger Shimizu <rogershimizu@gmail.com>
  *
  * This file is dual-licensed: you can use it either under the terms
  * of the GPL or the X11 license, at your option. Note that this dual
@@ -69,8 +70,6 @@
 
                internal-regs {
                        pinctrl: pinctrl@10000 {
-                               pinctrl-0 = <&pmx_usb_power &pmx_power_hdd
-                                       &pmx_fan_low &pmx_fan_high &pmx_fan_lock>;
                                pinctrl-names = "default";
 
                                pmx_led_power: pmx-leds {
                led@1 {
                        label = "lswtgl:blue:power";
                        gpios = <&gpio0 0 GPIO_ACTIVE_LOW>;
+                       default-state = "keep";
                };
 
                led@2 {
                                3250 1
                                5000 0>;
 
-               alarm-gpios = <&gpio0 2 GPIO_ACTIVE_HIGH>;
+               alarm-gpios = <&gpio0 6 GPIO_ACTIVE_HIGH>;
        };
 
        restart_poweroff {
        };
 };
 
+&devbus_bootcs {
+       status = "okay";
+       devbus,keep-config;
+
+       flash@0 {
+               compatible = "jedec-flash";
+               reg = <0 0x40000>;
+               bank-width = <1>;
+
+               partitions {
+                       compatible = "fixed-partitions";
+                       #address-cells = <1>;
+                       #size-cells = <1>;
+
+                       header@0 {
+                               reg = <0 0x30000>;
+                               read-only;
+                       };
+
+                       uboot@30000 {
+                               reg = <0x30000 0xF000>;
+                               read-only;
+                       };
+
+                       uboot_env@3F000 {
+                               reg = <0x3F000 0x1000>;
+                       };
+               };
+       };
+};
+
 &mdio {
        status = "okay";
 
index 6713b1e..01d239c 100644 (file)
        pinctrl-names = "default";
 
        status = "okay";
-       renesas,enable-gpio = <&gpio5 31 GPIO_ACTIVE_HIGH>;
 };
 
 &usbphy {
index 1afe246..b0c912f 100644 (file)
@@ -90,7 +90,7 @@
 #define PIN_PA14__I2SC1_MCK            PINMUX_PIN(PIN_PA14, 4, 2)
 #define PIN_PA14__FLEXCOM3_IO2         PINMUX_PIN(PIN_PA14, 5, 1)
 #define PIN_PA14__D9                   PINMUX_PIN(PIN_PA14, 6, 2)
-#define PIN_PA15                       14
+#define PIN_PA15                       15
 #define PIN_PA15__GPIO                 PINMUX_PIN(PIN_PA15, 0, 0)
 #define PIN_PA15__SPI0_MOSI            PINMUX_PIN(PIN_PA15, 1, 1)
 #define PIN_PA15__TF1                  PINMUX_PIN(PIN_PA15, 2, 1)
index b8032bc..db1151c 100644 (file)
                        dbgu: serial@fc069000 {
                                compatible = "atmel,at91sam9260-dbgu", "atmel,at91sam9260-usart";
                                reg = <0xfc069000 0x200>;
-                               interrupts = <2 IRQ_TYPE_LEVEL_HIGH 7>;
+                               interrupts = <45 IRQ_TYPE_LEVEL_HIGH 7>;
                                pinctrl-names = "default";
                                pinctrl-0 = <&pinctrl_dbgu>;
                                clocks = <&dbgu_clk>;
index d0c7438..27a333e 100644 (file)
                        };
                        mmcsd_default_mode: mmcsd_default {
                                mmcsd_default_cfg1 {
-                                       /* MCCLK */
-                                       pins = "GPIO8_B10";
-                                       ste,output = <0>;
-                               };
-                               mmcsd_default_cfg2 {
-                                       /* MCCMDDIR, MCDAT0DIR, MCDAT31DIR, MCDATDIR2 */
-                                       pins = "GPIO10_C11", "GPIO15_A12",
-                                       "GPIO16_C13", "GPIO23_D15";
-                                       ste,output = <1>;
-                               };
-                               mmcsd_default_cfg3 {
-                                       /* MCCMD, MCDAT3-0, MCMSFBCLK */
-                                       pins = "GPIO9_A10", "GPIO11_B11",
-                                       "GPIO12_A11", "GPIO13_C12",
-                                       "GPIO14_B12", "GPIO24_C15";
-                                       ste,input = <1>;
+                                       /*
+                                        * MCCLK, MCCMDDIR, MCDAT0DIR, MCDAT31DIR, MCDATDIR2
+                                        * MCCMD, MCDAT3-0, MCMSFBCLK
+                                        */
+                                       pins = "GPIO8_B10", "GPIO9_A10", "GPIO10_C11", "GPIO11_B11",
+                                              "GPIO12_A11", "GPIO13_C12", "GPIO14_B12", "GPIO15_A12",
+                                              "GPIO16_C13", "GPIO23_D15", "GPIO24_C15";
+                                       ste,output = <2>;
                                };
                        };
                };
                        clock-names = "mclk", "apb_pclk";
                        interrupt-parent = <&vica>;
                        interrupts = <22>;
-                       max-frequency = <48000000>;
+                       max-frequency = <400000>;
                        bus-width = <4>;
                        cap-mmc-highspeed;
                        cap-sd-highspeed;
+                       full-pwr-cycle;
+                       /*
+                        * The STw4811 circuit used with the Nomadik strictly
+                        * requires that all of these signal direction pins be
+                        * routed and used for its 4-bit levelshifter.
+                        */
+                       st,sig-dir-dat0;
+                       st,sig-dir-dat2;
+                       st,sig-dir-dat31;
+                       st,sig-dir-cmd;
+                       st,sig-pin-fbclk;
                        pinctrl-names = "default";
                        pinctrl-0 = <&mmcsd_default_mux>, <&mmcsd_default_mode>;
                        vmmc-supply = <&vmmc_regulator>;
diff --git a/arch/arm/boot/dts/tps65217.dtsi b/arch/arm/boot/dts/tps65217.dtsi
new file mode 100644 (file)
index 0000000..a632724
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Integrated Power Management Chip
+ * http://www.ti.com/lit/ds/symlink/tps65217.pdf
+ */
+
+&tps {
+       compatible = "ti,tps65217";
+
+       regulators {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               dcdc1_reg: regulator@0 {
+                       reg = <0>;
+                       regulator-compatible = "dcdc1";
+               };
+
+               dcdc2_reg: regulator@1 {
+                       reg = <1>;
+                       regulator-compatible = "dcdc2";
+               };
+
+               dcdc3_reg: regulator@2 {
+                       reg = <2>;
+                       regulator-compatible = "dcdc3";
+               };
+
+               ldo1_reg: regulator@3 {
+                       reg = <3>;
+                       regulator-compatible = "ldo1";
+               };
+
+               ldo2_reg: regulator@4 {
+                       reg = <4>;
+                       regulator-compatible = "ldo2";
+               };
+
+               ldo3_reg: regulator@5 {
+                       reg = <5>;
+                       regulator-compatible = "ldo3";
+               };
+
+               ldo4_reg: regulator@6 {
+                       reg = <6>;
+                       regulator-compatible = "ldo4";
+               };
+       };
+};
index 2dc6da7..d7ed252 100644 (file)
@@ -16,7 +16,7 @@
  */
 #include <linux/module.h>
 #include <linux/kernel.h>
-
+#include <asm/div64.h>
 #include <asm/hardware/icst.h>
 
 /*
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(icst525_s2div);
 
 unsigned long icst_hz(const struct icst_params *p, struct icst_vco vco)
 {
-       return p->ref * 2 * (vco.v + 8) / ((vco.r + 2) * p->s2div[vco.s]);
+       u64 dividend = p->ref * 2 * (u64)(vco.v + 8);
+       u32 divisor = (vco.r + 2) * p->s2div[vco.s];
+
+       do_div(dividend, divisor);
+       return (unsigned long)dividend;
 }
 
 EXPORT_SYMBOL(icst_hz);
@@ -58,6 +62,7 @@ icst_hz_to_vco(const struct icst_params *p, unsigned long freq)
 
                if (f > p->vco_min && f <= p->vco_max)
                        break;
+               i++;
        } while (i < 8);
 
        if (i >= 8)
index 314f6be..8e8b2ac 100644 (file)
@@ -426,6 +426,7 @@ CONFIG_SUNXI_WATCHDOG=y
 CONFIG_IMX2_WDT=y
 CONFIG_TEGRA_WATCHDOG=m
 CONFIG_MESON_WATCHDOG=y
+CONFIG_DW_WATCHDOG=y
 CONFIG_DIGICOLOR_WATCHDOG=y
 CONFIG_MFD_AS3711=y
 CONFIG_MFD_AS3722=y
index c5e1943..d18d6b4 100644 (file)
@@ -50,6 +50,7 @@ CONFIG_SOC_AM33XX=y
 CONFIG_SOC_AM43XX=y
 CONFIG_SOC_DRA7XX=y
 CONFIG_ARM_THUMBEE=y
+CONFIG_ARM_KERNMEM_PERMS=y
 CONFIG_ARM_ERRATA_411920=y
 CONFIG_ARM_ERRATA_430973=y
 CONFIG_SMP=y
@@ -177,6 +178,7 @@ CONFIG_TI_CPTS=y
 CONFIG_AT803X_PHY=y
 CONFIG_SMSC_PHY=y
 CONFIG_USB_USBNET=m
+CONFIG_USB_NET_SMSC75XX=m
 CONFIG_USB_NET_SMSC95XX=m
 CONFIG_USB_ALI_M5632=y
 CONFIG_USB_AN2720=y
@@ -290,24 +292,23 @@ CONFIG_FB=y
 CONFIG_FIRMWARE_EDID=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
-CONFIG_OMAP2_DSS=m
-CONFIG_OMAP5_DSS_HDMI=y
-CONFIG_OMAP2_DSS_SDI=y
-CONFIG_OMAP2_DSS_DSI=y
+CONFIG_FB_OMAP5_DSS_HDMI=y
+CONFIG_FB_OMAP2_DSS_SDI=y
+CONFIG_FB_OMAP2_DSS_DSI=y
 CONFIG_FB_OMAP2=m
-CONFIG_DISPLAY_ENCODER_TFP410=m
-CONFIG_DISPLAY_ENCODER_TPD12S015=m
-CONFIG_DISPLAY_CONNECTOR_DVI=m
-CONFIG_DISPLAY_CONNECTOR_HDMI=m
-CONFIG_DISPLAY_CONNECTOR_ANALOG_TV=m
-CONFIG_DISPLAY_PANEL_DPI=m
-CONFIG_DISPLAY_PANEL_DSI_CM=m
-CONFIG_DISPLAY_PANEL_SONY_ACX565AKM=m
-CONFIG_DISPLAY_PANEL_LGPHILIPS_LB035Q02=m
-CONFIG_DISPLAY_PANEL_SHARP_LS037V7DW01=m
-CONFIG_DISPLAY_PANEL_TPO_TD028TTEC1=m
-CONFIG_DISPLAY_PANEL_TPO_TD043MTEA1=m
-CONFIG_DISPLAY_PANEL_NEC_NL8048HL11=m
+CONFIG_FB_OMAP2_ENCODER_TFP410=m
+CONFIG_FB_OMAP2_ENCODER_TPD12S015=m
+CONFIG_FB_OMAP2_CONNECTOR_DVI=m
+CONFIG_FB_OMAP2_CONNECTOR_HDMI=m
+CONFIG_FB_OMAP2_CONNECTOR_ANALOG_TV=m
+CONFIG_FB_OMAP2_PANEL_DPI=m
+CONFIG_FB_OMAP2_PANEL_DSI_CM=m
+CONFIG_FB_OMAP2_PANEL_SONY_ACX565AKM=m
+CONFIG_FB_OMAP2_PANEL_LGPHILIPS_LB035Q02=m
+CONFIG_FB_OMAP2_PANEL_SHARP_LS037V7DW01=m
+CONFIG_FB_OMAP2_PANEL_TPO_TD028TTEC1=m
+CONFIG_FB_OMAP2_PANEL_TPO_TD043MTEA1=m
+CONFIG_FB_OMAP2_PANEL_NEC_NL8048HL11=m
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=y
@@ -354,6 +355,11 @@ CONFIG_USB_MUSB_DSPS=m
 CONFIG_USB_INVENTRA_DMA=y
 CONFIG_USB_TI_CPPI41_DMA=y
 CONFIG_USB_DWC3=m
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_SIMPLE=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_PL2303=m
 CONFIG_USB_TEST=m
 CONFIG_AM335X_PHY_USB=y
 CONFIG_USB_GADGET=m
@@ -387,6 +393,7 @@ CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=m
 CONFIG_LEDS_GPIO=m
 CONFIG_LEDS_PWM=m
+CONFIG_LEDS_PCA963X=m
 CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=m
 CONFIG_LEDS_TRIGGER_ONESHOT=m
@@ -449,6 +456,8 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_SPLIT=y
+CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
index b445a5d..89a3a3e 100644 (file)
@@ -364,7 +364,7 @@ static struct crypto_alg aes_algs[] = { {
        .cra_blkcipher = {
                .min_keysize    = AES_MIN_KEY_SIZE,
                .max_keysize    = AES_MAX_KEY_SIZE,
-               .ivsize         = AES_BLOCK_SIZE,
+               .ivsize         = 0,
                .setkey         = ce_aes_setkey,
                .encrypt        = ecb_encrypt,
                .decrypt        = ecb_decrypt,
@@ -441,7 +441,7 @@ static struct crypto_alg aes_algs[] = { {
        .cra_ablkcipher = {
                .min_keysize    = AES_MIN_KEY_SIZE,
                .max_keysize    = AES_MAX_KEY_SIZE,
-               .ivsize         = AES_BLOCK_SIZE,
+               .ivsize         = 0,
                .setkey         = ablk_set_key,
                .encrypt        = ablk_encrypt,
                .decrypt        = ablk_decrypt,
index 7da5503..e08d151 100644 (file)
@@ -117,6 +117,7 @@ static inline u32 gic_read_iar(void)
        u32 irqstat;
 
        asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat));
+       dsb(sy);
        return irqstat;
 }
 
index d5525bf..9156fc3 100644 (file)
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 #else
index 0375c8c..9408a99 100644 (file)
@@ -35,14 +35,21 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
             dma_addr_t dev_addr, unsigned long offset, size_t size,
             enum dma_data_direction dir, struct dma_attrs *attrs)
 {
-       bool local = XEN_PFN_DOWN(dev_addr) == page_to_xen_pfn(page);
+       unsigned long page_pfn = page_to_xen_pfn(page);
+       unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr);
+       unsigned long compound_pages =
+               (1<<compound_order(page)) * XEN_PFN_PER_PAGE;
+       bool local = (page_pfn <= dev_pfn) &&
+               (dev_pfn - page_pfn < compound_pages);
+
        /*
-        * Dom0 is mapped 1:1, while the Linux page can be spanned accross
-        * multiple Xen page, it's not possible to have a mix of local and
-        * foreign Xen page. So if the first xen_pfn == mfn the page is local
-        * otherwise it's a foreign page grant-mapped in dom0. If the page is
-        * local we can safely call the native dma_ops function, otherwise we
-        * call the xen specific function.
+        * Dom0 is mapped 1:1, while the Linux page can span across
+        * multiple Xen pages, it's not possible for it to contain a
+        * mix of local and foreign Xen pages. So if the first xen_pfn
+        * == mfn the page is local otherwise it's a foreign page
+        * grant-mapped in dom0. If the page is local we can safely
+        * call the native dma_ops function, otherwise we call the xen
+        * specific function.
         */
        if (local)
                __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs);
index ede692f..5dd2528 100644 (file)
 #define __NR_userfaultfd               (__NR_SYSCALL_BASE+388)
 #define __NR_membarrier                        (__NR_SYSCALL_BASE+389)
 #define __NR_mlock2                    (__NR_SYSCALL_BASE+390)
+#define __NR_copy_file_range           (__NR_SYSCALL_BASE+391)
 
 /*
  * The following SWIs are ARM private.
index 2c5f160..ad325a8 100644 (file)
@@ -88,6 +88,7 @@ obj-$(CONFIG_DEBUG_LL)        += debug.o
 obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
 
 obj-$(CONFIG_ARM_VIRT_EXT)     += hyp-stub.o
+AFLAGS_hyp-stub.o              :=-Wa,-march=armv7-a
 ifeq ($(CONFIG_ARM_PSCI),y)
 obj-$(CONFIG_SMP)              += psci_smp.o
 endif
index ac368bb..dfc7cd6 100644 (file)
                CALL(sys_userfaultfd)
                CALL(sys_membarrier)
                CALL(sys_mlock2)
+               CALL(sys_copy_file_range)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
index 7d0cba6..139791e 100644 (file)
@@ -176,13 +176,13 @@ static struct resource mem_res[] = {
                .name = "Kernel code",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        },
        {
                .name = "Kernel data",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        }
 };
 
@@ -851,7 +851,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
                res->name  = "System RAM";
                res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
                request_resource(&iomem_resource, res);
 
index 37312f6..baee702 100644 (file)
@@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void)
        /*
         * OK, it's off to the idle thread for us
         */
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
index dda1959..08e49c4 100644 (file)
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
        struct kvm_vcpu *vcpu;
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
                vcpu->arch.pause = false;
-               wake_up_interruptible(wq);
+               swake_up(wq);
        }
 }
 
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
-       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
-       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
                                       (!vcpu->arch.pause)));
 }
 
index 5fa69d7..99361f1 100644 (file)
@@ -161,7 +161,7 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
        u64 val;
 
        val = kvm_arm_timer_get_reg(vcpu, reg->id);
-       return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+       return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
 }
 
 static unsigned long num_core_regs(void)
index 7f33b20..0f6600f 100644 (file)
@@ -206,7 +206,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
        run->mmio.is_write      = is_write;
        run->mmio.phys_addr     = fault_ipa;
        run->mmio.len           = len;
-       memcpy(run->mmio.data, data_buf, len);
+       if (is_write)
+               memcpy(run->mmio.data, data_buf, len);
 
        if (!ret) {
                /* We handled the access successfully in the kernel. */
index a9b3b90..c2b1315 100644 (file)
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
        struct kvm *kvm = source_vcpu->kvm;
        struct kvm_vcpu *vcpu = NULL;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
        unsigned long cpu_id;
        unsigned long context_id;
        phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
        smp_mb();               /* Make sure the above is visible */
 
        wq = kvm_arch_vcpu_wq(vcpu);
-       wake_up_interruptible(wq);
+       swake_up(wq);
 
        return PSCI_RET_SUCCESS;
 }
index 77d6b1b..ee06fab 100644 (file)
@@ -86,8 +86,8 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
 {
        dma_addr_t dma;
 
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               PANEL_SIZE, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, PANEL_SIZE, &dma,
+                                         GFP_KERNEL);
        if (!fb->fb.screen_base) {
                printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                return -ENOMEM;
@@ -116,15 +116,14 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
 
 static int lpc32xx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 static void lpc32xx_clcd_remove(struct clcd_fb *fb)
 {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
 }
 
 /*
index 64e3d2c..b003e3a 100644 (file)
@@ -3,7 +3,6 @@ menuconfig ARCH_MVEBU
        depends on ARCH_MULTI_V7 || ARCH_MULTI_V5
        select ARCH_SUPPORTS_BIG_ENDIAN
        select CLKSRC_MMIO
-       select GENERIC_IRQ_CHIP
        select PINCTRL
        select PLAT_ORION
        select SOC_BUS
@@ -29,6 +28,7 @@ config MACH_ARMADA_370
        bool "Marvell Armada 370 boards"
        depends on ARCH_MULTI_V7
        select ARMADA_370_CLK
+       select ARMADA_370_XP_IRQ
        select CPU_PJ4B
        select MACH_MVEBU_V7
        select PINCTRL_ARMADA_370
@@ -39,6 +39,7 @@ config MACH_ARMADA_370
 config MACH_ARMADA_375
        bool "Marvell Armada 375 boards"
        depends on ARCH_MULTI_V7
+       select ARMADA_370_XP_IRQ
        select ARM_ERRATA_720789
        select ARM_ERRATA_753970
        select ARM_GIC
@@ -58,6 +59,7 @@ config MACH_ARMADA_38X
        select ARM_ERRATA_720789
        select ARM_ERRATA_753970
        select ARM_GIC
+       select ARMADA_370_XP_IRQ
        select ARMADA_38X_CLK
        select HAVE_ARM_SCU
        select HAVE_ARM_TWD if SMP
@@ -72,6 +74,7 @@ config MACH_ARMADA_39X
        bool "Marvell Armada 39x boards"
        depends on ARCH_MULTI_V7
        select ARM_GIC
+       select ARMADA_370_XP_IRQ
        select ARMADA_39X_CLK
        select CACHE_L2X0
        select HAVE_ARM_SCU
@@ -86,6 +89,7 @@ config MACH_ARMADA_39X
 config MACH_ARMADA_XP
        bool "Marvell Armada XP boards"
        depends on ARCH_MULTI_V7
+       select ARMADA_370_XP_IRQ
        select ARMADA_XP_CLK
        select CPU_PJ4B
        select MACH_MVEBU_V7
index d122ee6..8814ee5 100644 (file)
@@ -42,8 +42,8 @@ int netx_clcd_setup(struct clcd_fb *fb)
 
        fb->panel = netx_panel;
 
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, 1024*1024,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, 1024 * 1024, &dma,
+                                         GFP_KERNEL);
        if (!fb->fb.screen_base) {
                printk(KERN_ERR "CLCD: unable to map framebuffer\n");
                return -ENOMEM;
@@ -57,16 +57,14 @@ int netx_clcd_setup(struct clcd_fb *fb)
 
 int netx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void netx_clcd_remove(struct clcd_fb *fb)
 {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
 }
 
 static AMBA_AHB_DEVICE(fb, "fb", 0, 0x00104000, { NETX_IRQ_LCD }, NULL);
index abea126..ea0e5b2 100644 (file)
@@ -90,8 +90,8 @@ int nspire_clcd_setup(struct clcd_fb *fb)
        panel_size = ((panel->mode.xres * panel->mode.yres) * panel->bpp) / 8;
        panel_size = ALIGN(panel_size, PAGE_SIZE);
 
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-               panel_size, &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, panel_size, &dma,
+                                         GFP_KERNEL);
 
        if (!fb->fb.screen_base) {
                pr_err("CLCD: unable to map framebuffer\n");
@@ -107,13 +107,12 @@ int nspire_clcd_setup(struct clcd_fb *fb)
 
 int nspire_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-               fb->fb.screen_base, fb->fb.fix.smem_start,
-               fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void nspire_clcd_remove(struct clcd_fb *fb)
 {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-               fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
 }
index 8098272..bab814d 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <asm/setup.h>
 #include <asm/mach/arch.h>
+#include <asm/system_info.h>
 
 #include "common.h"
 
@@ -77,12 +78,31 @@ static const char *const n900_boards_compat[] __initconst = {
        NULL,
 };
 
+/* Set system_rev from atags */
+static void __init rx51_set_system_rev(const struct tag *tags)
+{
+       const struct tag *tag;
+
+       if (tags->hdr.tag != ATAG_CORE)
+               return;
+
+       for_each_tag(tag, tags) {
+               if (tag->hdr.tag == ATAG_REVISION) {
+                       system_rev = tag->u.revision.rev;
+                       break;
+               }
+       }
+}
+
 /* Legacy userspace on Nokia N900 needs ATAGS exported in /proc/atags,
  * save them while the data is still not overwritten
  */
 static void __init rx51_reserve(void)
 {
-       save_atags((const struct tag *)(PAGE_OFFSET + 0x100));
+       const struct tag *tags = (const struct tag *)(PAGE_OFFSET + 0x100);
+
+       save_atags(tags);
+       rx51_set_system_rev(tags);
        omap_reserve();
 }
 
index 9cda974..d7f1d69 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/pinctrl/machine.h>
-#include <linux/platform_data/mailbox-omap.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/map.h>
@@ -66,32 +65,6 @@ static int __init omap3_l3_init(void)
 }
 omap_postcore_initcall(omap3_l3_init);
 
-#if defined(CONFIG_OMAP2PLUS_MBOX) || defined(CONFIG_OMAP2PLUS_MBOX_MODULE)
-static inline void __init omap_init_mbox(void)
-{
-       struct omap_hwmod *oh;
-       struct platform_device *pdev;
-       struct omap_mbox_pdata *pdata;
-
-       oh = omap_hwmod_lookup("mailbox");
-       if (!oh) {
-               pr_err("%s: unable to find hwmod\n", __func__);
-               return;
-       }
-       if (!oh->dev_attr) {
-               pr_err("%s: hwmod doesn't have valid attrs\n", __func__);
-               return;
-       }
-
-       pdata = (struct omap_mbox_pdata *)oh->dev_attr;
-       pdev = omap_device_build("omap-mailbox", -1, oh, pdata, sizeof(*pdata));
-       WARN(IS_ERR(pdev), "%s: could not build device, err %ld\n",
-                                               __func__, PTR_ERR(pdev));
-}
-#else
-static inline void omap_init_mbox(void) { }
-#endif /* CONFIG_OMAP2PLUS_MBOX */
-
 static inline void omap_init_sti(void) {}
 
 #if defined(CONFIG_SPI_OMAP24XX) || defined(CONFIG_SPI_OMAP24XX_MODULE)
@@ -229,7 +202,6 @@ static int __init omap2_init_devices(void)
                 * please keep these calls, and their implementations above,
                 * in alphabetical order so they're easier to sort through.
                 */
-               omap_init_mbox();
                omap_init_mcspi();
                omap_init_sham();
                omap_init_aes();
index 7b76ce0..8633c70 100644 (file)
@@ -101,10 +101,8 @@ static void omap2_onenand_set_async_mode(void __iomem *onenand_base)
 
 static void set_onenand_cfg(void __iomem *onenand_base)
 {
-       u32 reg;
+       u32 reg = ONENAND_SYS_CFG1_RDY | ONENAND_SYS_CFG1_INT;
 
-       reg = readw(onenand_base + ONENAND_REG_SYS_CFG1);
-       reg &= ~((0x7 << ONENAND_SYS_CFG1_BRL_SHIFT) | (0x7 << 9));
        reg |=  (latency << ONENAND_SYS_CFG1_BRL_SHIFT) |
                ONENAND_SYS_CFG1_BL_16;
        if (onenand_flags & ONENAND_FLAG_SYNCREAD)
@@ -123,6 +121,7 @@ static void set_onenand_cfg(void __iomem *onenand_base)
                reg |= ONENAND_SYS_CFG1_VHF;
        else
                reg &= ~ONENAND_SYS_CFG1_VHF;
+
        writew(reg, onenand_base + ONENAND_REG_SYS_CFG1);
 }
 
@@ -289,6 +288,7 @@ static int omap2_onenand_setup_async(void __iomem *onenand_base)
                }
        }
 
+       onenand_async.sync_write = true;
        omap2_onenand_calc_async_timings(&t);
 
        ret = gpmc_cs_program_settings(gpmc_onenand_data->cs, &onenand_async);
index 0437537..f7ff3b9 100644 (file)
@@ -191,12 +191,22 @@ static int _omap_device_notifier_call(struct notifier_block *nb,
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct omap_device *od;
+       int err;
 
        switch (event) {
        case BUS_NOTIFY_DEL_DEVICE:
                if (pdev->archdata.od)
                        omap_device_delete(pdev->archdata.od);
                break;
+       case BUS_NOTIFY_UNBOUND_DRIVER:
+               od = to_omap_device(pdev);
+               if (od && (od->_state == OMAP_DEVICE_STATE_ENABLED)) {
+                       dev_info(dev, "enabled after unload, idling\n");
+                       err = omap_device_idle(pdev);
+                       if (err)
+                               dev_err(dev, "failed to idle\n");
+               }
+               break;
        case BUS_NOTIFY_ADD_DEVICE:
                if (pdev->dev.of_node)
                        omap_device_build_from_dt(pdev);
@@ -602,8 +612,10 @@ static int _od_runtime_resume(struct device *dev)
        int ret;
 
        ret = omap_device_enable(pdev);
-       if (ret)
+       if (ret) {
+               dev_err(dev, "use pm_runtime_put_sync_suspend() in driver?\n");
                return ret;
+       }
 
        return pm_generic_runtime_resume(dev);
 }
index e9f65fe..b6d62e4 100644 (file)
@@ -2200,6 +2200,11 @@ static int _enable(struct omap_hwmod *oh)
  */
 static int _idle(struct omap_hwmod *oh)
 {
+       if (oh->flags & HWMOD_NO_IDLE) {
+               oh->_int_flags |= _HWMOD_SKIP_ENABLE;
+               return 0;
+       }
+
        pr_debug("omap_hwmod: %s: idling\n", oh->name);
 
        if (oh->_state != _HWMOD_STATE_ENABLED) {
@@ -2504,6 +2509,8 @@ static int __init _init(struct omap_hwmod *oh, void *data)
                        oh->flags |= HWMOD_INIT_NO_RESET;
                if (of_find_property(np, "ti,no-idle-on-init", NULL))
                        oh->flags |= HWMOD_INIT_NO_IDLE;
+               if (of_find_property(np, "ti,no-idle", NULL))
+                       oh->flags |= HWMOD_NO_IDLE;
        }
 
        oh->_state = _HWMOD_STATE_INITIALIZED;
@@ -2630,7 +2637,7 @@ static void __init _setup_postsetup(struct omap_hwmod *oh)
         * XXX HWMOD_INIT_NO_IDLE does not belong in hwmod data -
         * it should be set by the core code as a runtime flag during startup
         */
-       if ((oh->flags & HWMOD_INIT_NO_IDLE) &&
+       if ((oh->flags & (HWMOD_INIT_NO_IDLE | HWMOD_NO_IDLE)) &&
            (postsetup_state == _HWMOD_STATE_IDLE)) {
                oh->_int_flags |= _HWMOD_SKIP_ENABLE;
                postsetup_state = _HWMOD_STATE_ENABLED;
index 76bce11..7c7a311 100644 (file)
@@ -525,6 +525,8 @@ struct omap_hwmod_omap4_prcm {
  *     or idled.
  * HWMOD_OPT_CLKS_NEEDED: The optional clocks are needed for the module to
  *     operate and they need to be handled at the same time as the main_clk.
+ * HWMOD_NO_IDLE: Do not idle the hwmod at all. Useful to handle certain
+ *     IPs like CPSW on DRA7, where clocks to this module cannot be disabled.
  */
 #define HWMOD_SWSUP_SIDLE                      (1 << 0)
 #define HWMOD_SWSUP_MSTANDBY                   (1 << 1)
@@ -541,6 +543,7 @@ struct omap_hwmod_omap4_prcm {
 #define HWMOD_SWSUP_SIDLE_ACT                  (1 << 12)
 #define HWMOD_RECONFIG_IO_CHAIN                        (1 << 13)
 #define HWMOD_OPT_CLKS_NEEDED                  (1 << 14)
+#define HWMOD_NO_IDLE                          (1 << 15)
 
 /*
  * omap_hwmod._int_flags definitions
index e781e4f..a935d28 100644 (file)
@@ -23,6 +23,8 @@
 #include <linux/platform_data/pinctrl-single.h>
 #include <linux/platform_data/iommu-omap.h>
 #include <linux/platform_data/wkup_m3.h>
+#include <linux/platform_data/pwm_omap_dmtimer.h>
+#include <plat/dmtimer.h>
 
 #include "common.h"
 #include "common-board-devices.h"
@@ -449,6 +451,24 @@ void omap_auxdata_legacy_init(struct device *dev)
        dev->platform_data = &twl_gpio_auxdata;
 }
 
+/* Dual mode timer PWM callbacks platdata */
+#if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
+struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = {
+       .request_by_node = omap_dm_timer_request_by_node,
+       .free = omap_dm_timer_free,
+       .enable = omap_dm_timer_enable,
+       .disable = omap_dm_timer_disable,
+       .get_fclk = omap_dm_timer_get_fclk,
+       .start = omap_dm_timer_start,
+       .stop = omap_dm_timer_stop,
+       .set_load = omap_dm_timer_set_load,
+       .set_match = omap_dm_timer_set_match,
+       .set_pwm = omap_dm_timer_set_pwm,
+       .set_prescaler = omap_dm_timer_set_prescaler,
+       .write_counter = omap_dm_timer_write_counter,
+};
+#endif
+
 /*
  * Few boards still need auxdata populated before we populate
  * the dev entries in of_platform_populate().
@@ -502,6 +522,9 @@ static struct of_dev_auxdata omap_auxdata_lookup[] __initdata = {
        OF_DEV_AUXDATA("ti,am4372-wkup-m3", 0x44d00000, "44d00000.wkup_m3",
                       &wkup_m3_data),
 #endif
+#if IS_ENABLED(CONFIG_OMAP_DM_TIMER)
+       OF_DEV_AUXDATA("ti,omap-dmtimer-pwm", 0, NULL, &pwm_dmtimer_pdata),
+#endif
 #if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5)
        OF_DEV_AUXDATA("ti,omap4-iommu", 0x4a066000, "4a066000.mmu",
                       &omap4_iommu_pdata),
index eafd120..1b9f052 100644 (file)
@@ -86,13 +86,18 @@ ENTRY(enable_omap3630_toggle_l2_on_restore)
        stmfd   sp!, {lr}       @ save registers on stack
        /* Setup so that we will disable and enable l2 */
        mov     r1, #0x1
-       adrl    r2, l2dis_3630  @ may be too distant for plain adr
-       str     r1, [r2]
+       adrl    r3, l2dis_3630_offset   @ may be too distant for plain adr
+       ldr     r2, [r3]                @ value for offset
+       str     r1, [r2, r3]            @ write to l2dis_3630
        ldmfd   sp!, {pc}       @ restore regs and return
 ENDPROC(enable_omap3630_toggle_l2_on_restore)
 
-       .text
-/* Function to call rom code to save secure ram context */
+/*
+ * Function to call rom code to save secure ram context. This gets
+ * relocated to SRAM, so it can be all in .data section. Otherwise
+ * we need to initialize api_params separately.
+ */
+       .data
        .align  3
 ENTRY(save_secure_ram_context)
        stmfd   sp!, {r4 - r11, lr}     @ save registers on stack
@@ -126,6 +131,8 @@ ENDPROC(save_secure_ram_context)
 ENTRY(save_secure_ram_context_sz)
        .word   . - save_secure_ram_context
 
+       .text
+
 /*
  * ======================
  * == Idle entry point ==
@@ -289,12 +296,6 @@ wait_sdrc_ready:
        bic     r5, r5, #0x40
        str     r5, [r4]
 
-/*
- * PC-relative stores lead to undefined behaviour in Thumb-2: use a r7 as a
- * base instead.
- * Be careful not to clobber r7 when maintaing this code.
- */
-
 is_dll_in_lock_mode:
        /* Is dll in lock mode? */
        ldr     r4, sdrc_dlla_ctrl
@@ -302,11 +303,7 @@ is_dll_in_lock_mode:
        tst     r5, #0x4
        bne     exit_nonoff_modes       @ Return if locked
        /* wait till dll locks */
-       adr     r7, kick_counter
 wait_dll_lock_timed:
-       ldr     r4, wait_dll_lock_counter
-       add     r4, r4, #1
-       str     r4, [r7, #wait_dll_lock_counter - kick_counter]
        ldr     r4, sdrc_dlla_status
        /* Wait 20uS for lock */
        mov     r6, #8
@@ -330,9 +327,6 @@ kick_dll:
        orr     r6, r6, #(1<<3)         @ enable dll
        str     r6, [r4]
        dsb
-       ldr     r4, kick_counter
-       add     r4, r4, #1
-       str     r4, [r7]                @ kick_counter
        b       wait_dll_lock_timed
 
 exit_nonoff_modes:
@@ -360,15 +354,6 @@ sdrc_dlla_status:
        .word   SDRC_DLLA_STATUS_V
 sdrc_dlla_ctrl:
        .word   SDRC_DLLA_CTRL_V
-       /*
-        * When exporting to userspace while the counters are in SRAM,
-        * these 2 words need to be at the end to facilitate retrival!
-        */
-kick_counter:
-       .word   0
-wait_dll_lock_counter:
-       .word   0
-
 ENTRY(omap3_do_wfi_sz)
        .word   . - omap3_do_wfi
 
@@ -437,7 +422,9 @@ ENTRY(omap3_restore)
        cmp     r2, #0x0        @ Check if target power state was OFF or RET
        bne     logic_l1_restore
 
-       ldr     r0, l2dis_3630
+       adr     r1, l2dis_3630_offset   @ address for offset
+       ldr     r0, [r1]                @ value for offset
+       ldr     r0, [r1, r0]            @ value at l2dis_3630
        cmp     r0, #0x1        @ should we disable L2 on 3630?
        bne     skipl2dis
        mrc     p15, 0, r0, c1, c0, 1
@@ -449,12 +436,14 @@ skipl2dis:
        and     r1, #0x700
        cmp     r1, #0x300
        beq     l2_inv_gp
+       adr     r0, l2_inv_api_params_offset
+       ldr     r3, [r0]
+       add     r3, r3, r0              @ r3 points to dummy parameters
        mov     r0, #40                 @ set service ID for PPA
        mov     r12, r0                 @ copy secure Service ID in r12
        mov     r1, #0                  @ set task id for ROM code in r1
        mov     r2, #4                  @ set some flags in r2, r6
        mov     r6, #0xff
-       adr     r3, l2_inv_api_params   @ r3 points to dummy parameters
        dsb                             @ data write barrier
        dmb                             @ data memory barrier
        smc     #1                      @ call SMI monitor (smi #1)
@@ -488,8 +477,8 @@ skipl2dis:
        b       logic_l1_restore
 
        .align
-l2_inv_api_params:
-       .word   0x1, 0x00
+l2_inv_api_params_offset:
+       .long   l2_inv_api_params - .
 l2_inv_gp:
        /* Execute smi to invalidate L2 cache */
        mov r12, #0x1                   @ set up to invalidate L2
@@ -506,7 +495,9 @@ l2_inv_gp:
        mov     r12, #0x2
        smc     #0                      @ Call SMI monitor (smieq)
 logic_l1_restore:
-       ldr     r1, l2dis_3630
+       adr     r0, l2dis_3630_offset   @ adress for offset
+       ldr     r1, [r0]                @ value for offset
+       ldr     r1, [r0, r1]            @ value at l2dis_3630
        cmp     r1, #0x1                @ Test if L2 re-enable needed on 3630
        bne     skipl2reen
        mrc     p15, 0, r1, c1, c0, 1
@@ -535,9 +526,17 @@ control_stat:
        .word   CONTROL_STAT
 control_mem_rta:
        .word   CONTROL_MEM_RTA_CTRL
+l2dis_3630_offset:
+       .long   l2dis_3630 - .
+
+       .data
 l2dis_3630:
        .word   0
 
+       .data
+l2_inv_api_params:
+       .word   0x1, 0x00
+
 /*
  * Internal functions
  */
index 9b09d85..c7a3b4a 100644 (file)
        dsb
 .endm
 
-ppa_zero_params:
-       .word           0x0
-
-ppa_por_params:
-       .word           1, 0
-
 #ifdef CONFIG_ARCH_OMAP4
 
 /*
@@ -266,7 +260,9 @@ ENTRY(omap4_cpu_resume)
        beq     skip_ns_smp_enable
 ppa_actrl_retry:
        mov     r0, #OMAP4_PPA_CPU_ACTRL_SMP_INDEX
-       adr     r3, ppa_zero_params             @ Pointer to parameters
+       adr     r1, ppa_zero_params_offset
+       ldr     r3, [r1]
+       add     r3, r3, r1                      @ Pointer to ppa_zero_params
        mov     r1, #0x0                        @ Process ID
        mov     r2, #0x4                        @ Flag
        mov     r6, #0xff
@@ -303,7 +299,9 @@ skip_ns_smp_enable:
        ldr     r0, =OMAP4_PPA_L2_POR_INDEX
        ldr     r1, =OMAP44XX_SAR_RAM_BASE
        ldr     r4, [r1, #L2X0_PREFETCH_CTRL_OFFSET]
-       adr     r3, ppa_por_params
+       adr     r1, ppa_por_params_offset
+       ldr     r3, [r1]
+       add     r3, r3, r1                      @ Pointer to ppa_por_params
        str     r4, [r3, #0x04]
        mov     r1, #0x0                        @ Process ID
        mov     r2, #0x4                        @ Flag
@@ -328,6 +326,8 @@ skip_l2en:
 #endif
 
        b       cpu_resume                      @ Jump to generic resume
+ppa_por_params_offset:
+       .long   ppa_por_params - .
 ENDPROC(omap4_cpu_resume)
 #endif /* CONFIG_ARCH_OMAP4 */
 
@@ -380,4 +380,13 @@ ENTRY(omap_do_wfi)
        nop
 
        ldmfd   sp!, {pc}
+ppa_zero_params_offset:
+       .long   ppa_zero_params - .
 ENDPROC(omap_do_wfi)
+
+       .data
+ppa_zero_params:
+       .word           0
+
+ppa_por_params:
+       .word           1, 0
index def40a0..70ab4a2 100644 (file)
@@ -1,5 +1,6 @@
 menuconfig ARCH_REALVIEW
-       bool "ARM Ltd. RealView family" if ARCH_MULTI_V5 || ARCH_MULTI_V6 || ARCH_MULTI_V7
+       bool "ARM Ltd. RealView family"
+       depends on ARCH_MULTI_V5 || ARCH_MULTI_V6 || ARCH_MULTI_V7
        select ARM_AMBA
        select ARM_TIMER_SP804
        select COMMON_CLK_VERSATILE
index 6558539..6964e88 100644 (file)
@@ -80,7 +80,7 @@ static void __init realview_smp_prepare_cpus(unsigned int max_cpus)
                     virt_to_phys(versatile_secondary_startup));
 }
 
-struct smp_operations realview_dt_smp_ops __initdata = {
+static const struct smp_operations realview_dt_smp_ops __initconst = {
        .smp_prepare_cpus       = realview_smp_prepare_cpus,
        .smp_secondary_init     = versatile_secondary_init,
        .smp_boot_secondary     = versatile_boot_secondary,
index 9cb1121..b3a4ed5 100644 (file)
@@ -4,7 +4,6 @@
 extern void shmobile_init_delay(void);
 extern void shmobile_boot_vector(void);
 extern unsigned long shmobile_boot_fn;
-extern unsigned long shmobile_boot_arg;
 extern unsigned long shmobile_boot_size;
 extern void shmobile_smp_boot(void);
 extern void shmobile_smp_sleep(void);
index fa5248c..5e503d9 100644 (file)
@@ -38,9 +38,3 @@ ENTRY(shmobile_boot_scu)
 
        b       secondary_startup
 ENDPROC(shmobile_boot_scu)
-
-       .text
-       .align  2
-       .globl  shmobile_scu_base
-shmobile_scu_base:
-       .space  4
index 330c1fc..32e0bf6 100644 (file)
@@ -24,7 +24,6 @@
        .arm
        .align  12
 ENTRY(shmobile_boot_vector)
-       ldr     r0, 2f
        ldr     r1, 1f
        bx      r1
 
@@ -34,9 +33,6 @@ ENDPROC(shmobile_boot_vector)
        .globl  shmobile_boot_fn
 shmobile_boot_fn:
 1:     .space  4
-       .globl  shmobile_boot_arg
-shmobile_boot_arg:
-2:     .space  4
        .globl  shmobile_boot_size
 shmobile_boot_size:
        .long   . - shmobile_boot_vector
@@ -46,13 +42,15 @@ shmobile_boot_size:
  */
 
 ENTRY(shmobile_smp_boot)
-                                               @ r0 = MPIDR_HWID_BITMASK
        mrc     p15, 0, r1, c0, c0, 5           @ r1 = MPIDR
-       and     r0, r1, r0                      @ r0 = cpu_logical_map() value
+       and     r0, r1, #0xffffff               @ MPIDR_HWID_BITMASK
+                                               @ r0 = cpu_logical_map() value
        mov     r1, #0                          @ r1 = CPU index
-       adr     r5, 1f                          @ array of per-cpu mpidr values
-       adr     r6, 2f                          @ array of per-cpu functions
-       adr     r7, 3f                          @ array of per-cpu arguments
+       adr     r2, 1f
+       ldmia   r2, {r5, r6, r7}
+       add     r5, r5, r2                      @ array of per-cpu mpidr values
+       add     r6, r6, r2                      @ array of per-cpu functions
+       add     r7, r7, r2                      @ array of per-cpu arguments
 
 shmobile_smp_boot_find_mpidr:
        ldr     r8, [r5, r1, lsl #2]
@@ -80,12 +78,18 @@ ENTRY(shmobile_smp_sleep)
        b       shmobile_smp_boot
 ENDPROC(shmobile_smp_sleep)
 
+       .align  2
+1:     .long   shmobile_smp_mpidr - .
+       .long   shmobile_smp_fn - 1b
+       .long   shmobile_smp_arg - 1b
+
+       .bss
        .globl  shmobile_smp_mpidr
 shmobile_smp_mpidr:
-1:     .space  NR_CPUS * 4
+       .space  NR_CPUS * 4
        .globl  shmobile_smp_fn
 shmobile_smp_fn:
-2:     .space  NR_CPUS * 4
+       .space  NR_CPUS * 4
        .globl  shmobile_smp_arg
 shmobile_smp_arg:
-3:     .space  NR_CPUS * 4
+       .space  NR_CPUS * 4
index 911884f..aba75c8 100644 (file)
@@ -123,7 +123,6 @@ void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus,
 {
        /* install boot code shared by all CPUs */
        shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
-       shmobile_boot_arg = MPIDR_HWID_BITMASK;
 
        /* perform per-cpu setup */
        apmu_parse_cfg(apmu_init_cpu, apmu_config, num);
index 6466311..081a097 100644 (file)
@@ -17,6 +17,9 @@
 #include <asm/smp_scu.h>
 #include "common.h"
 
+
+void __iomem *shmobile_scu_base;
+
 static int shmobile_smp_scu_notifier_call(struct notifier_block *nfb,
                                          unsigned long action, void *hcpu)
 {
@@ -41,7 +44,6 @@ void __init shmobile_smp_scu_prepare_cpus(unsigned int max_cpus)
 {
        /* install boot code shared by all CPUs */
        shmobile_boot_fn = virt_to_phys(shmobile_smp_boot);
-       shmobile_boot_arg = MPIDR_HWID_BITMASK;
 
        /* enable SCU and cache coherency on booting CPU */
        scu_enable(shmobile_scu_base);
index b854fe2..0b024a9 100644 (file)
@@ -92,8 +92,6 @@ static void __init r8a7779_smp_prepare_cpus(unsigned int max_cpus)
 {
        /* Map the reset vector (in headsmp-scu.S, headsmp.S) */
        __raw_writel(__pa(shmobile_boot_vector), AVECR);
-       shmobile_boot_fn = virt_to_phys(shmobile_boot_scu);
-       shmobile_boot_arg = (unsigned long)shmobile_scu_base;
 
        /* setup r8a7779 specific SCU bits */
        shmobile_scu_base = IOMEM(R8A7779_SCU_BASE);
index d6a3714..ebe15b9 100644 (file)
@@ -1,5 +1,6 @@
 config ARCH_TANGO
-       bool "Sigma Designs Tango4 (SMP87xx)" if ARCH_MULTI_V7
+       bool "Sigma Designs Tango4 (SMP87xx)"
+       depends on ARCH_MULTI_V7
        # Cortex-A9 MPCore r3p0, PL310 r3p2
        select ARCH_HAS_HOLES_MEMORYMODEL
        select ARM_ERRATA_754322
index a18d5a3..a21f55e 100644 (file)
@@ -9,7 +9,7 @@ static int tango_boot_secondary(unsigned int cpu, struct task_struct *idle)
        return 0;
 }
 
-static struct smp_operations tango_smp_ops __initdata = {
+static const struct smp_operations tango_smp_ops __initconst = {
        .smp_boot_secondary     = tango_boot_secondary,
 };
 
index 4b4058d..66353ca 100644 (file)
@@ -173,7 +173,7 @@ unsigned long arch_mmap_rnd(void)
 {
        unsigned long rnd;
 
-       rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+       rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
 
        return rnd << PAGE_SHIFT;
 }
index cf30daf..d19b1ad 100644 (file)
@@ -49,6 +49,9 @@ static int change_memory_common(unsigned long addr, int numpages,
                WARN_ON_ONCE(1);
        }
 
+       if (!numpages)
+               return 0;
+
        if (start < MODULES_VADDR || start >= MODULES_END)
                return -EINVAL;
 
index 04aff2c..70f2f69 100644 (file)
@@ -53,8 +53,8 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
                if (ptr->child != NULL)
                        s3c_pm_run_res(ptr->child, fn, arg);
 
-               if ((ptr->flags & IORESOURCE_MEM) &&
-                   strcmp(ptr->name, "System RAM") == 0) {
+               if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
+                               == IORESOURCE_SYSTEM_RAM) {
                        S3C_PMDBG("Found system RAM at %08lx..%08lx\n",
                                  (unsigned long)ptr->start,
                                  (unsigned long)ptr->end);
index b2b97e3..a62a7b6 100644 (file)
@@ -23,9 +23,8 @@
 #include <linux/const.h>
 #include <asm/page.h>
 
-       __PAGE_ALIGNED_DATA
-
        .globl vdso_start, vdso_end
+       .section .data..ro_after_init
        .balign PAGE_SIZE
 vdso_start:
        .incbin "arch/arm/vdso/vdso.so"
index cd822d8..b5e3f6d 100644 (file)
@@ -27,6 +27,8 @@ $(warning LSE atomics not supported by binutils)
 endif
 
 KBUILD_CFLAGS  += -mgeneral-regs-only $(lseinstr)
+KBUILD_CFLAGS  += -fno-asynchronous-unwind-tables
+KBUILD_CFLAGS  += $(call cc-option, -mpc-relative-literal-loads)
 KBUILD_AFLAGS  += $(lseinstr)
 
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -86,7 +88,7 @@ Image: vmlinux
 Image.%: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-zinstall install: vmlinux
+zinstall install:
        $(Q)$(MAKE) $(build)=$(boot) $@
 
 %.dtb: scripts
index abcbba2..305c552 100644 (file)
@@ -34,10 +34,10 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
 $(obj)/Image.lzo: $(obj)/Image FORCE
        $(call if_changed,lzo)
 
-install: $(obj)/Image
+install:
        $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
        $(obj)/Image System.map "$(INSTALL_PATH)"
 
-zinstall: $(obj)/Image.gz
+zinstall:
        $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
        $(obj)/Image.gz System.map "$(INSTALL_PATH)"
index dd5158e..e5b59ca 100644 (file)
                             <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
index da7b6e6..933cba3 100644 (file)
@@ -23,9 +23,8 @@ soc0: soc@000000000 {
                };
        };
 
-       dsa: dsa@c7000000 {
+       dsaf0: dsa@c7000000 {
                compatible = "hisilicon,hns-dsaf-v1";
-               dsa_name = "dsaf0";
                mode = "6port-16rss";
                interrupt-parent = <&mbigen_dsa>;
 
@@ -127,7 +126,7 @@ soc0: soc@000000000 {
 
        eth0: ethernet@0{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <0>;
                local-mac-address = [00 00 00 01 00 58];
                status = "disabled";
@@ -135,14 +134,14 @@ soc0: soc@000000000 {
        };
        eth1: ethernet@1{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <1>;
                status = "disabled";
                dma-coherent;
        };
        eth2: ethernet@2{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <2>;
                local-mac-address = [00 00 00 01 00 5a];
                status = "disabled";
@@ -150,7 +149,7 @@ soc0: soc@000000000 {
        };
        eth3: ethernet@3{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <3>;
                local-mac-address = [00 00 00 01 00 5b];
                status = "disabled";
@@ -158,7 +157,7 @@ soc0: soc@000000000 {
        };
        eth4: ethernet@4{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <4>;
                local-mac-address = [00 00 00 01 00 5c];
                status = "disabled";
@@ -166,7 +165,7 @@ soc0: soc@000000000 {
        };
        eth5: ethernet@5{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <5>;
                local-mac-address = [00 00 00 01 00 5d];
                status = "disabled";
@@ -174,7 +173,7 @@ soc0: soc@000000000 {
        };
        eth6: ethernet@6{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <6>;
                local-mac-address = [00 00 00 01 00 5e];
                status = "disabled";
@@ -182,7 +181,7 @@ soc0: soc@000000000 {
        };
        eth7: ethernet@7{
                compatible = "hisilicon,hns-nic-v1";
-               ae-name = "dsaf0";
+               ae-handle = <&dsaf0>;
                port-id = <7>;
                local-mac-address = [00 00 00 01 00 5f];
                status = "disabled";
index 7dfe1c0..62f33fc 100644 (file)
@@ -12,6 +12,8 @@
                rtc1 = "/rtc@0,7000e000";
        };
 
+       chosen { };
+
        memory {
                device_type = "memory";
                reg = <0x0 0x80000000 0x0 0x80000000>;
index 12ed78a..d91e1f0 100644 (file)
 #   $4 - default install path (blank if root directory)
 #
 
+verify () {
+       if [ ! -f "$1" ]; then
+               echo ""                                                   1>&2
+               echo " *** Missing file: $1"                              1>&2
+               echo ' *** You need to run "make" before "make install".' 1>&2
+               echo ""                                                   1>&2
+               exit 1
+       fi
+}
+
+# Make sure the files actually exist
+verify "$2"
+verify "$3"
+
 # User may have a custom install script
 if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
 if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
index 18ca9fb..86581f7 100644 (file)
@@ -16,7 +16,6 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
-CONFIG_MEMCG_KMEM=y
 CONFIG_CGROUP_HUGETLB=y
 # CONFIG_UTS_NS is not set
 # CONFIG_IPC_NS is not set
@@ -37,15 +36,13 @@ CONFIG_ARCH_EXYNOS7=y
 CONFIG_ARCH_LAYERSCAPE=y
 CONFIG_ARCH_HISI=y
 CONFIG_ARCH_MEDIATEK=y
+CONFIG_ARCH_QCOM=y
 CONFIG_ARCH_ROCKCHIP=y
 CONFIG_ARCH_SEATTLE=y
 CONFIG_ARCH_RENESAS=y
 CONFIG_ARCH_R8A7795=y
 CONFIG_ARCH_STRATIX10=y
 CONFIG_ARCH_TEGRA=y
-CONFIG_ARCH_TEGRA_132_SOC=y
-CONFIG_ARCH_TEGRA_210_SOC=y
-CONFIG_ARCH_QCOM=y
 CONFIG_ARCH_SPRD=y
 CONFIG_ARCH_THUNDER=y
 CONFIG_ARCH_UNIPHIER=y
@@ -54,14 +51,19 @@ CONFIG_ARCH_XGENE=y
 CONFIG_ARCH_ZYNQMP=y
 CONFIG_PCI=y
 CONFIG_PCI_MSI=y
+CONFIG_PCI_IOV=y
+CONFIG_PCI_RCAR_GEN2_PCIE=y
 CONFIG_PCI_HOST_GENERIC=y
 CONFIG_PCI_XGENE=y
-CONFIG_SMP=y
+CONFIG_PCI_LAYERSCAPE=y
+CONFIG_PCI_HISI=y
+CONFIG_PCIE_QCOM=y
 CONFIG_SCHED_MC=y
 CONFIG_PREEMPT=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_CMA=y
+CONFIG_XEN=y
 CONFIG_CMDLINE="console=ttyAMA0"
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_COMPAT=y
@@ -100,7 +102,11 @@ CONFIG_PATA_OF_PLATFORM=y
 CONFIG_NETDEVICES=y
 CONFIG_TUN=y
 CONFIG_VIRTIO_NET=y
+CONFIG_AMD_XGBE=y
 CONFIG_NET_XGENE=y
+CONFIG_E1000E=y
+CONFIG_IGB=y
+CONFIG_IGBVF=y
 CONFIG_SKY2=y
 CONFIG_RAVB=y
 CONFIG_SMC91X=y
@@ -117,25 +123,23 @@ CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_DW=y
 CONFIG_SERIAL_8250_MT6577=y
 CONFIG_SERIAL_8250_UNIPHIER=y
+CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_AMBA_PL011=y
 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
 CONFIG_SERIAL_SAMSUNG=y
-CONFIG_SERIAL_SAMSUNG_UARTS_4=y
-CONFIG_SERIAL_SAMSUNG_UARTS=4
 CONFIG_SERIAL_SAMSUNG_CONSOLE=y
+CONFIG_SERIAL_TEGRA=y
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=11
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
-CONFIG_SERIAL_TEGRA=y
 CONFIG_SERIAL_MSM=y
 CONFIG_SERIAL_MSM_CONSOLE=y
-CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_XILINX_PS_UART=y
 CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y
 CONFIG_VIRTIO_CONSOLE=y
 # CONFIG_HW_RANDOM is not set
-CONFIG_I2C=y
 CONFIG_I2C_QUP=y
+CONFIG_I2C_UNIPHIER_F=y
 CONFIG_I2C_RCAR=y
 CONFIG_SPI=y
 CONFIG_SPI_PL022=y
@@ -176,8 +180,6 @@ CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_SDHCI_TEGRA=y
 CONFIG_MMC_SPI=y
 CONFIG_MMC_DW=y
-CONFIG_MMC_DW_IDMAC=y
-CONFIG_MMC_DW_PLTFM=y
 CONFIG_MMC_DW_EXYNOS=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -187,28 +189,33 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_LEDS_TRIGGER_CPU=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_EFI=y
+CONFIG_RTC_DRV_PL031=y
 CONFIG_RTC_DRV_XGENE=y
 CONFIG_DMADEVICES=y
-CONFIG_RCAR_DMAC=y
 CONFIG_QCOM_BAM_DMA=y
 CONFIG_TEGRA20_APB_DMA=y
+CONFIG_RCAR_DMAC=y
+CONFIG_VFIO=y
+CONFIG_VFIO_PCI=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BALLOON=y
 CONFIG_VIRTIO_MMIO=y
+CONFIG_XEN_GNTDEV=y
+CONFIG_XEN_GRANT_DEV_ALLOC=y
 CONFIG_COMMON_CLK_CS2000_CP=y
 CONFIG_COMMON_CLK_QCOM=y
 CONFIG_MSM_GCC_8916=y
 CONFIG_HWSPINLOCK_QCOM=y
-# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ARM_SMMU=y
 CONFIG_QCOM_SMEM=y
 CONFIG_QCOM_SMD=y
 CONFIG_QCOM_SMD_RPM=y
+CONFIG_ARCH_TEGRA_132_SOC=y
+CONFIG_ARCH_TEGRA_210_SOC=y
+CONFIG_HISILICON_IRQ_MBIGEN=y
 CONFIG_PHY_XGENE=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
 CONFIG_FANOTIFY=y
 CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA=y
@@ -239,6 +246,7 @@ CONFIG_LOCKUP_DETECTOR=y
 # CONFIG_FTRACE is not set
 CONFIG_MEMTEST=y
 CONFIG_SECURITY=y
+CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_ARM64_CRYPTO=y
 CONFIG_CRYPTO_SHA1_ARM64_CE=y
index 05d9e16..7a3d22a 100644 (file)
@@ -294,7 +294,7 @@ static struct crypto_alg aes_algs[] = { {
        .cra_blkcipher = {
                .min_keysize    = AES_MIN_KEY_SIZE,
                .max_keysize    = AES_MAX_KEY_SIZE,
-               .ivsize         = AES_BLOCK_SIZE,
+               .ivsize         = 0,
                .setkey         = aes_setkey,
                .encrypt        = ecb_encrypt,
                .decrypt        = ecb_decrypt,
@@ -371,7 +371,7 @@ static struct crypto_alg aes_algs[] = { {
        .cra_ablkcipher = {
                .min_keysize    = AES_MIN_KEY_SIZE,
                .max_keysize    = AES_MAX_KEY_SIZE,
-               .ivsize         = AES_BLOCK_SIZE,
+               .ivsize         = 0,
                .setkey         = ablk_set_key,
                .encrypt        = ablk_encrypt,
                .decrypt        = ablk_decrypt,
index 2731d3b..8ec88e5 100644 (file)
@@ -103,6 +103,7 @@ static inline u64 gic_read_iar_common(void)
        u64 irqstat;
 
        asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat));
+       dsb(sy);
        return irqstat;
 }
 
index 7fc294c..22dda61 100644 (file)
@@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #endif
index 007a69f..5f3ab8c 100644 (file)
@@ -121,6 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
                return -EFAULT;
 
        asm volatile("// futex_atomic_cmpxchg_inatomic\n"
+ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
 "      prfm    pstl1strm, %2\n"
 "1:    ldxr    %w1, %2\n"
 "      sub     %w3, %w1, %w4\n"
@@ -137,6 +138,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 "      .align  3\n"
 "      .quad   1b, 4b, 2b, 4b\n"
 "      .popsection\n"
+ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
        : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
        : "r" (oldval), "r" (newval), "Ir" (-EFAULT)
        : "memory");
index 738a95f..d201d4b 100644 (file)
 #define TCR_EL2_MASK   (TCR_EL2_TG0 | TCR_EL2_SH0 | \
                         TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
 
-#define TCR_EL2_FLAGS  (TCR_EL2_RES1 | TCR_EL2_PS_40B)
-
 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_RES1          (1 << 31)
 #define VTCR_EL2_PS_MASK       (7 << 16)
 #define CPTR_EL2_TCPAC (1 << 31)
 #define CPTR_EL2_TTA   (1 << 20)
 #define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
+#define CPTR_EL2_DEFAULT       0x000033ff
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA          (1 << 11)
index 3066328..779a587 100644 (file)
@@ -127,10 +127,14 @@ static inline unsigned long *vcpu_spsr(const struct kvm_vcpu *vcpu)
 
 static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu)
 {
-       u32 mode = *vcpu_cpsr(vcpu) & PSR_MODE_MASK;
+       u32 mode;
 
-       if (vcpu_mode_is_32bit(vcpu))
+       if (vcpu_mode_is_32bit(vcpu)) {
+               mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK;
                return mode > COMPAT_PSR_MODE_USR;
+       }
+
+       mode = *vcpu_cpsr(vcpu) & PSR_MODE_MASK;
 
        return mode != PSR_MODE_EL0t;
 }
index 9b2f5a9..ae615b9 100644 (file)
@@ -39,6 +39,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/personality.h> /* for READ_IMPLIES_EXEC */
 #include <asm/pgtable-types.h>
 
 extern void __cpu_clear_user_page(void *p, unsigned long user);
index 2d545d7..819aff5 100644 (file)
@@ -34,7 +34,7 @@
 /*
  * VMALLOC and SPARSEMEM_VMEMMAP ranges.
  *
- * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array
+ * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
  *     (rounded up to PUD_SIZE).
  * VMALLOC_START: beginning of the kernel VA space
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
@@ -51,7 +51,9 @@
 
 #define VMALLOC_END            (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
-#define vmemmap                        ((struct page *)(VMALLOC_END + SZ_64K))
+#define VMEMMAP_START          (VMALLOC_END + SZ_64K)
+#define vmemmap                        ((struct page *)VMEMMAP_START - \
+                                SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS     0UL
 
@@ -67,11 +69,11 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
 #define PROT_DEFAULT           (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
 #define PROT_SECT_DEFAULT      (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
 
-#define PROT_DEVICE_nGnRnE     (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
-#define PROT_DEVICE_nGnRE      (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
-#define PROT_NORMAL_NC         (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
-#define PROT_NORMAL_WT         (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_WT))
-#define PROT_NORMAL            (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL))
+#define PROT_DEVICE_nGnRnE     (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
+#define PROT_DEVICE_nGnRE      (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
+#define PROT_NORMAL_NC         (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
+#define PROT_NORMAL_WT         (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT))
+#define PROT_NORMAL            (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
 
 #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
 #define PROT_SECT_NORMAL       (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
@@ -81,7 +83,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
 
 #define PAGE_KERNEL            __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_RO         __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY)
-#define PAGE_KERNEL_ROX        __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY)
+#define PAGE_KERNEL_ROX                __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY)
 #define PAGE_KERNEL_EXEC       __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
 
@@ -153,6 +155,7 @@ extern struct page *empty_zero_page;
 #define pte_write(pte)         (!!(pte_val(pte) & PTE_WRITE))
 #define pte_exec(pte)          (!(pte_val(pte) & PTE_UXN))
 #define pte_cont(pte)          (!!(pte_val(pte) & PTE_CONT))
+#define pte_user(pte)          (!!(pte_val(pte) & PTE_USER))
 
 #ifdef CONFIG_ARM64_HW_AFDBM
 #define pte_hw_dirty(pte)      (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
@@ -163,8 +166,6 @@ extern struct page *empty_zero_page;
 #define pte_dirty(pte)         (pte_sw_dirty(pte) || pte_hw_dirty(pte))
 
 #define pte_valid(pte)         (!!(pte_val(pte) & PTE_VALID))
-#define pte_valid_user(pte) \
-       ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER))
 #define pte_valid_not_user(pte) \
        ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID)
 #define pte_valid_young(pte) \
@@ -278,13 +279,13 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
 {
-       if (pte_valid_user(pte)) {
-               if (!pte_special(pte) && pte_exec(pte))
-                       __sync_icache_dcache(pte, addr);
+       if (pte_valid(pte)) {
                if (pte_sw_dirty(pte) && pte_write(pte))
                        pte_val(pte) &= ~PTE_RDONLY;
                else
                        pte_val(pte) |= PTE_RDONLY;
+               if (pte_user(pte) && pte_exec(pte) && !pte_special(pte))
+                       __sync_icache_dcache(pte, addr);
        }
 
        /*
index 8aee3ae..c536c9e 100644 (file)
@@ -226,11 +226,28 @@ static int call_step_hook(struct pt_regs *regs, unsigned int esr)
        return retval;
 }
 
+static void send_user_sigtrap(int si_code)
+{
+       struct pt_regs *regs = current_pt_regs();
+       siginfo_t info = {
+               .si_signo       = SIGTRAP,
+               .si_errno       = 0,
+               .si_code        = si_code,
+               .si_addr        = (void __user *)instruction_pointer(regs),
+       };
+
+       if (WARN_ON(!user_mode(regs)))
+               return;
+
+       if (interrupts_enabled(regs))
+               local_irq_enable();
+
+       force_sig_info(SIGTRAP, &info, current);
+}
+
 static int single_step_handler(unsigned long addr, unsigned int esr,
                               struct pt_regs *regs)
 {
-       siginfo_t info;
-
        /*
         * If we are stepping a pending breakpoint, call the hw_breakpoint
         * handler first.
@@ -239,11 +256,7 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
                return 0;
 
        if (user_mode(regs)) {
-               info.si_signo = SIGTRAP;
-               info.si_errno = 0;
-               info.si_code  = TRAP_HWBKPT;
-               info.si_addr  = (void __user *)instruction_pointer(regs);
-               force_sig_info(SIGTRAP, &info, current);
+               send_user_sigtrap(TRAP_HWBKPT);
 
                /*
                 * ptrace will disable single step unless explicitly
@@ -307,17 +320,8 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr)
 static int brk_handler(unsigned long addr, unsigned int esr,
                       struct pt_regs *regs)
 {
-       siginfo_t info;
-
        if (user_mode(regs)) {
-               info = (siginfo_t) {
-                       .si_signo = SIGTRAP,
-                       .si_errno = 0,
-                       .si_code  = TRAP_BRKPT,
-                       .si_addr  = (void __user *)instruction_pointer(regs),
-               };
-
-               force_sig_info(SIGTRAP, &info, current);
+               send_user_sigtrap(TRAP_BRKPT);
        } else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
                pr_warning("Unexpected kernel BRK exception at EL1\n");
                return -EFAULT;
@@ -328,7 +332,6 @@ static int brk_handler(unsigned long addr, unsigned int esr,
 
 int aarch32_break_handler(struct pt_regs *regs)
 {
-       siginfo_t info;
        u32 arm_instr;
        u16 thumb_instr;
        bool bp = false;
@@ -359,14 +362,7 @@ int aarch32_break_handler(struct pt_regs *regs)
        if (!bp)
                return -EFAULT;
 
-       info = (siginfo_t) {
-               .si_signo = SIGTRAP,
-               .si_errno = 0,
-               .si_code  = TRAP_BRKPT,
-               .si_addr  = pc,
-       };
-
-       force_sig_info(SIGTRAP, &info, current);
+       send_user_sigtrap(TRAP_BRKPT);
        return 0;
 }
 
index ffe9c2b..917d981 100644 (file)
@@ -514,9 +514,14 @@ CPU_LE(    movk    x0, #0x30d0, lsl #16    )       // Clear EE and E0E on LE systems
 #endif
 
        /* EL2 debug */
+       mrs     x0, id_aa64dfr0_el1             // Check ID_AA64DFR0_EL1 PMUVer
+       sbfx    x0, x0, #8, #4
+       cmp     x0, #1
+       b.lt    4f                              // Skip if no PMU present
        mrs     x0, pmcr_el0                    // Disable debug access traps
        ubfx    x0, x0, #11, #5                 // to EL2 and allow access to
        msr     mdcr_el2, x0                    // all PMU counters from EL1
+4:
 
        /* Stage-2 translation */
        msr     vttbr_el2, xzr
index bc2abb8..352f7ab 100644 (file)
 
 #ifdef CONFIG_EFI
 
+/*
+ * Prevent the symbol aliases below from being emitted into the kallsyms
+ * table, by forcing them to be absolute symbols (which are conveniently
+ * ignored by scripts/kallsyms) rather than section relative symbols.
+ * The distinction is only relevant for partial linking, and only for symbols
+ * that are defined within a section declaration (which is not the case for
+ * the definitions below) so the resulting values will be identical.
+ */
+#define KALLSYMS_HIDE(sym)     ABSOLUTE(sym)
+
 /*
  * The EFI stub has its own symbol namespace prefixed by __efistub_, to
  * isolate it from the kernel proper. The following symbols are legally
  * linked at. The routines below are all implemented in assembler in a
  * position independent manner
  */
-__efistub_memcmp               = __pi_memcmp;
-__efistub_memchr               = __pi_memchr;
-__efistub_memcpy               = __pi_memcpy;
-__efistub_memmove              = __pi_memmove;
-__efistub_memset               = __pi_memset;
-__efistub_strlen               = __pi_strlen;
-__efistub_strcmp               = __pi_strcmp;
-__efistub_strncmp              = __pi_strncmp;
-__efistub___flush_dcache_area  = __pi___flush_dcache_area;
+__efistub_memcmp               = KALLSYMS_HIDE(__pi_memcmp);
+__efistub_memchr               = KALLSYMS_HIDE(__pi_memchr);
+__efistub_memcpy               = KALLSYMS_HIDE(__pi_memcpy);
+__efistub_memmove              = KALLSYMS_HIDE(__pi_memmove);
+__efistub_memset               = KALLSYMS_HIDE(__pi_memset);
+__efistub_strlen               = KALLSYMS_HIDE(__pi_strlen);
+__efistub_strnlen              = KALLSYMS_HIDE(__pi_strnlen);
+__efistub_strcmp               = KALLSYMS_HIDE(__pi_strcmp);
+__efistub_strncmp              = KALLSYMS_HIDE(__pi_strncmp);
+__efistub___flush_dcache_area  = KALLSYMS_HIDE(__pi___flush_dcache_area);
 
 #ifdef CONFIG_KASAN
-__efistub___memcpy             = __pi_memcpy;
-__efistub___memmove            = __pi_memmove;
-__efistub___memset             = __pi_memset;
+__efistub___memcpy             = KALLSYMS_HIDE(__pi_memcpy);
+__efistub___memmove            = KALLSYMS_HIDE(__pi_memmove);
+__efistub___memset             = KALLSYMS_HIDE(__pi_memset);
 #endif
 
-__efistub__text                        = _text;
-__efistub__end                 = _end;
-__efistub__edata               = _edata;
+__efistub__text                        = KALLSYMS_HIDE(_text);
+__efistub__end                 = KALLSYMS_HIDE(_end);
+__efistub__edata               = KALLSYMS_HIDE(_edata);
 
 #endif
 
index 8119479..450987d 100644 (file)
@@ -73,13 +73,13 @@ static struct resource mem_res[] = {
                .name = "Kernel code",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        },
        {
                .name = "Kernel data",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        }
 };
 
@@ -210,7 +210,7 @@ static void __init request_standard_resources(void)
                res->name  = "System RAM";
                res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
                request_resource(&iomem_resource, res);
 
index e33fe33..fd10eb6 100644 (file)
@@ -145,6 +145,10 @@ ENTRY(cpu_resume_mmu)
 ENDPROC(cpu_resume_mmu)
        .popsection
 cpu_resume_after_mmu:
+#ifdef CONFIG_KASAN
+       mov     x0, sp
+       bl      kasan_unpoison_remaining_stack
+#endif
        mov     x0, #0                  // return zero on success
        ldp     x19, x20, [sp, #16]
        ldp     x21, x22, [sp, #32]
index b1adc51..4607657 100644 (file)
@@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void)
        /*
         * OK, it's off to the idle thread for us
         */
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
index 4fad978..d9751a4 100644 (file)
@@ -44,14 +44,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
        unsigned long irq_stack_ptr;
 
        /*
-        * Use raw_smp_processor_id() to avoid false-positives from
-        * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping
-        * task stacks, we can be pre-empted in this case, so
-        * {raw_,}smp_processor_id() may give us the wrong value. Sleeping
-        * tasks can't ever be on an interrupt stack, so regardless of cpu,
-        * the checks will always fail.
+        * Switching between stacks is valid when tracing current and in
+        * non-preemptible context.
         */
-       irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id());
+       if (tsk == current && !preemptible())
+               irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+       else
+               irq_stack_ptr = 0;
 
        low  = frame->sp;
        /* irq stacks are not THREAD_SIZE aligned */
@@ -64,8 +63,8 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
                return -EINVAL;
 
        frame->sp = fp + 0x10;
-       frame->fp = *(unsigned long *)(fp);
-       frame->pc = *(unsigned long *)(fp + 8);
+       frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
+       frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8));
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (tsk && tsk->ret_stack &&
index cbedd72..c539208 100644 (file)
@@ -146,9 +146,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
 static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 {
        struct stackframe frame;
-       unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+       unsigned long irq_stack_ptr;
        int skip;
 
+       /*
+        * Switching between stacks is valid when tracing current and in
+        * non-preemptible context.
+        */
+       if (tsk == current && !preemptible())
+               irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+       else
+               irq_stack_ptr = 0;
+
        pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
        if (!tsk)
index fcb7788..9e54ad7 100644 (file)
@@ -194,7 +194,7 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
        u64 val;
 
        val = kvm_arm_timer_get_reg(vcpu, reg->id);
-       return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+       return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
 }
 
 /**
index 3e568dc..d073b5a 100644 (file)
@@ -64,7 +64,7 @@ __do_hyp_init:
        mrs     x4, tcr_el1
        ldr     x5, =TCR_EL2_MASK
        and     x4, x4, x5
-       ldr     x5, =TCR_EL2_FLAGS
+       mov     x5, #TCR_EL2_RES1
        orr     x4, x4, x5
 
 #ifndef CONFIG_ARM64_VA_BITS_48
@@ -85,15 +85,17 @@ __do_hyp_init:
        ldr_l   x5, idmap_t0sz
        bfi     x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
 #endif
-       msr     tcr_el2, x4
-
-       ldr     x4, =VTCR_EL2_FLAGS
        /*
         * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
-        * VTCR_EL2.
+        * TCR_EL2 and VTCR_EL2.
         */
        mrs     x5, ID_AA64MMFR0_EL1
        bfi     x4, x5, #16, #3
+
+       msr     tcr_el2, x4
+
+       ldr     x4, =VTCR_EL2_FLAGS
+       bfi     x4, x5, #16, #3
        /*
         * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS bit in
         * VTCR_EL2.
index ca8f5a5..f0e7bdf 100644 (file)
@@ -36,7 +36,11 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
        write_sysreg(val, hcr_el2);
        /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
        write_sysreg(1 << 15, hstr_el2);
-       write_sysreg(CPTR_EL2_TTA | CPTR_EL2_TFP, cptr_el2);
+
+       val = CPTR_EL2_DEFAULT;
+       val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
+       write_sysreg(val, cptr_el2);
+
        write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 }
 
@@ -45,7 +49,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
        write_sysreg(HCR_RW, hcr_el2);
        write_sysreg(0, hstr_el2);
        write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
-       write_sysreg(0, cptr_el2);
+       write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
 }
 
 static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
index 9142e08..5dd2a26 100644 (file)
@@ -147,16 +147,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
        max_lr_idx = vtr_to_max_lr_idx(val);
        nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-       switch (nr_pri_bits) {
-       case 7:
-                write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
-                write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
-       case 6:
-                write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
-       default:
-                write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
-       }                                          
-                                                  
        switch (nr_pri_bits) {
        case 7:
                 write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
@@ -167,6 +157,16 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
                 write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
        }
 
+       switch (nr_pri_bits) {
+       case 7:
+                write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
+                write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+       case 6:
+                write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+       default:
+                write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+       }
+
        switch (max_lr_idx) {
        case 15:
                write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(15)], ICH_LR15_EL2);
index 648112e..4d1ac81 100644 (file)
 
 #define PSTATE_FAULT_BITS_64   (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \
                                 PSR_I_BIT | PSR_D_BIT)
-#define EL1_EXCEPT_SYNC_OFFSET 0x200
+
+#define CURRENT_EL_SP_EL0_VECTOR       0x0
+#define CURRENT_EL_SP_ELx_VECTOR       0x200
+#define LOWER_EL_AArch64_VECTOR                0x400
+#define LOWER_EL_AArch32_VECTOR                0x600
 
 static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
 {
@@ -97,6 +101,34 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
                *fsr = 0x14;
 }
 
+enum exception_type {
+       except_type_sync        = 0,
+       except_type_irq         = 0x80,
+       except_type_fiq         = 0x100,
+       except_type_serror      = 0x180,
+};
+
+static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
+{
+       u64 exc_offset;
+
+       switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
+       case PSR_MODE_EL1t:
+               exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+               break;
+       case PSR_MODE_EL1h:
+               exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+               break;
+       case PSR_MODE_EL0t:
+               exc_offset = LOWER_EL_AArch64_VECTOR;
+               break;
+       default:
+               exc_offset = LOWER_EL_AArch32_VECTOR;
+       }
+
+       return vcpu_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
+}
+
 static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
 {
        unsigned long cpsr = *vcpu_cpsr(vcpu);
@@ -108,8 +140,8 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
        *vcpu_spsr(vcpu) = cpsr;
        *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
 
+       *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
        *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
-       *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET;
 
        vcpu_sys_reg(vcpu, FAR_EL1) = addr;
 
@@ -143,8 +175,8 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
        *vcpu_spsr(vcpu) = cpsr;
        *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
 
+       *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
        *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
-       *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET;
 
        /*
         * Build an unknown exception, depending on the instruction
index eec3598..2e90371 100644 (file)
@@ -1007,10 +1007,9 @@ static int emulate_cp(struct kvm_vcpu *vcpu,
                if (likely(r->access(vcpu, params, r))) {
                        /* Skip instruction, since it was emulated */
                        kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+                       /* Handled */
+                       return 0;
                }
-
-               /* Handled */
-               return 0;
        }
 
        /* Not handled */
@@ -1043,7 +1042,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
 }
 
 /**
- * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP15 access
+ * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access
  * @vcpu: The VCPU pointer
  * @run:  The kvm_run struct
  */
@@ -1095,7 +1094,7 @@ out:
 }
 
 /**
- * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access
+ * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access
  * @vcpu: The VCPU pointer
  * @run:  The kvm_run struct
  */
index 2ca6657..eae38da 100644 (file)
@@ -168,4 +168,4 @@ CPU_LE( lsr tmp2, tmp2, tmp4 )      /* Shift (tmp1 & 63).  */
 .Lhit_limit:
        mov     len, limit
        ret
-ENDPROC(strnlen)
+ENDPIPROC(strnlen)
index 331c4ca..a6e757c 100644 (file)
@@ -933,6 +933,10 @@ static int __init __iommu_dma_init(void)
                ret = register_iommu_dma_ops_notifier(&platform_bus_type);
        if (!ret)
                ret = register_iommu_dma_ops_notifier(&amba_bustype);
+
+       /* handle devices queued before this arch_initcall */
+       if (!ret)
+               __iommu_attach_notifier(NULL, BUS_NOTIFY_ADD_DEVICE, NULL);
        return ret;
 }
 arch_initcall(__iommu_dma_init);
index 5a22a11..0adbebb 100644 (file)
@@ -46,7 +46,7 @@ enum address_markers_idx {
        PCI_START_NR,
        PCI_END_NR,
        MODULES_START_NR,
-       MODUELS_END_NR,
+       MODULES_END_NR,
        KERNEL_SPACE_NR,
 };
 
index 92ddac1..abe2a95 100644 (file)
@@ -371,6 +371,13 @@ static int __kprobes do_translation_fault(unsigned long addr,
        return 0;
 }
 
+static int do_alignment_fault(unsigned long addr, unsigned int esr,
+                             struct pt_regs *regs)
+{
+       do_bad_area(addr, esr, regs);
+       return 0;
+}
+
 /*
  * This abort handler always returns "fault".
  */
@@ -418,7 +425,7 @@ static struct fault_info {
        { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
        { do_bad,               SIGBUS,  0,             "synchronous parity error (translation table walk)" },
        { do_bad,               SIGBUS,  0,             "unknown 32"                    },
-       { do_bad,               SIGBUS,  BUS_ADRALN,    "alignment fault"               },
+       { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"               },
        { do_bad,               SIGBUS,  0,             "unknown 34"                    },
        { do_bad,               SIGBUS,  0,             "unknown 35"                    },
        { do_bad,               SIGBUS,  0,             "unknown 36"                    },
index 82d607c..da30529 100644 (file)
@@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt)
                hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
        } else if (ps == PUD_SIZE) {
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-       } else if (ps == (PAGE_SIZE * CONT_PTES)) {
-               hugetlb_add_hstate(CONT_PTE_SHIFT);
-       } else if (ps == (PMD_SIZE * CONT_PMDS)) {
-               hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
        } else {
                pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
                return 0;
@@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt)
        return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-       if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-               hugetlb_add_hstate(CONT_PMD_SHIFT);
-       return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif
index f3b061e..7802f21 100644 (file)
@@ -319,8 +319,8 @@ void __init mem_init(void)
 #endif
                  MLG(VMALLOC_START, VMALLOC_END),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-                 MLG((unsigned long)vmemmap,
-                     (unsigned long)vmemmap + VMEMMAP_SIZE),
+                 MLG(VMEMMAP_START,
+                     VMEMMAP_START + VMEMMAP_SIZE),
                  MLM((unsigned long)virt_to_page(PAGE_OFFSET),
                      (unsigned long)virt_to_page(high_memory)),
 #endif
index cf038c7..cab7a5b 100644 (file)
@@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1)
 void __init kasan_init(void)
 {
        struct memblock_region *reg;
+       int i;
 
        /*
         * We are going to perform proper setup of shadow memory.
@@ -155,6 +156,14 @@ void __init kasan_init(void)
                                pfn_to_nid(virt_to_pfn(start)));
        }
 
+       /*
+        * KAsan may reuse the contents of kasan_zero_pte directly, so we
+        * should make sure that it maps the zero page read-only.
+        */
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               set_pte(&kasan_zero_pte[i],
+                       pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+
        memset(kasan_zero_page, 0, PAGE_SIZE);
        cpu_set_ttbr1(__pa(swapper_pg_dir));
        flush_tlb_all();
index 4c893b5..232f787 100644 (file)
@@ -53,10 +53,10 @@ unsigned long arch_mmap_rnd(void)
 
 #ifdef CONFIG_COMPAT
        if (test_thread_flag(TIF_32BIT))
-               rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+               rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
 #endif
-               rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+               rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
        return rnd << PAGE_SHIFT;
 }
 
index 3571c73..0795c3a 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,6 +45,7 @@ static int change_memory_common(unsigned long addr, int numpages,
        unsigned long end = start + size;
        int ret;
        struct page_change_data data;
+       struct vm_struct *area;
 
        if (!PAGE_ALIGNED(addr)) {
                start &= PAGE_MASK;
@@ -51,11 +53,27 @@ static int change_memory_common(unsigned long addr, int numpages,
                WARN_ON_ONCE(1);
        }
 
-       if (start < MODULES_VADDR || start >= MODULES_END)
+       /*
+        * Kernel VA mappings are always live, and splitting live section
+        * mappings into page mappings may cause TLB conflicts. This means
+        * we have to ensure that changing the permission bits of the range
+        * we are operating on does not result in such splitting.
+        *
+        * Let's restrict ourselves to mappings created by vmalloc (or vmap).
+        * Those are guaranteed to consist entirely of page mappings, and
+        * splitting is never needed.
+        *
+        * So check whether the [addr, addr + size) interval is entirely
+        * covered by precisely one VM area that has the VM_ALLOC flag set.
+        */
+       area = find_vm_area((void *)addr);
+       if (!area ||
+           end > (unsigned long)area->addr + area->size ||
+           !(area->flags & VM_ALLOC))
                return -EINVAL;
 
-       if (end < MODULES_VADDR || end >= MODULES_END)
-               return -EINVAL;
+       if (!numpages)
+               return 0;
 
        data.set_mask = set_mask;
        data.clear_mask = clear_mask;
index 146bd99..e6a30e1 100644 (file)
        b.lo    9998b
        dsb     \domain
        .endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+       .macro  reset_pmuserenr_el0, tmpreg
+       mrs     \tmpreg, id_aa64dfr0_el1        // Check ID_AA64DFR0_EL1 PMUVer
+       sbfx    \tmpreg, \tmpreg, #8, #4
+       cmp     \tmpreg, #1                     // Skip if no PMU present
+       b.lt    9000f
+       msr     pmuserenr_el0, xzr              // Disable PMU access from EL0
+9000:
+       .endm
index a3d867e..c164d2c 100644 (file)
@@ -117,7 +117,7 @@ ENTRY(cpu_do_resume)
         */
        ubfx    x11, x11, #1, #1
        msr     oslar_el1, x11
-       msr     pmuserenr_el0, xzr              // Disable PMU access from EL0
+       reset_pmuserenr_el0 x0                  // Disable PMU access from EL0
        mov     x0, x12
        dsb     nsh             // Make sure local tlb invalidation completed
        isb
@@ -154,7 +154,7 @@ ENTRY(__cpu_setup)
        msr     cpacr_el1, x0                   // Enable FP/ASIMD
        mov     x0, #1 << 12                    // Reset mdscr_el1 and disable
        msr     mdscr_el1, x0                   // access to the DCC from EL0
-       msr     pmuserenr_el0, xzr              // Disable PMU access from EL0
+       reset_pmuserenr_el0 x0                  // Disable PMU access from EL0
        /*
         * Memory region attributes for LPAE:
         *
index 209ae5a..e692889 100644 (file)
@@ -49,13 +49,13 @@ static struct resource __initdata kernel_data = {
        .name   = "Kernel data",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
 };
 static struct resource __initdata kernel_code = {
        .name   = "Kernel code",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_MEM,
+       .flags  = IORESOURCE_SYSTEM_RAM,
        .sibling = &kernel_data,
 };
 
@@ -134,7 +134,7 @@ add_physical_memory(resource_size_t start, resource_size_t end)
        new->start = start;
        new->end = end;
        new->name = "System RAM";
-       new->flags = IORESOURCE_MEM;
+       new->flags = IORESOURCE_SYSTEM_RAM;
 
        *pprev = new;
 }
index 0030e21..23c4ef5 100644 (file)
@@ -333,7 +333,7 @@ void secondary_start_kernel(void)
 
        /* We are done with local CPU inits, unblock the boot CPU. */
        set_cpu_online(cpu, true);
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_prepare_boot_cpu(void)
index 72e17f7..786e36e 100644 (file)
@@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
         */
        set_ist(_vectors_start);
 
-       lockdep_init();
-
        /*
         * dtb is passed in from bootloader.
         * fdt is linked in blob.
index ff759f2..983bae7 100644 (file)
@@ -180,7 +180,7 @@ void start_secondary(void)
 
        local_irq_enable();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 
index caae3f4..300dac3 100644 (file)
@@ -1178,7 +1178,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
        efi_memory_desc_t *md;
        u64 efi_desc_size;
        char *name;
-       unsigned long flags;
+       unsigned long flags, desc;
 
        efi_map_start = __va(ia64_boot_param->efi_memmap);
        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
@@ -1193,6 +1193,8 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                        continue;
 
                flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               desc = IORES_DESC_NONE;
+
                switch (md->type) {
 
                        case EFI_MEMORY_MAPPED_IO:
@@ -1207,14 +1209,17 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                                if (md->attribute & EFI_MEMORY_WP) {
                                        name = "System ROM";
                                        flags |= IORESOURCE_READONLY;
-                               } else if (md->attribute == EFI_MEMORY_UC)
+                               } else if (md->attribute == EFI_MEMORY_UC) {
                                        name = "Uncached RAM";
-                               else
+                               } else {
                                        name = "System RAM";
+                                       flags |= IORESOURCE_SYSRAM;
+                               }
                                break;
 
                        case EFI_ACPI_MEMORY_NVS:
                                name = "ACPI Non-volatile Storage";
+                               desc = IORES_DESC_ACPI_NV_STORAGE;
                                break;
 
                        case EFI_UNUSABLE_MEMORY:
@@ -1224,6 +1229,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 
                        case EFI_PERSISTENT_MEMORY:
                                name = "Persistent Memory";
+                               desc = IORES_DESC_PERSISTENT_MEMORY;
                                break;
 
                        case EFI_RESERVED_TYPE:
@@ -1246,6 +1252,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
                res->start = md->phys_addr;
                res->end = md->phys_addr + efi_md_size(md) - 1;
                res->flags = flags;
+               res->desc = desc;
 
                if (insert_resource(&iomem_resource, res) < 0)
                        kfree(res);
index 4f118b0..2029a38 100644 (file)
@@ -80,17 +80,17 @@ unsigned long vga_console_membase;
 
 static struct resource data_resource = {
        .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
        .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
        .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 unsigned long ia64_max_cacheline_size;
index 0e76fad..74fe317 100644 (file)
@@ -454,7 +454,7 @@ start_secondary (void *unused)
        preempt_disable();
        smp_callin();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
        return 0;
 }
 
index 836ac5a..2841c0a 100644 (file)
@@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt"
 
 config SMP
        bool "Symmetric multi-processing support"
+       depends on MMU
        ---help---
          This enables support for systems with more than one CPU. If you have
          a system with only one CPU, say N. If you have a system with more
index a5ecef7..136c69f 100644 (file)
@@ -70,14 +70,14 @@ static struct resource data_resource = {
        .name   = "Kernel data",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
        .name   = "Kernel code",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 unsigned long memory_start;
index a468467..f98d2f6 100644 (file)
@@ -432,7 +432,7 @@ int __init start_secondary(void *unused)
         */
        local_flush_tlb_all();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
        return 0;
 }
 
index fc96e81..d1fc479 100644 (file)
@@ -108,6 +108,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -266,6 +268,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -366,6 +374,7 @@ CONFIG_ARIADNE=y
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 CONFIG_HYDRA=y
 CONFIG_APNE=y
 CONFIG_ZORRO8390=y
index 05c904f..9bfe8be 100644 (file)
@@ -106,6 +106,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -264,6 +266,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -344,6 +352,7 @@ CONFIG_VETH=m
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index d572b73..ebdcfae 100644 (file)
@@ -106,6 +106,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -264,6 +266,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -353,6 +361,7 @@ CONFIG_ATARILANCE=y
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 CONFIG_NE2000=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
index 11a30c6..8acc65e 100644 (file)
@@ -104,6 +104,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -262,6 +264,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -343,6 +351,7 @@ CONFIG_BVME6000_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index 6630a51..0c6a3d5 100644 (file)
@@ -106,6 +106,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -264,6 +266,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -345,6 +353,7 @@ CONFIG_HPLANCE=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index 1d90b71..12a8a6c 100644 (file)
@@ -105,6 +105,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -266,6 +268,12 @@ CONFIG_DEV_APPLETALK=m
 CONFIG_IPDDP=m
 CONFIG_IPDDP_ENCAP=y
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -362,6 +370,7 @@ CONFIG_MAC89x0=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 CONFIG_MACSONIC=y
+# CONFIG_NET_VENDOR_NETRONOME is not set
 CONFIG_MAC8390=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
index 1fd21c1..64ff2dc 100644 (file)
@@ -115,6 +115,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -276,6 +278,12 @@ CONFIG_DEV_APPLETALK=m
 CONFIG_IPDDP=m
 CONFIG_IPDDP_ENCAP=y
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -404,6 +412,7 @@ CONFIG_MVME16x_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 CONFIG_MACSONIC=y
+# CONFIG_NET_VENDOR_NETRONOME is not set
 CONFIG_HYDRA=y
 CONFIG_MAC8390=y
 CONFIG_NE2000=y
index 74e10f7..07fc6ab 100644 (file)
@@ -103,6 +103,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -261,6 +263,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -343,6 +351,7 @@ CONFIG_MVME147_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index 7034e71..69903de 100644 (file)
@@ -104,6 +104,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -262,6 +264,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -343,6 +351,7 @@ CONFIG_MVME16x_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index f7deb5f..bd84016 100644 (file)
@@ -104,6 +104,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -262,6 +264,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -352,6 +360,7 @@ CONFIG_VETH=m
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 CONFIG_NE2000=y
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
index 0ce79eb..5f9fb3a 100644 (file)
@@ -101,6 +101,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -259,6 +261,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -340,6 +348,7 @@ CONFIG_SUN3_82586=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index 4cb787e..5d1c674 100644 (file)
@@ -101,6 +101,8 @@ CONFIG_NFT_NAT=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_REJECT=m
 CONFIG_NFT_COMPAT=m
+CONFIG_NFT_DUP_NETDEV=m
+CONFIG_NFT_FWD_NETDEV=m
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -259,6 +261,12 @@ CONFIG_L2TP=m
 CONFIG_BRIDGE=m
 CONFIG_ATALK=m
 CONFIG_6LOWPAN=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m
+CONFIG_6LOWPAN_GHC_UDP=m
+CONFIG_6LOWPAN_GHC_ICMPV6=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m
+CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_BATMAN_ADV_DAT=y
@@ -341,6 +349,7 @@ CONFIG_SUN3LANCE=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_NATSEMI is not set
+# CONFIG_NET_VENDOR_NETRONOME is not set
 # CONFIG_NET_VENDOR_QUALCOMM is not set
 # CONFIG_NET_VENDOR_RENESAS is not set
 # CONFIG_NET_VENDOR_ROCKER is not set
index f9d96bf..bafaff6 100644 (file)
@@ -4,7 +4,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls            376
+#define NR_syscalls            377
 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
index 36cf129..0ca7296 100644 (file)
 #define __NR_userfaultfd       373
 #define __NR_membarrier                374
 #define __NR_mlock2            375
+#define __NR_copy_file_range   376
 
 #endif /* _UAPI_ASM_M68K_UNISTD_H_ */
index 282cd90..8bb9426 100644 (file)
@@ -396,3 +396,4 @@ ENTRY(sys_call_table)
        .long sys_userfaultfd
        .long sys_membarrier
        .long sys_mlock2                /* 375 */
+       .long sys_copy_file_range
index c3c6f08..bad1323 100644 (file)
@@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void)
        /*
         * OK, it's off to the idle thread for us
         */
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
index 89a2a93..f31ebb5 100644 (file)
@@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
        memset(__bss_start, 0, __bss_stop-__bss_start);
        memset(_ssbss, 0, _esbss-_ssbss);
 
-       lockdep_init();
-
 /* initialize device tree for usage in early_printk */
        early_init_devtree(_fdt_start);
 
index 57a945e..a65eacf 100644 (file)
@@ -151,6 +151,7 @@ config BMIPS_GENERIC
        select CSRC_R4K
        select SYNC_R4K
        select COMMON_CLK
+       select BCM6345_L1_IRQ
        select BCM7038_L1_IRQ
        select BCM7120_L2_IRQ
        select BRCMSTB_L2_IRQ
@@ -2085,7 +2086,7 @@ config PAGE_SIZE_32KB
 
 config PAGE_SIZE_64KB
        bool "64kB"
-       depends on !CPU_R3000 && !CPU_TX39XX
+       depends on !CPU_R3000 && !CPU_TX39XX && !CPU_R6000
        help
          Using 64kB page size will result in higher performance kernel at
          the price of higher memory consumption.  This option is available on
@@ -2169,7 +2170,6 @@ config MIPS_MT_SMP
        select CPU_MIPSR2_IRQ_VI
        select CPU_MIPSR2_IRQ_EI
        select SYNC_R4K
-       select MIPS_GIC_IPI
        select MIPS_MT
        select SMP
        select SMP_UP
@@ -2267,7 +2267,6 @@ config MIPS_VPE_APSP_API_MT
 config MIPS_CMP
        bool "MIPS CMP framework support (DEPRECATED)"
        depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
-       select MIPS_GIC_IPI
        select SMP
        select SYNC_R4K
        select SYS_SUPPORTS_SMP
@@ -2287,7 +2286,6 @@ config MIPS_CPS
        select MIPS_CM
        select MIPS_CPC
        select MIPS_CPS_PM if HOTPLUG_CPU
-       select MIPS_GIC_IPI
        select SMP
        select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
        select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2305,9 +2303,6 @@ config MIPS_CPS_PM
        select MIPS_CPC
        bool
 
-config MIPS_GIC_IPI
-       bool
-
 config MIPS_CM
        bool
 
index 511c065..2dfff1f 100644 (file)
 #include "common.h"
 #include "machtypes.h"
 
-static void __init ath79_misc_intc_domain_init(
-       struct device_node *node, int irq);
-
-static void ath79_misc_irq_handler(struct irq_desc *desc)
-{
-       struct irq_domain *domain = irq_desc_get_handler_data(desc);
-       void __iomem *base = domain->host_data;
-       u32 pending;
-
-       pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
-                 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-       if (!pending) {
-               spurious_interrupt();
-               return;
-       }
-
-       while (pending) {
-               int bit = __ffs(pending);
-
-               generic_handle_irq(irq_linear_revmap(domain, bit));
-               pending &= ~BIT(bit);
-       }
-}
-
-static void ar71xx_misc_irq_unmask(struct irq_data *d)
-{
-       void __iomem *base = irq_data_get_irq_chip_data(d);
-       unsigned int irq = d->hwirq;
-       u32 t;
-
-       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-       __raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-       /* flush write */
-       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-}
-
-static void ar71xx_misc_irq_mask(struct irq_data *d)
-{
-       void __iomem *base = irq_data_get_irq_chip_data(d);
-       unsigned int irq = d->hwirq;
-       u32 t;
-
-       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-       __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-       /* flush write */
-       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-}
-
-static void ar724x_misc_irq_ack(struct irq_data *d)
-{
-       void __iomem *base = irq_data_get_irq_chip_data(d);
-       unsigned int irq = d->hwirq;
-       u32 t;
-
-       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
-       __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
-
-       /* flush write */
-       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
-}
-
-static struct irq_chip ath79_misc_irq_chip = {
-       .name           = "MISC",
-       .irq_unmask     = ar71xx_misc_irq_unmask,
-       .irq_mask       = ar71xx_misc_irq_mask,
-};
-
-static void __init ath79_misc_irq_init(void)
-{
-       if (soc_is_ar71xx() || soc_is_ar913x())
-               ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
-       else if (soc_is_ar724x() ||
-                soc_is_ar933x() ||
-                soc_is_ar934x() ||
-                soc_is_qca955x())
-               ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
-       else
-               BUG();
-
-       ath79_misc_intc_domain_init(NULL, ATH79_CPU_IRQ(6));
-}
 
 static void ar934x_ip2_irq_dispatch(struct irq_desc *desc)
 {
@@ -212,142 +128,12 @@ static void qca955x_irq_init(void)
        irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch);
 }
 
-/*
- * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
- * these devices typically allocate coherent DMA memory, however the
- * DMA controller may still have some unsynchronized data in the FIFO.
- * Issue a flush in the handlers to ensure that the driver sees
- * the update.
- *
- * This array map the interrupt lines to the DDR write buffer channels.
- */
-
-static unsigned irq_wb_chan[8] = {
-       -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-asmlinkage void plat_irq_dispatch(void)
-{
-       unsigned long pending;
-       int irq;
-
-       pending = read_c0_status() & read_c0_cause() & ST0_IM;
-
-       if (!pending) {
-               spurious_interrupt();
-               return;
-       }
-
-       pending >>= CAUSEB_IP;
-       while (pending) {
-               irq = fls(pending) - 1;
-               if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
-                       ath79_ddr_wb_flush(irq_wb_chan[irq]);
-               do_IRQ(MIPS_CPU_IRQ_BASE + irq);
-               pending &= ~BIT(irq);
-       }
-}
-
-static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
-{
-       irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
-       irq_set_chip_data(irq, d->host_data);
-       return 0;
-}
-
-static const struct irq_domain_ops misc_irq_domain_ops = {
-       .xlate = irq_domain_xlate_onecell,
-       .map = misc_map,
-};
-
-static void __init ath79_misc_intc_domain_init(
-       struct device_node *node, int irq)
-{
-       void __iomem *base = ath79_reset_base;
-       struct irq_domain *domain;
-
-       domain = irq_domain_add_legacy(node, ATH79_MISC_IRQ_COUNT,
-                       ATH79_MISC_IRQ_BASE, 0, &misc_irq_domain_ops, base);
-       if (!domain)
-               panic("Failed to add MISC irqdomain");
-
-       /* Disable and clear all interrupts */
-       __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-       __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
-
-       irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
-}
-
-static int __init ath79_misc_intc_of_init(
-       struct device_node *node, struct device_node *parent)
-{
-       int irq;
-
-       irq = irq_of_parse_and_map(node, 0);
-       if (!irq)
-               panic("Failed to get MISC IRQ");
-
-       ath79_misc_intc_domain_init(node, irq);
-       return 0;
-}
-
-static int __init ar7100_misc_intc_of_init(
-       struct device_node *node, struct device_node *parent)
-{
-       ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
-       return ath79_misc_intc_of_init(node, parent);
-}
-
-IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
-               ar7100_misc_intc_of_init);
-
-static int __init ar7240_misc_intc_of_init(
-       struct device_node *node, struct device_node *parent)
-{
-       ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
-       return ath79_misc_intc_of_init(node, parent);
-}
-
-IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
-               ar7240_misc_intc_of_init);
-
-static int __init ar79_cpu_intc_of_init(
-       struct device_node *node, struct device_node *parent)
-{
-       int err, i, count;
-
-       /* Fill the irq_wb_chan table */
-       count = of_count_phandle_with_args(
-               node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
-
-       for (i = 0; i < count; i++) {
-               struct of_phandle_args args;
-               u32 irq = i;
-
-               of_property_read_u32_index(
-                       node, "qca,ddr-wb-channel-interrupts", i, &irq);
-               if (irq >= ARRAY_SIZE(irq_wb_chan))
-                       continue;
-
-               err = of_parse_phandle_with_args(
-                       node, "qca,ddr-wb-channels",
-                       "#qca,ddr-wb-channel-cells",
-                       i, &args);
-               if (err)
-                       return err;
-
-               irq_wb_chan[irq] = args.args[0];
-               pr_info("IRQ: Set flush channel of IRQ%d to %d\n",
-                       irq, args.args[0]);
-       }
-
-       return mips_cpu_irq_of_init(node, parent);
-}
-IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
-               ar79_cpu_intc_of_init);
-
 void __init arch_init_irq(void)
 {
+       unsigned irq_wb_chan2 = -1;
+       unsigned irq_wb_chan3 = -1;
+       bool misc_is_ar71xx;
+
        if (mips_machtype == ATH79_MACH_GENERIC_OF) {
                irqchip_init();
                return;
@@ -355,14 +141,26 @@ void __init arch_init_irq(void)
 
        if (soc_is_ar71xx() || soc_is_ar724x() ||
            soc_is_ar913x() || soc_is_ar933x()) {
-               irq_wb_chan[2] = 3;
-               irq_wb_chan[3] = 2;
+               irq_wb_chan2 = 3;
+               irq_wb_chan3 = 2;
        } else if (soc_is_ar934x()) {
-               irq_wb_chan[3] = 2;
+               irq_wb_chan3 = 2;
        }
 
-       mips_cpu_irq_init();
-       ath79_misc_irq_init();
+       ath79_cpu_irq_init(irq_wb_chan2, irq_wb_chan3);
+
+       if (soc_is_ar71xx() || soc_is_ar913x())
+               misc_is_ar71xx = true;
+       else if (soc_is_ar724x() ||
+                soc_is_ar933x() ||
+                soc_is_ar934x() ||
+                soc_is_qca955x())
+               misc_is_ar71xx = false;
+       else
+               BUG();
+       ath79_misc_irq_init(
+               ath79_reset_base + AR71XX_RESET_REG_MISC_INT_STATUS,
+               ATH79_CPU_IRQ(6), ATH79_MISC_IRQ_BASE, misc_is_ar71xx);
 
        if (soc_is_ar934x())
                ar934x_ip2_irq_init();
index 5f2bc1e..05757ae 100644 (file)
@@ -19,6 +19,8 @@
 
 #include <bcm63xx_nvram.h>
 
+#define BCM63XX_DEFAULT_PSI_SIZE       64
+
 static struct bcm963xx_nvram nvram;
 static int mac_addr_used;
 
@@ -85,3 +87,12 @@ int bcm63xx_nvram_get_mac_address(u8 *mac)
        return 0;
 }
 EXPORT_SYMBOL(bcm63xx_nvram_get_mac_address);
+
+int bcm63xx_nvram_get_psi_size(void)
+{
+       if (nvram.psi_size > 0)
+               return nvram.psi_size;
+
+       return BCM63XX_DEFAULT_PSI_SIZE;
+}
+EXPORT_SYMBOL(bcm63xx_nvram_get_psi_size);
index e7fc6f9..7efefcf 100644 (file)
 #include <asm/irq_cpu.h>
 #include <asm/time.h>
 
+static const struct of_device_id smp_intc_dt_match[] = {
+       { .compatible = "brcm,bcm7038-l1-intc" },
+       { .compatible = "brcm,bcm6345-l1-intc" },
+       {}
+};
+
 unsigned int get_c0_compare_int(void)
 {
        return CP0_LEGACY_COMPARE_IRQ;
@@ -24,8 +30,8 @@ void __init arch_init_irq(void)
 {
        struct device_node *dn;
 
-       /* Only the STB (bcm7038) controller supports SMP IRQ affinity */
-       dn = of_find_compatible_node(NULL, NULL, "brcm,bcm7038-l1-intc");
+       /* Only these controllers support SMP IRQ affinity */
+       dn = of_find_matching_node(NULL, smp_intc_dt_match);
        if (dn)
                of_node_put(dn);
        else
index 408799a..f752114 100644 (file)
@@ -17,7 +17,7 @@
 #define PORT(offset) (CKSEG1ADDR(AR7_REGS_UART0) + (4 * offset))
 #endif
 
-#ifdef CONFIG_MACH_JZ4740
+#if defined(CONFIG_MACH_JZ4740) || defined(CONFIG_MACH_JZ4780)
 #include <asm/mach-jz4740/base.h>
 #define PORT(offset) (CKSEG1ADDR(JZ4740_UART0_BASE_ADDR) + (4 * offset))
 #endif
index 459b9b2..d61b161 100644 (file)
@@ -74,6 +74,7 @@
                timer: timer@10000040 {
                        compatible = "syscon";
                        reg = <0x10000040 0x2c>;
+                       little-endian;
                };
 
                reboot {
index 4fc7ece..1a7efa8 100644 (file)
@@ -98,6 +98,7 @@
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7125-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x60c>;
+                       little-endian;
                };
 
                reboot {
index a3039bb..d4bf52c 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7346-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index 4274ff4..8e25016 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7358-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index 0dcc916..7e5f760 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7360-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index 2f3f9fc..c739ea7 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7362-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index bee221b..5f55d0a 100644 (file)
@@ -99,6 +99,7 @@
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7420-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x60c>;
+                       little-endian;
                };
 
                reboot {
index 571f30f..e24d41a 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7425-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index 614ee21..8b9432c 100644 (file)
                sun_top_ctrl: syscon@404000 {
                        compatible = "brcm,bcm7425-sun-top-ctrl", "syscon";
                        reg = <0x404000 0x51c>;
+                       little-endian;
                };
 
                reboot {
index cefb7a5..e090fc3 100644 (file)
@@ -227,7 +227,7 @@ struct mips_elf_abiflags_v0 {
        int __res = 1;                                                  \
        struct elfhdr *__h = (hdr);                                     \
                                                                        \
-       if (__h->e_machine != EM_MIPS)                                  \
+       if (!mips_elf_check_machine(__h))                               \
                __res = 0;                                              \
        if (__h->e_ident[EI_CLASS] != ELFCLASS32)                       \
                __res = 0;                                              \
@@ -258,7 +258,7 @@ struct mips_elf_abiflags_v0 {
        int __res = 1;                                                  \
        struct elfhdr *__h = (hdr);                                     \
                                                                        \
-       if (__h->e_machine != EM_MIPS)                                  \
+       if (!mips_elf_check_machine(__h))                               \
                __res = 0;                                              \
        if (__h->e_ident[EI_CLASS] != ELFCLASS64)                       \
                __res = 0;                                              \
@@ -285,6 +285,11 @@ struct mips_elf_abiflags_v0 {
 
 #endif /* !defined(ELF_ARCH) */
 
+#define mips_elf_check_machine(x) ((x)->e_machine == EM_MIPS)
+
+#define vmcore_elf32_check_arch mips_elf_check_machine
+#define vmcore_elf64_check_arch mips_elf_check_machine
+
 struct mips_abi;
 
 extern struct mips_abi mips_abi;
index 9cbf383..f06f97b 100644 (file)
@@ -179,6 +179,10 @@ static inline void lose_fpu_inatomic(int save, struct task_struct *tsk)
                if (save)
                        _save_fp(tsk);
                __disable_fpu();
+       } else {
+               /* FPU should not have been left enabled with no owner */
+               WARN(read_c0_status() & ST0_CU1,
+                    "Orphaned FPU left enabled");
        }
        KSTK_STATUS(tsk) &= ~ST0_CU1;
        clear_tsk_thread_flag(tsk, TIF_USEDFPU);
index 2b34872..441faa9 100644 (file)
@@ -144,4 +144,8 @@ static inline u32 ath79_reset_rr(unsigned reg)
 void ath79_device_reset_set(u32 mask);
 void ath79_device_reset_clear(u32 mask);
 
+void ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3);
+void ath79_misc_irq_init(void __iomem *regs, int irq,
+                       int irq_base, bool is_ar71xx);
+
 #endif /* __ASM_MACH_ATH79_H */
index 4e0b6bc..348df49 100644 (file)
@@ -30,4 +30,6 @@ u8 *bcm63xx_nvram_get_name(void);
  */
 int bcm63xx_nvram_get_mac_address(u8 *mac);
 
+int bcm63xx_nvram_get_psi_size(void);
+
 #endif /* BCM63XX_NVRAM_H */
index 8ebd3f5..3ed10a8 100644 (file)
@@ -128,7 +128,8 @@ static inline int octeon_has_feature(enum octeon_feature feature)
        case OCTEON_FEATURE_PCIE:
                return OCTEON_IS_MODEL(OCTEON_CN56XX)
                        || OCTEON_IS_MODEL(OCTEON_CN52XX)
-                       || OCTEON_IS_MODEL(OCTEON_CN6XXX);
+                       || OCTEON_IS_MODEL(OCTEON_CN6XXX)
+                       || OCTEON_IS_MODEL(OCTEON_CN7XXX);
 
        case OCTEON_FEATURE_SRIO:
                return OCTEON_IS_MODEL(OCTEON_CN63XX)
index 3f832c3..041153f 100644 (file)
@@ -45,7 +45,7 @@ extern unsigned int vced_count, vcei_count;
  * User space process size: 2GB. This is hardcoded into a few places,
  * so don't change it unless you know what you are doing.
  */
-#define TASK_SIZE      0x7fff8000UL
+#define TASK_SIZE      0x80000000UL
 #endif
 
 #define STACK_TOP_MAX  TASK_SIZE
index 6ba1fb8..db7c322 100644 (file)
@@ -44,8 +44,9 @@ static inline void plat_smp_setup(void)
        mp_ops->smp_setup();
 }
 
-extern void gic_send_ipi_single(int cpu, unsigned int action);
-extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action);
+extern void mips_smp_send_ipi_single(int cpu, unsigned int action);
+extern void mips_smp_send_ipi_mask(const struct cpumask *mask,
+                                     unsigned int action);
 
 #else /* !CONFIG_SMP */
 
index a71da57..eebf395 100644 (file)
                .set    reorder
                .set    noat
                mfc0    a0, CP0_STATUS
-               li      v1, 0xff00
+               li      v1, ST0_CU1 | ST0_IM
                ori     a0, STATMASK
                xori    a0, STATMASK
                mtc0    a0, CP0_STATUS
                ori     a0, STATMASK
                xori    a0, STATMASK
                mtc0    a0, CP0_STATUS
-               li      v1, 0xff00
+               li      v1, ST0_CU1 | ST0_FR | ST0_IM
                and     a0, v1
                LONG_L  v0, PT_STATUS(sp)
                nor     v1, $0, v1
index 6499d93..47bc45a 100644 (file)
@@ -101,10 +101,8 @@ static inline void syscall_get_arguments(struct task_struct *task,
        /* O32 ABI syscall() - Either 64-bit with O32 or 32-bit */
        if ((config_enabled(CONFIG_32BIT) ||
            test_tsk_thread_flag(task, TIF_32BIT_REGS)) &&
-           (regs->regs[2] == __NR_syscall)) {
+           (regs->regs[2] == __NR_syscall))
                i++;
-               n++;
-       }
 
        while (n--)
                ret |= mips_get_syscall_arg(args++, task, regs, i++);
index 90f03a7..3129795 100644 (file)
 #define __NR_userfaultfd               (__NR_Linux + 357)
 #define __NR_membarrier                        (__NR_Linux + 358)
 #define __NR_mlock2                    (__NR_Linux + 359)
+#define __NR_copy_file_range           (__NR_Linux + 360)
 
 /*
  * Offset of the last Linux o32 flavoured syscall
  */
-#define __NR_Linux_syscalls            359
+#define __NR_Linux_syscalls            360
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
 
 #define __NR_O32_Linux                 4000
-#define __NR_O32_Linux_syscalls                359
+#define __NR_O32_Linux_syscalls                360
 
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 
 #define __NR_userfaultfd               (__NR_Linux + 317)
 #define __NR_membarrier                        (__NR_Linux + 318)
 #define __NR_mlock2                    (__NR_Linux + 319)
+#define __NR_copy_file_range           (__NR_Linux + 320)
 
 /*
  * Offset of the last Linux 64-bit flavoured syscall
  */
-#define __NR_Linux_syscalls            319
+#define __NR_Linux_syscalls            320
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
 
 #define __NR_64_Linux                  5000
-#define __NR_64_Linux_syscalls         319
+#define __NR_64_Linux_syscalls         320
 
 #if _MIPS_SIM == _MIPS_SIM_NABI32
 
 #define __NR_userfaultfd               (__NR_Linux + 321)
 #define __NR_membarrier                        (__NR_Linux + 322)
 #define __NR_mlock2                    (__NR_Linux + 323)
+#define __NR_copy_file_range           (__NR_Linux + 324)
 
 /*
  * Offset of the last N32 flavoured syscall
  */
-#define __NR_Linux_syscalls            323
+#define __NR_Linux_syscalls            324
 
 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
 
 #define __NR_N32_Linux                 6000
-#define __NR_N32_Linux_syscalls                323
+#define __NR_N32_Linux_syscalls                324
 
 #endif /* _UAPI_ASM_UNISTD_H */
index 8c6d76c..d9907e5 100644 (file)
@@ -270,7 +270,7 @@ uint32_t jz_gpio_port_get_value(int port, uint32_t mask)
 }
 EXPORT_SYMBOL(jz_gpio_port_get_value);
 
-#define IRQ_TO_BIT(irq) BIT(irq_to_gpio(irq) & 0x1f)
+#define IRQ_TO_BIT(irq) BIT((irq - JZ4740_IRQ_GPIO(0)) & 0x1f)
 
 static void jz_gpio_check_trigger_both(struct jz_gpio_chip *chip, unsigned int irq)
 {
index 68e2b7d..b0988fd 100644 (file)
@@ -52,7 +52,6 @@ obj-$(CONFIG_MIPS_MT_SMP)     += smp-mt.o
 obj-$(CONFIG_MIPS_CMP)         += smp-cmp.o
 obj-$(CONFIG_MIPS_CPS)         += smp-cps.o cps-vec.o
 obj-$(CONFIG_MIPS_CPS_NS16550) += cps-vec-ns16550.o
-obj-$(CONFIG_MIPS_GIC_IPI)     += smp-gic.o
 obj-$(CONFIG_MIPS_SPRAM)       += spram.o
 
 obj-$(CONFIG_MIPS_VPE_LOADER)  += vpe.o
index 1188e00..1b992c6 100644 (file)
@@ -35,7 +35,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
        int __res = 1;                                                  \
        struct elfhdr *__h = (hdr);                                     \
                                                                        \
-       if (__h->e_machine != EM_MIPS)                                  \
+       if (!mips_elf_check_machine(__h))                               \
                __res = 0;                                              \
        if (__h->e_ident[EI_CLASS] != ELFCLASS32)                       \
                __res = 0;                                              \
index 9287678..abd3aff 100644 (file)
@@ -47,7 +47,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
        int __res = 1;                                                  \
        struct elfhdr *__h = (hdr);                                     \
                                                                        \
-       if (__h->e_machine != EM_MIPS)                                  \
+       if (!mips_elf_check_machine(__h))                               \
                __res = 0;                                              \
        if (__h->e_ident[EI_CLASS] != ELFCLASS32)                       \
                __res = 0;                                              \
index f2975d4..eddd5fd 100644 (file)
@@ -65,12 +65,10 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
        status = regs->cp0_status & ~(ST0_CU0|ST0_CU1|ST0_FR|KU_MASK);
        status |= KU_USER;
        regs->cp0_status = status;
+       lose_fpu(0);
+       clear_thread_flag(TIF_MSA_CTX_LIVE);
        clear_used_math();
-       clear_fpu_owner();
        init_dsp();
-       clear_thread_flag(TIF_USEDMSA);
-       clear_thread_flag(TIF_MSA_CTX_LIVE);
-       disable_msa();
        regs->cp0_epc = pc;
        regs->regs[29] = sp;
 }
index 5ce3b74..b4ac637 100644 (file)
@@ -125,7 +125,7 @@ LEAF(_restore_fp_context)
        END(_restore_fp_context)
        .set    reorder
 
-       .type   fault@function
+       .type   fault@function
        .ent    fault
 fault: li      v0, -EFAULT
        jr      ra
index f09546e..17732f8 100644 (file)
@@ -358,7 +358,7 @@ LEAF(_restore_msa_all_upper)
 
        .set    reorder
 
-       .type   fault@function
+       .type   fault@function
        .ent    fault
 fault: li      v0, -EFAULT                             # failure
        jr      ra
index 2d23c83..a563174 100644 (file)
@@ -595,3 +595,4 @@ EXPORT(sys_call_table)
        PTR     sys_userfaultfd
        PTR     sys_membarrier
        PTR     sys_mlock2
+       PTR     sys_copy_file_range             /* 4360 */
index deac633..2b2dc14 100644 (file)
@@ -433,4 +433,5 @@ EXPORT(sys_call_table)
        PTR     sys_userfaultfd
        PTR     sys_membarrier
        PTR     sys_mlock2
+       PTR     sys_copy_file_range             /* 5320 */
        .size   sys_call_table,.-sys_call_table
index 5a69eb4..2bf5c85 100644 (file)
@@ -423,4 +423,5 @@ EXPORT(sysn32_call_table)
        PTR     sys_userfaultfd
        PTR     sys_membarrier
        PTR     sys_mlock2
+       PTR     sys_copy_file_range
        .size   sysn32_call_table,.-sysn32_call_table
index e4b6d7c..c5b759e 100644 (file)
@@ -578,4 +578,5 @@ EXPORT(sys32_call_table)
        PTR     sys_userfaultfd
        PTR     sys_membarrier
        PTR     sys_mlock2
+       PTR     sys_copy_file_range             /* 4360 */
        .size   sys32_call_table,.-sys32_call_table
index 569a7d5..4f60734 100644 (file)
@@ -732,21 +732,23 @@ static void __init resource_init(void)
                        end = HIGHMEM_START - 1;
 
                res = alloc_bootmem(sizeof(struct resource));
+
+               res->start = start;
+               res->end = end;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
                switch (boot_mem_map.map[i].type) {
                case BOOT_MEM_RAM:
                case BOOT_MEM_INIT_RAM:
                case BOOT_MEM_ROM_DATA:
                        res->name = "System RAM";
+                       res->flags |= IORESOURCE_SYSRAM;
                        break;
                case BOOT_MEM_RESERVED:
                default:
                        res->name = "reserved";
                }
 
-               res->start = start;
-               res->end = end;
-
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                request_resource(&iomem_resource, res);
 
                /*
@@ -782,6 +784,7 @@ static inline void prefill_possible_map(void) {}
 void __init setup_arch(char **cmdline_p)
 {
        cpu_probe();
+       mips_cm_probe();
        prom_init();
 
        setup_early_fdc_console();
index d5e0f94..7692334 100644 (file)
@@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus)
 }
 
 struct plat_smp_ops cmp_smp_ops = {
-       .send_ipi_single        = gic_send_ipi_single,
-       .send_ipi_mask          = gic_send_ipi_mask,
+       .send_ipi_single        = mips_smp_send_ipi_single,
+       .send_ipi_mask          = mips_smp_send_ipi_mask,
        .init_secondary         = cmp_init_secondary,
        .smp_finish             = cmp_smp_finish,
        .boot_secondary         = cmp_boot_secondary,
index 2ad4e4c..253e140 100644 (file)
@@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = {
        .boot_secondary         = cps_boot_secondary,
        .init_secondary         = cps_init_secondary,
        .smp_finish             = cps_smp_finish,
-       .send_ipi_single        = gic_send_ipi_single,
-       .send_ipi_mask          = gic_send_ipi_mask,
+       .send_ipi_single        = mips_smp_send_ipi_single,
+       .send_ipi_mask          = mips_smp_send_ipi_mask,
 #ifdef CONFIG_HOTPLUG_CPU
        .cpu_disable            = cps_cpu_disable,
        .cpu_die                = cps_cpu_die,
index 86311a1..4f9570a 100644 (file)
@@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action)
 
 #ifdef CONFIG_MIPS_GIC
        if (gic_present) {
-               gic_send_ipi_single(cpu, action);
+               mips_smp_send_ipi_single(cpu, action);
                return;
        }
 #endif
index bd4385a..37708d9 100644 (file)
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/ftrace.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
 
 #include <linux/atomic.h>
 #include <asm/cpu.h>
 #include <asm/processor.h>
 #include <asm/idle.h>
 #include <asm/r4k-timer.h>
+#include <asm/mips-cpc.h>
 #include <asm/mmu_context.h>
 #include <asm/time.h>
 #include <asm/setup.h>
@@ -79,6 +83,11 @@ static cpumask_t cpu_core_setup_map;
 
 cpumask_t cpu_coherent_mask;
 
+#ifdef CONFIG_GENERIC_IRQ_IPI
+static struct irq_desc *call_desc;
+static struct irq_desc *sched_desc;
+#endif
+
 static inline void set_cpu_sibling_map(int cpu)
 {
        int i;
@@ -121,6 +130,7 @@ static inline void calculate_cpu_foreign_map(void)
        cpumask_t temp_foreign_map;
 
        /* Re-calculate the mask */
+       cpumask_clear(&temp_foreign_map);
        for_each_online_cpu(i) {
                core_present = 0;
                for_each_cpu(k, &temp_foreign_map)
@@ -145,6 +155,133 @@ void register_smp_ops(struct plat_smp_ops *ops)
        mp_ops = ops;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_IPI
+void mips_smp_send_ipi_single(int cpu, unsigned int action)
+{
+       mips_smp_send_ipi_mask(cpumask_of(cpu), action);
+}
+
+void mips_smp_send_ipi_mask(const struct cpumask *mask, unsigned int action)
+{
+       unsigned long flags;
+       unsigned int core;
+       int cpu;
+
+       local_irq_save(flags);
+
+       switch (action) {
+       case SMP_CALL_FUNCTION:
+               __ipi_send_mask(call_desc, mask);
+               break;
+
+       case SMP_RESCHEDULE_YOURSELF:
+               __ipi_send_mask(sched_desc, mask);
+               break;
+
+       default:
+               BUG();
+       }
+
+       if (mips_cpc_present()) {
+               for_each_cpu(cpu, mask) {
+                       core = cpu_data[cpu].core;
+
+                       if (core == current_cpu_data.core)
+                               continue;
+
+                       while (!cpumask_test_cpu(cpu, &cpu_coherent_mask)) {
+                               mips_cpc_lock_other(core);
+                               write_cpc_co_cmd(CPC_Cx_CMD_PWRUP);
+                               mips_cpc_unlock_other();
+                       }
+               }
+       }
+
+       local_irq_restore(flags);
+}
+
+
+static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
+{
+       scheduler_ipi();
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
+{
+       generic_smp_call_function_interrupt();
+
+       return IRQ_HANDLED;
+}
+
+static struct irqaction irq_resched = {
+       .handler        = ipi_resched_interrupt,
+       .flags          = IRQF_PERCPU,
+       .name           = "IPI resched"
+};
+
+static struct irqaction irq_call = {
+       .handler        = ipi_call_interrupt,
+       .flags          = IRQF_PERCPU,
+       .name           = "IPI call"
+};
+
+static __init void smp_ipi_init_one(unsigned int virq,
+                                   struct irqaction *action)
+{
+       int ret;
+
+       irq_set_handler(virq, handle_percpu_irq);
+       ret = setup_irq(virq, action);
+       BUG_ON(ret);
+}
+
+static int __init mips_smp_ipi_init(void)
+{
+       unsigned int call_virq, sched_virq;
+       struct irq_domain *ipidomain;
+       struct device_node *node;
+
+       node = of_irq_find_parent(of_root);
+       ipidomain = irq_find_matching_host(node, DOMAIN_BUS_IPI);
+
+       /*
+        * Some platforms have half DT setup. So if we found irq node but
+        * didn't find an ipidomain, try to search for one that is not in the
+        * DT.
+        */
+       if (node && !ipidomain)
+               ipidomain = irq_find_matching_host(NULL, DOMAIN_BUS_IPI);
+
+       BUG_ON(!ipidomain);
+
+       call_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
+       BUG_ON(!call_virq);
+
+       sched_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
+       BUG_ON(!sched_virq);
+
+       if (irq_domain_is_ipi_per_cpu(ipidomain)) {
+               int cpu;
+
+               for_each_cpu(cpu, cpu_possible_mask) {
+                       smp_ipi_init_one(call_virq + cpu, &irq_call);
+                       smp_ipi_init_one(sched_virq + cpu, &irq_resched);
+               }
+       } else {
+               smp_ipi_init_one(call_virq, &irq_call);
+               smp_ipi_init_one(sched_virq, &irq_resched);
+       }
+
+       call_desc = irq_to_desc(call_virq);
+       sched_desc = irq_to_desc(sched_virq);
+
+       return 0;
+}
+early_initcall(mips_smp_ipi_init);
+#endif
+
 /*
  * First C code run on the secondary CPUs after being started up by
  * the master.
@@ -191,7 +328,7 @@ asmlinkage void start_secondary(void)
        WARN_ON_ONCE(!irqs_disabled());
        mp_ops->smp_finish();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void stop_this_cpu(void *dummy)
index bafcb7a..bf14da9 100644 (file)
@@ -663,7 +663,7 @@ static int simulate_rdhwr_normal(struct pt_regs *regs, unsigned int opcode)
        return -1;
 }
 
-static int simulate_rdhwr_mm(struct pt_regs *regs, unsigned short opcode)
+static int simulate_rdhwr_mm(struct pt_regs *regs, unsigned int opcode)
 {
        if ((opcode & MM_POOL32A_FUNC) == MM_RDHWR) {
                int rd = (opcode & MM_RS) >> 16;
@@ -690,15 +690,15 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
 asmlinkage void do_ov(struct pt_regs *regs)
 {
        enum ctx_state prev_state;
-       siginfo_t info;
+       siginfo_t info = {
+               .si_signo = SIGFPE,
+               .si_code = FPE_INTOVF,
+               .si_addr = (void __user *)regs->cp0_epc,
+       };
 
        prev_state = exception_enter();
        die_if_kernel("Integer overflow", regs);
 
-       info.si_code = FPE_INTOVF;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_addr = (void __user *) regs->cp0_epc;
        force_sig_info(SIGFPE, &info, current);
        exception_exit(prev_state);
 }
@@ -874,7 +874,7 @@ out:
 void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
        const char *str)
 {
-       siginfo_t info;
+       siginfo_t info = { 0 };
        char b[40];
 
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
@@ -903,7 +903,6 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
                else
                        info.si_code = FPE_INTOVF;
                info.si_signo = SIGFPE;
-               info.si_errno = 0;
                info.si_addr = (void __user *) regs->cp0_epc;
                force_sig_info(SIGFPE, &info, current);
                break;
@@ -1119,11 +1118,12 @@ no_r2_instr:
        if (get_isa16_mode(regs->cp0_epc)) {
                unsigned short mmop[2] = { 0 };
 
-               if (unlikely(get_user(mmop[0], epc) < 0))
+               if (unlikely(get_user(mmop[0], (u16 __user *)epc + 0) < 0))
                        status = SIGSEGV;
-               if (unlikely(get_user(mmop[1], epc) < 0))
+               if (unlikely(get_user(mmop[1], (u16 __user *)epc + 1) < 0))
                        status = SIGSEGV;
-               opcode = (mmop[0] << 16) | mmop[1];
+               opcode = mmop[0];
+               opcode = (opcode << 16) | mmop[1];
 
                if (status < 0)
                        status = simulate_rdhwr_mm(regs, opcode);
@@ -1369,26 +1369,12 @@ asmlinkage void do_cpu(struct pt_regs *regs)
                if (unlikely(compute_return_epc(regs) < 0))
                        break;
 
-               if (get_isa16_mode(regs->cp0_epc)) {
-                       unsigned short mmop[2] = { 0 };
-
-                       if (unlikely(get_user(mmop[0], epc) < 0))
-                               status = SIGSEGV;
-                       if (unlikely(get_user(mmop[1], epc) < 0))
-                               status = SIGSEGV;
-                       opcode = (mmop[0] << 16) | mmop[1];
-
-                       if (status < 0)
-                               status = simulate_rdhwr_mm(regs, opcode);
-               } else {
+               if (!get_isa16_mode(regs->cp0_epc)) {
                        if (unlikely(get_user(opcode, epc) < 0))
                                status = SIGSEGV;
 
                        if (!cpu_has_llsc && status < 0)
                                status = simulate_llsc(regs, opcode);
-
-                       if (status < 0)
-                               status = simulate_rdhwr_normal(regs, opcode);
                }
 
                if (status < 0)
index 8bc3977..70ef1a4 100644 (file)
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
        dvcpu->arch.wait = 0;
 
-       if (waitqueue_active(&dvcpu->wq))
-               wake_up_interruptible(&dvcpu->wq);
+       if (swait_active(&dvcpu->wq))
+               swake_up(&dvcpu->wq);
 
        return 0;
 }
@@ -702,7 +702,7 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        } else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
                void __user *uaddr = (void __user *)(long)reg->addr;
 
-               return copy_to_user(uaddr, vs, 16);
+               return copy_to_user(uaddr, vs, 16) ? -EFAULT : 0;
        } else {
                return -EINVAL;
        }
@@ -732,7 +732,7 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        } else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
                void __user *uaddr = (void __user *)(long)reg->addr;
 
-               return copy_from_user(vs, uaddr, 16);
+               return copy_from_user(vs, uaddr, 16) ? -EFAULT : 0;
        } else {
                return -EINVAL;
        }
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
        kvm_mips_callbacks->queue_timer_int(vcpu);
 
        vcpu->arch.wait = 0;
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
 }
 
 /* low level hrtimer wake routine */
index 5c81fdd..3530376 100644 (file)
@@ -146,7 +146,7 @@ unsigned long arch_mmap_rnd(void)
 {
        unsigned long rnd;
 
-       rnd = (unsigned long)get_random_int();
+       rnd = get_random_long();
        rnd <<= PAGE_SHIFT;
        if (TASK_IS_32BIT_ADDR)
                rnd &= 0xfffffful;
@@ -174,7 +174,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 
 static inline unsigned long brk_rnd(void)
 {
-       unsigned long rnd = get_random_int();
+       unsigned long rnd = get_random_long();
 
        rnd = rnd << PAGE_SHIFT;
        /* 8MB for 32bit, 256MB for 64bit */
index 3bd0597..91dec32 100644 (file)
@@ -164,11 +164,13 @@ static int __init mips_sc_probe_cm3(void)
 
        sets = cfg & CM_GCR_L2_CONFIG_SET_SIZE_MSK;
        sets >>= CM_GCR_L2_CONFIG_SET_SIZE_SHF;
-       c->scache.sets = 64 << sets;
+       if (sets)
+               c->scache.sets = 64 << sets;
 
        line_sz = cfg & CM_GCR_L2_CONFIG_LINE_SIZE_MSK;
        line_sz >>= CM_GCR_L2_CONFIG_LINE_SIZE_SHF;
-       c->scache.linesz = 2 << line_sz;
+       if (line_sz)
+               c->scache.linesz = 2 << line_sz;
 
        assoc = cfg & CM_GCR_L2_CONFIG_ASSOC_MSK;
        assoc >>= CM_GCR_L2_CONFIG_ASSOC_SHF;
@@ -176,13 +178,12 @@ static int __init mips_sc_probe_cm3(void)
        c->scache.waysize = c->scache.sets * c->scache.linesz;
        c->scache.waybit = __ffs(c->scache.waysize);
 
-       c->scache.flags &= ~MIPS_CACHE_NOT_PRESENT;
+       if (c->scache.linesz) {
+               c->scache.flags &= ~MIPS_CACHE_NOT_PRESENT;
+               return 1;
+       }
 
-       return 1;
-}
-
-void __weak platform_early_l2_init(void)
-{
+       return 0;
 }
 
 static inline int __init mips_sc_probe(void)
@@ -194,12 +195,6 @@ static inline int __init mips_sc_probe(void)
        /* Mark as not present until probe completed */
        c->scache.flags |= MIPS_CACHE_NOT_PRESENT;
 
-       /*
-        * Do we need some platform specific probing before
-        * we configure L2?
-        */
-       platform_early_l2_init();
-
        if (mips_cm_revision() >= CM_REV_CM3)
                return mips_sc_probe_cm3();
 
index 571148c..dc2c521 100644 (file)
@@ -293,7 +293,6 @@ mips_pci_controller:
        console_config();
 #endif
        /* Early detection of CMP support */
-       mips_cm_probe();
        mips_cpc_probe();
 
        if (!register_cps_smp_ops())
@@ -304,10 +303,3 @@ mips_pci_controller:
                return;
        register_up_smp_ops();
 }
-
-void platform_early_l2_init(void)
-{
-       /* L2 configuration lives in the CM3 */
-       if (mips_cm_revision() >= CM_REV_CM3)
-               mips_cm_probe();
-}
index a009ee4..1ae932c 100644 (file)
@@ -297,12 +297,12 @@ static int mt7620_pci_probe(struct platform_device *pdev)
                return PTR_ERR(rstpcie0);
 
        bridge_base = devm_ioremap_resource(&pdev->dev, bridge_res);
-       if (!bridge_base)
-               return -ENOMEM;
+       if (IS_ERR(bridge_base))
+               return PTR_ERR(bridge_base);
 
        pcie_base = devm_ioremap_resource(&pdev->dev, pcie_res);
-       if (!pcie_base)
-               return -ENOMEM;
+       if (IS_ERR(pcie_base))
+               return PTR_ERR(pcie_base);
 
        iomem_resource.start = 0;
        iomem_resource.end = ~0;
index f984193..426173c 100644 (file)
@@ -675,7 +675,7 @@ int __init start_secondary(void *unused)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
        init_clockevents();
 #endif
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
        return 0;
 }
 
index 3d0e17b..df0f52b 100644 (file)
@@ -22,6 +22,9 @@
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init        __read_mostly
+
 void parisc_cache_init(void);  /* initializes cache-flushing */
 void disable_sr_hashing_asm(int); /* low level support for above */
 void disable_sr_hashing(void);   /* turns off space register hashing */
index 845272c..7bd69bd 100644 (file)
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
        }
 }
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #include <asm/kmap_types.h>
 
 #define ARCH_HAS_KMAP
index f84ff12..6d8276c 100644 (file)
@@ -33,7 +33,7 @@
  * floppy accesses go through the track buffer.
  */
 #define _CROSS_64KB(a,s,vdma) \
-(!vdma && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
+(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
 
 #define CROSS_64KB(a,s) _CROSS_64KB(a,s,use_virtual_dma & 1)
 
index 35bdccb..b75039f 100644 (file)
 #define __NR_membarrier                (__NR_Linux + 343)
 #define __NR_userfaultfd       (__NR_Linux + 344)
 #define __NR_mlock2            (__NR_Linux + 345)
+#define __NR_copy_file_range   (__NR_Linux + 346)
 
-#define __NR_Linux_syscalls    (__NR_mlock2 + 1)
+#define __NR_Linux_syscalls    (__NR_copy_file_range + 1)
 
 
 #define __IGNORE_select                /* newselect */
index 9585c81..ce0b2b4 100644 (file)
@@ -269,14 +269,19 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
-       long ret = 0;
-
        /* Do the secure computing check first. */
        secure_computing_strict(regs->gr[20]);
 
        if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-           tracehook_report_syscall_entry(regs))
-               ret = -1L;
+           tracehook_report_syscall_entry(regs)) {
+               /*
+                * Tracing decided this syscall should not happen or the
+                * debugger stored an invalid system call number. Skip
+                * the system call and the system call restart handling.
+                */
+               regs->gr[20] = -1UL;
+               goto out;
+       }
 
 #ifdef CONFIG_64BIT
        if (!is_compat_task())
@@ -290,7 +295,8 @@ long do_syscall_trace_enter(struct pt_regs *regs)
                        regs->gr[24] & 0xffffffff,
                        regs->gr[23] & 0xffffffff);
 
-       return ret ? : regs->gr[20];
+out:
+       return regs->gr[20];
 }
 
 void do_syscall_trace_exit(struct pt_regs *regs)
index 52e8597..c2a9cc5 100644 (file)
@@ -305,7 +305,7 @@ void __init smp_callin(void)
 
        local_irq_enable();  /* Interrupts have been off until now */
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
        /* NOTREACHED */
        panic("smp_callin() AAAAaaaaahhhh....\n");
index 3fbd725..fbafa0d 100644 (file)
@@ -343,7 +343,7 @@ tracesys_next:
 #endif
 
        comiclr,>>=     __NR_Linux_syscalls, %r20, %r0
-       b,n     .Lsyscall_nosys
+       b,n     .Ltracesys_nosys
 
        LDREGX  %r20(%r19), %r19
 
@@ -359,6 +359,9 @@ tracesys_next:
        be      0(%sr7,%r19)
        ldo     R%tracesys_exit(%r2),%r2
 
+.Ltracesys_nosys:
+       ldo     -ENOSYS(%r0),%r28               /* set errno */
+
        /* Do *not* call this function on the gateway page, because it
        makes a direct call to syscall_trace. */
        
index d4ffcfb..585d50f 100644 (file)
        ENTRY_SAME(membarrier)
        ENTRY_SAME(userfaultfd)
        ENTRY_SAME(mlock2)              /* 345 */
+       ENTRY_SAME(copy_file_range)
 
 
 .ifne (. - 90b) - (__NR_Linux_syscalls * (91b - 90b))
index 1b366c4..3c07d6b 100644 (file)
@@ -55,12 +55,12 @@ signed char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
 
 static struct resource data_resource = {
        .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource code_resource = {
        .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource pdcdata_resource = {
@@ -201,7 +201,7 @@ static void __init setup_bootmem(void)
                res->name = "System RAM";
                res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
                res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                request_resource(&iomem_resource, res);
        }
 
index e4824fd..9faa18c 100644 (file)
@@ -557,7 +557,7 @@ choice
 
 config PPC_4K_PAGES
        bool "4k page size"
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_16K_PAGES
        bool "16k page size"
@@ -566,7 +566,7 @@ config PPC_16K_PAGES
 config PPC_64K_PAGES
        bool "64k page size"
        depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
        bool "256k page size"
index 06f17e7..8d1c816 100644 (file)
@@ -50,7 +50,9 @@
  * set of bits not changed in pmd_modify.
  */
 #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
-                        _PAGE_ACCESSED | _PAGE_THP_HUGE)
+                        _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \
+                        _PAGE_SOFT_DIRTY)
+
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
index 8204b0c..ac07a30 100644 (file)
@@ -223,7 +223,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
-#define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
@@ -282,6 +281,10 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
 
+#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
+extern void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                   unsigned long address, pmd_t *pmdp);
+
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
index c5eb86f..867c39b 100644 (file)
@@ -81,6 +81,7 @@ struct pci_dn;
 #define EEH_PE_KEEP            (1 << 8)        /* Keep PE on hotplug   */
 #define EEH_PE_CFG_RESTRICTED  (1 << 9)        /* Block config on error */
 #define EEH_PE_REMOVED         (1 << 10)       /* Removed permanently  */
+#define EEH_PE_PRI_BUS         (1 << 11)       /* Cached primary bus   */
 
 struct eeh_pe {
        int type;                       /* PE type: PHB/Bus/Device      */
index 271fefb..c98afa5 100644 (file)
@@ -38,8 +38,7 @@
 
 #define KVM_MAX_VCPUS          NR_CPUS
 #define KVM_MAX_VCORES         NR_CPUS
-#define KVM_USER_MEM_SLOTS 32
-#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
+#define KVM_USER_MEM_SLOTS     512
 
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -290,7 +289,7 @@ struct kvmppc_vcore {
        struct list_head runnable_threads;
        struct list_head preempt_list;
        spinlock_t lock;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
        u64 stolen_tb;
        u64 preempt_tb;
@@ -630,7 +629,7 @@ struct kvm_vcpu_arch {
        u8 prodded;
        u32 last_inst;
 
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
        struct kvmppc_vcore *vcore;
        int ret;
        int trap;
index 5654ece..3fa9df7 100644 (file)
@@ -383,3 +383,4 @@ SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(mlock2)
+SYSCALL(copy_file_range)
index 8e86b48..32e36b1 100644 (file)
@@ -57,12 +57,14 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
 extern void hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
 
-TRACE_EVENT_FN(hcall_entry,
+TRACE_EVENT_FN_COND(hcall_entry,
 
        TP_PROTO(unsigned long opcode, unsigned long *args),
 
        TP_ARGS(opcode, args),
 
+       TP_CONDITION(cpu_online(raw_smp_processor_id())),
+
        TP_STRUCT__entry(
                __field(unsigned long, opcode)
        ),
@@ -76,13 +78,15 @@ TRACE_EVENT_FN(hcall_entry,
        hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
 );
 
-TRACE_EVENT_FN(hcall_exit,
+TRACE_EVENT_FN_COND(hcall_exit,
 
        TP_PROTO(unsigned long opcode, unsigned long retval,
                unsigned long *retbuf),
 
        TP_ARGS(opcode, retval, retbuf),
 
+       TP_CONDITION(cpu_online(raw_smp_processor_id())),
+
        TP_STRUCT__entry(
                __field(unsigned long, opcode)
                __field(unsigned long, retval)
index 6a5ace5..1f2594d 100644 (file)
@@ -12,7 +12,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls            379
+#define NR_syscalls            380
 
 #define __NR__exit __NR_exit
 
index 12a0565..940290d 100644 (file)
 #define __NR_userfaultfd       364
 #define __NR_membarrier                365
 #define __NR_mlock2            378
+#define __NR_copy_file_range   379
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
index 9387421..650cfb3 100644 (file)
@@ -418,8 +418,7 @@ static void *eeh_rmv_device(void *data, void *userdata)
                eeh_pcid_put(dev);
                if (driver->err_handler &&
                    driver->err_handler->error_detected &&
-                   driver->err_handler->slot_reset &&
-                   driver->err_handler->resume)
+                   driver->err_handler->slot_reset)
                        return NULL;
        }
 
@@ -564,6 +563,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
         */
        eeh_pe_state_mark(pe, EEH_PE_KEEP);
        if (bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                pci_lock_rescan_remove();
                pcibios_remove_pci_devices(bus);
                pci_unlock_rescan_remove();
@@ -803,6 +803,7 @@ perm_error:
         * the their PCI config any more.
         */
        if (frozen_bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
 
                pci_lock_rescan_remove();
@@ -886,6 +887,7 @@ static void eeh_handle_special_event(void)
                                        continue;
 
                                /* Notify all devices to be down */
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                                bus = eeh_pe_bus_get(phb_pe);
                                eeh_pe_dev_traverse(pe,
                                        eeh_report_failure, NULL);
index 8654cb1..98f8180 100644 (file)
@@ -883,32 +883,29 @@ void eeh_pe_restore_bars(struct eeh_pe *pe)
 const char *eeh_pe_loc_get(struct eeh_pe *pe)
 {
        struct pci_bus *bus = eeh_pe_bus_get(pe);
-       struct device_node *dn = pci_bus_to_OF_node(bus);
+       struct device_node *dn;
        const char *loc = NULL;
 
-       if (!dn)
-               goto out;
+       while (bus) {
+               dn = pci_bus_to_OF_node(bus);
+               if (!dn) {
+                       bus = bus->parent;
+                       continue;
+               }
 
-       /* PHB PE or root PE ? */
-       if (pci_is_root_bus(bus)) {
-               loc = of_get_property(dn, "ibm,loc-code", NULL);
-               if (!loc)
+               if (pci_is_root_bus(bus))
                        loc = of_get_property(dn, "ibm,io-base-loc-code", NULL);
+               else
+                       loc = of_get_property(dn, "ibm,slot-location-code",
+                                             NULL);
+
                if (loc)
-                       goto out;
+                       return loc;
 
-               /* Check the root port */
-               dn = dn->child;
-               if (!dn)
-                       goto out;
+               bus = bus->parent;
        }
 
-       loc = of_get_property(dn, "ibm,loc-code", NULL);
-       if (!loc)
-               loc = of_get_property(dn, "ibm,slot-location-code", NULL);
-
-out:
-       return loc ? loc : "N/A";
+       return "N/A";
 }
 
 /**
@@ -931,7 +928,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
                bus = pe->phb->bus;
        } else if (pe->type & EEH_PE_BUS ||
                   pe->type & EEH_PE_DEVICE) {
-               if (pe->bus) {
+               if (pe->state & EEH_PE_PRI_BUS) {
                        bus = pe->bus;
                        goto out;
                }
index 05e804c..aec9a1b 100644 (file)
@@ -109,8 +109,9 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp)
         * If the breakpoint is unregistered between a hw_breakpoint_handler()
         * and the single_step_dabr_instruction(), then cleanup the breakpoint
         * restoration variables to prevent dangling pointers.
+        * FIXME, this should not be using bp->ctx at all! Sayeth peterz.
         */
-       if (bp->ctx && bp->ctx->task)
+       if (bp->ctx && bp->ctx->task && bp->ctx->task != ((void *)-1L))
                bp->ctx->task->thread.last_hit_ubp = NULL;
 }
 
index db475d4..f28754c 100644 (file)
@@ -701,31 +701,3 @@ _GLOBAL(kexec_sequence)
        li      r5,0
        blr     /* image->start(physid, image->start, 0); */
 #endif /* CONFIG_KEXEC */
-
-#ifdef CONFIG_MODULES
-#if defined(_CALL_ELF) && _CALL_ELF == 2
-
-#ifdef CONFIG_MODVERSIONS
-.weak __crc_TOC.
-.section "___kcrctab+TOC.","a"
-.globl __kcrctab_TOC.
-__kcrctab_TOC.:
-       .llong  __crc_TOC.
-#endif
-
-/*
- * Export a fake .TOC. since both modpost and depmod will complain otherwise.
- * Both modpost and depmod strip the leading . so we do the same here.
- */
-.section "__ksymtab_strings","a"
-__kstrtab_TOC.:
-       .asciz "TOC."
-
-.section "___ksymtab+TOC.","a"
-/* This symbol name is important: it's used by modpost to find exported syms */
-.globl __ksymtab_TOC.
-__ksymtab_TOC.:
-       .llong 0 /* .value */
-       .llong __kstrtab_TOC.
-#endif /* ELFv2 */
-#endif /* MODULES */
index 59663af..08b7a40 100644 (file)
@@ -326,7 +326,10 @@ static void dedotify_versions(struct modversion_info *vers,
                }
 }
 
-/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */
+/*
+ * Undefined symbols which refer to .funcname, hack to funcname. Make .TOC.
+ * seem to be defined (value set later).
+ */
 static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 {
        unsigned int i;
@@ -334,8 +337,11 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
        for (i = 1; i < numsyms; i++) {
                if (syms[i].st_shndx == SHN_UNDEF) {
                        char *name = strtab + syms[i].st_name;
-                       if (name[0] == '.')
-                               memmove(name, name+1, strlen(name));
+                       if (name[0] == '.') {
+                               if (strcmp(name+1, "TOC.") == 0)
+                                       syms[i].st_shndx = SHN_ABS;
+                               syms[i].st_name++;
+                       }
                }
        }
 }
@@ -351,7 +357,7 @@ static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs,
        numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym);
 
        for (i = 1; i < numsyms; i++) {
-               if (syms[i].st_shndx == SHN_UNDEF
+               if (syms[i].st_shndx == SHN_ABS
                    && strcmp(strtab + syms[i].st_name, "TOC.") == 0)
                        return &syms[i];
        }
index dccc87e..3c5736e 100644 (file)
@@ -1768,9 +1768,9 @@ static inline unsigned long brk_rnd(void)
 
        /* 8MB for 32bit, 1GB for 64bit */
        if (is_32bit_task())
-               rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+               rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
        else
-               rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+               rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
 
        return rnd << PAGE_SHIFT;
 }
index ad8c9db..d544fa3 100644 (file)
@@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
 
 notrace void __init machine_init(u64 dt_ptr)
 {
-       lockdep_init();
-
        /* Enable early debugging if any specified (see udbg.h) */
        udbg_early_init();
 
index 5c03a6a..f98be83 100644 (file)
@@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr)
        setup_paca(&boot_paca);
        fixup_boot_paca();
 
-       /* Initialize lockdep early or else spinlocks will blow */
-       lockdep_init();
-
        /* -------- printk is now safe to use ------- */
 
        /* Enable early debugging if any specified (see udbg.h) */
index ec9ec20..cc13d4c 100644 (file)
@@ -727,7 +727,7 @@ void start_secondary(void *unused)
 
        local_irq_enable();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
        BUG();
 }
index 774a253..9bf7031 100644 (file)
@@ -377,15 +377,12 @@ no_seg_found:
 
 static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s;
        u64 esid, esid_1t;
        int slb_nr;
        struct kvmppc_slb *slbe;
 
        dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb);
 
-       vcpu_book3s = to_book3s(vcpu);
-
        esid = GET_ESID(rb);
        esid_1t = GET_ESID_1T(rb);
        slb_nr = rb & 0xfff;
index cff207b..f1187bb 100644 (file)
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
        int cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
 
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                tvcpu->arch.prodded = 1;
                smp_mb();
                if (vcpu->arch.ceded) {
-                       if (waitqueue_active(&vcpu->wq)) {
-                               wake_up_interruptible(&vcpu->wq);
+                       if (swait_active(&vcpu->wq)) {
+                               swake_up(&vcpu->wq);
                                vcpu->stat.halt_wakeup++;
                        }
                }
@@ -833,6 +833,24 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        vcpu->stat.sum_exits++;
 
+       /*
+        * This can happen if an interrupt occurs in the last stages
+        * of guest entry or the first stages of guest exit (i.e. after
+        * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+        * and before setting it to KVM_GUEST_MODE_HOST_HV).
+        * That can happen due to a bug, or due to a machine check
+        * occurring at just the wrong time.
+        */
+       if (vcpu->arch.shregs.msr & MSR_HV) {
+               printk(KERN_EMERG "KVM trap in HV mode!\n");
+               printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+                       vcpu->arch.trap, kvmppc_get_pc(vcpu),
+                       vcpu->arch.shregs.msr);
+               kvmppc_dump_regs(vcpu);
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               run->hw.hardware_exit_reason = vcpu->arch.trap;
+               return RESUME_HOST;
+       }
        run->exit_reason = KVM_EXIT_UNKNOWN;
        run->ready_for_interrupt_injection = 1;
        switch (vcpu->arch.trap) {
@@ -1441,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        INIT_LIST_HEAD(&vcore->runnable_threads);
        spin_lock_init(&vcore->lock);
        spin_lock_init(&vcore->stoltb_lock);
-       init_waitqueue_head(&vcore->wq);
+       init_swait_queue_head(&vcore->wq);
        vcore->preempt_tb = TB_NIL;
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
@@ -2513,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
        struct kvm_vcpu *vcpu;
        int do_sleep = 1;
+       DECLARE_SWAITQUEUE(wait);
 
-       DEFINE_WAIT(wait);
-
-       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 
        /*
         * Check one last time for pending exceptions and ceded state after
@@ -2530,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
        }
 
        if (!do_sleep) {
-               finish_wait(&vc->wq, &wait);
+               finish_swait(&vc->wq, &wait);
                return;
        }
 
@@ -2538,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
        trace_kvmppc_vcore_blocked(vc, 0);
        spin_unlock(&vc->lock);
        schedule();
-       finish_wait(&vc->wq, &wait);
+       finish_swait(&vc->wq, &wait);
        spin_lock(&vc->lock);
        vc->vcore_state = VCORE_INACTIVE;
        trace_kvmppc_vcore_blocked(vc, 1);
@@ -2594,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
-                       wake_up(&vc->wq);
+                       swake_up(&vc->wq);
                }
 
        }
index 3c6badc..25ae2c9 100644 (file)
@@ -1370,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        std     r6, VCPU_ACOP(r9)
        stw     r7, VCPU_GUEST_PID(r9)
        std     r8, VCPU_WORT(r9)
+       /*
+        * Restore various registers to 0, where non-zero values
+        * set by the guest could disrupt the host.
+        */
+       li      r0, 0
+       mtspr   SPRN_IAMR, r0
+       mtspr   SPRN_CIABR, r0
+       mtspr   SPRN_DAWRX, r0
+       mtspr   SPRN_TCSCR, r0
+       mtspr   SPRN_WORT, r0
+       /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+       li      r0, 1
+       sldi    r0, r0, 31
+       mtspr   SPRN_MMCRS, r0
 8:
 
        /* Save and reset AMR and UAMOR before turning on the MMU */
@@ -2153,7 +2167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
        /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
 2:     rlwimi  r5, r4, 5, DAWRX_DR | DAWRX_DW
-       rlwimi  r5, r4, 1, DAWRX_WT
+       rlwimi  r5, r4, 2, DAWRX_WT
        clrrdi  r4, r4, 3
        std     r4, VCPU_DAWR(r3)
        std     r5, VCPU_DAWRX(r3)
@@ -2404,6 +2418,8 @@ machine_check_realmode:
         * guest as machine check causing guest to crash.
         */
        ld      r11, VCPU_MSR(r9)
+       rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
+       bne     mc_cont                 /* if so, exit to host */
        andi.   r10, r11, MSR_RI        /* check for unrecoverable exception */
        beq     1f                      /* Deliver a machine check to guest */
        ld      r10, VCPU_PC(r9)
index 6fd2405..a3b182d 100644 (file)
@@ -919,21 +919,17 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                                r = -ENXIO;
                                break;
                        }
-                       vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+                       val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0];
                        break;
                case KVM_REG_PPC_VSCR:
                        if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
                                r = -ENXIO;
                                break;
                        }
-                       vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val);
+                       val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]);
                        break;
                case KVM_REG_PPC_VRSAVE:
-                       if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
-                               r = -ENXIO;
-                               break;
-                       }
-                       vcpu->arch.vrsave = set_reg_val(reg->id, val);
+                       val = get_reg_val(reg->id, vcpu->arch.vrsave);
                        break;
 #endif /* CONFIG_ALTIVEC */
                default:
@@ -974,17 +970,21 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                                r = -ENXIO;
                                break;
                        }
-                       val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0];
+                       vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
                        break;
                case KVM_REG_PPC_VSCR:
                        if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
                                r = -ENXIO;
                                break;
                        }
-                       val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]);
+                       vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val);
                        break;
                case KVM_REG_PPC_VRSAVE:
-                       val = get_reg_val(reg->id, vcpu->arch.vrsave);
+                       if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+                               r = -ENXIO;
+                               break;
+                       }
+                       vcpu->arch.vrsave = set_reg_val(reg->id, val);
                        break;
 #endif /* CONFIG_ALTIVEC */
                default:
index 0762c1e..edb0991 100644 (file)
@@ -111,7 +111,13 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
         */
        if (!(old_pte & _PAGE_COMBO)) {
                flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
-               old_pte &= ~_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND;
+               /*
+                * clear the old slot details from the old and new pte.
+                * On hash insert failure we use old pte value and we don't
+                * want slot information there if we have a insert failure.
+                */
+               old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
+               new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
                goto htab_insert_hpte;
        }
        /*
index 49b152b..eb2accd 100644 (file)
@@ -78,9 +78,19 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                 * base page size. This is because demote_segment won't flush
                 * hash page table entries.
                 */
-               if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
+               if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) {
                        flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
                                            ssize, flags);
+                       /*
+                        * With THP, we also clear the slot information with
+                        * respect to all the 64K hash pte mapping the 16MB
+                        * page. They are all invalid now. This make sure we
+                        * don't find the slot valid when we fault with 4k
+                        * base page size.
+                        *
+                        */
+                       memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+               }
        }
 
        valid = hpte_valid(hpte_slot_array, index);
index 7e6d088..83a8be7 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 
+#include <asm/mmu.h>
+
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #ifdef CONFIG_PPC64
 static inline int tlb1_next(void)
@@ -60,6 +62,14 @@ static inline void book3e_tlb_lock(void)
        unsigned long tmp;
        int token = smp_processor_id() + 1;
 
+       /*
+        * Besides being unnecessary in the absence of SMT, this
+        * check prevents trying to do lbarx/stbcx. on e5500 which
+        * doesn't implement either feature.
+        */
+       if (!cpu_has_feature(CPU_FTR_SMT))
+               return;
+
        asm volatile("1: lbarx %0, 0, %1;"
                     "cmpwi %0, 0;"
                     "bne 2f;"
@@ -80,6 +90,9 @@ static inline void book3e_tlb_unlock(void)
 {
        struct paca_struct *paca = get_paca();
 
+       if (!cpu_has_feature(CPU_FTR_SMT))
+               return;
+
        isync();
        paca->tcd_ptr->lock = 0;
 }
index 22d94c3..f078a1f 100644 (file)
@@ -541,7 +541,7 @@ static int __init add_system_ram_resources(void)
                        res->name = "System RAM";
                        res->start = base;
                        res->end = base + size - 1;
-                       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                        WARN_ON(request_resource(&iomem_resource, res) < 0);
                }
        }
@@ -560,12 +560,12 @@ subsys_initcall(add_system_ram_resources);
  */
 int devmem_is_allowed(unsigned long pfn)
 {
+       if (page_is_rtas_user_buf(pfn))
+               return 1;
        if (iomem_is_exclusive(PFN_PHYS(pfn)))
                return 0;
        if (!page_is_ram(pfn))
                return 1;
-       if (page_is_rtas_user_buf(pfn))
-               return 1;
        return 0;
 }
 #endif /* CONFIG_STRICT_DEVMEM */
index 0f0502e..4087705 100644 (file)
@@ -59,9 +59,9 @@ unsigned long arch_mmap_rnd(void)
 
        /* 8MB for 32bit, 1GB for 64bit */
        if (is_32bit_task())
-               rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
+               rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
        else
-               rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
+               rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
 
        return rnd << PAGE_SHIFT;
 }
index 3124a20..cdf2123 100644 (file)
@@ -646,6 +646,28 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
        return pgtable;
 }
 
+void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                            unsigned long address, pmd_t *pmdp)
+{
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
+
+       /*
+        * We can't mark the pmd none here, because that will cause a race
+        * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
+        * we spilt, but at the same time we wan't rest of the ppc64 code
+        * not to insert hash pte on this, because we will be modifying
+        * the deposited pgtable in the caller of this function. Hence
+        * clear the _PAGE_USER so that we move the fault handling to
+        * higher level function and that will serialize against ptl.
+        * We need to flush existing hash pte entries here even though,
+        * the translation is still valid, because we will withdraw
+        * pgtable_t after this.
+        */
+       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
+}
+
+
 /*
  * set a new huge pmd. We should not be called for updating
  * an existing pmd entry. That should go via pmd_hugepage_update.
@@ -663,10 +685,20 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
 
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
 {
        pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+
+       /*
+        * This ensures that generic code that rely on IRQ disabling
+        * to prevent a parallel THP split work as expected.
+        */
+       kick_all_cpus_sync();
 }
 
 /*
index 7d5e295..9958ba8 100644 (file)
@@ -816,7 +816,7 @@ static struct power_pmu power8_pmu = {
        .get_constraint         = power8_get_constraint,
        .get_alternatives       = power8_get_alternatives,
        .disable_pmc            = power8_disable_pmc,
-       .flags                  = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_ARCH_207S,
+       .flags                  = PPMU_HAS_SIER | PPMU_ARCH_207S,
        .n_generic              = ARRAY_SIZE(power8_generic_events),
        .generic_events         = power8_generic_events,
        .cache_events           = &power8_cache_events,
index 5f152b9..87f47e5 100644 (file)
@@ -444,9 +444,12 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
         * PCI devices of the PE are expected to be removed prior
         * to PE reset.
         */
-       if (!edev->pe->bus)
+       if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
                edev->pe->bus = pci_find_bus(hose->global_number,
                                             pdn->busno);
+               if (edev->pe->bus)
+                       edev->pe->state |= EEH_PE_PRI_BUS;
+       }
 
        /*
         * Enable EEH explicitly so that we will do EEH check
index 573ae19..f90dc04 100644 (file)
@@ -3180,6 +3180,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .dma_dev_setup = pnv_pci_dma_dev_setup,
+       .dma_bus_setup = pnv_pci_dma_bus_setup,
 #ifdef CONFIG_PCI_MSI
        .setup_msi_irqs = pnv_setup_msi_irqs,
        .teardown_msi_irqs = pnv_teardown_msi_irqs,
index 2f55c86..b1ef84a 100644 (file)
@@ -599,6 +599,9 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
        u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
        long i;
 
+       if (proto_tce & TCE_PCI_WRITE)
+               proto_tce |= TCE_PCI_READ;
+
        for (i = 0; i < npages; i++) {
                unsigned long newtce = proto_tce |
                        ((rpn + i) << tbl->it_page_shift);
@@ -620,6 +623,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
 
        BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
 
+       if (newtce & TCE_PCI_WRITE)
+               newtce |= TCE_PCI_READ;
+
        oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
        *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
        *direction = iommu_tce_direction(oldtce);
@@ -760,6 +766,26 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
                phb->dma_dev_setup(phb, pdev);
 }
 
+void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+{
+       struct pci_controller *hose = bus->sysdata;
+       struct pnv_phb *phb = hose->private_data;
+       struct pnv_ioda_pe *pe;
+
+       list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+               if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+                       continue;
+
+               if (!pe->pbus)
+                       continue;
+
+               if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+                       pe->pbus = bus;
+                       break;
+               }
+       }
+}
+
 void pnv_pci_shutdown(void)
 {
        struct pci_controller *hose;
index 7f56313..00691a9 100644 (file)
@@ -242,6 +242,7 @@ extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
 extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 
index ea91ddf..629c908 100644 (file)
@@ -40,6 +40,7 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
 static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
 {
        fpregs->pad = 0;
+       fpregs->fpc = fpu->fpc;
        if (MACHINE_HAS_VX)
                convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
        else
@@ -49,6 +50,7 @@ static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
 
 static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
 {
+       fpu->fpc = fpregs->fpc;
        if (MACHINE_HAS_VX)
                convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
        else
index 16aa0c7..595a275 100644 (file)
@@ -8,6 +8,8 @@
 
 #include <linux/types.h>
 
+#define ARCH_IRQ_ENABLED       (3UL << (BITS_PER_LONG - 8))
+
 /* store then OR system mask. */
 #define __arch_local_irq_stosm(__or)                                   \
 ({                                                                     \
@@ -54,14 +56,17 @@ static inline notrace void arch_local_irq_enable(void)
        __arch_local_irq_stosm(0x03);
 }
 
+/* This only restores external and I/O interrupt state */
 static inline notrace void arch_local_irq_restore(unsigned long flags)
 {
-       __arch_local_irq_ssm(flags);
+       /* only disabled->disabled and disabled->enabled is valid */
+       if (flags & ARCH_IRQ_ENABLED)
+               arch_local_irq_enable();
 }
 
 static inline notrace bool arch_irqs_disabled_flags(unsigned long flags)
 {
-       return !(flags & (3UL << (BITS_PER_LONG - 8)));
+       return !(flags & ARCH_IRQ_ENABLED);
 }
 
 static inline notrace bool arch_irqs_disabled(void)
index 6742414..b0c8ad0 100644 (file)
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
 struct kvm_s390_local_interrupt {
        spinlock_t lock;
        struct kvm_s390_float_interrupt *float_int;
-       wait_queue_head_t *wq;
+       struct swait_queue_head *wq;
        atomic_t *cpuflags;
        DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
        struct kvm_s390_irq_payload irq;
@@ -546,7 +546,6 @@ struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
        unsigned int      host_acrs[NUM_ACRS];
        struct fpu        host_fpregs;
-       struct fpu        guest_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
index 7aa7991..a52b6cc 100644 (file)
@@ -37,7 +37,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
        regs->psw.addr = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVEPATCH
+#error Include linux/livepatch.h, not asm/livepatch.h
 #endif
 
 #endif
index fb1b93e..e485817 100644 (file)
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
+       spin_lock_init(&mm->context.list_lock);
+       INIT_LIST_HEAD(&mm->context.pgtable_list);
+       INIT_LIST_HEAD(&mm->context.gmap_list);
        cpumask_clear(&mm->context.cpu_attach_mask);
        atomic_set(&mm->context.attach_count, 0);
        mm->context.flush_mm = 0;
-       mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
-       mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #ifdef CONFIG_PGSTE
        mm->context.alloc_pgste = page_table_allocate_pgste;
        mm->context.has_pgste = 0;
        mm->context.use_skey = 0;
 #endif
-       mm->context.asce_limit = STACK_TOP_MAX;
+       if (mm->context.asce_limit == 0) {
+               /* context created by exec, set asce limit to 4TB */
+               mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+                       _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+               mm->context.asce_limit = STACK_TOP_MAX;
+       } else if (mm->context.asce_limit == (1UL << 31)) {
+               mm_inc_nr_pmds(mm);
+       }
        crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
        return 0;
 }
@@ -111,8 +119,6 @@ static inline void activate_mm(struct mm_struct *prev,
 static inline void arch_dup_mmap(struct mm_struct *oldmm,
                                 struct mm_struct *mm)
 {
-       if (oldmm->context.asce_limit < mm->context.asce_limit)
-               crst_table_downgrade(mm, oldmm->context.asce_limit);
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
index 1a9a98d..69aa18b 100644 (file)
@@ -8,10 +8,13 @@
 #include <asm/pci_insn.h>
 
 /* I/O Map */
-#define ZPCI_IOMAP_MAX_ENTRIES         0x7fff
-#define ZPCI_IOMAP_ADDR_BASE           0x8000000000000000ULL
-#define ZPCI_IOMAP_ADDR_IDX_MASK       0x7fff000000000000ULL
-#define ZPCI_IOMAP_ADDR_OFF_MASK       0x0000ffffffffffffULL
+#define ZPCI_IOMAP_SHIFT               48
+#define ZPCI_IOMAP_ADDR_BASE           0x8000000000000000UL
+#define ZPCI_IOMAP_ADDR_OFF_MASK       ((1UL << ZPCI_IOMAP_SHIFT) - 1)
+#define ZPCI_IOMAP_MAX_ENTRIES                                                 \
+       ((ULONG_MAX - ZPCI_IOMAP_ADDR_BASE + 1) / (1UL << ZPCI_IOMAP_SHIFT))
+#define ZPCI_IOMAP_ADDR_IDX_MASK                                               \
+       (~ZPCI_IOMAP_ADDR_OFF_MASK - ZPCI_IOMAP_ADDR_BASE)
 
 struct zpci_iomap_entry {
        u32 fh;
@@ -21,8 +24,9 @@ struct zpci_iomap_entry {
 
 extern struct zpci_iomap_entry *zpci_iomap_start;
 
+#define ZPCI_ADDR(idx) (ZPCI_IOMAP_ADDR_BASE | ((u64) idx << ZPCI_IOMAP_SHIFT))
 #define ZPCI_IDX(addr)                                                         \
-       (((__force u64) addr & ZPCI_IOMAP_ADDR_IDX_MASK) >> 48)
+       (((__force u64) addr & ZPCI_IOMAP_ADDR_IDX_MASK) >> ZPCI_IOMAP_SHIFT)
 #define ZPCI_OFFSET(addr)                                                      \
        ((__force u64) addr & ZPCI_IOMAP_ADDR_OFF_MASK)
 
index 7b7858f..d7cc79f 100644 (file)
@@ -100,12 +100,26 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-       spin_lock_init(&mm->context.list_lock);
-       INIT_LIST_HEAD(&mm->context.pgtable_list);
-       INIT_LIST_HEAD(&mm->context.gmap_list);
-       return (pgd_t *) crst_table_alloc(mm);
+       unsigned long *table = crst_table_alloc(mm);
+
+       if (!table)
+               return NULL;
+       if (mm->context.asce_limit == (1UL << 31)) {
+               /* Forking a compat process with 2 page table levels */
+               if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+                       crst_table_free(mm, table);
+                       return NULL;
+               }
+       }
+       return (pgd_t *) table;
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       if (mm->context.asce_limit == (1UL << 31))
+               pgtable_pmd_page_dtor(virt_to_page(pgd));
+       crst_table_free(mm, (unsigned long *) pgd);
 }
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
 
 static inline void pmd_populate(struct mm_struct *mm,
                                pmd_t *pmd, pgtable_t pte)
index f16debf..1c4fe12 100644 (file)
@@ -166,14 +166,14 @@ extern __vector128 init_task_fpu_regs[__NUM_VXRS];
  */
 #define start_thread(regs, new_psw, new_stackp) do {                   \
        regs->psw.mask  = PSW_USER_BITS | PSW_MASK_EA | PSW_MASK_BA;    \
-       regs->psw.addr  = new_psw | PSW_ADDR_AMODE;                     \
+       regs->psw.addr  = new_psw;                                      \
        regs->gprs[15]  = new_stackp;                                   \
        execve_tail();                                                  \
 } while (0)
 
 #define start_thread31(regs, new_psw, new_stackp) do {                 \
        regs->psw.mask  = PSW_USER_BITS | PSW_MASK_BA;                  \
-       regs->psw.addr  = new_psw | PSW_ADDR_AMODE;                     \
+       regs->psw.addr  = new_psw;                                      \
        regs->gprs[15]  = new_stackp;                                   \
        crst_table_downgrade(current->mm, 1UL << 31);                   \
        execve_tail();                                                  \
index f00cd35..99bc456 100644 (file)
@@ -149,7 +149,7 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
 #define arch_has_block_step()  (1)
 
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
-#define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
+#define instruction_pointer(regs) ((regs)->psw.addr)
 #define user_stack_pointer(regs)((regs)->gprs[15])
 #define profile_pc(regs) instruction_pointer(regs)
 
@@ -161,7 +161,7 @@ static inline long regs_return_value(struct pt_regs *regs)
 static inline void instruction_pointer_set(struct pt_regs *regs,
                                           unsigned long val)
 {
-       regs->psw.addr = val | PSW_ADDR_AMODE;
+       regs->psw.addr = val;
 }
 
 int regs_query_register_offset(const char *name);
@@ -171,7 +171,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
 
 static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
 {
-       return regs->gprs[15] & PSW_ADDR_INSN;
+       return regs->gprs[15];
 }
 
 #endif /* __ASSEMBLY__ */
index 34ec202..ab3aa68 100644 (file)
 #define __NR_recvmsg           372
 #define __NR_shutdown          373
 #define __NR_mlock2            374
-#define NR_syscalls 375
+#define __NR_copy_file_range   375
+#define NR_syscalls 376
 
 /* 
  * There are some system calls that are not present on 64 bit, some
index 66c9441..4af6037 100644 (file)
@@ -271,7 +271,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
 
        /* Restore high gprs from signal stack */
        if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high,
-                            sizeof(&sregs_ext->gprs_high)))
+                            sizeof(sregs_ext->gprs_high)))
                return -EFAULT;
        for (i = 0; i < NUM_GPRS; i++)
                *(__u32 *)&regs->gprs[i] = gprs_high[i];
index fac4eed..ae2cda5 100644 (file)
@@ -177,3 +177,4 @@ COMPAT_SYSCALL_WRAP3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
 COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len);
 COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len);
 COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags);
+COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags);
index a92b39f..3986c9f 100644 (file)
@@ -59,8 +59,6 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
        struct save_area *sa;
 
        sa = (void *) memblock_alloc(sizeof(*sa), 8);
-       if (!sa)
-               return NULL;
        if (is_boot_cpu)
                list_add(&sa->list, &dump_save_areas);
        else
index 6fca0e4..c890a55 100644 (file)
@@ -1470,7 +1470,7 @@ debug_dflt_header_fn(debug_info_t * id, struct debug_view *view,
                except_str = "*";
        else
                except_str = "-";
-       caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN;
+       caller = (unsigned long) entry->caller;
        rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p  ",
                      area, (long long)time_spec.tv_sec,
                      time_spec.tv_nsec / 1000, level, except_str,
index dc8e204..02bd02f 100644 (file)
@@ -34,22 +34,21 @@ __show_trace(unsigned long sp, unsigned long low, unsigned long high)
        unsigned long addr;
 
        while (1) {
-               sp = sp & PSW_ADDR_INSN;
                if (sp < low || sp > high - sizeof(*sf))
                        return sp;
                sf = (struct stack_frame *) sp;
-               addr = sf->gprs[8] & PSW_ADDR_INSN;
+               addr = sf->gprs[8];
                printk("([<%016lx>] %pSR)\n", addr, (void *)addr);
                /* Follow the backchain. */
                while (1) {
                        low = sp;
-                       sp = sf->back_chain & PSW_ADDR_INSN;
+                       sp = sf->back_chain;
                        if (!sp)
                                break;
                        if (sp <= low || sp > high - sizeof(*sf))
                                return sp;
                        sf = (struct stack_frame *) sp;
-                       addr = sf->gprs[8] & PSW_ADDR_INSN;
+                       addr = sf->gprs[8];
                        printk(" [<%016lx>] %pSR\n", addr, (void *)addr);
                }
                /* Zero backchain detected, check for interrupt frame. */
@@ -57,7 +56,7 @@ __show_trace(unsigned long sp, unsigned long low, unsigned long high)
                if (sp <= low || sp > high - sizeof(*regs))
                        return sp;
                regs = (struct pt_regs *) sp;
-               addr = regs->psw.addr & PSW_ADDR_INSN;
+               addr = regs->psw.addr;
                printk(" [<%016lx>] %pSR\n", addr, (void *)addr);
                low = sp;
                sp = regs->gprs[15];
index 20a5caf..a0684de 100644 (file)
@@ -252,14 +252,14 @@ static void early_pgm_check_handler(void)
        unsigned long addr;
 
        addr = S390_lowcore.program_old_psw.addr;
-       fixup = search_exception_tables(addr & PSW_ADDR_INSN);
+       fixup = search_exception_tables(addr);
        if (!fixup)
                disabled_wait(0);
        /* Disable low address protection before storing into lowcore. */
        __ctl_store(cr0, 0, 0);
        cr0_new = cr0 & ~(1UL << 28);
        __ctl_load(cr0_new, 0, 0);
-       S390_lowcore.program_old_psw.addr = extable_fixup(fixup)|PSW_ADDR_AMODE;
+       S390_lowcore.program_old_psw.addr = extable_fixup(fixup);
        __ctl_load(cr0, 0, 0);
 }
 
@@ -268,9 +268,9 @@ static noinline __init void setup_lowcore_early(void)
        psw_t psw;
 
        psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA;
-       psw.addr = PSW_ADDR_AMODE | (unsigned long) s390_base_ext_handler;
+       psw.addr = (unsigned long) s390_base_ext_handler;
        S390_lowcore.external_new_psw = psw;
-       psw.addr = PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler;
+       psw.addr = (unsigned long) s390_base_pgm_handler;
        S390_lowcore.program_new_psw = psw;
        s390_base_pgm_handler_fn = early_pgm_check_handler;
 }
@@ -448,7 +448,6 @@ void __init startup_init(void)
        rescue_initrd();
        clear_bss_section();
        init_kernel_storage_key();
-       lockdep_init();
        lockdep_off();
        setup_lowcore_early();
        setup_facility_list();
index e0eaf11..0f7bfeb 100644 (file)
@@ -203,7 +203,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
                goto out;
        if (unlikely(atomic_read(&current->tracing_graph_pause)))
                goto out;
-       ip = (ip & PSW_ADDR_INSN) - MCOUNT_INSN_SIZE;
+       ip -= MCOUNT_INSN_SIZE;
        trace.func = ip;
        trace.depth = current->curr_ret_stack + 1;
        /* Only trace if the calling function expects to. */
index c5febe8..03c2b46 100644 (file)
@@ -16,7 +16,7 @@
 
 __HEAD
 ENTRY(startup_continue)
-       tm      __LC_STFLE_FAC_LIST+6,0x80      # LPP available ?
+       tm      __LC_STFLE_FAC_LIST+5,0x80      # LPP available ?
        jz      0f
        xc      __LC_LPP+1(7,0),__LC_LPP+1      # clear lpp and current_pid
        mvi     __LC_LPP,0x80                   #   and set LPP_MAGIC
index 0a5a6b6..f20abdb 100644 (file)
@@ -2057,12 +2057,12 @@ void s390_reset_system(void)
        /* Set new machine check handler */
        S390_lowcore.mcck_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT;
        S390_lowcore.mcck_new_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) s390_base_mcck_handler;
+               (unsigned long) s390_base_mcck_handler;
 
        /* Set new program check handler */
        S390_lowcore.program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT;
        S390_lowcore.program_new_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler;
+               (unsigned long) s390_base_pgm_handler;
 
        /*
         * Clear subchannel ID and number to signal new kernel that no CCW or
index 389db56..250f597 100644 (file)
@@ -226,7 +226,7 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb,
        __ctl_load(per_kprobe, 9, 11);
        regs->psw.mask |= PSW_MASK_PER;
        regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT);
-       regs->psw.addr = ip | PSW_ADDR_AMODE;
+       regs->psw.addr = ip;
 }
 NOKPROBE_SYMBOL(enable_singlestep);
 
@@ -238,7 +238,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb,
        __ctl_load(kcb->kprobe_saved_ctl, 9, 11);
        regs->psw.mask &= ~PSW_MASK_PER;
        regs->psw.mask |= kcb->kprobe_saved_imask;
-       regs->psw.addr = ip | PSW_ADDR_AMODE;
+       regs->psw.addr = ip;
 }
 NOKPROBE_SYMBOL(disable_singlestep);
 
@@ -310,7 +310,7 @@ static int kprobe_handler(struct pt_regs *regs)
         */
        preempt_disable();
        kcb = get_kprobe_ctlblk();
-       p = get_kprobe((void *)((regs->psw.addr & PSW_ADDR_INSN) - 2));
+       p = get_kprobe((void *)(regs->psw.addr - 2));
 
        if (p) {
                if (kprobe_running()) {
@@ -460,7 +460,7 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
                        break;
        }
 
-       regs->psw.addr = orig_ret_address | PSW_ADDR_AMODE;
+       regs->psw.addr = orig_ret_address;
 
        pop_kprobe(get_kprobe_ctlblk());
        kretprobe_hash_unlock(current, &flags);
@@ -490,7 +490,7 @@ NOKPROBE_SYMBOL(trampoline_probe_handler);
 static void resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       unsigned long ip = regs->psw.addr & PSW_ADDR_INSN;
+       unsigned long ip = regs->psw.addr;
        int fixup = probe_get_fixup_type(p->ainsn.insn);
 
        /* Check if the kprobes location is an enabled ftrace caller */
@@ -605,9 +605,9 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
                 * In case the user-specified fault handler returned
                 * zero, try to fix up.
                 */
-               entry = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
+               entry = search_exception_tables(regs->psw.addr);
                if (entry) {
-                       regs->psw.addr = extable_fixup(entry) | PSW_ADDR_AMODE;
+                       regs->psw.addr = extable_fixup(entry);
                        return 1;
                }
 
@@ -683,7 +683,7 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
        memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs));
 
        /* setup return addr to the jprobe handler routine */
-       regs->psw.addr = (unsigned long) jp->entry | PSW_ADDR_AMODE;
+       regs->psw.addr = (unsigned long) jp->entry;
        regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT);
 
        /* r15 is the stack pointer */
index 61595c1..0943b11 100644 (file)
@@ -74,7 +74,7 @@ static unsigned long guest_is_user_mode(struct pt_regs *regs)
 
 static unsigned long instruction_pointer_guest(struct pt_regs *regs)
 {
-       return sie_block(regs)->gpsw.addr & PSW_ADDR_INSN;
+       return sie_block(regs)->gpsw.addr;
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
@@ -231,29 +231,27 @@ static unsigned long __store_trace(struct perf_callchain_entry *entry,
        struct pt_regs *regs;
 
        while (1) {
-               sp = sp & PSW_ADDR_INSN;
                if (sp < low || sp > high - sizeof(*sf))
                        return sp;
                sf = (struct stack_frame *) sp;
-               perf_callchain_store(entry, sf->gprs[8] & PSW_ADDR_INSN);
+               perf_callchain_store(entry, sf->gprs[8]);
                /* Follow the backchain. */
                while (1) {
                        low = sp;
-                       sp = sf->back_chain & PSW_ADDR_INSN;
+                       sp = sf->back_chain;
                        if (!sp)
                                break;
                        if (sp <= low || sp > high - sizeof(*sf))
                                return sp;
                        sf = (struct stack_frame *) sp;
-                       perf_callchain_store(entry,
-                                            sf->gprs[8] & PSW_ADDR_INSN);
+                       perf_callchain_store(entry, sf->gprs[8]);
                }
                /* Zero backchain detected, check for interrupt frame. */
                sp = (unsigned long) (sf + 1);
                if (sp <= low || sp > high - sizeof(*regs))
                        return sp;
                regs = (struct pt_regs *) sp;
-               perf_callchain_store(entry, sf->gprs[8] & PSW_ADDR_INSN);
+               perf_callchain_store(entry, sf->gprs[8]);
                low = sp;
                sp = regs->gprs[15];
        }
@@ -262,12 +260,13 @@ static unsigned long __store_trace(struct perf_callchain_entry *entry,
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
                           struct pt_regs *regs)
 {
-       unsigned long head;
+       unsigned long head, frame_size;
        struct stack_frame *head_sf;
 
        if (user_mode(regs))
                return;
 
+       frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
        head = regs->gprs[15];
        head_sf = (struct stack_frame *) head;
 
@@ -275,8 +274,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
                return;
 
        head = head_sf->back_chain;
-       head = __store_trace(entry, head, S390_lowcore.async_stack - ASYNC_SIZE,
-                            S390_lowcore.async_stack);
+       head = __store_trace(entry, head,
+                            S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
+                            S390_lowcore.async_stack + frame_size);
 
        __store_trace(entry, head, S390_lowcore.thread_info,
                      S390_lowcore.thread_info + THREAD_SIZE);
index 114ee8b..2bba7df 100644 (file)
@@ -56,10 +56,10 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
                return 0;
        low = task_stack_page(tsk);
        high = (struct stack_frame *) task_pt_regs(tsk);
-       sf = (struct stack_frame *) (tsk->thread.ksp & PSW_ADDR_INSN);
+       sf = (struct stack_frame *) tsk->thread.ksp;
        if (sf <= low || sf > high)
                return 0;
-       sf = (struct stack_frame *) (sf->back_chain & PSW_ADDR_INSN);
+       sf = (struct stack_frame *) sf->back_chain;
        if (sf <= low || sf > high)
                return 0;
        return sf->gprs[8];
@@ -154,7 +154,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
                memset(&frame->childregs, 0, sizeof(struct pt_regs));
                frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
                                PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
-               frame->childregs.psw.addr = PSW_ADDR_AMODE |
+               frame->childregs.psw.addr =
                                (unsigned long) kernel_thread_starter;
                frame->childregs.gprs[9] = new_stackp; /* function */
                frame->childregs.gprs[10] = arg;
@@ -220,14 +220,14 @@ unsigned long get_wchan(struct task_struct *p)
                return 0;
        low = task_stack_page(p);
        high = (struct stack_frame *) task_pt_regs(p);
-       sf = (struct stack_frame *) (p->thread.ksp & PSW_ADDR_INSN);
+       sf = (struct stack_frame *) p->thread.ksp;
        if (sf <= low || sf > high)
                return 0;
        for (count = 0; count < 16; count++) {
-               sf = (struct stack_frame *) (sf->back_chain & PSW_ADDR_INSN);
+               sf = (struct stack_frame *) sf->back_chain;
                if (sf <= low || sf > high)
                        return 0;
-               return_address = sf->gprs[8] & PSW_ADDR_INSN;
+               return_address = sf->gprs[8];
                if (!in_sched_functions(return_address))
                        return return_address;
        }
index 01c37b3..49b1c13 100644 (file)
@@ -84,7 +84,7 @@ void update_cr_regs(struct task_struct *task)
                if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP))
                        new.control |= PER_EVENT_IFETCH;
                new.start = 0;
-               new.end = PSW_ADDR_INSN;
+               new.end = -1UL;
        }
 
        /* Take care of the PER enablement bit in the PSW. */
@@ -148,7 +148,7 @@ static inline unsigned long __peek_user_per(struct task_struct *child,
        else if (addr == (addr_t) &dummy->cr11)
                /* End address of the active per set. */
                return test_thread_flag(TIF_SINGLE_STEP) ?
-                       PSW_ADDR_INSN : child->thread.per_user.end;
+                       -1UL : child->thread.per_user.end;
        else if (addr == (addr_t) &dummy->bits)
                /* Single-step bit. */
                return test_thread_flag(TIF_SINGLE_STEP) ?
@@ -495,8 +495,6 @@ long arch_ptrace(struct task_struct *child, long request,
                }
                return 0;
        default:
-               /* Removing high order bit from addr (only for 31 bit). */
-               addr &= PSW_ADDR_INSN;
                return ptrace_request(child, request, addr, data);
        }
 }
index c6878fb..cedb019 100644 (file)
@@ -301,25 +301,21 @@ static void __init setup_lowcore(void)
        BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096);
        lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0);
        lc->restart_psw.mask = PSW_KERNEL_BITS;
-       lc->restart_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) restart_int_handler;
+       lc->restart_psw.addr = (unsigned long) restart_int_handler;
        lc->external_new_psw.mask = PSW_KERNEL_BITS |
                PSW_MASK_DAT | PSW_MASK_MCHECK;
-       lc->external_new_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) ext_int_handler;
+       lc->external_new_psw.addr = (unsigned long) ext_int_handler;
        lc->svc_new_psw.mask = PSW_KERNEL_BITS |
                PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
-       lc->svc_new_psw.addr = PSW_ADDR_AMODE | (unsigned long) system_call;
+       lc->svc_new_psw.addr = (unsigned long) system_call;
        lc->program_new_psw.mask = PSW_KERNEL_BITS |
                PSW_MASK_DAT | PSW_MASK_MCHECK;
-       lc->program_new_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) pgm_check_handler;
+       lc->program_new_psw.addr = (unsigned long) pgm_check_handler;
        lc->mcck_new_psw.mask = PSW_KERNEL_BITS;
-       lc->mcck_new_psw.addr =
-               PSW_ADDR_AMODE | (unsigned long) mcck_int_handler;
+       lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler;
        lc->io_new_psw.mask = PSW_KERNEL_BITS |
                PSW_MASK_DAT | PSW_MASK_MCHECK;
-       lc->io_new_psw.addr = PSW_ADDR_AMODE | (unsigned long) io_int_handler;
+       lc->io_new_psw.addr = (unsigned long) io_int_handler;
        lc->clock_comparator = -1ULL;
        lc->kernel_stack = ((unsigned long) &init_thread_union)
                + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
@@ -378,17 +374,17 @@ static void __init setup_lowcore(void)
 
 static struct resource code_resource = {
        .name  = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource data_resource = {
        .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource bss_resource = {
        .name = "Kernel bss",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource __initdata *standard_resources[] = {
@@ -412,7 +408,7 @@ static void __init setup_resources(void)
 
        for_each_memblock(memory, reg) {
                res = alloc_bootmem_low(sizeof(*res));
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
                res->name = "System RAM";
                res->start = reg->base;
index 028cc46..d82562c 100644 (file)
@@ -331,13 +331,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
        /* Set up to return from userspace.  If provided, use a stub
           already in userspace.  */
        if (ka->sa.sa_flags & SA_RESTORER) {
-               restorer = (unsigned long) ka->sa.sa_restorer | PSW_ADDR_AMODE;
+               restorer = (unsigned long) ka->sa.sa_restorer;
        } else {
                /* Signal frame without vector registers are short ! */
                __u16 __user *svc = (void __user *) frame + frame_size - 2;
                if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
                        return -EFAULT;
-               restorer = (unsigned long) svc | PSW_ADDR_AMODE;
+               restorer = (unsigned long) svc;
        }
 
        /* Set up registers for signal handler */
@@ -347,7 +347,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
        regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA |
                (PSW_USER_BITS & PSW_MASK_ASC) |
                (regs->psw.mask & ~PSW_MASK_ASC);
-       regs->psw.addr = (unsigned long) ka->sa.sa_handler | PSW_ADDR_AMODE;
+       regs->psw.addr = (unsigned long) ka->sa.sa_handler;
 
        regs->gprs[2] = sig;
        regs->gprs[3] = (unsigned long) &frame->sc;
@@ -394,13 +394,12 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
        /* Set up to return from userspace.  If provided, use a stub
           already in userspace.  */
        if (ksig->ka.sa.sa_flags & SA_RESTORER) {
-               restorer = (unsigned long)
-                       ksig->ka.sa.sa_restorer | PSW_ADDR_AMODE;
+               restorer = (unsigned long) ksig->ka.sa.sa_restorer;
        } else {
                __u16 __user *svc = &frame->svc_insn;
                if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
                        return -EFAULT;
-               restorer = (unsigned long) svc | PSW_ADDR_AMODE;
+               restorer = (unsigned long) svc;
        }
 
        /* Create siginfo on the signal stack */
@@ -426,7 +425,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
        regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA |
                (PSW_USER_BITS & PSW_MASK_ASC) |
                (regs->psw.mask & ~PSW_MASK_ASC);
-       regs->psw.addr = (unsigned long) ksig->ka.sa.sa_handler | PSW_ADDR_AMODE;
+       regs->psw.addr = (unsigned long) ksig->ka.sa.sa_handler;
 
        regs->gprs[2] = ksig->sig;
        regs->gprs[3] = (unsigned long) &frame->info;
index a13468b..40a6b4f 100644 (file)
@@ -623,8 +623,6 @@ void __init smp_save_dump_cpus(void)
                return;
        /* Allocate a page as dumping area for the store status sigps */
        page = memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, 1UL << 31);
-       if (!page)
-               panic("could not allocate memory for save area\n");
        /* Set multi-threading state to the previous system. */
        pcpu_set_smt(sclp.mtid_prev);
        boot_cpu_addr = stap();
@@ -800,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid)
        set_cpu_online(smp_processor_id(), true);
        inc_irq_stat(CPU_RST);
        local_irq_enable();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Upping and downing of CPUs */
index 1785cd8..8f64ebd 100644 (file)
@@ -21,12 +21,11 @@ static unsigned long save_context_stack(struct stack_trace *trace,
        unsigned long addr;
 
        while(1) {
-               sp &= PSW_ADDR_INSN;
                if (sp < low || sp > high)
                        return sp;
                sf = (struct stack_frame *)sp;
                while(1) {
-                       addr = sf->gprs[8] & PSW_ADDR_INSN;
+                       addr = sf->gprs[8];
                        if (!trace->skip)
                                trace->entries[trace->nr_entries++] = addr;
                        else
@@ -34,7 +33,7 @@ static unsigned long save_context_stack(struct stack_trace *trace,
                        if (trace->nr_entries >= trace->max_entries)
                                return sp;
                        low = sp;
-                       sp = sf->back_chain & PSW_ADDR_INSN;
+                       sp = sf->back_chain;
                        if (!sp)
                                break;
                        if (sp <= low || sp > high - sizeof(*sf))
@@ -46,7 +45,7 @@ static unsigned long save_context_stack(struct stack_trace *trace,
                if (sp <= low || sp > high - sizeof(*regs))
                        return sp;
                regs = (struct pt_regs *)sp;
-               addr = regs->psw.addr & PSW_ADDR_INSN;
+               addr = regs->psw.addr;
                if (savesched || !in_sched_functions(addr)) {
                        if (!trace->skip)
                                trace->entries[trace->nr_entries++] = addr;
@@ -60,33 +59,43 @@ static unsigned long save_context_stack(struct stack_trace *trace,
        }
 }
 
-void save_stack_trace(struct stack_trace *trace)
+static void __save_stack_trace(struct stack_trace *trace, unsigned long sp)
 {
-       register unsigned long sp asm ("15");
-       unsigned long orig_sp, new_sp;
+       unsigned long new_sp, frame_size;
 
-       orig_sp = sp & PSW_ADDR_INSN;
-       new_sp = save_context_stack(trace, orig_sp,
-                                   S390_lowcore.panic_stack - PAGE_SIZE,
-                                   S390_lowcore.panic_stack, 1);
-       if (new_sp != orig_sp)
-               return;
+       frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
+       new_sp = save_context_stack(trace, sp,
+                       S390_lowcore.panic_stack + frame_size - PAGE_SIZE,
+                       S390_lowcore.panic_stack + frame_size, 1);
        new_sp = save_context_stack(trace, new_sp,
-                                   S390_lowcore.async_stack - ASYNC_SIZE,
-                                   S390_lowcore.async_stack, 1);
-       if (new_sp != orig_sp)
-               return;
+                       S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
+                       S390_lowcore.async_stack + frame_size, 1);
        save_context_stack(trace, new_sp,
                           S390_lowcore.thread_info,
                           S390_lowcore.thread_info + THREAD_SIZE, 1);
 }
+
+void save_stack_trace(struct stack_trace *trace)
+{
+       register unsigned long r15 asm ("15");
+       unsigned long sp;
+
+       sp = r15;
+       __save_stack_trace(trace, sp);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
        unsigned long sp, low, high;
 
-       sp = tsk->thread.ksp & PSW_ADDR_INSN;
+       sp = tsk->thread.ksp;
+       if (tsk == current) {
+               /* Get current stack pointer. */
+               asm volatile("la %0,0(15)" : "=a" (sp));
+       }
        low = (unsigned long) task_stack_page(tsk);
        high = (unsigned long) task_pt_regs(tsk);
        save_context_stack(trace, sp, low, high, 0);
@@ -94,3 +103,14 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+       unsigned long sp;
+
+       sp = kernel_stack_pointer(regs);
+       __save_stack_trace(trace, sp);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_regs);
index 5378c3e..293d8b9 100644 (file)
@@ -383,3 +383,4 @@ SYSCALL(sys_recvfrom,compat_sys_recvfrom)
 SYSCALL(sys_recvmsg,compat_sys_recvmsg)
 SYSCALL(sys_shutdown,sys_shutdown)
 SYSCALL(sys_mlock2,compat_sys_mlock2)
+SYSCALL(sys_copy_file_range,compat_sys_copy_file_range) /* 375 */
index 21a5df9..dde7654 100644 (file)
@@ -18,6 +18,9 @@ void trace_s390_diagnose_norecursion(int diag_nr)
        unsigned long flags;
        unsigned int *depth;
 
+       /* Avoid lockdep recursion. */
+       if (IS_ENABLED(CONFIG_LOCKDEP))
+               return;
        local_irq_save(flags);
        depth = this_cpu_ptr(&diagnose_trace_depth);
        if (*depth == 0) {
index d69d648..017eb03 100644 (file)
@@ -32,8 +32,7 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
                address = *(unsigned long *)(current->thread.trap_tdb + 24);
        else
                address = regs->psw.addr;
-       return (void __user *)
-               ((address - (regs->int_code >> 16)) & PSW_ADDR_INSN);
+       return (void __user *) (address - (regs->int_code >> 16));
 }
 
 static inline void report_user_fault(struct pt_regs *regs, int signr)
@@ -46,7 +45,7 @@ static inline void report_user_fault(struct pt_regs *regs, int signr)
                return;
        printk("User process fault: interruption code %04x ilc:%d ",
               regs->int_code & 0xffff, regs->int_code >> 17);
-       print_vma_addr("in ", regs->psw.addr & PSW_ADDR_INSN);
+       print_vma_addr("in ", regs->psw.addr);
        printk("\n");
        show_regs(regs);
 }
@@ -69,13 +68,13 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
                report_user_fault(regs, si_signo);
         } else {
                 const struct exception_table_entry *fixup;
-                fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
+               fixup = search_exception_tables(regs->psw.addr);
                 if (fixup)
-                       regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;
+                       regs->psw.addr = extable_fixup(fixup);
                else {
                        enum bug_trap_type btt;
 
-                       btt = report_bug(regs->psw.addr & PSW_ADDR_INSN, regs);
+                       btt = report_bug(regs->psw.addr, regs);
                        if (btt == BUG_TRAP_TYPE_WARN)
                                return;
                        die(regs, str);
index 5fce52c..5ea5af3 100644 (file)
@@ -29,6 +29,7 @@ config KVM
        select HAVE_KVM_IRQFD
        select HAVE_KVM_IRQ_ROUTING
        select SRCU
+       select KVM_VFIO
        ---help---
          Support hosting paravirtualized guest machines using the SIE
          virtualization capability on the mainframe. This should work
index b3b5534..d42fa38 100644 (file)
@@ -7,7 +7,7 @@
 # as published by the Free Software Foundation.
 
 KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
index 47518a3..d697312 100644 (file)
@@ -116,7 +116,7 @@ static void enable_all_hw_wp(struct kvm_vcpu *vcpu)
        if (*cr9 & PER_EVENT_STORE && *cr9 & PER_CONTROL_ALTERATION) {
                *cr9 &= ~PER_CONTROL_ALTERATION;
                *cr10 = 0;
-               *cr11 = PSW_ADDR_INSN;
+               *cr11 = -1UL;
        } else {
                *cr9 &= ~PER_CONTROL_ALTERATION;
                *cr9 |= PER_EVENT_STORE;
@@ -159,7 +159,7 @@ void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu)
                vcpu->arch.sie_block->gcr[0] &= ~0x800ul;
                vcpu->arch.sie_block->gcr[9] |= PER_EVENT_IFETCH;
                vcpu->arch.sie_block->gcr[10] = 0;
-               vcpu->arch.sie_block->gcr[11] = PSW_ADDR_INSN;
+               vcpu->arch.sie_block->gcr[11] = -1UL;
        }
 
        if (guestdbg_hw_bp_enabled(vcpu)) {
index f88ca72..9ffc732 100644 (file)
@@ -966,13 +966,13 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
-       if (waitqueue_active(&vcpu->wq)) {
+       if (swait_active(&vcpu->wq)) {
                /*
                 * The vcpu gave up the cpu voluntarily, mark it as a good
                 * yield-candidate.
                 */
                vcpu->preempted = true;
-               wake_up_interruptible(&vcpu->wq);
+               swake_up(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
 }
index 835d60b..03dfe9c 100644 (file)
@@ -1423,44 +1423,18 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-/*
- * Backs up the current FP/VX register save area on a particular
- * destination.  Used to switch between different register save
- * areas.
- */
-static inline void save_fpu_to(struct fpu *dst)
-{
-       dst->fpc = current->thread.fpu.fpc;
-       dst->regs = current->thread.fpu.regs;
-}
-
-/*
- * Switches the FP/VX register save area from which to lazy
- * restore register contents.
- */
-static inline void load_fpu_from(struct fpu *from)
-{
-       current->thread.fpu.fpc = from->fpc;
-       current->thread.fpu.regs = from->regs;
-}
-
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        /* Save host register state */
        save_fpu_regs();
-       save_fpu_to(&vcpu->arch.host_fpregs);
-
-       if (test_kvm_facility(vcpu->kvm, 129)) {
-               current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
-               /*
-                * Use the register save area in the SIE-control block
-                * for register restore and save in kvm_arch_vcpu_put()
-                */
-               current->thread.fpu.vxrs =
-                       (__vector128 *)&vcpu->run->s.regs.vrs;
-       } else
-               load_fpu_from(&vcpu->arch.guest_fpregs);
+       vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
+       vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
 
+       /* Depending on MACHINE_HAS_VX, data stored to vrs either
+        * has vector register or floating point register format.
+        */
+       current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+       current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
        if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
                current->thread.fpu.fpc = 0;
@@ -1476,19 +1450,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
 
+       /* Save guest register state */
        save_fpu_regs();
+       vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
 
-       if (test_kvm_facility(vcpu->kvm, 129))
-               /*
-                * kvm_arch_vcpu_load() set up the register save area to
-                * the &vcpu->run->s.regs.vrs and, thus, the vector registers
-                * are already saved.  Only the floating-point control must be
-                * copied.
-                */
-               vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
-       else
-               save_fpu_to(&vcpu->arch.guest_fpregs);
-       load_fpu_from(&vcpu->arch.host_fpregs);
+       /* Restore host register state */
+       current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
+       current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
 
        save_access_regs(vcpu->run->s.regs.acrs);
        restore_access_regs(vcpu->arch.host_acrs);
@@ -1506,8 +1474,9 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
        memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
        vcpu->arch.sie_block->gcr[0]  = 0xE0UL;
        vcpu->arch.sie_block->gcr[14] = 0xC2000000UL;
-       vcpu->arch.guest_fpregs.fpc = 0;
-       asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
+       /* make sure the new fpc will be lazily loaded */
+       save_fpu_regs();
+       current->thread.fpu.fpc = 0;
        vcpu->arch.sie_block->gbea = 1;
        vcpu->arch.sie_block->pp = 0;
        vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
@@ -1648,17 +1617,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        vcpu->arch.local_int.wq = &vcpu->wq;
        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
 
-       /*
-        * Allocate a save area for floating-point registers.  If the vector
-        * extension is available, register contents are saved in the SIE
-        * control block.  The allocated save area is still required in
-        * particular places, for example, in kvm_s390_vcpu_store_status().
-        */
-       vcpu->arch.guest_fpregs.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
-                                              GFP_KERNEL);
-       if (!vcpu->arch.guest_fpregs.fprs)
-               goto out_free_sie_block;
-
        rc = kvm_vcpu_init(vcpu, kvm, id);
        if (rc)
                goto out_free_sie_block;
@@ -1879,19 +1837,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
+       /* make sure the new values will be lazily loaded */
+       save_fpu_regs();
        if (test_fp_ctl(fpu->fpc))
                return -EINVAL;
-       memcpy(vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-       vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-       save_fpu_regs();
-       load_fpu_from(&vcpu->arch.guest_fpregs);
+       current->thread.fpu.fpc = fpu->fpc;
+       if (MACHINE_HAS_VX)
+               convert_fp_to_vx(current->thread.fpu.vxrs, (freg_t *)fpu->fprs);
+       else
+               memcpy(current->thread.fpu.fprs, &fpu->fprs, sizeof(fpu->fprs));
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       memcpy(&fpu->fprs, vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
-       fpu->fpc = vcpu->arch.guest_fpregs.fpc;
+       /* make sure we have the latest values */
+       save_fpu_regs();
+       if (MACHINE_HAS_VX)
+               convert_vx_to_fp((freg_t *)fpu->fprs, current->thread.fpu.vxrs);
+       else
+               memcpy(fpu->fprs, current->thread.fpu.fprs, sizeof(fpu->fprs));
+       fpu->fpc = current->thread.fpu.fpc;
        return 0;
 }
 
@@ -2396,6 +2362,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 {
        unsigned char archmode = 1;
+       freg_t fprs[NUM_FPRS];
        unsigned int px;
        u64 clkcomp;
        int rc;
@@ -2411,8 +2378,16 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
                gpa = px;
        } else
                gpa -= __LC_FPREGS_SAVE_AREA;
-       rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
-                            vcpu->arch.guest_fpregs.fprs, 128);
+
+       /* manually convert vector registers if necessary */
+       if (MACHINE_HAS_VX) {
+               convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
+               rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
+                                    fprs, 128);
+       } else {
+               rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
+                                    vcpu->run->s.regs.vrs, 128);
+       }
        rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
                              vcpu->run->s.regs.gprs, 128);
        rc |= write_guest_abs(vcpu, gpa + __LC_PSW_SAVE_AREA,
@@ -2420,7 +2395,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
        rc |= write_guest_abs(vcpu, gpa + __LC_PREFIX_SAVE_AREA,
                              &px, 4);
        rc |= write_guest_abs(vcpu, gpa + __LC_FP_CREG_SAVE_AREA,
-                             &vcpu->arch.guest_fpregs.fpc, 4);
+                             &vcpu->run->s.regs.fpc, 4);
        rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA,
                              &vcpu->arch.sie_block->todpr, 4);
        rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA,
@@ -2443,19 +2418,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
         * it into the save area
         */
        save_fpu_regs();
-       if (test_kvm_facility(vcpu->kvm, 129)) {
-               /*
-                * If the vector extension is available, the vector registers
-                * which overlaps with floating-point registers are saved in
-                * the SIE-control block.  Hence, extract the floating-point
-                * registers and the FPC value and store them in the
-                * guest_fpregs structure.
-                */
-               vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc;
-               convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs,
-                                current->thread.fpu.vxrs);
-       } else
-               save_fpu_to(&vcpu->arch.guest_fpregs);
+       vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
        save_access_regs(vcpu->run->s.regs.acrs);
 
        return kvm_s390_store_status_unloaded(vcpu, addr);
index 1b903f6..791a414 100644 (file)
@@ -228,7 +228,7 @@ static inline void report_user_fault(struct pt_regs *regs, long signr)
                return;
        printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
               regs->int_code & 0xffff, regs->int_code >> 17);
-       print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN);
+       print_vma_addr(KERN_CONT "in ", regs->psw.addr);
        printk(KERN_CONT "\n");
        printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n",
               regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
@@ -256,9 +256,9 @@ static noinline void do_no_context(struct pt_regs *regs)
        const struct exception_table_entry *fixup;
 
        /* Are we prepared to handle this kernel fault?  */
-       fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
+       fixup = search_exception_tables(regs->psw.addr);
        if (fixup) {
-               regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;
+               regs->psw.addr = extable_fixup(fixup);
                return;
        }
 
index c722400..73e2903 100644 (file)
@@ -98,7 +98,7 @@ void __init paging_init(void)
        __ctl_load(S390_lowcore.kernel_asce, 1, 1);
        __ctl_load(S390_lowcore.kernel_asce, 7, 7);
        __ctl_load(S390_lowcore.kernel_asce, 13, 13);
-       arch_local_irq_restore(4UL << (BITS_PER_LONG - 8));
+       __arch_local_irq_stosm(0x04);
 
        sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
index fec59c0..792f9c6 100644 (file)
@@ -93,15 +93,19 @@ static int __memcpy_real(void *dest, void *src, size_t count)
  */
 int memcpy_real(void *dest, void *src, size_t count)
 {
+       int irqs_disabled, rc;
        unsigned long flags;
-       int rc;
 
        if (!count)
                return 0;
-       local_irq_save(flags);
-       __arch_local_irq_stnsm(0xfbUL);
+       flags = __arch_local_irq_stnsm(0xf8UL);
+       irqs_disabled = arch_irqs_disabled_flags(flags);
+       if (!irqs_disabled)
+               trace_hardirqs_off();
        rc = __memcpy_real(dest, src, count);
-       local_irq_restore(flags);
+       if (!irqs_disabled)
+               trace_hardirqs_on();
+       __arch_local_irq_ssm(flags);
        return rc;
 }
 
index ea01477..45c4daa 100644 (file)
@@ -169,12 +169,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
 {
-       if (is_compat_task() || (TASK_SIZE >= (1UL << 53)))
+       if (is_compat_task() || TASK_SIZE >= TASK_MAX_SIZE)
                return 0;
        if (!(flags & MAP_FIXED))
                addr = 0;
        if ((addr + len) >= TASK_SIZE)
-               return crst_table_upgrade(current->mm, 1UL << 53);
+               return crst_table_upgrade(current->mm, TASK_MAX_SIZE);
        return 0;
 }
 
@@ -189,9 +189,9 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
        area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
        if (!(area & ~PAGE_MASK))
                return area;
-       if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
+       if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
                /* Upgrade the page table to 4 levels and retry. */
-               rc = crst_table_upgrade(mm, 1UL << 53);
+               rc = crst_table_upgrade(mm, TASK_MAX_SIZE);
                if (rc)
                        return (unsigned long) rc;
                area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
@@ -211,9 +211,9 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
        area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
        if (!(area & ~PAGE_MASK))
                return area;
-       if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
+       if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
                /* Upgrade the page table to 4 levels and retry. */
-               rc = crst_table_upgrade(mm, 1UL << 53);
+               rc = crst_table_upgrade(mm, TASK_MAX_SIZE);
                if (rc)
                        return (unsigned long) rc;
                area = arch_get_unmapped_area_topdown(filp, addr, len,
index a809fa8..5109827 100644 (file)
@@ -55,7 +55,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
        unsigned long entry;
        int flush;
 
-       BUG_ON(limit > (1UL << 53));
+       BUG_ON(limit > TASK_MAX_SIZE);
        flush = 0;
 repeat:
        table = crst_table_alloc(mm);
index 43f32ce..2794845 100644 (file)
@@ -57,9 +57,7 @@ static __init pg_data_t *alloc_node_data(void)
 {
        pg_data_t *res;
 
-       res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1);
-       if (!res)
-               panic("Could not allocate memory for node data!\n");
+       res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 8);
        memset(res, 0, sizeof(pg_data_t));
        return res;
 }
@@ -162,7 +160,7 @@ static int __init numa_init_late(void)
                register_one_node(nid);
        return 0;
 }
-device_initcall(numa_init_late);
+arch_initcall(numa_init_late);
 
 static int __init parse_debug(char *parm)
 {
index 8a6811b..1884e17 100644 (file)
@@ -16,24 +16,23 @@ __show_trace(unsigned int *depth, unsigned long sp,
        struct pt_regs *regs;
 
        while (*depth) {
-               sp = sp & PSW_ADDR_INSN;
                if (sp < low || sp > high - sizeof(*sf))
                        return sp;
                sf = (struct stack_frame *) sp;
                (*depth)--;
-               oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+               oprofile_add_trace(sf->gprs[8]);
 
                /* Follow the backchain.  */
                while (*depth) {
                        low = sp;
-                       sp = sf->back_chain & PSW_ADDR_INSN;
+                       sp = sf->back_chain;
                        if (!sp)
                                break;
                        if (sp <= low || sp > high - sizeof(*sf))
                                return sp;
                        sf = (struct stack_frame *) sp;
                        (*depth)--;
-                       oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+                       oprofile_add_trace(sf->gprs[8]);
 
                }
 
@@ -46,7 +45,7 @@ __show_trace(unsigned int *depth, unsigned long sp,
                        return sp;
                regs = (struct pt_regs *) sp;
                (*depth)--;
-               oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+               oprofile_add_trace(sf->gprs[8]);
                low = sp;
                sp = regs->gprs[15];
        }
@@ -55,12 +54,13 @@ __show_trace(unsigned int *depth, unsigned long sp,
 
 void s390_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
-       unsigned long head;
+       unsigned long head, frame_size;
        struct stack_frame* head_sf;
 
        if (user_mode(regs))
                return;
 
+       frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
        head = regs->gprs[15];
        head_sf = (struct stack_frame*)head;
 
@@ -69,8 +69,9 @@ void s390_backtrace(struct pt_regs * const regs, unsigned int depth)
 
        head = head_sf->back_chain;
 
-       head = __show_trace(&depth, head, S390_lowcore.async_stack - ASYNC_SIZE,
-                           S390_lowcore.async_stack);
+       head = __show_trace(&depth, head,
+                           S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
+                           S390_lowcore.async_stack + frame_size);
 
        __show_trace(&depth, head, S390_lowcore.thread_info,
                     S390_lowcore.thread_info + THREAD_SIZE);
index 11d4f27..8f19c8f 100644 (file)
@@ -68,9 +68,12 @@ static struct airq_struct zpci_airq = {
        .isc = PCI_ISC,
 };
 
-/* I/O Map */
+#define ZPCI_IOMAP_ENTRIES                                             \
+       min(((unsigned long) CONFIG_PCI_NR_FUNCTIONS * PCI_BAR_COUNT),  \
+           ZPCI_IOMAP_MAX_ENTRIES)
+
 static DEFINE_SPINLOCK(zpci_iomap_lock);
-static DECLARE_BITMAP(zpci_iomap, ZPCI_IOMAP_MAX_ENTRIES);
+static unsigned long *zpci_iomap_bitmap;
 struct zpci_iomap_entry *zpci_iomap_start;
 EXPORT_SYMBOL_GPL(zpci_iomap_start);
 
@@ -265,27 +268,20 @@ void __iomem *pci_iomap_range(struct pci_dev *pdev,
                              unsigned long max)
 {
        struct zpci_dev *zdev = to_zpci(pdev);
-       u64 addr;
        int idx;
 
-       if ((bar & 7) != bar)
+       if (!pci_resource_len(pdev, bar))
                return NULL;
 
        idx = zdev->bars[bar].map_idx;
        spin_lock(&zpci_iomap_lock);
-       if (zpci_iomap_start[idx].count++) {
-               BUG_ON(zpci_iomap_start[idx].fh != zdev->fh ||
-                      zpci_iomap_start[idx].bar != bar);
-       } else {
-               zpci_iomap_start[idx].fh = zdev->fh;
-               zpci_iomap_start[idx].bar = bar;
-       }
        /* Detect overrun */
-       BUG_ON(!zpci_iomap_start[idx].count);
+       WARN_ON(!++zpci_iomap_start[idx].count);
+       zpci_iomap_start[idx].fh = zdev->fh;
+       zpci_iomap_start[idx].bar = bar;
        spin_unlock(&zpci_iomap_lock);
 
-       addr = ZPCI_IOMAP_ADDR_BASE | ((u64) idx << 48);
-       return (void __iomem *) addr + offset;
+       return (void __iomem *) ZPCI_ADDR(idx) + offset;
 }
 EXPORT_SYMBOL(pci_iomap_range);
 
@@ -297,12 +293,11 @@ EXPORT_SYMBOL(pci_iomap);
 
 void pci_iounmap(struct pci_dev *pdev, void __iomem *addr)
 {
-       unsigned int idx;
+       unsigned int idx = ZPCI_IDX(addr);
 
-       idx = (((__force u64) addr) & ~ZPCI_IOMAP_ADDR_BASE) >> 48;
        spin_lock(&zpci_iomap_lock);
        /* Detect underrun */
-       BUG_ON(!zpci_iomap_start[idx].count);
+       WARN_ON(!zpci_iomap_start[idx].count);
        if (!--zpci_iomap_start[idx].count) {
                zpci_iomap_start[idx].fh = 0;
                zpci_iomap_start[idx].bar = 0;
@@ -544,15 +539,15 @@ static void zpci_irq_exit(void)
 
 static int zpci_alloc_iomap(struct zpci_dev *zdev)
 {
-       int entry;
+       unsigned long entry;
 
        spin_lock(&zpci_iomap_lock);
-       entry = find_first_zero_bit(zpci_iomap, ZPCI_IOMAP_MAX_ENTRIES);
-       if (entry == ZPCI_IOMAP_MAX_ENTRIES) {
+       entry = find_first_zero_bit(zpci_iomap_bitmap, ZPCI_IOMAP_ENTRIES);
+       if (entry == ZPCI_IOMAP_ENTRIES) {
                spin_unlock(&zpci_iomap_lock);
                return -ENOSPC;
        }
-       set_bit(entry, zpci_iomap);
+       set_bit(entry, zpci_iomap_bitmap);
        spin_unlock(&zpci_iomap_lock);
        return entry;
 }
@@ -561,7 +556,7 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry)
 {
        spin_lock(&zpci_iomap_lock);
        memset(&zpci_iomap_start[entry], 0, sizeof(struct zpci_iomap_entry));
-       clear_bit(entry, zpci_iomap);
+       clear_bit(entry, zpci_iomap_bitmap);
        spin_unlock(&zpci_iomap_lock);
 }
 
@@ -611,8 +606,7 @@ static int zpci_setup_bus_resources(struct zpci_dev *zdev,
                if (zdev->bars[i].val & 4)
                        flags |= IORESOURCE_MEM_64;
 
-               addr = ZPCI_IOMAP_ADDR_BASE + ((u64) entry << 48);
-
+               addr = ZPCI_ADDR(entry);
                size = 1UL << zdev->bars[i].size;
 
                res = __alloc_res(zdev, addr, size, flags);
@@ -873,23 +867,30 @@ static int zpci_mem_init(void)
        zdev_fmb_cache = kmem_cache_create("PCI_FMB_cache", sizeof(struct zpci_fmb),
                                16, 0, NULL);
        if (!zdev_fmb_cache)
-               goto error_zdev;
+               goto error_fmb;
 
-       /* TODO: use realloc */
-       zpci_iomap_start = kzalloc(ZPCI_IOMAP_MAX_ENTRIES * sizeof(*zpci_iomap_start),
-                                  GFP_KERNEL);
+       zpci_iomap_start = kcalloc(ZPCI_IOMAP_ENTRIES,
+                                  sizeof(*zpci_iomap_start), GFP_KERNEL);
        if (!zpci_iomap_start)
                goto error_iomap;
-       return 0;
 
+       zpci_iomap_bitmap = kcalloc(BITS_TO_LONGS(ZPCI_IOMAP_ENTRIES),
+                                   sizeof(*zpci_iomap_bitmap), GFP_KERNEL);
+       if (!zpci_iomap_bitmap)
+               goto error_iomap_bitmap;
+
+       return 0;
+error_iomap_bitmap:
+       kfree(zpci_iomap_start);
 error_iomap:
        kmem_cache_destroy(zdev_fmb_cache);
-error_zdev:
+error_fmb:
        return -ENOMEM;
 }
 
 static void zpci_mem_exit(void)
 {
+       kfree(zpci_iomap_bitmap);
        kfree(zpci_iomap_start);
        kmem_cache_destroy(zdev_fmb_cache);
 }
index 369a3e0..b0e0475 100644 (file)
@@ -53,6 +53,11 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 
        pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
               pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
+
+       if (!pdev)
+               return;
+
+       pdev->error_state = pci_channel_io_perm_failure;
 }
 
 void zpci_event_error(void *data)
index b48459a..f3a0649 100644 (file)
@@ -101,7 +101,7 @@ static void __init resource_init(void)
        res->name = "System RAM";
        res->start = MEMORY_START;
        res->end = MEMORY_START + MEMORY_SIZE - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        request_resource(&iomem_resource, res);
 
        request_resource(res, &code_resource);
index f887c64..8a84e05 100644 (file)
@@ -33,7 +33,6 @@
 #endif
 
 #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#define smp_store_mb(var, value) __smp_store_mb(var, value)
 
 #include <asm-generic/barrier.h>
 
index de19cfa..3f1c18b 100644 (file)
@@ -78,17 +78,17 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0, };
 
 static struct resource code_resource = {
        .name = "Kernel code",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource data_resource = {
        .name = "Kernel data",
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource bss_resource = {
        .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 unsigned long memory_start;
@@ -202,7 +202,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
        res->name = "System RAM";
        res->start = start;
        res->end = end - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
        if (request_resource(&iomem_resource, res)) {
                pr_err("unable to request memory_resource 0x%lx 0x%lx\n",
index de6be00..13f633a 100644 (file)
@@ -203,7 +203,7 @@ asmlinkage void start_secondary(void)
        set_cpu_online(cpu, true);
        per_cpu(cpu_state, cpu) = CPU_ONLINE;
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 extern struct {
index eaee146..8496a07 100644 (file)
@@ -24,7 +24,13 @@ LDFLAGS        := -m elf32_sparc
 export BITS    := 32
 UTS_MACHINE    := sparc
 
+# We are adding -Wa,-Av8 to KBUILD_CFLAGS to deal with a specs bug in some
+# versions of gcc.  Some gcc versions won't pass -Av8 to binutils when you
+# give -mcpu=v8.  This silently worked with older bintutils versions but
+# does not any more.
 KBUILD_CFLAGS  += -m32 -mcpu=v8 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7
+KBUILD_CFLAGS  += -Wa,-Av8
+
 KBUILD_AFLAGS  += -m32 -Wa,-Av8
 
 else
index 1c26d44..b6de8b1 100644 (file)
 #define __NR_listen            354
 #define __NR_setsockopt                355
 #define __NR_mlock2            356
+#define __NR_copy_file_range   357
 
-#define NR_syscalls            357
+#define NR_syscalls            358
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK  0x00000001
index 33c02b1..a83707c 100644 (file)
@@ -948,7 +948,24 @@ linux_syscall_trace:
        cmp     %o0, 0
        bne     3f
         mov    -ENOSYS, %o0
+
+       /* Syscall tracing can modify the registers.  */
+       ld      [%sp + STACKFRAME_SZ + PT_G1], %g1
+       sethi   %hi(sys_call_table), %l7
+       ld      [%sp + STACKFRAME_SZ + PT_I0], %i0
+       or      %l7, %lo(sys_call_table), %l7
+       ld      [%sp + STACKFRAME_SZ + PT_I1], %i1
+       ld      [%sp + STACKFRAME_SZ + PT_I2], %i2
+       ld      [%sp + STACKFRAME_SZ + PT_I3], %i3
+       ld      [%sp + STACKFRAME_SZ + PT_I4], %i4
+       ld      [%sp + STACKFRAME_SZ + PT_I5], %i5
+       cmp     %g1, NR_syscalls
+       bgeu    3f
+        mov    -ENOSYS, %o0
+
+       sll     %g1, 2, %l4
        mov     %i0, %o0
+       ld      [%l7 + %l4], %l7
        mov     %i1, %o1
        mov     %i2, %o2
        mov     %i3, %o3
index f2d30ca..cd1f592 100644 (file)
@@ -696,14 +696,6 @@ tlb_fixup_done:
        call    __bzero
         sub    %o1, %o0, %o1
 
-#ifdef CONFIG_LOCKDEP
-       /* We have this call this super early, as even prom_init can grab
-        * spinlocks and thus call into the lockdep code.
-        */
-       call    lockdep_init
-        nop
-#endif
-
        call    prom_init
         mov    %l7, %o0                        ! OpenPROM cif handler
 
index afbaba5..d127130 100644 (file)
@@ -338,8 +338,9 @@ ENTRY(sun4v_mach_set_watchdog)
        mov     %o1, %o4
        mov     HV_FAST_MACH_SET_WATCHDOG, %o5
        ta      HV_FAST_TRAP
+       brnz,a,pn %o4, 0f
        stx     %o1, [%o4]
-       retl
+0:     retl
         nop
 ENDPROC(sun4v_mach_set_watchdog)
 
index d88beff..39aaec1 100644 (file)
@@ -52,7 +52,7 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
        unsigned char fenab;
        int err;
 
-       flush_user_windows();
+       synchronize_user_stack();
        if (get_thread_wsaved()                                 ||
            (((unsigned long)ucp) & (sizeof(unsigned long)-1))  ||
            (!__access_ok(ucp, sizeof(*ucp))))
index b3a5d81..fb30e7c 100644 (file)
@@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg)
        local_irq_enable();
 
        wmb();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
        /* We should never reach here! */
        BUG();
index 19cd08d..8a6151a 100644 (file)
@@ -134,7 +134,7 @@ void smp_callin(void)
 
        local_irq_enable();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void cpu_panic(void)
index a92d5d2..9e034f2 100644 (file)
@@ -37,6 +37,7 @@ EXPORT_SYMBOL(sun4v_niagara_getperf);
 EXPORT_SYMBOL(sun4v_niagara_setperf);
 EXPORT_SYMBOL(sun4v_niagara2_getperf);
 EXPORT_SYMBOL(sun4v_niagara2_setperf);
+EXPORT_SYMBOL(sun4v_mach_set_watchdog);
 
 /* from hweight.S */
 EXPORT_SYMBOL(__arch_hweight8);
index c690c8e..b489e97 100644 (file)
@@ -264,7 +264,7 @@ static unsigned long mmap_rnd(void)
        unsigned long rnd = 0UL;
 
        if (current->flags & PF_RANDOMIZE) {
-               unsigned long val = get_random_int();
+               unsigned long val = get_random_long();
                if (test_thread_flag(TIF_32BIT))
                        rnd = (val % (1UL << (23UL-PAGE_SHIFT)));
                else
index bb00089..c4a1b5c 100644 (file)
@@ -158,7 +158,25 @@ linux_syscall_trace32:
         add    %sp, PTREGS_OFF, %o0
        brnz,pn %o0, 3f
         mov    -ENOSYS, %o0
+
+       /* Syscall tracing can modify the registers.  */
+       ldx     [%sp + PTREGS_OFF + PT_V9_G1], %g1
+       sethi   %hi(sys_call_table32), %l7
+       ldx     [%sp + PTREGS_OFF + PT_V9_I0], %i0
+       or      %l7, %lo(sys_call_table32), %l7
+       ldx     [%sp + PTREGS_OFF + PT_V9_I1], %i1
+       ldx     [%sp + PTREGS_OFF + PT_V9_I2], %i2
+       ldx     [%sp + PTREGS_OFF + PT_V9_I3], %i3
+       ldx     [%sp + PTREGS_OFF + PT_V9_I4], %i4
+       ldx     [%sp + PTREGS_OFF + PT_V9_I5], %i5
+
+       cmp     %g1, NR_syscalls
+       bgeu,pn %xcc, 3f
+        mov    -ENOSYS, %o0
+
+       sll     %g1, 2, %l4
        srl     %i0, 0, %o0
+       lduw    [%l7 + %l4], %l7
        srl     %i4, 0, %o4
        srl     %i1, 0, %o1
        srl     %i2, 0, %o2
@@ -170,7 +188,25 @@ linux_syscall_trace:
         add    %sp, PTREGS_OFF, %o0
        brnz,pn %o0, 3f
         mov    -ENOSYS, %o0
+
+       /* Syscall tracing can modify the registers.  */
+       ldx     [%sp + PTREGS_OFF + PT_V9_G1], %g1
+       sethi   %hi(sys_call_table64), %l7
+       ldx     [%sp + PTREGS_OFF + PT_V9_I0], %i0
+       or      %l7, %lo(sys_call_table64), %l7
+       ldx     [%sp + PTREGS_OFF + PT_V9_I1], %i1
+       ldx     [%sp + PTREGS_OFF + PT_V9_I2], %i2
+       ldx     [%sp + PTREGS_OFF + PT_V9_I3], %i3
+       ldx     [%sp + PTREGS_OFF + PT_V9_I4], %i4
+       ldx     [%sp + PTREGS_OFF + PT_V9_I5], %i5
+
+       cmp     %g1, NR_syscalls
+       bgeu,pn %xcc, 3f
+        mov    -ENOSYS, %o0
+
+       sll     %g1, 2, %l4
        mov     %i0, %o0
+       lduw    [%l7 + %l4], %l7
        mov     %i1, %o1
        mov     %i2, %o2
        mov     %i3, %o3
index e663b6c..6c3dd6c 100644 (file)
@@ -88,4 +88,4 @@ sys_call_table:
 /*340*/        .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 /*345*/        .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-/*355*/        .long sys_setsockopt, sys_mlock2
+/*355*/        .long sys_setsockopt, sys_mlock2, sys_copy_file_range
index 1557121..12b524c 100644 (file)
@@ -89,7 +89,7 @@ sys_call_table32:
 /*340*/        .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
        .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-       .word compat_sys_setsockopt, sys_mlock2
+       .word compat_sys_setsockopt, sys_mlock2, sys_copy_file_range
 
 #endif /* CONFIG_COMPAT */
 
@@ -170,4 +170,4 @@ sys_call_table:
 /*340*/        .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
        .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-       .word sys_setsockopt, sys_mlock2
+       .word sys_setsockopt, sys_mlock2, sys_copy_file_range
index 6f21685..1cfe6aa 100644 (file)
@@ -2863,17 +2863,17 @@ void hugetlb_setup(struct pt_regs *regs)
 
 static struct resource code_resource = {
        .name   = "Kernel code",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource data_resource = {
        .name   = "Kernel data",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
        .name   = "Kernel bss",
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static inline resource_size_t compute_kern_paddr(void *addr)
@@ -2909,7 +2909,7 @@ static int __init report_memory(void)
                res->name = "System RAM";
                res->start = pavail[i].phys_addr;
                res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
-               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
                if (insert_resource(&iomem_resource, res) < 0) {
                        pr_warn("Resource insertion failed.\n");
index bbb855d..a992238 100644 (file)
@@ -1632,14 +1632,14 @@ static struct resource data_resource = {
        .name   = "Kernel data",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
        .name   = "Kernel code",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 /*
@@ -1673,10 +1673,15 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
                kzalloc(sizeof(struct resource), GFP_ATOMIC);
        if (!res)
                return NULL;
-       res->name = reserved ? "Reserved" : "System RAM";
        res->start = start_pfn << PAGE_SHIFT;
        res->end = (end_pfn << PAGE_SHIFT) - 1;
        res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       if (reserved) {
+               res->name = "Reserved";
+       } else {
+               res->name = "System RAM";
+               res->flags |= IORESOURCE_SYSRAM;
+       }
        if (insert_resource(&iomem_resource, res)) {
                kfree(res);
                return NULL;
index 20d52a9..6c0abaa 100644 (file)
@@ -208,7 +208,7 @@ void online_secondary(void)
        /* Set up tile-timer clock-event device on this cpu */
        setup_tile_timer();
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
index e13d41c..f878bec 100644 (file)
@@ -34,21 +34,18 @@ struct page;
 
 #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT)
 
-typedef struct { unsigned long pte_low, pte_high; } pte_t;
+typedef struct { unsigned long pte; } pte_t;
 typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pgd; } pgd_t;
-#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32))
-
-#define pte_get_bits(pte, bits) ((pte).pte_low & (bits))
-#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits))
-#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits))
-#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \
-                             smp_wmb(); \
-                             (to).pte_low = (from).pte_low; })
-#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high)
-#define pte_set_val(pte, phys, prot) \
-       ({ (pte).pte_high = (phys) >> 32; \
-          (pte).pte_low = (phys) | pgprot_val(prot); })
+#define pte_val(p) ((p).pte)
+
+#define pte_get_bits(p, bits) ((p).pte & (bits))
+#define pte_set_bits(p, bits) ((p).pte |= (bits))
+#define pte_clear_bits(p, bits) ((p).pte &= ~(bits))
+#define pte_copy(to, from) ({ (to).pte = (from).pte; })
+#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
+#define pte_set_val(p, phys, prot) \
+       ({ (p).pte = (phys) | pgprot_val(prot); })
 
 #define pmd_val(x)     ((x).pmd)
 #define __pmd(x) ((pmd_t) { (x) } )
index 9bdf67a..b60a9f8 100644 (file)
@@ -12,6 +12,7 @@
 #include <skas.h>
 
 void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
 
 static void kill_off_processes(void)
 {
index fc8be0e..57acbd6 100644 (file)
@@ -69,7 +69,7 @@ void do_signal(struct pt_regs *regs)
        struct ksignal ksig;
        int handled_sig = 0;
 
-       if (get_signal(&ksig)) {
+       while (get_signal(&ksig)) {
                handled_sig = 1;
                /* Whee!  Actually deliver the signal.  */
                handle_signal(&ksig, regs);
index 3fa317f..c2bffa5 100644 (file)
@@ -72,13 +72,13 @@ static struct resource mem_res[] = {
                .name = "Kernel code",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        },
        {
                .name = "Kernel data",
                .start = 0,
                .end = 0,
-               .flags = IORESOURCE_MEM
+               .flags = IORESOURCE_SYSTEM_RAM
        }
 };
 
@@ -211,7 +211,7 @@ request_standard_resources(struct meminfo *mi)
                res->name  = "System RAM";
                res->start = mi->bank[i].start;
                res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
                request_resource(&iomem_resource, res);
 
index 1538562..eb3abf8 100644 (file)
@@ -1,6 +1,7 @@
-
 obj-y += entry/
 
+obj-$(CONFIG_PERF_EVENTS) += events/
+
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
index 330e738..8f2e665 100644 (file)
@@ -303,6 +303,9 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
        def_bool y
 
+config DEBUG_RODATA
+       def_bool y
+
 config PGTABLE_LEVELS
        int
        default 4 if X86_64
@@ -475,6 +478,7 @@ config X86_UV
        depends on X86_64
        depends on X86_EXTENDED_PLATFORM
        depends on NUMA
+       depends on EFI
        depends on X86_X2APIC
        depends on PCI
        ---help---
@@ -509,11 +513,10 @@ config X86_INTEL_CE
 
 config X86_INTEL_MID
        bool "Intel MID platform support"
-       depends on X86_32
        depends on X86_EXTENDED_PLATFORM
        depends on X86_PLATFORM_DEVICES
        depends on PCI
-       depends on PCI_GOANY
+       depends on X86_64 || (PCI_GOANY && X86_32)
        depends on X86_IO_APIC
        select SFI
        select I2C
@@ -778,8 +781,8 @@ config HPET_TIMER
          HPET is the next generation timer replacing legacy 8254s.
          The HPET provides a stable time base on SMP
          systems, unlike the TSC, but it is more expensive to access,
-         as it is off-chip.  You can find the HPET spec at
-         <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
+         as it is off-chip.  The interface used is documented
+         in the HPET spec, revision 1.
 
          You can safely choose Y here.  However, HPET will only be
          activated if the platform and the BIOS support this feature.
@@ -1160,22 +1163,23 @@ config MICROCODE
        bool "CPU microcode loading support"
        default y
        depends on CPU_SUP_AMD || CPU_SUP_INTEL
-       depends on BLK_DEV_INITRD
        select FW_LOADER
        ---help---
-
          If you say Y here, you will be able to update the microcode on
-         certain Intel and AMD processors. The Intel support is for the
-         IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
-         Xeon etc. The AMD support is for families 0x10 and later. You will
-         obviously need the actual microcode binary data itself which is not
-         shipped with the Linux kernel.
-
-         This option selects the general module only, you need to select
-         at least one vendor specific module as well.
-
-         To compile this driver as a module, choose M here: the module
-         will be called microcode.
+         Intel and AMD processors. The Intel support is for the IA32 family,
+         e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+         AMD support is for families 0x10 and later. You will obviously need
+         the actual microcode binary data itself which is not shipped with
+         the Linux kernel.
+
+         The preferred method to load microcode from a detached initrd is described
+         in Documentation/x86/early-microcode.txt. For that you need to enable
+         CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+         initrd for microcode blobs.
+
+         In addition, you can build-in the microcode into the kernel. For that you
+         need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
+         to the CONFIG_EXTRA_FIRMWARE config option.
 
 config MICROCODE_INTEL
        bool "Intel microcode loading support"
index 9b18ed9..67eec55 100644 (file)
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
          issues with the mapping of the EFI runtime regions into that
          table.
 
-config DEBUG_RODATA
-       bool "Write protect kernel read-only data structures"
-       default y
-       depends on DEBUG_KERNEL
-       ---help---
-         Mark the kernel read-only data as write-protected in the pagetables,
-         in order to catch accidental (and incorrect) writes to such const
-         data. This is recommended so that we can catch kernel bugs sooner.
-         If in doubt, say "Y".
-
 config DEBUG_RODATA_TEST
-       bool "Testcase for the DEBUG_RODATA feature"
-       depends on DEBUG_RODATA
+       bool "Testcase for the marking rodata read-only"
        default y
        ---help---
-         This option enables a testcase for the DEBUG_RODATA
-         feature as well as for the change_page_attr() infrastructure.
+         This option enables a testcase for the setting rodata read-only
+         as well as for the change_page_attr() infrastructure.
          If in doubt, say "N"
 
 config DEBUG_WX
        bool "Warn on W+X mappings at boot"
-       depends on DEBUG_RODATA
        select X86_PTDUMP_CORE
        ---help---
          Generate a warning if any W+X mappings are found at boot.
@@ -350,16 +338,6 @@ config DEBUG_IMR_SELFTEST
 
          If unsure say N here.
 
-config X86_DEBUG_STATIC_CPU_HAS
-       bool "Debug alternatives"
-       depends on DEBUG_KERNEL
-       ---help---
-         This option causes additional code to be generated which
-         fails if static_cpu_has() is used before alternatives have
-         run.
-
-         If unsure, say N.
-
 config X86_DEBUG_FPU
        bool "Debug the x86 FPU code"
        depends on DEBUG_KERNEL
index ea97697..4cb404f 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef BOOT_CPUFLAGS_H
 #define BOOT_CPUFLAGS_H
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/processor-flags.h>
 
 struct cpu_features {
index 637097e..f72498d 100644 (file)
@@ -17,7 +17,7 @@
 
 #include "../include/asm/required-features.h"
 #include "../include/asm/disabled-features.h"
-#include "../include/asm/cpufeature.h"
+#include "../include/asm/cpufeatures.h"
 #include "../kernel/cpu/capflags.c"
 
 int main(void)
index a7661c4..0702d25 100644 (file)
@@ -49,7 +49,6 @@ typedef unsigned int   u32;
 
 /* This must be large enough to hold the entire setup */
 u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
 
 #define PECOFF_RELOC_RESERVE 0x20
 
index 028be48..e25a163 100644 (file)
@@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_FRAME_WARN=2048
+CONFIG_FRAME_WARN=1024
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_DEBUG_KERNEL=y
index 712b130..3a33124 100644 (file)
@@ -157,7 +157,9 @@ ENTRY(chacha20_4block_xor_ssse3)
        # done with the slightly better performing SSSE3 byte shuffling,
        # 7/12-bit word rotation uses traditional shift+OR.
 
-       sub             $0x40,%rsp
+       mov             %rsp,%r11
+       sub             $0x80,%rsp
+       and             $~63,%rsp
 
        # x0..15[0-3] = s0..3[0..3]
        movq            0x00(%rdi),%xmm1
@@ -620,6 +622,6 @@ ENTRY(chacha20_4block_xor_ssse3)
        pxor            %xmm1,%xmm15
        movdqu          %xmm15,0xf0(%rsi)
 
-       add             $0x40,%rsp
+       mov             %r11,%rsp
        ret
 ENDPROC(chacha20_4block_xor_ssse3)
index 07d2c6c..27226df 100644 (file)
@@ -33,7 +33,7 @@
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/api.h>
 
index 0e98716..0857b1a 100644 (file)
@@ -30,7 +30,7 @@
 #include <linux/kernel.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/internal.h>
 
index a3fcfc9..cd4df93 100644 (file)
@@ -30,7 +30,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 
 asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
index e32206e..9a9e588 100644 (file)
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
        .byte 0xf1
        .endm
 
-#else /* CONFIG_X86_64 */
-
-/*
- * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
- * are different from the entry_32.S versions in not changing the segment
- * registers. So only suitable for in kernel use, not when transitioning
- * from or to user space. The resulting stack frame is not a standard
- * pt_regs frame. The main use case is calling C code from assembler
- * when all the registers need to be preserved.
- */
-
-       .macro SAVE_ALL
-       pushl %eax
-       pushl %ebp
-       pushl %edi
-       pushl %esi
-       pushl %edx
-       pushl %ecx
-       pushl %ebx
-       .endm
-
-       .macro RESTORE_ALL
-       popl %ebx
-       popl %ecx
-       popl %edx
-       popl %esi
-       popl %edi
-       popl %ebp
-       popl %eax
-       .endm
-
 #endif /* CONFIG_X86_64 */
 
 /*
index 0366374..e79d93d 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/traps.h>
 #include <asm/vdso.h>
 #include <asm/uaccess.h>
+#include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
        CT_WARN_ON(ct_state() != CONTEXT_USER);
        user_exit();
 }
+#else
+static inline void enter_from_user_mode(void) {}
 #endif
 
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 
        work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
 
-#ifdef CONFIG_CONTEXT_TRACKING
-       /*
-        * If TIF_NOHZ is set, we are required to call user_exit() before
-        * doing anything that could touch RCU.
-        */
-       if (work & _TIF_NOHZ) {
-               enter_from_user_mode();
-               work &= ~_TIF_NOHZ;
-       }
-#endif
-
 #ifdef CONFIG_SECCOMP
        /*
         * Do seccomp first -- it should minimize exposure of other
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
        if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
                BUG_ON(regs != task_pt_regs(current));
 
-       /*
-        * If we stepped into a sysenter/syscall insn, it trapped in
-        * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-        * If user-mode had set TF itself, then it's still clear from
-        * do_debug() and we need to set it again to restore the user
-        * state.  If we entered on the slow path, TF was already set.
-        */
-       if (work & _TIF_SINGLESTEP)
-               regs->flags |= X86_EFLAGS_TF;
-
 #ifdef CONFIG_SECCOMP
        /*
         * Call seccomp_phase2 before running the other hooks so that
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 /* Called with IRQs disabled. */
 __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 {
+       struct thread_info *ti = pt_regs_to_thread_info(regs);
        u32 cached_flags;
 
        if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 
        lockdep_sys_exit();
 
-       cached_flags =
-               READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+       cached_flags = READ_ONCE(ti->flags);
 
        if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
                exit_to_usermode_loop(regs, cached_flags);
 
+#ifdef CONFIG_COMPAT
+       /*
+        * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+        * returning to user mode.  We need to clear it *after* signal
+        * handling, because syscall restart has a fixup for compat
+        * syscalls.  The fixup is exercised by the ptrace_syscall_32
+        * selftest.
+        */
+       ti->status &= ~TS_COMPAT;
+#endif
+
        user_enter();
 }
 
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
        if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
                syscall_slow_exit_work(regs, cached_flags);
 
-#ifdef CONFIG_COMPAT
+       local_irq_disable();
+       prepare_exit_to_usermode(regs);
+}
+
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       unsigned long nr = regs->orig_ax;
+
+       enter_from_user_mode();
+       local_irq_enable();
+
+       if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+               nr = syscall_trace_enter(regs);
+
        /*
-        * Compat syscalls set TS_COMPAT.  Make sure we clear it before
-        * returning to user mode.
+        * NB: Native and x32 syscalls are dispatched from the same
+        * table.  The only functional difference is the x32 bit in
+        * regs->orig_ax, which changes the behavior of some syscalls.
         */
-       ti->status &= ~TS_COMPAT;
-#endif
+       if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+               regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+                       regs->di, regs->si, regs->dx,
+                       regs->r10, regs->r8, regs->r9);
+       }
 
-       local_irq_disable();
-       prepare_exit_to_usermode(regs);
+       syscall_return_slowpath(regs);
 }
+#endif
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
- * Does a 32-bit syscall.  Called with IRQs on and does all entry and
- * exit work and returns with IRQs off.  This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
+ * all entry and exit work and returns with IRQs off.  This function is
+ * extremely hot in workloads that use it, and it's usually called from
  * do_fast_syscall_32, so forcibly inline it to improve performance.
  */
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 {
        struct thread_info *ti = pt_regs_to_thread_info(regs);
        unsigned int nr = (unsigned int)regs->orig_ax;
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
        syscall_return_slowpath(regs);
 }
 
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
 {
+       enter_from_user_mode();
        local_irq_enable();
        do_syscall_32_irqs_on(regs);
 }
-#endif
 
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
         */
        regs->ip = landing_pad;
 
-       /*
-        * Fetch EBP from where the vDSO stashed it.
-        *
-        * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
-        */
+       enter_from_user_mode();
+
        local_irq_enable();
+
+       /* Fetch EBP from where the vDSO stashed it. */
        if (
 #ifdef CONFIG_X86_64
                /*
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
                /* User code screwed up. */
                local_irq_disable();
                regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
-               enter_from_user_mode();
-#endif
                prepare_exit_to_usermode(regs);
                return 0;       /* Keep it simple: use IRET. */
        }
index 77d8c51..10868aa 100644 (file)
@@ -40,7 +40,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
@@ -287,7 +287,58 @@ need_resched:
 END(resume_kernel)
 #endif
 
-       # SYSENTER  call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+       addl    $5*4, %esp                      /* remove xen-provided frame */
+       jmp     sysenter_past_esp
+#endif
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
 ENTRY(entry_SYSENTER_32)
        movl    TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
@@ -300,6 +351,29 @@ sysenter_past_esp:
        pushl   %eax                    /* pt_regs->orig_ax */
        SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
 
+       /*
+        * SYSENTER doesn't filter flags, so we need to clear NT, AC
+        * and TF ourselves.  To save a few cycles, we can check whether
+        * either was set instead of doing an unconditional popfq.
+        * This needs to happen before enabling interrupts so that
+        * we don't get preempted with NT set.
+        *
+        * If TF is set, we will single-step all the way to here -- do_debug
+        * will ignore all the traps.  (Yes, this is slow, but so is
+        * single-stepping in general.  This allows us to avoid having
+        * a more complicated code to handle the case where a user program
+        * forces us to single-step through the SYSENTER entry code.)
+        *
+        * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+        * out-of-line as an optimization: NT is unlikely to be set in the
+        * majority of the cases and instead of polluting the I$ unnecessarily,
+        * we're keeping that code behind a branch which will predict as
+        * not-taken and therefore its instructions won't be fetched.
+        */
+       testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+       jnz     .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
        /*
         * User mode is traced as though IRQs are on, and SYSENTER
         * turned them off.
@@ -325,6 +399,15 @@ sysenter_past_esp:
        popl    %ebp                    /* pt_regs->bp */
        popl    %eax                    /* pt_regs->ax */
 
+       /*
+        * Restore all flags except IF. (We restore IF separately because
+        * STI gives a one-instruction window in which we won't be interrupted,
+        * whereas POPF does not.)
+        */
+       addl    $PT_EFLAGS-PT_DS, %esp  /* point esp at pt_regs->flags */
+       btr     $X86_EFLAGS_IF_BIT, (%esp)
+       popfl
+
        /*
         * Return back to the vDSO, which will pop ecx and edx.
         * Don't bother with DS and ES (they already contain __USER_DS).
@@ -338,28 +421,63 @@ sysenter_past_esp:
 .popsection
        _ASM_EXTABLE(1b, 2b)
        PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+       pushl   $X86_EFLAGS_FIXED
+       popfl
+       jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
 
-       # system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
 ENTRY(entry_INT80_32)
        ASM_CLAC
        pushl   %eax                    /* pt_regs->orig_ax */
        SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */
 
        /*
-        * User mode is traced as though IRQs are on.  Unlike the 64-bit
-        * case, INT80 is a trap gate on 32-bit kernels, so interrupts
-        * are already on (unless user code is messing around with iopl).
+        * User mode is traced as though IRQs are on, and the interrupt gate
+        * turned them off.
         */
+       TRACE_IRQS_OFF
 
        movl    %esp, %eax
-       call    do_syscall_32_irqs_on
+       call    do_int80_syscall_32
 .Lsyscall_32_done:
 
 restore_all:
        TRACE_IRQS_IRET
 restore_all_notrace:
 #ifdef CONFIG_X86_ESPFIX32
+       ALTERNATIVE     "jmp restore_nocheck", "", X86_BUG_ESPFIX
+
        movl    PT_EFLAGS(%esp), %eax           # mix EFLAGS, SS and CS
        /*
         * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -386,19 +504,6 @@ ENTRY(iret_exc     )
 
 #ifdef CONFIG_X86_ESPFIX32
 ldt_ss:
-#ifdef CONFIG_PARAVIRT
-       /*
-        * The kernel can't run on a non-flat stack if paravirt mode
-        * is active.  Rather than try to fixup the high bits of
-        * ESP, bypass this code entirely.  This may break DOSemu
-        * and/or Wine support in a paravirt VM, although the option
-        * is still available to implement the setting of the high
-        * 16-bits in the INTERRUPT_RETURN paravirt-op.
-        */
-       cmpl    $0, pv_info+PARAVIRT_enabled
-       jne     restore_nocheck
-#endif
-
 /*
  * Setup and switch to ESPFIX stack
  *
@@ -631,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
 END(spurious_interrupt_bug)
 
 #ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
-       addl    $5*4, %esp                      /* remove xen-provided frame */
-       jmp     sysenter_past_esp
-
 ENTRY(xen_hypervisor_callback)
        pushl   $-1                             /* orig_ax = -1 => not a system call */
        SAVE_ALL
@@ -938,51 +1035,48 @@ error_code:
        jmp     ret_from_exception
 END(page_fault)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-       cmpw    $__KERNEL_CS, 4(%esp)
-       jne     \ok
-\label:
-       movl    TSS_sysenter_sp0 + \offset(%esp), %esp
-       pushfl
-       pushl   $__KERNEL_CS
-       pushl   $sysenter_past_esp
-.endm
-
 ENTRY(debug)
+       /*
+        * #DB can happen at the first instruction of
+        * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+        * happens, then we will be running on a very small stack.  We
+        * need to detect this condition and switch to the thread
+        * stack before calling any C code at all.
+        *
+        * If you edit this code, keep in mind that NMIs can happen in here.
+        */
        ASM_CLAC
-       cmpl    $entry_SYSENTER_32, (%esp)
-       jne     debug_stack_correct
-       FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
        pushl   $-1                             # mark this as an int
        SAVE_ALL
-       TRACE_IRQS_OFF
        xorl    %edx, %edx                      # error code 0
        movl    %esp, %eax                      # pt_regs pointer
+
+       /* Are we currently on the SYSENTER stack? */
+       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       jb      .Ldebug_from_sysenter_stack
+
+       TRACE_IRQS_OFF
+       call    do_debug
+       jmp     ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+       /* We're on the SYSENTER stack.  Switch off. */
+       movl    %esp, %ebp
+       movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
+       TRACE_IRQS_OFF
        call    do_debug
+       movl    %ebp, %esp
        jmp     ret_from_exception
 END(debug)
 
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is doubly nasty.  It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks.  We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
  */
 ENTRY(nmi)
        ASM_CLAC
@@ -993,41 +1087,32 @@ ENTRY(nmi)
        popl    %eax
        je      nmi_espfix_stack
 #endif
-       cmpl    $entry_SYSENTER_32, (%esp)
-       je      nmi_stack_fixup
-       pushl   %eax
-       movl    %esp, %eax
-       /*
-        * Do not access memory above the end of our stack page,
-        * it might not exist.
-        */
-       andl    $(THREAD_SIZE-1), %eax
-       cmpl    $(THREAD_SIZE-20), %eax
-       popl    %eax
-       jae     nmi_stack_correct
-       cmpl    $entry_SYSENTER_32, 12(%esp)
-       je      nmi_debug_stack_check
-nmi_stack_correct:
-       pushl   %eax
+
+       pushl   %eax                            # pt_regs->orig_ax
        SAVE_ALL
        xorl    %edx, %edx                      # zero error code
        movl    %esp, %eax                      # pt_regs pointer
+
+       /* Are we currently on the SYSENTER stack? */
+       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
+       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       jb      .Lnmi_from_sysenter_stack
+
+       /* Not on SYSENTER stack. */
        call    do_nmi
        jmp     restore_all_notrace
 
-nmi_stack_fixup:
-       FIX_STACK 12, nmi_stack_correct, 1
-       jmp     nmi_stack_correct
-
-nmi_debug_stack_check:
-       cmpw    $__KERNEL_CS, 16(%esp)
-       jne     nmi_stack_correct
-       cmpl    $debug, (%esp)
-       jb      nmi_stack_correct
-       cmpl    $debug_esp_fix_insn, (%esp)
-       ja      nmi_stack_correct
-       FIX_STACK 24, nmi_stack_correct, 1
-       jmp     nmi_stack_correct
+.Lnmi_from_sysenter_stack:
+       /*
+        * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+        * is using the thread stack right now, so it's safe for us to use it.
+        */
+       movl    %esp, %ebp
+       movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
+       call    do_nmi
+       movl    %ebp, %esp
+       jmp     restore_all_notrace
 
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
index 9d34d3c..858b555 100644 (file)
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
 /*
  * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
  * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+       TRACE_IRQS_OFF
+
        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
-       /*
-        * Re-enable interrupts.
-        * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-        * must execute atomically in the face of possible interrupt-driven
-        * task preemption. We must enable interrupts only after we're done
-        * with using rsp_scratch:
-        */
-       ENABLE_INTERRUPTS(CLBR_NONE)
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
        pushq   %r11                            /* pt_regs->r11 */
        sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
 
-       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-       jnz     tracesys
+       /*
+        * If we need to do entry work or if we guess we'll need to do
+        * exit work, go straight to the slow path.
+        */
+       testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       jnz     entry_SYSCALL64_slow_path
+
 entry_SYSCALL_64_fastpath:
+       /*
+        * Easy case: enable interrupts and issue the syscall.  If the syscall
+        * needs pt_regs, we'll call a stub that disables interrupts again
+        * and jumps to the slow path.
+        */
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
        cmpq    $__NR_syscall_max, %rax
 #else
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
 #endif
        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
        movq    %r10, %rcx
+
+       /*
+        * This call instruction is handled specially in stub_ptregs_64.
+        * It might end up jumping to the slow path.  If it jumps, RAX
+        * and all argument registers are clobbered.
+        */
        call    *sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
        movq    %rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-       LOCKDEP_SYS_EXIT
-       /*
-        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-        * it is too small to ever cause noticeable irq latency.
-        */
-       DISABLE_INTERRUPTS(CLBR_NONE)
 
        /*
-        * We must check ti flags with interrupts (or at least preemption)
-        * off because we must *never* return to userspace without
-        * processing exit work that is enqueued if we're preempted here.
-        * In particular, returning to userspace with any of the one-shot
-        * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-        * very bad.
+        * If we get here, then we know that pt_regs is clean for SYSRET64.
+        * If we see that no exit work is required (which we are required
+        * to check with IRQs off), then we can go straight to SYSRET64.
         */
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
        testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-       jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
+       jnz     1f
 
-       RESTORE_C_REGS_EXCEPT_RCX_R11
+       LOCKDEP_SYS_EXIT
+       TRACE_IRQS_ON           /* user mode is traced as IRQs on */
        movq    RIP(%rsp), %rcx
        movq    EFLAGS(%rsp), %r11
+       RESTORE_C_REGS_EXCEPT_RCX_R11
        movq    RSP(%rsp), %rsp
-       /*
-        * 64-bit SYSRET restores rip from rcx,
-        * rflags from r11 (but RF and VM bits are forced to 0),
-        * cs and ss are loaded from MSRs.
-        * Restoration of rflags re-enables interrupts.
-        *
-        * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-        * descriptor is not reinitialized.  This means that we should
-        * avoid SYSRET with SS == NULL, which could happen if we schedule,
-        * exit the kernel, and re-enter using an interrupt vector.  (All
-        * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-        * from happening by reloading SS in __switch_to.  (Actually
-        * detecting the failure in 64-bit userspace is tricky but can be
-        * done.)
-        */
        USERGS_SYSRET64
 
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+       /*
+        * The fast path looked good when we started, but something changed
+        * along the way and we need to switch to the slow path.  Calling
+        * raise(3) will trigger this, for example.  IRQs are off.
+        */
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-       jmp int_ret_from_sys_call
-
-       /* Do syscall entry tracing */
-tracesys:
-       movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       call    syscall_trace_enter_phase1
-       test    %rax, %rax
-       jnz     tracesys_phase2                 /* if needed, run the slow path */
-       RESTORE_C_REGS_EXCEPT_RAX               /* else restore clobbered regs */
-       movq    ORIG_RAX(%rsp), %rax
-       jmp     entry_SYSCALL_64_fastpath       /* and return to the fast path */
-
-tracesys_phase2:
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-       movl    $AUDIT_ARCH_X86_64, %esi
-       movq    %rax, %rdx
-       call    syscall_trace_enter_phase2
-
-       /*
-        * Reload registers from stack in case ptrace changed them.
-        * We don't reload %rax because syscall_trace_entry_phase2() returned
-        * the value it wants us to use in the table lookup.
-        */
-       RESTORE_C_REGS_EXCEPT_RAX
-       RESTORE_EXTRA_REGS
-#if __SYSCALL_MASK == ~0
-       cmpq    $__NR_syscall_max, %rax
-#else
-       andl    $__SYSCALL_MASK, %eax
-       cmpl    $__NR_syscall_max, %eax
-#endif
-       ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
-       movq    %r10, %rcx                      /* fixup for C */
-       call    *sys_call_table(, %rax, 8)
-       movq    %rax, RAX(%rsp)
-1:
-       /* Use IRET because user could have changed pt_regs->foo */
+       call    syscall_return_slowpath /* returns with IRQs disabled */
+       jmp     return_from_SYSCALL_64
 
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+       /* IRQs are off. */
        SAVE_EXTRA_REGS
        movq    %rsp, %rdi
-       call    syscall_return_slowpath /* returns with IRQs disabled */
+       call    do_syscall_64           /* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
        RESTORE_EXTRA_REGS
        TRACE_IRQS_IRETQ                /* we're about to change IF */
 
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
        jmp     restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
+ENTRY(stub_ptregs_64)
+       /*
+        * Syscalls marked as needing ptregs land here.
+        * If we are on the fast path, we need to save the extra regs,
+        * which we achieve by trying again on the slow path.  If we are on
+        * the slow path, the extra regs are already saved.
+        *
+        * RAX stores a pointer to the C function implementing the syscall.
+        * IRQs are on.
+        */
+       cmpq    $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+       jne     1f
 
-       .macro FORK_LIKE func
-ENTRY(stub_\func)
-       SAVE_EXTRA_REGS 8
-       jmp     sys_\func
-END(stub_\func)
-       .endm
-
-       FORK_LIKE  clone
-       FORK_LIKE  fork
-       FORK_LIKE  vfork
-
-ENTRY(stub_execve)
-       call    sys_execve
-return_from_execve:
-       testl   %eax, %eax
-       jz      1f
-       /* exec failed, can use fast SYSRET code path in this case */
-       ret
-1:
-       /* must use IRET code path (pt_regs->cs may have changed) */
-       addq    $8, %rsp
-       ZERO_EXTRA_REGS
-       movq    %rax, RAX(%rsp)
-       jmp     int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
-       .align  8
-GLOBAL(stub_execveat)
-       call    sys_execveat
-       jmp     return_from_execve
-END(stub_execveat)
-
-#if defined(CONFIG_X86_X32_ABI)
-       .align  8
-GLOBAL(stub_x32_execve)
-       call    compat_sys_execve
-       jmp     return_from_execve
-END(stub_x32_execve)
-       .align  8
-GLOBAL(stub_x32_execveat)
-       call    compat_sys_execveat
-       jmp     return_from_execve
-END(stub_x32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
        /*
-        * SAVE_EXTRA_REGS result is not normally needed:
-        * sigreturn overwrites all pt_regs->GPREGS.
-        * But sigreturn can fail (!), and there is no easy way to detect that.
-        * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
-        * we SAVE_EXTRA_REGS here.
+        * Called from fast path -- disable IRQs again, pop return address
+        * and jump to slow path
         */
-       SAVE_EXTRA_REGS 8
-       call    sys_rt_sigreturn
-return_from_stub:
-       addq    $8, %rsp
-       RESTORE_EXTRA_REGS
-       movq    %rax, RAX(%rsp)
-       jmp     int_ret_from_sys_call
-END(stub_rt_sigreturn)
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       popq    %rax
+       jmp     entry_SYSCALL64_slow_path
 
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
-       SAVE_EXTRA_REGS 8
-       call    sys32_x32_rt_sigreturn
-       jmp     return_from_stub
-END(stub_x32_rt_sigreturn)
-#endif
+1:
+       /* Called from C */
+       jmp     *%rax                           /* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+       leaq    \func(%rip), %rax
+       jmp     stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 
 /*
  * A newly forked process directly context switches into this address.
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-
        LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
        pushq   $0x0002
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
 
        call    schedule_tail                   /* rdi: 'prev' task parameter */
 
-       RESTORE_EXTRA_REGS
-
        testb   $3, CS(%rsp)                    /* from kernel_thread? */
+       jnz     1f
 
        /*
-        * By the time we get here, we have no idea whether our pt_regs,
-        * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-        * the slow path, or one of the 32-bit compat paths.
-        * Use IRET code path to return, since it can safely handle
-        * all of the above.
+        * We came from kernel_thread.  This code path is quite twisted, and
+        * someone should clean it up.
+        *
+        * copy_thread_tls stashes the function pointer in RBX and the
+        * parameter to be passed in RBP.  The called function is permitted
+        * to call do_execve and thereby jump to user mode.
         */
-       jnz     int_ret_from_sys_call
+       movq    RBP(%rsp), %rdi
+       call    *RBX(%rsp)
+       movl    $0, RAX(%rsp)
 
        /*
-        * We came from kernel_thread
-        * nb: we depend on RESTORE_EXTRA_REGS above
+        * Fall through as though we're exiting a syscall.  This makes a
+        * twisted sort of sense if we just called do_execve.
         */
-       movq    %rbp, %rdi
-       call    *%rbx
-       movl    $0, RAX(%rsp)
-       RESTORE_EXTRA_REGS
-       jmp     int_ret_from_sys_call
+
+1:
+       movq    %rsp, %rdi
+       call    syscall_return_slowpath /* returns with IRQs disabled */
+       TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+       SWAPGS
+       jmp     restore_regs_and_iret
 END(ret_from_fork)
 
 /*
index ff1c6d6..847f2f0 100644 (file)
        .section .entry.text, "ax"
 
 /*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
  *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
  * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
  *
  * Arguments:
  * eax  system call number
  * edi  arg5
  * ebp  user stack
  * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
  */
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
         */
        pushfq                          /* pt_regs->flags (except IF = 0) */
        orl     $X86_EFLAGS_IF, (%rsp)  /* Fix saved flags */
-       ASM_CLAC                        /* Clear AC after saving FLAGS */
-
        pushq   $__USER32_CS            /* pt_regs->cs */
        xorq    %r8,%r8
        pushq   %r8                     /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
        cld
 
        /*
-        * Sysenter doesn't filter flags, so we need to clear NT
+        * SYSENTER doesn't filter flags, so we need to clear NT and AC
         * ourselves.  To save a few cycles, we can check whether
-        * NT was set instead of doing an unconditional popfq.
+        * either was set instead of doing an unconditional popfq.
         * This needs to happen before enabling interrupts so that
         * we don't get preempted with NT set.
         *
+        * If TF is set, we will single-step all the way to here -- do_debug
+        * will ignore all the traps.  (Yes, this is slow, but so is
+        * single-stepping in general.  This allows us to avoid having
+        * a more complicated code to handle the case where a user program
+        * forces us to single-step through the SYSENTER entry code.)
+        *
         * NB.: .Lsysenter_fix_flags is a label with the code under it moved
         * out-of-line as an optimization: NT is unlikely to be set in the
         * majority of the cases and instead of polluting the I$ unnecessarily,
         * we're keeping that code behind a branch which will predict as
         * not-taken and therefore its instructions won't be fetched.
         */
-       testl   $X86_EFLAGS_NT, EFLAGS(%rsp)
+       testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
        jnz     .Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
        pushq   $X86_EFLAGS_FIXED
        popfq
        jmp     .Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 
 /*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
  *
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ *    user task that did SYSCALL without immediately reloading SS
+ *    would randomly crash.
  *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
  * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
  * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
  * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
  *
@@ -236,7 +267,21 @@ sysret32_from_system_call:
 END(entry_SYSCALL_compat)
 
 /*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
  *
  * Arguments:
  * eax  system call number
@@ -245,22 +290,14 @@ END(entry_SYSCALL_compat)
  * edx  arg3
  * esi  arg4
  * edi  arg5
- * ebp  arg6   (note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
+ * ebp  arg6
  */
-
 ENTRY(entry_INT80_compat)
        /*
         * Interrupts are off on entry.
         */
        PARAVIRT_ADJUST_EXCEPTION_FRAME
+       ASM_CLAC                        /* Do this early to minimize exposure */
        SWAPGS
 
        /*
@@ -299,7 +336,7 @@ ENTRY(entry_INT80_compat)
        TRACE_IRQS_OFF
 
        movq    %rsp, %rdi
-       call    do_syscall_32_irqs_off
+       call    do_int80_syscall_32
 .Lsyscall_32_done:
 
        /* Go back to user mode. */
index 9a66498..8f895ee 100644 (file)
@@ -6,17 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
index 41283d2..9dbc5ab 100644 (file)
@@ -6,19 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
 
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
index dc1040a..2e5b565 100644 (file)
@@ -21,7 +21,7 @@
 12     common  brk                     sys_brk
 13     64      rt_sigaction            sys_rt_sigaction
 14     common  rt_sigprocmask          sys_rt_sigprocmask
-15     64      rt_sigreturn            stub_rt_sigreturn
+15     64      rt_sigreturn            sys_rt_sigreturn/ptregs
 16     64      ioctl                   sys_ioctl
 17     common  pread64                 sys_pread64
 18     common  pwrite64                sys_pwrite64
 53     common  socketpair              sys_socketpair
 54     64      setsockopt              sys_setsockopt
 55     64      getsockopt              sys_getsockopt
-56     common  clone                   stub_clone
-57     common  fork                    stub_fork
-58     common  vfork                   stub_vfork
-59     64      execve                  stub_execve
+56     common  clone                   sys_clone/ptregs
+57     common  fork                    sys_fork/ptregs
+58     common  vfork                   sys_vfork/ptregs
+59     64      execve                  sys_execve/ptregs
 60     common  exit                    sys_exit
 61     common  wait4                   sys_wait4
 62     common  kill                    sys_kill
 169    common  reboot                  sys_reboot
 170    common  sethostname             sys_sethostname
 171    common  setdomainname           sys_setdomainname
-172    common  iopl                    sys_iopl
+172    common  iopl                    sys_iopl/ptregs
 173    common  ioperm                  sys_ioperm
 174    64      create_module
 175    common  init_module             sys_init_module
 319    common  memfd_create            sys_memfd_create
 320    common  kexec_file_load         sys_kexec_file_load
 321    common  bpf                     sys_bpf
-322    64      execveat                stub_execveat
+322    64      execveat                sys_execveat/ptregs
 323    common  userfaultfd             sys_userfaultfd
 324    common  membarrier              sys_membarrier
 325    common  mlock2                  sys_mlock2
 # for native 64-bit operation.
 #
 512    x32     rt_sigaction            compat_sys_rt_sigaction
-513    x32     rt_sigreturn            stub_x32_rt_sigreturn
+513    x32     rt_sigreturn            sys32_x32_rt_sigreturn
 514    x32     ioctl                   compat_sys_ioctl
 515    x32     readv                   compat_sys_readv
 516    x32     writev                  compat_sys_writev
 517    x32     recvfrom                compat_sys_recvfrom
 518    x32     sendmsg                 compat_sys_sendmsg
 519    x32     recvmsg                 compat_sys_recvmsg
-520    x32     execve                  stub_x32_execve
+520    x32     execve                  compat_sys_execve/ptregs
 521    x32     ptrace                  compat_sys_ptrace
 522    x32     rt_sigpending           compat_sys_rt_sigpending
 523    x32     rt_sigtimedwait         compat_sys_rt_sigtimedwait
 542    x32     getsockopt              compat_sys_getsockopt
 543    x32     io_setup                compat_sys_io_setup
 544    x32     io_submit               compat_sys_io_submit
-545    x32     execveat                stub_x32_execveat
+545    x32     execveat                compat_sys_execveat/ptregs
index 0e7f8ec..cd3d301 100644 (file)
@@ -3,13 +3,63 @@
 in="$1"
 out="$2"
 
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"                # Strip the function name
+    qualifier="${qualifier:1}"                 # Strip the slash, if any
+
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+       echo "a compat entry for a 64-bit syscall makes no sense" >&2
+       exit 1
+    fi
+
+    if [ -z "$compat" ]; then
+       if [ -n "$entry" ]; then
+           syscall_macro "$abi" "$nr" "$entry"
+       fi
+    else
+       echo "#ifdef CONFIG_X86_32"
+       if [ -n "$entry" ]; then
+           syscall_macro "$abi" "$nr" "$entry"
+       fi
+       echo "#else"
+       syscall_macro "$abi" "$nr" "$compat"
+       echo "#endif"
+    fi
+}
+
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
        abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-       if [ -n "$compat" ]; then
-           echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-       elif [ -n "$entry" ]; then
-           echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+       if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+           # COMMON is the same as 64, except that we don't expect X32
+           # programs to use it.  Our expectation has nothing to do with
+           # any generated code, so treat them the same.
+           emit 64 "$nr" "$entry" "$compat"
+       elif [ "$abi" == "X32" ]; then
+           # X32 is equivalent to 64 on an X32-compatible kernel.
+           echo "#ifdef CONFIG_X86_X32_ABI"
+           emit 64 "$nr" "$entry" "$compat"
+           echo "#endif"
+       elif [ "$abi" == "I386" ]; then
+           emit "$abi" "$nr" "$entry" "$compat"
+       else
+           echo "Unknown abi $abi" >&2
+           exit 1
        fi
     done
 ) > "$out"
index 0224987..63a03bb 100644 (file)
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
        fprintf(outfile, "#include <asm/vdso.h>\n");
        fprintf(outfile, "\n");
        fprintf(outfile,
-               "static unsigned char raw_data[%lu] __page_aligned_data = {",
+               "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
                mapping_size);
        for (j = 0; j < stripped_len; j++) {
                if (j % 10 == 0)
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
        }
        fprintf(outfile, "\n};\n\n");
 
-       fprintf(outfile, "static struct page *pages[%lu];\n\n",
-               mapping_size / 4096);
-
        fprintf(outfile, "const struct vdso_image %s = {\n", name);
        fprintf(outfile, "\t.data = raw_data,\n");
        fprintf(outfile, "\t.size = %lu,\n", mapping_size);
-       fprintf(outfile, "\t.text_mapping = {\n");
-       fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
-       fprintf(outfile, "\t\t.pages = pages,\n");
-       fprintf(outfile, "\t},\n");
        if (alt_sec) {
                fprintf(outfile, "\t.alt = %lu,\n",
                        (unsigned long)GET_LE(&alt_sec->sh_offset));
index 08a317a..7853b53 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
 
-#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/vdso.h>
 
index 3a1d929..0109ac6 100644 (file)
@@ -3,7 +3,7 @@
 */
 
 #include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
index b8f69e2..10f7045 100644 (file)
@@ -20,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/hpet.h>
 #include <asm/desc.h>
+#include <asm/cpufeature.h>
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
 
 void __init init_vdso_image(const struct vdso_image *image)
 {
-       int i;
-       int npages = (image->size) / PAGE_SIZE;
-
        BUG_ON(image->size % PAGE_SIZE != 0);
-       for (i = 0; i < npages; i++)
-               image->text_mapping.pages[i] =
-                       virt_to_page(image->data + i*PAGE_SIZE);
 
        apply_alternatives((struct alt_instr *)(image->data + image->alt),
                           (struct alt_instr *)(image->data + image->alt +
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 #endif
 }
 
+static int vdso_fault(const struct vm_special_mapping *sm,
+                     struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+       if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+               return VM_FAULT_SIGBUS;
+
+       vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+       get_page(vmf->page);
+       return 0;
+}
+
+static const struct vm_special_mapping text_mapping = {
+       .name = "[vdso]",
+       .fault = vdso_fault,
+};
+
+static int vvar_fault(const struct vm_special_mapping *sm,
+                     struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+       long sym_offset;
+       int ret = -EFAULT;
+
+       if (!image)
+               return VM_FAULT_SIGBUS;
+
+       sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+               image->sym_vvar_start;
+
+       /*
+        * Sanity check: a symbol offset of zero means that the page
+        * does not exist for this vdso image, not that the page is at
+        * offset zero relative to the text mapping.  This should be
+        * impossible here, because sym_offset should only be zero for
+        * the page past the end of the vvar mapping.
+        */
+       if (sym_offset == 0)
+               return VM_FAULT_SIGBUS;
+
+       if (sym_offset == image->sym_vvar_page) {
+               ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+                                   __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+       } else if (sym_offset == image->sym_hpet_page) {
+#ifdef CONFIG_HPET_TIMER
+               if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
+                       ret = vm_insert_pfn_prot(
+                               vma,
+                               (unsigned long)vmf->virtual_address,
+                               hpet_address >> PAGE_SHIFT,
+                               pgprot_noncached(PAGE_READONLY));
+               }
+#endif
+       } else if (sym_offset == image->sym_pvclock_page) {
+               struct pvclock_vsyscall_time_info *pvti =
+                       pvclock_pvti_cpu0_va();
+               if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
+                       ret = vm_insert_pfn(
+                               vma,
+                               (unsigned long)vmf->virtual_address,
+                               __pa(pvti) >> PAGE_SHIFT);
+               }
+       }
+
+       if (ret == 0 || ret == -EBUSY)
+               return VM_FAULT_NOPAGE;
+
+       return VM_FAULT_SIGBUS;
+}
+
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long addr, text_start;
        int ret = 0;
-       static struct page *no_pages[] = {NULL};
-       static struct vm_special_mapping vvar_mapping = {
+       static const struct vm_special_mapping vvar_mapping = {
                .name = "[vvar]",
-               .pages = no_pages,
+               .fault = vvar_fault,
        };
-       struct pvclock_vsyscall_time_info *pvti;
 
        if (calculate_addr) {
                addr = vdso_addr(current->mm->start_stack,
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 
        text_start = addr - image->sym_vvar_start;
        current->mm->context.vdso = (void __user *)text_start;
+       current->mm->context.vdso_image = image;
 
        /*
         * MAYWRITE to allow gdb to COW and set breakpoints
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                                       image->size,
                                       VM_READ|VM_EXEC|
                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-                                      &image->text_mapping);
+                                      &text_mapping);
 
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
        vma = _install_special_mapping(mm,
                                       addr,
                                       -image->sym_vvar_start,
-                                      VM_READ|VM_MAYREAD,
+                                      VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+                                      VM_PFNMAP,
                                       &vvar_mapping);
 
        if (IS_ERR(vma)) {
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                goto up_fail;
        }
 
-       if (image->sym_vvar_page)
-               ret = remap_pfn_range(vma,
-                                     text_start + image->sym_vvar_page,
-                                     __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-                                     PAGE_SIZE,
-                                     PAGE_READONLY);
-
-       if (ret)
-               goto up_fail;
-
-#ifdef CONFIG_HPET_TIMER
-       if (hpet_address && image->sym_hpet_page) {
-               ret = io_remap_pfn_range(vma,
-                       text_start + image->sym_hpet_page,
-                       hpet_address >> PAGE_SHIFT,
-                       PAGE_SIZE,
-                       pgprot_noncached(PAGE_READONLY));
-
-               if (ret)
-                       goto up_fail;
-       }
-#endif
-
-       pvti = pvclock_pvti_cpu0_va();
-       if (pvti && image->sym_pvclock_page) {
-               ret = remap_pfn_range(vma,
-                                     text_start + image->sym_pvclock_page,
-                                     __pa(pvti) >> PAGE_SHIFT,
-                                     PAGE_SIZE,
-                                     PAGE_READONLY);
-
-               if (ret)
-                       goto up_fail;
-       }
-
 up_fail:
        if (ret)
                current->mm->context.vdso = NULL;
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
 #ifdef CONFIG_NUMA
        node = cpu_to_node(cpu);
 #endif
-       if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+       if (static_cpu_has(X86_FEATURE_RDTSCP))
                write_rdtscp_aux((node << 12) | cpu);
 
        /*
index 51e3304..0fb3a10 100644 (file)
@@ -16,6 +16,8 @@
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
 
+int vclocks_used __read_mostly;
+
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
+       int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
        struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
 
+       /* Mark the new vclock used. */
+       BUILD_BUG_ON(VCLOCK_MAX >= 32);
+       WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
        gtod_write_begin(vdata);
 
        /* copy vsyscall data */
-       vdata->vclock_mode      = tk->tkr_mono.clock->archdata.vclock_mode;
+       vdata->vclock_mode      = vclock_mode;
        vdata->cycle_last       = tk->tkr_mono.cycle_last;
        vdata->mask             = tk->tkr_mono.mask;
        vdata->mult             = tk->tkr_mono.mult;
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644 (file)
index 0000000..fdfea15
--- /dev/null
@@ -0,0 +1,13 @@
+obj-y                  += core.o
+
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
+endif
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/cstate.o intel/ds.o intel/knc.o 
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)            += intel/rapl.o msr.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644 (file)
index 0000000..049ada8
--- /dev/null
@@ -0,0 +1,731 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+       return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+       int offset;
+
+       if (!index)
+               return index;
+
+       if (eventsel)
+               offset = event_offsets[index];
+       else
+               offset = count_offsets[index];
+
+       if (offset)
+               return offset;
+
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               offset = index;
+       else
+               offset = index << 1;
+
+       if (eventsel)
+               event_offsets[index] = offset;
+       else
+               count_offsets[index] = offset;
+
+       return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+       if (event->attr.exclude_host && event->attr.exclude_guest)
+               /*
+                * When HO == GO == 1 the hardware treats that as GO == HO == 0
+                * and will count in both modes. We don't want to count in that
+                * case so we emulate no-counting by setting US = OS = 0.
+                */
+               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+                                     ARCH_PERFMON_EVENTSEL_OS);
+       else if (event->attr.exclude_host)
+               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+       else if (event->attr.exclude_guest)
+               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+       return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+       return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+
+       return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+       int ret;
+
+       /* pass precise event sampling to ibs: */
+       if (event->attr.precise_ip && get_ibs_caps())
+               return -ENOENT;
+
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
+       ret = x86_pmu_hw_config(event);
+       if (ret)
+               return ret;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+       return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+                                          struct perf_event *event)
+{
+       struct amd_nb *nb = cpuc->amd_nb;
+       int i;
+
+       /*
+        * need to scan whole list because event may not have
+        * been assigned during scheduling
+        *
+        * no race condition possible because event can only
+        * be removed on one CPU at a time AND PMU is disabled
+        * when we come here
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (cmpxchg(nb->owners + i, event, NULL) == event)
+                       break;
+       }
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                              struct event_constraint *c)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct amd_nb *nb = cpuc->amd_nb;
+       struct perf_event *old;
+       int idx, new = -1;
+
+       if (!c)
+               c = &unconstrained;
+
+       if (cpuc->is_fake)
+               return c;
+
+       /*
+        * detect if already present, if so reuse
+        *
+        * cannot merge with actual allocation
+        * because of possible holes
+        *
+        * event can already be present yet not assigned (in hwc->idx)
+        * because of successive calls to x86_schedule_events() from
+        * hw_perf_group_sched_in() without hw_perf_enable()
+        */
+       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+               if (new == -1 || hwc->idx == idx)
+                       /* assign free slot, prefer hwc->idx */
+                       old = cmpxchg(nb->owners + idx, NULL, event);
+               else if (nb->owners[idx] == event)
+                       /* event already present */
+                       old = event;
+               else
+                       continue;
+
+               if (old && old != event)
+                       continue;
+
+               /* reassign to this slot */
+               if (new != -1)
+                       cmpxchg(nb->owners + new, event, NULL);
+               new = idx;
+
+               /* already present, reuse */
+               if (old == event)
+                       break;
+       }
+
+       if (new == -1)
+               return &emptyconstraint;
+
+       return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+       struct amd_nb *nb;
+       int i;
+
+       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+       if (!nb)
+               return NULL;
+
+       nb->nb_id = -1;
+
+       /*
+        * initialize all possible NB constraints
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               __set_bit(i, nb->event_constraints[i].idxmsk);
+               nb->event_constraints[i].weight = 1;
+       }
+       return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       WARN_ON_ONCE(cpuc->amd_nb);
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return NOTIFY_OK;
+
+       cpuc->amd_nb = amd_alloc_nb(cpu);
+       if (!cpuc->amd_nb)
+               return NOTIFY_BAD;
+
+       return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+       struct amd_nb *nb;
+       int i, nb_id;
+
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       nb_id = amd_get_nb_id(cpu);
+       WARN_ON_ONCE(nb_id == BAD_APICID);
+
+       for_each_online_cpu(i) {
+               nb = per_cpu(cpu_hw_events, i).amd_nb;
+               if (WARN_ON_ONCE(!nb))
+                       continue;
+
+               if (nb->nb_id == nb_id) {
+                       *onln = cpuc->amd_nb;
+                       cpuc->amd_nb = nb;
+                       break;
+               }
+       }
+
+       cpuc->amd_nb->nb_id = nb_id;
+       cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+       struct cpu_hw_events *cpuhw;
+
+       if (boot_cpu_data.x86_max_cores < 2)
+               return;
+
+       cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+       if (cpuhw->amd_nb) {
+               struct amd_nb *nb = cpuhw->amd_nb;
+
+               if (nb->nb_id == -1 || --nb->refcnt == 0)
+                       kfree(nb);
+
+               cpuhw->amd_nb = NULL;
+       }
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       /*
+        * if not NB event or no NB, then no constraints
+        */
+       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+               return &unconstrained;
+
+       return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+                                     struct perf_event *event)
+{
+       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+               __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *amd_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
+
+#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS                0x000000C0ULL
+#define AMD_EVENT_DE           0x000000D0ULL
+#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000       FP      PERF_CTL[5:3]
+ * 0x010       FP      PERF_CTL[5:3]
+ * 0x020       LS      PERF_CTL[5:0]
+ * 0x030       LS      PERF_CTL[5:0]
+ * 0x040       DC      PERF_CTL[5:0]
+ * 0x050       DC      PERF_CTL[5:0]
+ * 0x060       CU      PERF_CTL[2:0]
+ * 0x070       CU      PERF_CTL[2:0]
+ * 0x080       IC/DE   PERF_CTL[2:0]
+ * 0x090       IC/DE   PERF_CTL[2:0]
+ * 0x0A0       ---
+ * 0x0B0       ---
+ * 0x0C0       EX/LS   PERF_CTL[5:0]
+ * 0x0D0       DE      PERF_CTL[2:0]
+ * 0x0E0       NB      NB_PERF_CTL[3:0]
+ * 0x0F0       NB      NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003       FP      PERF_CTL[3]
+ * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B       FP      PERF_CTL[3]
+ * 0x00D       FP      PERF_CTL[3]
+ * 0x023       DE      PERF_CTL[2:0]
+ * 0x02D       LS      PERF_CTL[3]
+ * 0x02E       LS      PERF_CTL[3,0]
+ * 0x031       LS      PERF_CTL[2:0] (**)
+ * 0x043       CU      PERF_CTL[2:0]
+ * 0x045       CU      PERF_CTL[2:0]
+ * 0x046       CU      PERF_CTL[2:0]
+ * 0x054       CU      PERF_CTL[2:0]
+ * 0x055       CU      PERF_CTL[2:0]
+ * 0x08F       IC      PERF_CTL[0]
+ * 0x187       DE      PERF_CTL[0]
+ * 0x188       DE      PERF_CTL[0]
+ * 0x0DB       EX      PERF_CTL[5:0]
+ * 0x0DC       LS      PERF_CTL[5:0]
+ * 0x0DD       LS      PERF_CTL[5:0]
+ * 0x0DE       LS      PERF_CTL[5:0]
+ * 0x0DF       LS      PERF_CTL[5:0]
+ * 0x1C0       EX      PERF_CTL[5:3]
+ * 0x1D6       EX      PERF_CTL[5:0]
+ * 0x1D8       EX      PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+                              struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int event_code = amd_get_event_code(hwc);
+
+       switch (event_code & AMD_EVENT_TYPE_MASK) {
+       case AMD_EVENT_FP:
+               switch (event_code) {
+               case 0x000:
+                       if (!(hwc->config & 0x0000F000ULL))
+                               break;
+                       if (!(hwc->config & 0x00000F00ULL))
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x004:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               break;
+                       return &amd_f15_PMC3;
+               case 0x003:
+               case 0x00B:
+               case 0x00D:
+                       return &amd_f15_PMC3;
+               }
+               return &amd_f15_PMC53;
+       case AMD_EVENT_LS:
+       case AMD_EVENT_DC:
+       case AMD_EVENT_EX_LS:
+               switch (event_code) {
+               case 0x023:
+               case 0x043:
+               case 0x045:
+               case 0x046:
+               case 0x054:
+               case 0x055:
+                       return &amd_f15_PMC20;
+               case 0x02D:
+                       return &amd_f15_PMC3;
+               case 0x02E:
+                       return &amd_f15_PMC30;
+               case 0x031:
+                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+                               return &amd_f15_PMC20;
+                       return &emptyconstraint;
+               case 0x1C0:
+                       return &amd_f15_PMC53;
+               default:
+                       return &amd_f15_PMC50;
+               }
+       case AMD_EVENT_CU:
+       case AMD_EVENT_IC_DE:
+       case AMD_EVENT_DE:
+               switch (event_code) {
+               case 0x08F:
+               case 0x187:
+               case 0x188:
+                       return &amd_f15_PMC0;
+               case 0x0DB ... 0x0DF:
+               case 0x1D6:
+               case 0x1D8:
+                       return &amd_f15_PMC50;
+               default:
+                       return &amd_f15_PMC20;
+               }
+       case AMD_EVENT_NB:
+               /* moved to perf_event_amd_uncore.c */
+               return &emptyconstraint;
+       default:
+               return &emptyconstraint;
+       }
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+                   (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+       .name                   = "AMD",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = x86_pmu_enable_all,
+       .enable                 = x86_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = amd_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_K7_EVNTSEL0,
+       .perfctr                = MSR_K7_PERFCTR0,
+       .addr_offset            = amd_pmu_addr_offset,
+       .event_map              = amd_pmu_event_map,
+       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
+       .num_counters           = AMD64_NUM_COUNTERS,
+       .cntval_bits            = 48,
+       .cntval_mask            = (1ULL << 48) - 1,
+       .apic                   = 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
+       .get_event_constraints  = amd_get_event_constraints,
+       .put_event_constraints  = amd_put_event_constraints,
+
+       .format_attrs           = amd_format_attr,
+       .events_sysfs_show      = amd_event_sysfs_show,
+
+       .cpu_prepare            = amd_pmu_cpu_prepare,
+       .cpu_starting           = amd_pmu_cpu_starting,
+       .cpu_dead               = amd_pmu_cpu_dead,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+               return 0;
+
+       switch (boot_cpu_data.x86) {
+       case 0x15:
+               pr_cont("Fam15h ");
+               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+               break;
+
+       default:
+               pr_err("core perfctr but no constraints; unknown hardware!\n");
+               return -ENODEV;
+       }
+
+       /*
+        * If core performance counter extensions exists, we must use
+        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+        * amd_pmu_addr_offset().
+        */
+       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
+       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
+       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
+
+       pr_cont("core perfctr, ");
+       return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+       int ret;
+
+       /* Performance-monitoring supported from K7 and later: */
+       if (boot_cpu_data.x86 < 6)
+               return -ENODEV;
+
+       x86_pmu = amd_pmu;
+
+       ret = amd_core_pmu_init();
+       if (ret)
+               return ret;
+
+       /* Events are common for all AMDs */
+       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+              sizeof(hw_cache_event_ids));
+
+       return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       cpuc->perf_ctr_virt_mask = 0;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * We only mask out the Host-only bit so that host-only counting works
+        * when SVM is disabled. If someone sets up a guest-only counter when
+        * SVM is disabled the Guest-only bits still gets set and the counter
+        * will not count anything.
+        */
+       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+       /* Reload all events */
+       x86_pmu_disable_all();
+       x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644 (file)
index 0000000..51087c2
--- /dev/null
@@ -0,0 +1,959 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
+
+enum ibs_states {
+       IBS_ENABLED     = 0,
+       IBS_STARTED     = 1,
+       IBS_STOPPING    = 2,
+
+       IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+       struct perf_event       *event;
+       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+       struct pmu                      pmu;
+       unsigned int                    msr;
+       u64                             config_mask;
+       u64                             cnt_mask;
+       u64                             enable_mask;
+       u64                             valid_mask;
+       u64                             max_period;
+       unsigned long                   offset_mask[1];
+       int                             offset_max;
+       struct cpu_perf_ibs __percpu    *pcpu;
+
+       struct attribute                **format_attrs;
+       struct attribute_group          format_group;
+       const struct attribute_group    *attr_groups[2];
+
+       u64                             (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+       u32             size;
+       union {
+               u32     data[0];        /* data buffer starts here */
+               u32     caps;
+       };
+       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int overflow = 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       if (unlikely(left < (s64)min)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               overflow = 1;
+       }
+
+       /*
+        * If the hw period that triggers the sw overflow is too short
+        * we might hit the irq handler. This biases the results.
+        * Thus we shorten the next-to-last period and set the last
+        * period to the max period.
+        */
+       if (left > max) {
+               left -= max;
+               if (left > max)
+                       left = max;
+               else if (left < min)
+                       left = min;
+       }
+
+       *hw_period = (u64)left;
+
+       return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - width;
+       u64 prev_raw_count;
+       u64 delta;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+       prev_raw_count = local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               return 0;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+       if (perf_ibs_fetch.pmu.type == type)
+               return &perf_ibs_fetch;
+       if (perf_ibs_op.pmu.type == type)
+               return &perf_ibs_op;
+       return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+       switch (event->attr.precise_ip) {
+       case 0:
+               return -ENOENT;
+       case 1:
+       case 2:
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       switch (event->attr.type) {
+       case PERF_TYPE_HARDWARE:
+               switch (event->attr.config) {
+               case PERF_COUNT_HW_CPU_CYCLES:
+                       *config = 0;
+                       return 0;
+               }
+               break;
+       case PERF_TYPE_RAW:
+               switch (event->attr.config) {
+               case 0x0076:
+                       *config = 0;
+                       return 0;
+               case 0x00C1:
+                       *config = IBS_OP_CNT_CTL;
+                       return 0;
+               }
+               break;
+       default:
+               return -ENOENT;
+       }
+
+       return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+       .exclude_user   = 1,
+       .exclude_kernel = 1,
+       .exclude_hv     = 1,
+       .exclude_idle   = 1,
+       .exclude_host   = 1,
+       .exclude_guest  = 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs;
+       u64 max_cnt, config;
+       int ret;
+
+       perf_ibs = get_ibs_pmu(event->attr.type);
+       if (perf_ibs) {
+               config = event->attr.config;
+       } else {
+               perf_ibs = &perf_ibs_op;
+               ret = perf_ibs_precise_event(event, &config);
+               if (ret)
+                       return ret;
+       }
+
+       if (event->pmu != &perf_ibs->pmu)
+               return -ENOENT;
+
+       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+               return -EINVAL;
+
+       if (config & ~perf_ibs->config_mask)
+               return -EINVAL;
+
+       if (hwc->sample_period) {
+               if (config & perf_ibs->cnt_mask)
+                       /* raw max_cnt may not be set */
+                       return -EINVAL;
+               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+                       /*
+                        * lower 4 bits can not be set in ibs max cnt,
+                        * but allowing it in case we adjust the
+                        * sample period to set a frequency.
+                        */
+                       return -EINVAL;
+               hwc->sample_period &= ~0x0FULL;
+               if (!hwc->sample_period)
+                       hwc->sample_period = 0x10;
+       } else {
+               max_cnt = config & perf_ibs->cnt_mask;
+               config &= ~perf_ibs->cnt_mask;
+               event->attr.sample_period = max_cnt << 4;
+               hwc->sample_period = event->attr.sample_period;
+       }
+
+       if (!hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * If we modify hwc->sample_period, we also need to update
+        * hwc->last_period and hwc->period_left.
+        */
+       hwc->last_period = hwc->sample_period;
+       local64_set(&hwc->period_left, hwc->sample_period);
+
+       hwc->config_base = perf_ibs->msr;
+       hwc->config = config;
+
+       return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+                              struct hw_perf_event *hwc, u64 *period)
+{
+       int overflow;
+
+       /* ignore lower 4 bits in min count: */
+       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+       local64_set(&hwc->prev_count, 0);
+
+       return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+       return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+       u64 count = 0;
+
+       if (config & IBS_OP_VAL)
+               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+       if (ibs_caps & IBS_CAPS_RDWROPCNT)
+               count += (config & IBS_OP_CUR_CNT) >> 32;
+
+       return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+                     u64 *config)
+{
+       u64 count = perf_ibs->get_count(*config);
+
+       /*
+        * Set width to 64 since we do not overflow on max width but
+        * instead on max count. In perf_ibs_set_period() we clear
+        * prev count manually on overflow.
+        */
+       while (!perf_event_try_update(event, count, 64)) {
+               rdmsrl(event->hw.config_base, *config);
+               count = perf_ibs->get_count(*config);
+       }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+                                        struct hw_perf_event *hwc, u64 config)
+{
+       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+                                         struct hw_perf_event *hwc, u64 config)
+{
+       config &= ~perf_ibs->cnt_mask;
+       wrmsrl(hwc->config_base, config);
+       config &= ~perf_ibs->enable_mask;
+       wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 period;
+
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       perf_ibs_set_period(perf_ibs, hwc, &period);
+       set_bit(IBS_STARTED, pcpu->state);
+       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       u64 config;
+       int stopping;
+
+       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+               return;
+
+       rdmsrl(hwc->config_base, config);
+
+       if (stopping) {
+               set_bit(IBS_STOPPING, pcpu->state);
+               perf_ibs_disable_event(perf_ibs, hwc, config);
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       /*
+        * Clear valid bit to not count rollovers on update, rollovers
+        * are only updated in the irq handler.
+        */
+       config &= ~perf_ibs->valid_mask;
+
+       perf_ibs_event_update(perf_ibs, event, &config);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+               return -ENOSPC;
+
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       pcpu->event = event;
+
+       if (flags & PERF_EF_START)
+               perf_ibs_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+               return;
+
+       perf_ibs_stop(event, PERF_EF_UPDATE);
+
+       pcpu->event = NULL;
+
+       perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,       "config:57");
+PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+       &format_attr_rand_en.attr,
+       NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+       NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSFETCHCTL,
+       .config_mask            = IBS_FETCH_CONFIG_MASK,
+       .cnt_mask               = IBS_FETCH_MAX_CNT,
+       .enable_mask            = IBS_FETCH_ENABLE,
+       .valid_mask             = IBS_FETCH_VAL,
+       .max_period             = IBS_FETCH_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
+       .format_attrs           = ibs_fetch_format_attrs,
+
+       .get_count              = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+       .pmu = {
+               .task_ctx_nr    = perf_invalid_context,
+
+               .event_init     = perf_ibs_init,
+               .add            = perf_ibs_add,
+               .del            = perf_ibs_del,
+               .start          = perf_ibs_start,
+               .stop           = perf_ibs_stop,
+               .read           = perf_ibs_read,
+       },
+       .msr                    = MSR_AMD64_IBSOPCTL,
+       .config_mask            = IBS_OP_CONFIG_MASK,
+       .cnt_mask               = IBS_OP_MAX_CNT,
+       .enable_mask            = IBS_OP_ENABLE,
+       .valid_mask             = IBS_OP_VAL,
+       .max_period             = IBS_OP_MAX_CNT << 4,
+       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
+       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
+       .format_attrs           = ibs_op_format_attrs,
+
+       .get_count              = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+       struct perf_event *event = pcpu->event;
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_sample_data data;
+       struct perf_raw_record raw;
+       struct pt_regs regs;
+       struct perf_ibs_data ibs_data;
+       int offset, size, check_rip, offset_max, throttle = 0;
+       unsigned int msr;
+       u64 *buf, *config, period;
+
+       if (!test_bit(IBS_STARTED, pcpu->state)) {
+               /*
+                * Catch spurious interrupts after stopping IBS: After
+                * disabling IBS there could be still incoming NMIs
+                * with samples that even have the valid bit cleared.
+                * Mark all this NMIs as handled.
+                */
+               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+       }
+
+       msr = hwc->config_base;
+       buf = ibs_data.regs;
+       rdmsrl(msr, *buf);
+       if (!(*buf++ & perf_ibs->valid_mask))
+               return 0;
+
+       config = &ibs_data.regs[0];
+       perf_ibs_event_update(perf_ibs, event, config);
+       perf_sample_data_init(&data, 0, hwc->last_period);
+       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+               goto out;       /* no sw counter overflow */
+
+       ibs_data.caps = ibs_caps;
+       size = 1;
+       offset = 1;
+       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+       if (event->attr.sample_type & PERF_SAMPLE_RAW)
+               offset_max = perf_ibs->offset_max;
+       else if (check_rip)
+               offset_max = 2;
+       else
+               offset_max = 1;
+       do {
+               rdmsrl(msr + offset, *buf++);
+               size++;
+               offset = find_next_bit(perf_ibs->offset_mask,
+                                      perf_ibs->offset_max,
+                                      offset + 1);
+       } while (offset < offset_max);
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               /*
+                * Read IbsBrTarget and IbsOpData4 separately
+                * depending on their availability.
+                * Can't add to offset_max as they are staggered
+                */
+               if (ibs_caps & IBS_CAPS_BRNTRGT) {
+                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+                       size++;
+               }
+               if (ibs_caps & IBS_CAPS_OPDATA4) {
+                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+                       size++;
+               }
+       }
+       ibs_data.size = sizeof(u64) * size;
+
+       regs = *iregs;
+       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+               regs.flags &= ~PERF_EFLAGS_EXACT;
+       } else {
+               set_linear_ip(&regs, ibs_data.regs[1]);
+               regs.flags |= PERF_EFLAGS_EXACT;
+       }
+
+       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+               raw.size = sizeof(u32) + ibs_data.size;
+               raw.data = ibs_data.data;
+               data.raw = &raw;
+       }
+
+       throttle = perf_event_overflow(event, &data, &regs);
+out:
+       if (throttle)
+               perf_ibs_disable_event(perf_ibs, hwc, *config);
+       else
+               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+       perf_event_update_userpage(event);
+
+       return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       int handled = 0;
+
+       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+       struct cpu_perf_ibs __percpu *pcpu;
+       int ret;
+
+       pcpu = alloc_percpu(struct cpu_perf_ibs);
+       if (!pcpu)
+               return -ENOMEM;
+
+       perf_ibs->pcpu = pcpu;
+
+       /* register attributes */
+       if (perf_ibs->format_attrs[0]) {
+               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+               perf_ibs->format_group.name     = "format";
+               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
+
+               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
+               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
+       }
+
+       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+       if (ret) {
+               perf_ibs->pcpu = NULL;
+               free_percpu(pcpu);
+       }
+
+       return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+       struct attribute **attr = ibs_op_format_attrs;
+
+       if (!ibs_caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+       if (ibs_caps & IBS_CAPS_OPCNT) {
+               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+               *attr++ = &format_attr_cnt_ctl.attr;
+       }
+       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+       pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+       return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+       u32 caps;
+       unsigned int max_level;
+
+       if (!boot_cpu_has(X86_FEATURE_IBS))
+               return 0;
+
+       /* check IBS cpuid feature flags */
+       max_level = cpuid_eax(0x80000000);
+       if (max_level < IBS_CPUID_FEATURES)
+               return IBS_CAPS_DEFAULT;
+
+       caps = cpuid_eax(IBS_CPUID_FEATURES);
+       if (!(caps & IBS_CAPS_AVAIL))
+               /* cpuid flags not valid */
+               return IBS_CAPS_DEFAULT;
+
+       return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+       return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+       return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+       int offset;
+       u64 val;
+       int valid = 0;
+
+       preempt_disable();
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       if (!get_eilvt(offset)) {
+               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+               goto out;
+       }
+
+       valid = 1;
+out:
+       preempt_enable();
+
+       return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+       struct pci_dev *cpu_cfg;
+       int nodes;
+       u32 value = 0;
+
+       nodes = 0;
+       cpu_cfg = NULL;
+       do {
+               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
+                                        cpu_cfg);
+               if (!cpu_cfg)
+                       break;
+               ++nodes;
+               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+                                      | IBSCTL_LVT_OFFSET_VALID);
+               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+                       pci_dev_put(cpu_cfg);
+                       pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+                                value);
+                       return -EINVAL;
+               }
+       } while (1);
+
+       if (!nodes) {
+               pr_debug("No CPU node configured for IBS\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+       int offset;
+       int ret;
+
+       preempt_disable();
+       /* find the next free available EILVT entry, skip offset 0 */
+       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+               if (get_eilvt(offset))
+                       break;
+       }
+       preempt_enable();
+
+       if (offset == APIC_EILVT_NR_MAX) {
+               pr_debug("No EILVT entry available\n");
+               return;
+       }
+
+       ret = setup_ibs_ctl(offset);
+       if (ret)
+               goto out;
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       pr_info("IBS: LVT offset %d assigned\n", offset);
+
+       return;
+out:
+       preempt_disable();
+       put_eilvt(offset);
+       preempt_enable();
+       return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+       /*
+        * Force LVT offset assignment for family 10h: The offsets are
+        * not assigned by the BIOS for this family, so the OS is
+        * responsible for doing it. If the OS assignment fails, fall
+        * back to BIOS settings and try to setup this.
+        */
+       if (boot_cpu_data.x86 == 0x10)
+               force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       if (!(val & IBSCTL_LVT_OFFSET_VALID))
+               return -EINVAL;
+
+       return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset < 0)
+               goto failed;
+
+       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+               return;
+failed:
+       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+               smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+       int offset;
+
+       offset = get_ibs_lvt_offset();
+       if (offset >= 0)
+               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+       clear_APIC_ibs(NULL);
+       return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+       ibs_eilvt_setup();
+       setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+       .resume         = perf_ibs_resume,
+       .suspend        = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+       register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_STARTING:
+               setup_APIC_ibs(NULL);
+               break;
+       case CPU_DYING:
+               clear_APIC_ibs(NULL);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+       u32 caps;
+       int ret = -EINVAL;
+
+       caps = __get_ibs_caps();
+       if (!caps)
+               return -ENODEV; /* ibs not supported by the cpu */
+
+       ibs_eilvt_setup();
+
+       if (!ibs_eilvt_valid())
+               goto out;
+
+       perf_ibs_pm_init();
+       cpu_notifier_register_begin();
+       ibs_caps = caps;
+       /* make ibs_caps visible to other cpus: */
+       smp_mb();
+       smp_call_function(setup_APIC_ibs, NULL, 1);
+       __perf_cpu_notifier(perf_ibs_cpu_notifier);
+       cpu_notifier_register_done();
+
+       ret = perf_event_ibs_init();
+out:
+       if (ret)
+               pr_err("Failed to setup IBS, %d\n", ret);
+       return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644 (file)
index 0000000..635e5eb
--- /dev/null
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT          16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+       struct pmu pmu;
+       u8 max_banks;
+       u8 max_counters;
+       u64 cntr_assign_mask;
+       raw_spinlock_t lock;
+       const struct attribute_group *attr_groups[4];
+};
+
+#define format_group   attr_groups[0]
+#define cpumask_group  attr_groups[1]
+#define events_group   attr_groups[2]
+#define null_group     attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+       &format_attr_csource.attr,
+       &format_attr_devid.attr,
+       &format_attr_pasid.attr,
+       &format_attr_domid.attr,
+       &format_attr_devid_mask.attr,
+       &format_attr_pasid_mask.attr,
+       &format_attr_domid_mask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+       .name = "format",
+       .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+       struct kobj_attribute attr;
+       const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+                               struct kobj_attribute *attr, char *buf)
+{
+       struct amd_iommu_event_desc *event =
+               container_of(attr, struct amd_iommu_event_desc, attr);
+       return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
+{                                                              \
+       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
+       .event = _event,                                        \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+       { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+                                  struct device_attribute *attr,
+                                  char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+       .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+       unsigned long flags;
+       int shift, bank, cntr, retval;
+       int max_banks = perf_iommu->max_banks;
+       int max_cntrs = perf_iommu->max_counters;
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+       for (bank = 0, shift = 0; bank < max_banks; bank++) {
+               for (cntr = 0; cntr < max_cntrs; cntr++) {
+                       shift = bank + (bank*3) + cntr;
+                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+                               continue;
+                       } else {
+                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+                               goto out;
+                       }
+               }
+       }
+       retval = -ENOSPC;
+out:
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+       return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+                                       u8 bank, u8 cntr)
+{
+       unsigned long flags;
+       int max_banks, max_cntrs;
+       int shift = 0;
+
+       max_banks = perf_iommu->max_banks;
+       max_cntrs = perf_iommu->max_counters;
+
+       if ((bank > max_banks) || (cntr > max_cntrs))
+               return -EINVAL;
+
+       shift = bank + cntr + (bank*3);
+
+       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+       return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct perf_amd_iommu *perf_iommu;
+       u64 config, config1;
+
+       /* test the event attr type check for PMU enumeration */
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * IOMMU counters are shared across all cores.
+        * Therefore, it does not support per-process mode.
+        * Also, it does not support event sampling mode.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* IOMMU counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       perf_iommu = &__perf_iommu;
+
+       if (event->pmu != &perf_iommu->pmu)
+               return -ENOENT;
+
+       if (perf_iommu) {
+               config = event->attr.config;
+               config1 = event->attr.config1;
+       } else {
+               return -EINVAL;
+       }
+
+       /* integrate with iommu base devid (0000), assume one iommu */
+       perf_iommu->max_banks =
+               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+       perf_iommu->max_counters =
+               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+               return -EINVAL;
+
+       /* update the hw_perf_event struct with the iommu config data */
+       hwc->config = config;
+       hwc->extra_reg.config = config1;
+
+       return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+       u8 csource = _GET_CSOURCE(ev);
+       u16 devid = _GET_DEVID(ev);
+       u64 reg = 0ULL;
+
+       reg = csource;
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+       if (reg)
+               reg |= (1UL << 31);
+       amd_iommu_pc_get_set_reg_val(devid,
+                       _GET_BANK(ev), _GET_CNTR(ev) ,
+                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+       u64 reg = 0ULL;
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                       _GET_BANK(event), _GET_CNTR(event),
+                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       pr_debug("perf: amd_iommu:perf_iommu_start\n");
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+       hwc->state = 0;
+
+       if (flags & PERF_EF_RELOAD) {
+               u64 prev_raw_count =  local64_read(&hwc->prev_count);
+               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+       }
+
+       perf_iommu_enable_event(event);
+       perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+       u64 count = 0ULL;
+       u64 prev_raw_count = 0ULL;
+       u64 delta = 0ULL;
+       struct hw_perf_event *hwc = &event->hw;
+       pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+                               _GET_BANK(event), _GET_CNTR(event),
+                               IOMMU_PC_COUNTER_REG, &count, false);
+
+       /* IOMMU pc counter register is only 48 bits */
+       count &= 0xFFFFFFFFFFFFULL;
+
+       prev_raw_count =  local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       count) != prev_raw_count)
+               return;
+
+       /* Handling 48-bit counter overflowing */
+       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       perf_iommu_disable_event(event);
+       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if (hwc->state & PERF_HES_UPTODATE)
+               return;
+
+       config = hwc->config;
+       perf_iommu_read(event);
+       hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+       int retval;
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_add\n");
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       /* request an iommu bank/counter */
+       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+       if (retval != -ENOSPC)
+               event->hw.extra_reg.reg = (u16)retval;
+       else
+               return retval;
+
+       if (flags & PERF_EF_START)
+               perf_iommu_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+       struct perf_amd_iommu *perf_iommu =
+                       container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+       pr_debug("perf: amd_iommu:perf_iommu_del\n");
+       perf_iommu_stop(event, PERF_EF_UPDATE);
+
+       /* clear the assigned iommu bank/counter */
+       clear_avail_iommu_bnk_cntr(perf_iommu,
+                                    _GET_BANK(event),
+                                    _GET_CNTR(event));
+
+       perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+       struct attribute **attrs;
+       struct attribute_group *attr_group;
+       int i = 0, j;
+
+       while (amd_iommu_v2_event_descs[i].attr.attr.name)
+               i++;
+
+       attr_group = kzalloc(sizeof(struct attribute *)
+               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+       if (!attr_group)
+               return -ENOMEM;
+
+       attrs = (struct attribute **)(attr_group + 1);
+       for (j = 0; j < i; j++)
+               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+       attr_group->name = "events";
+       attr_group->attrs = attrs;
+       perf_iommu->events_group = attr_group;
+
+       return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+       if (__perf_iommu.events_group != NULL) {
+               kfree(__perf_iommu.events_group);
+               __perf_iommu.events_group = NULL;
+       }
+}
+
+static __init int _init_perf_amd_iommu(
+       struct perf_amd_iommu *perf_iommu, char *name)
+{
+       int ret;
+
+       raw_spin_lock_init(&perf_iommu->lock);
+
+       /* Init format attributes */
+       perf_iommu->format_group = &amd_iommu_format_group;
+
+       /* Init cpumask attributes to only core 0 */
+       cpumask_set_cpu(0, &iommu_cpumask);
+       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+       /* Init events attributes */
+       if (_init_events_attrs(perf_iommu) != 0)
+               pr_err("perf: amd_iommu: Only support raw events.\n");
+
+       /* Init null attributes */
+       perf_iommu->null_group = NULL;
+       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+       if (ret) {
+               pr_err("perf: amd_iommu: Failed to initialized.\n");
+               amd_iommu_pc_exit();
+       } else {
+               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+       }
+
+       return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+       .pmu = {
+               .event_init     = perf_iommu_event_init,
+               .add            = perf_iommu_add,
+               .del            = perf_iommu_del,
+               .start          = perf_iommu_start,
+               .stop           = perf_iommu_stop,
+               .read           = perf_iommu_read,
+       },
+       .max_banks              = 0x00,
+       .max_counters           = 0x00,
+       .cntr_assign_mask       = 0ULL,
+       .format_group           = NULL,
+       .cpumask_group          = NULL,
+       .events_group           = NULL,
+       .null_group             = NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+       /* Make sure the IOMMU PC resource is available */
+       if (!amd_iommu_pc_supported())
+               return -ENODEV;
+
+       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+       return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644 (file)
index 0000000..845d173
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG                   0x00
+#define IOMMU_PC_COUNTER_SRC_REG               0x08
+#define IOMMU_PC_PASID_MATCH_REG               0x10
+#define IOMMU_PC_DOMID_MATCH_REG               0x18
+#define IOMMU_PC_DEVID_MATCH_REG               0x20
+#define IOMMU_PC_COUNTER_REPORT_REG            0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS                       64
+#define PC_MAX_SPEC_CNTRS                      16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID                       0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+                       u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644 (file)
index 0000000..3db9569
--- /dev/null
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB                4
+#define NUM_COUNTERS_L2                4
+#define MAX_COUNTERS           NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB          6
+#define RDPMC_BASE_L2          10
+
+#define COUNTER_SHIFT          16
+
+struct amd_uncore {
+       int id;
+       int refcnt;
+       int cpu;
+       int num_counters;
+       int rdpmc_base;
+       u32 msr_base;
+       cpumask_t *active_mask;
+       struct pmu *pmu;
+       struct perf_event *events[MAX_COUNTERS];
+       struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+       return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+       if (is_nb_event(event) && amd_uncore_nb)
+               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+       else if (is_l2_event(event) && amd_uncore_l2)
+               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+       return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev, new;
+       s64 delta;
+
+       /*
+        * since we do not enable counter overflow interrupts,
+        * we do not have to worry about prev_count changing on us
+        */
+
+       prev = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new);
+       local64_set(&hwc->prev_count, new);
+       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+       delta >>= COUNTER_SHIFT;
+       local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (flags & PERF_EF_RELOAD)
+               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+       hwc->state = 0;
+       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+       perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+       hwc->state |= PERF_HES_STOPPED;
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               amd_uncore_read(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       /* are we already assigned? */
+       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+               goto out;
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (uncore->events[i] == event) {
+                       hwc->idx = i;
+                       goto out;
+               }
+       }
+
+       /* if not, take the first available counter */
+       hwc->idx = -1;
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+                       hwc->idx = i;
+                       break;
+               }
+       }
+
+out:
+       if (hwc->idx == -1)
+               return -EBUSY;
+
+       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (flags & PERF_EF_START)
+               amd_uncore_start(event, PERF_EF_RELOAD);
+
+       return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+       int i;
+       struct amd_uncore *uncore = event_to_amd_uncore(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       amd_uncore_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < uncore->num_counters; i++) {
+               if (cmpxchg(&uncore->events[i], event, NULL) == event)
+                       break;
+       }
+
+       hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+       struct amd_uncore *uncore;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /*
+        * NB and L2 counters (MSRs) are shared across all cores that share the
+        * same NB / L2 cache. Interrupts can be directed to a single target
+        * core, however, event counts generated by processes running on other
+        * cores cannot be masked out. So we do not support sampling and
+        * per-thread events.
+        */
+       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+               return -EINVAL;
+
+       /* NB and L2 counters do not have usr/os/guest/host bits */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+           event->attr.exclude_host || event->attr.exclude_guest)
+               return -EINVAL;
+
+       /* and we do not enable counter overflow interrupts */
+       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+       hwc->idx = -1;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       uncore = event_to_amd_uncore(event);
+       if (!uncore)
+               return -ENODEV;
+
+       /*
+        * since request can come in to any of the shared cores, we will remap
+        * to a single common cpu.
+        */
+       event->cpu = uncore->cpu;
+
+       return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+       cpumask_t *active_mask;
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu->type == amd_nb_pmu.type)
+               active_mask = &amd_nb_active_mask;
+       else if (pmu->type == amd_l2_pmu.type)
+               active_mask = &amd_l2_active_mask;
+       else
+               return 0;
+
+       return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+       .attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+       .name = "format",
+       .attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+       &amd_uncore_attr_group,
+       &amd_uncore_format_group,
+       NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_nb",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+       .attr_groups    = amd_uncore_attr_groups,
+       .name           = "amd_l2",
+       .event_init     = amd_uncore_event_init,
+       .add            = amd_uncore_add,
+       .del            = amd_uncore_del,
+       .start          = amd_uncore_start,
+       .stop           = amd_uncore_stop,
+       .read           = amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+                       cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+       if (amd_uncore_nb) {
+               uncore_nb = amd_uncore_alloc(cpu);
+               if (!uncore_nb)
+                       goto fail;
+               uncore_nb->cpu = cpu;
+               uncore_nb->num_counters = NUM_COUNTERS_NB;
+               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+               uncore_nb->active_mask = &amd_nb_active_mask;
+               uncore_nb->pmu = &amd_nb_pmu;
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+       }
+
+       if (amd_uncore_l2) {
+               uncore_l2 = amd_uncore_alloc(cpu);
+               if (!uncore_l2)
+                       goto fail;
+               uncore_l2->cpu = cpu;
+               uncore_l2->num_counters = NUM_COUNTERS_L2;
+               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+               uncore_l2->active_mask = &amd_l2_active_mask;
+               uncore_l2->pmu = &amd_l2_pmu;
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+       }
+
+       return 0;
+
+fail:
+       if (amd_uncore_nb)
+               *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+       kfree(uncore_nb);
+       return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+                              struct amd_uncore * __percpu *uncores)
+{
+       unsigned int cpu;
+       struct amd_uncore *that;
+
+       for_each_online_cpu(cpu) {
+               that = *per_cpu_ptr(uncores, cpu);
+
+               if (!that)
+                       continue;
+
+               if (this == that)
+                       continue;
+
+               if (this->id == that->id) {
+                       that->free_when_cpu_online = this;
+                       this = that;
+                       break;
+               }
+       }
+
+       this->refcnt++;
+       return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+       unsigned int eax, ebx, ecx, edx;
+       struct amd_uncore *uncore;
+
+       if (amd_uncore_nb) {
+               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+               uncore->id = ecx & 0xff;
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+       }
+
+       if (amd_uncore_l2) {
+               unsigned int apicid = cpu_data(cpu).apicid;
+               unsigned int nshared;
+
+               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+               nshared = ((eax >> 14) & 0xfff) + 1;
+               uncore->id = apicid - (apicid % nshared);
+
+               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+       }
+}
+
+static void uncore_online(unsigned int cpu,
+                         struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       kfree(uncore->free_when_cpu_online);
+       uncore->free_when_cpu_online = NULL;
+
+       if (cpu == uncore->cpu)
+               cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_online(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+                               struct amd_uncore * __percpu *uncores)
+{
+       unsigned int i;
+       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+       if (this->cpu != cpu)
+               return;
+
+       /* this cpu is going down, migrate to a shared sibling if possible */
+       for_each_online_cpu(i) {
+               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+               if (cpu == i)
+                       continue;
+
+               if (this == that) {
+                       perf_pmu_migrate_context(this->pmu, cpu, i);
+                       cpumask_clear_cpu(cpu, that->active_mask);
+                       cpumask_set_cpu(i, that->active_mask);
+                       that->cpu = i;
+                       break;
+               }
+       }
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_down_prepare(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+       if (cpu == uncore->cpu)
+               cpumask_clear_cpu(cpu, uncore->active_mask);
+
+       if (!--uncore->refcnt)
+               kfree(uncore);
+       *per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+       if (amd_uncore_nb)
+               uncore_dead(cpu, amd_uncore_nb);
+
+       if (amd_uncore_l2)
+               uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+                       void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               if (amd_uncore_cpu_up_prepare(cpu))
+                       return notifier_from_errno(-ENOMEM);
+               break;
+
+       case CPU_STARTING:
+               amd_uncore_cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               amd_uncore_cpu_online(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               amd_uncore_cpu_down_prepare(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               amd_uncore_cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+       .notifier_call  = amd_uncore_cpu_notifier,
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_starting(cpu);
+       amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+       unsigned int cpu = smp_processor_id();
+
+       amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+       unsigned int cpu, cpu2;
+       int ret = -ENODEV;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               goto fail_nodev;
+
+       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+               goto fail_nodev;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_nb) {
+                       ret = -ENOMEM;
+                       goto fail_nb;
+               }
+               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+               if (ret)
+                       goto fail_nb;
+
+               pr_info("perf: AMD NB counters detected\n");
+               ret = 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+               if (!amd_uncore_l2) {
+                       ret = -ENOMEM;
+                       goto fail_l2;
+               }
+               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+               if (ret)
+                       goto fail_l2;
+
+               pr_info("perf: AMD L2I counters detected\n");
+               ret = 0;
+       }
+
+       if (ret)
+               goto fail_nodev;
+
+       cpu_notifier_register_begin();
+
+       /* init cpus already online before registering for hotplug notifier */
+       for_each_online_cpu(cpu) {
+               ret = amd_uncore_cpu_up_prepare(cpu);
+               if (ret)
+                       goto fail_online;
+               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+       }
+
+       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+       cpu_notifier_register_done();
+
+       return 0;
+
+
+fail_online:
+       for_each_online_cpu(cpu2) {
+               if (cpu2 == cpu)
+                       break;
+               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+       }
+       cpu_notifier_register_done();
+
+       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+       amd_uncore_nb = amd_uncore_l2 = NULL;
+
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+               perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+               perf_pmu_unregister(&amd_nb_pmu);
+       if (amd_uncore_l2)
+               free_percpu(amd_uncore_l2);
+fail_nb:
+       if (amd_uncore_nb)
+               free_percpu(amd_uncore_nb);
+
+fail_nodev:
+       return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
new file mode 100644 (file)
index 0000000..5e830d0
--- /dev/null
@@ -0,0 +1,2442 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+
+#include "perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+       .enabled = 1,
+};
+
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
+u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int shift = 64 - x86_pmu.cntval_bits;
+       u64 prev_raw_count, new_raw_count;
+       int idx = hwc->idx;
+       s64 delta;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * Careful: an NMI might modify the previous event value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic event atomically:
+        */
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       struct extra_reg *er;
+
+       reg = &event->hw.extra_reg;
+
+       if (!x86_pmu.extra_regs)
+               return 0;
+
+       for (er = x86_pmu.extra_regs; er->msr; er++) {
+               if (er->event != (config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+               /* Check if the extra msrs can be safely accessed*/
+               if (!er->extra_msr_access)
+                       return -ENXIO;
+
+               reg->idx = er->idx;
+               reg->config = event->attr.config1;
+               reg->reg = er->msr;
+               break;
+       }
+       return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+                       goto perfctr_fail;
+       }
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+                       goto eventsel_fail;
+       }
+
+       return true;
+
+eventsel_fail:
+       for (i--; i >= 0; i--)
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+
+       i = x86_pmu.num_counters;
+
+perfctr_fail:
+       for (i--; i >= 0; i--)
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+
+       return false;
+}
+
+static void release_pmc_hardware(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               release_perfctr_nmi(x86_pmu_event_addr(i));
+               release_evntsel_nmi(x86_pmu_config_addr(i));
+       }
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+       u64 val, val_fail, val_new= ~0;
+       int i, reg, reg_fail, ret = 0;
+       int bios_fail = 0;
+       int reg_safe = -1;
+
+       /*
+        * Check to see if the BIOS enabled any of the counters, if so
+        * complain and bail.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+                       bios_fail = 1;
+                       val_fail = val;
+                       reg_fail = reg;
+               } else {
+                       reg_safe = i;
+               }
+       }
+
+       if (x86_pmu.num_counters_fixed) {
+               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                       if (val & (0x03 << i*4)) {
+                               bios_fail = 1;
+                               val_fail = val;
+                               reg_fail = reg;
+                       }
+               }
+       }
+
+       /*
+        * If all the counters are enabled, the below test will always
+        * fail.  The tools will also become useless in this scenario.
+        * Just fail and disable the hardware counters.
+        */
+
+       if (reg_safe == -1) {
+               reg = reg_safe;
+               goto msr_fail;
+       }
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       reg = x86_pmu_event_addr(reg_safe);
+       if (rdmsrl_safe(reg, &val))
+               goto msr_fail;
+       val ^= 0xffffUL;
+       ret = wrmsrl_safe(reg, val);
+       ret |= rdmsrl_safe(reg, &val_new);
+       if (ret || val != val_new)
+               goto msr_fail;
+
+       /*
+        * We still allow the PMU driver to operate:
+        */
+       if (bios_fail) {
+               pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+               pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+                             reg_fail, val_fail);
+       }
+
+       return true;
+
+msr_fail:
+       pr_cont("Broken PMU hardware detected, using software events only.\n");
+       pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+               reg, val_new);
+
+       return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+       hw_perf_event_destroy(event);
+
+       /* undo the lbr/bts event accounting */
+       x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       unsigned int cache_type, cache_op, cache_result;
+       u64 config, val;
+
+       config = attr->config;
+
+       cache_type = (config >>  0) & 0xff;
+       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+               return -EINVAL;
+
+       cache_op = (config >>  8) & 0xff;
+       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+               return -EINVAL;
+
+       cache_result = (config >> 16) & 0xff;
+       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+               return -EINVAL;
+
+       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+       if (val == 0)
+               return -ENOENT;
+
+       if (val == -1)
+               return -EINVAL;
+
+       hwc->config |= val;
+       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+       return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+       int err = 0;
+
+       if (!atomic_inc_not_zero(&pmc_refcount)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&pmc_refcount) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               reserve_ds_buffers();
+               }
+               if (!err)
+                       atomic_inc(&pmc_refcount);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       return err;
+}
+
+void x86_release_hardware(void)
+{
+       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               release_ds_buffers();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+       int i;
+
+       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+               mutex_lock(&pmc_reserve_mutex);
+               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+                               goto fail_unlock;
+               }
+               atomic_inc(&x86_pmu.lbr_exclusive[what]);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       atomic_inc(&active_events);
+       return 0;
+
+fail_unlock:
+       mutex_unlock(&pmc_reserve_mutex);
+       return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+       atomic_dec(&x86_pmu.lbr_exclusive[what]);
+       atomic_dec(&active_events);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 config;
+
+       if (!is_sampling_event(event)) {
+               hwc->sample_period = x86_pmu.max_period;
+               hwc->last_period = hwc->sample_period;
+               local64_set(&hwc->period_left, hwc->sample_period);
+       }
+
+       if (attr->type == PERF_TYPE_RAW)
+               return x86_pmu_extra_regs(event->attr.config, event);
+
+       if (attr->type == PERF_TYPE_HW_CACHE)
+               return set_ext_hw_attr(hwc, event);
+
+       if (attr->config >= x86_pmu.max_events)
+               return -EINVAL;
+
+       /*
+        * The generic map:
+        */
+       config = x86_pmu.event_map(attr->config);
+
+       if (config == 0)
+               return -ENOENT;
+
+       if (config == -1LL)
+               return -EINVAL;
+
+       /*
+        * Branch tracing:
+        */
+       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !attr->freq && hwc->sample_period == 1) {
+               /* BTS is not supported by this architecture. */
+               if (!x86_pmu.bts_active)
+                       return -EOPNOTSUPP;
+
+               /* BTS is currently only allowed for user-mode. */
+               if (!attr->exclude_kernel)
+                       return -EOPNOTSUPP;
+
+               /* disallow bts if conflicting events are present */
+               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                       return -EBUSY;
+
+               event->destroy = hw_perf_lbr_event_destroy;
+       }
+
+       hwc->config |= config;
+
+       return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+       u64 m = event->attr.branch_sample_type;
+       u64 b = 0;
+
+       /* must capture all branches */
+       if (!(m & PERF_SAMPLE_BRANCH_ANY))
+               return 0;
+
+       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_user)
+               b |= PERF_SAMPLE_BRANCH_USER;
+
+       if (!event->attr.exclude_kernel)
+               b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+       /*
+        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+        */
+
+       return m == b;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+       if (event->attr.precise_ip) {
+               int precise = 0;
+
+               /* Support for constant skid */
+               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+                       precise++;
+
+                       /* Support for IP fixup */
+                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+                               precise++;
+
+                       if (x86_pmu.pebs_prec_dist)
+                               precise++;
+               }
+
+               if (event->attr.precise_ip > precise)
+                       return -EOPNOTSUPP;
+       }
+       /*
+        * check that PEBS LBR correction does not conflict with
+        * whatever the user is asking with attr->branch_sample_type
+        */
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+               u64 *br_type = &event->attr.branch_sample_type;
+
+               if (has_branch_stack(event)) {
+                       if (!precise_br_compat(event))
+                               return -EOPNOTSUPP;
+
+                       /* branch_sample_type is compatible */
+
+               } else {
+                       /*
+                        * user did not specify  branch_sample_type
+                        *
+                        * For PEBS fixups, we capture all
+                        * the branches at the priv level of the
+                        * event.
+                        */
+                       *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+                       if (!event->attr.exclude_user)
+                               *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+                       if (!event->attr.exclude_kernel)
+                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+               }
+       }
+
+       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+               event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+       /*
+        * Generate PMC IRQs:
+        * (keep 'enabled' bit clear for now)
+        */
+       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+       /*
+        * Count user and OS events unless requested not to
+        */
+       if (!event->attr.exclude_user)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+       if (!event->attr.exclude_kernel)
+               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+       if (event->attr.type == PERF_TYPE_RAW)
+               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+       if (event->attr.sample_period && x86_pmu.limit_period) {
+               if (x86_pmu.limit_period(event, event->attr.sample_period) >
+                               event->attr.sample_period)
+                       return -EINVAL;
+       }
+
+       return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+       int err;
+
+       if (!x86_pmu_initialized())
+               return -ENODEV;
+
+       err = x86_reserve_hardware();
+       if (err)
+               return err;
+
+       atomic_inc(&active_events);
+       event->destroy = hw_perf_event_destroy;
+
+       event->hw.idx = -1;
+       event->hw.last_cpu = -1;
+       event->hw.last_tag = ~0ULL;
+
+       /* mark unused */
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               u64 val;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               rdmsrl(x86_pmu_config_addr(idx), val);
+               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+                       continue;
+               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               wrmsrl(x86_pmu_config_addr(idx), val);
+       }
+}
+
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
+static void x86_pmu_disable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->n_added = 0;
+       cpuc->enabled = 0;
+       barrier();
+
+       x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+       return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+       int     weight;
+       int     event;          /* event index */
+       int     counter;        /* counter index */
+       int     unassigned;     /* number of events to be assigned left */
+       int     nr_gp;          /* number of GP counters used */
+       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define        SCHED_STATES_MAX        2
+
+struct perf_sched {
+       int                     max_weight;
+       int                     max_events;
+       int                     max_gp;
+       int                     saved_states;
+       struct event_constraint **constraints;
+       struct sched_state      state;
+       struct sched_state      saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+                           int num, int wmin, int wmax, int gpmax)
+{
+       int idx;
+
+       memset(sched, 0, sizeof(*sched));
+       sched->max_events       = num;
+       sched->max_weight       = wmax;
+       sched->max_gp           = gpmax;
+       sched->constraints      = constraints;
+
+       for (idx = 0; idx < num; idx++) {
+               if (constraints[idx]->weight == wmin)
+                       break;
+       }
+
+       sched->state.event      = idx;          /* start with min weight */
+       sched->state.weight     = wmin;
+       sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+               return;
+
+       sched->saved[sched->saved_states] = sched->state;
+       sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+       if (!sched->saved_states)
+               return false;
+
+       sched->saved_states--;
+       sched->state = sched->saved[sched->saved_states];
+
+       /* continue with next counter: */
+       clear_bit(sched->state.counter++, sched->state.used);
+
+       return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+       int idx;
+
+       if (!sched->state.unassigned)
+               return false;
+
+       if (sched->state.event >= sched->max_events)
+               return false;
+
+       c = sched->constraints[sched->state.event];
+       /* Prefer fixed purpose counters */
+       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+               idx = INTEL_PMC_IDX_FIXED;
+               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+                       if (!__test_and_set_bit(idx, sched->state.used))
+                               goto done;
+               }
+       }
+
+       /* Grab the first unused counter starting with idx */
+       idx = sched->state.counter;
+       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+               if (!__test_and_set_bit(idx, sched->state.used)) {
+                       if (sched->state.nr_gp++ >= sched->max_gp)
+                               return false;
+
+                       goto done;
+               }
+       }
+
+       return false;
+
+done:
+       sched->state.counter = idx;
+
+       if (c->overlap)
+               perf_sched_save_state(sched);
+
+       return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+       while (!__perf_sched_find_counter(sched)) {
+               if (!perf_sched_restore_state(sched))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+
+       if (!sched->state.unassigned || !--sched->state.unassigned)
+               return false;
+
+       do {
+               /* next event */
+               sched->state.event++;
+               if (sched->state.event >= sched->max_events) {
+                       /* next weight */
+                       sched->state.event = 0;
+                       sched->state.weight++;
+                       if (sched->state.weight > sched->max_weight)
+                               return false;
+               }
+               c = sched->constraints[sched->state.event];
+       } while (c->weight != sched->state.weight);
+
+       sched->state.counter = 0;       /* start with first counter */
+
+       return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign)
+{
+       struct perf_sched sched;
+
+       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+       do {
+               if (!perf_sched_find_counter(&sched))
+                       break;  /* failed */
+               if (assign)
+                       assign[sched.state.event] = sched.state.counter;
+       } while (perf_sched_next_event(&sched));
+
+       return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       struct event_constraint *c;
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       struct perf_event *e;
+       int i, wmin, wmax, unsched = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+       if (x86_pmu.start_scheduling)
+               x86_pmu.start_scheduling(cpuc);
+
+       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               cpuc->event_constraint[i] = NULL;
+               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+               cpuc->event_constraint[i] = c;
+
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /*
+        * fastpath, try to reuse previous register
+        */
+       for (i = 0; i < n; i++) {
+               hwc = &cpuc->event_list[i]->hw;
+               c = cpuc->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+
+       /* slow path */
+       if (i != n) {
+               int gpmax = x86_pmu.num_counters;
+
+               /*
+                * Do not allow scheduling of more than half the available
+                * generic counters.
+                *
+                * This helps avoid counter starvation of sibling thread by
+                * ensuring at most half the counters cannot be in exclusive
+                * mode. There is no designated counters for the limits. Any
+                * N/2 counters can be used. This helps with events with
+                * specific counter constraints.
+                */
+               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+                       gpmax /= 2;
+
+               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+                                            wmax, gpmax, assign);
+       }
+
+       /*
+        * In case of success (unsched = 0), mark events as committed,
+        * so we do not put_constraint() in case new events are added
+        * and fail to be scheduled
+        *
+        * We invoke the lower level commit callback to lock the resource
+        *
+        * We do not need to do all of this in case we are called to
+        * validate an event group (assign == NULL)
+        */
+       if (!unsched && assign) {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+                       if (x86_pmu.commit_scheduling)
+                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+               }
+       } else {
+               for (i = 0; i < n; i++) {
+                       e = cpuc->event_list[i];
+                       /*
+                        * do not put_constraint() on comitted events,
+                        * because they are good to go
+                        */
+                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+                               continue;
+
+                       /*
+                        * release events that failed scheduling
+                        */
+                       if (x86_pmu.put_event_constraints)
+                               x86_pmu.put_event_constraints(cpuc, e);
+               }
+       }
+
+       if (x86_pmu.stop_scheduling)
+               x86_pmu.stop_scheduling(cpuc);
+
+       return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+       /* current number of events already accepted */
+       n = cpuc->n_events;
+
+       if (is_x86_event(leader)) {
+               if (n >= max_count)
+                       return -EINVAL;
+               cpuc->event_list[n] = leader;
+               n++;
+       }
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_x86_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               cpuc->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+                               struct cpu_hw_events *cpuc, int i)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = cpuc->assign[i];
+       hwc->last_cpu = smp_processor_id();
+       hwc->last_tag = ++cpuc->tags[i];
+
+       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+               hwc->config_base = 0;
+               hwc->event_base = 0;
+       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+       } else {
+               hwc->config_base = x86_pmu_config_addr(hwc->idx);
+               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+       }
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+                                       struct cpu_hw_events *cpuc,
+                                       int i)
+{
+       return hwc->idx == cpuc->assign[i] &&
+               hwc->last_cpu == smp_processor_id() &&
+               hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int i, added = cpuc->n_added;
+
+       if (!x86_pmu_initialized())
+               return;
+
+       if (cpuc->enabled)
+               return;
+
+       if (cpuc->n_added) {
+               int n_running = cpuc->n_events - cpuc->n_added;
+               /*
+                * apply assignment obtained either from
+                * hw_perf_group_sched_in() or x86_pmu_enable()
+                *
+                * step1: save events moving to new counters
+                */
+               for (i = 0; i < n_running; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       /*
+                        * we can avoid reprogramming counter if:
+                        * - assigned same counter as last time
+                        * - running on same CPU as last time
+                        * - no other event has used the counter since
+                        */
+                       if (hwc->idx == -1 ||
+                           match_prev_assignment(hwc, cpuc, i))
+                               continue;
+
+                       /*
+                        * Ensure we don't accidentally enable a stopped
+                        * counter simply because we rescheduled.
+                        */
+                       if (hwc->state & PERF_HES_STOPPED)
+                               hwc->state |= PERF_HES_ARCH;
+
+                       x86_pmu_stop(event, PERF_EF_UPDATE);
+               }
+
+               /*
+                * step2: reprogram moved events into new counters
+                */
+               for (i = 0; i < cpuc->n_events; i++) {
+                       event = cpuc->event_list[i];
+                       hwc = &event->hw;
+
+                       if (!match_prev_assignment(hwc, cpuc, i))
+                               x86_assign_hw_event(event, cpuc, i);
+                       else if (i < n_running)
+                               continue;
+
+                       if (hwc->state & PERF_HES_ARCH)
+                               continue;
+
+                       x86_pmu_start(event, PERF_EF_RELOAD);
+               }
+               cpuc->n_added = 0;
+               perf_events_lapic_init();
+       }
+
+       cpuc->enabled = 1;
+       barrier();
+
+       x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       s64 left = local64_read(&hwc->period_left);
+       s64 period = hwc->sample_period;
+       int ret = 0, idx = hwc->idx;
+
+       if (idx == INTEL_PMC_IDX_FIXED_BTS)
+               return 0;
+
+       /*
+        * If we are way outside a reasonable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               local64_set(&hwc->period_left, left);
+               hwc->last_period = period;
+               ret = 1;
+       }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
+
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
+       if (x86_pmu.limit_period)
+               left = x86_pmu.limit_period(event, left);
+
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+           local64_read(&hwc->prev_count) != (u64)-left) {
+               /*
+                * The hw event starts counting from this event offset,
+                * mark it to be able to extra future deltas:
+                */
+               local64_set(&hwc->prev_count, (u64)-left);
+
+               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       /*
+        * Due to erratum on certan cpu we need
+        * a second write to be sure the register
+        * is updated properly
+        */
+       if (x86_pmu.perfctr_second_write) {
+               wrmsrl(hwc->event_base,
+                       (u64)(-left) & x86_pmu.cntval_mask);
+       }
+
+       perf_event_update_userpage(event);
+
+       return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+       if (__this_cpu_read(cpu_hw_events.enabled))
+               __x86_pmu_enable_event(&event->hw,
+                                      ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc;
+       int assign[X86_PMC_IDX_MAX];
+       int n, n0, ret;
+
+       hwc = &event->hw;
+
+       n0 = cpuc->n_events;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       /*
+        * If group events scheduling transaction was started,
+        * skip the schedulability test here, it will be performed
+        * at commit time (->commit_txn) as a whole.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               goto done_collect;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               goto out;
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+       /*
+        * Commit the collect_events() state. See x86_pmu_del() and
+        * x86_pmu_*_txn().
+        */
+       cpuc->n_events = n;
+       cpuc->n_added += n - n0;
+       cpuc->n_txn += n - n0;
+
+       ret = 0;
+out:
+       return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               x86_perf_event_set_period(event);
+       }
+
+       event->hw.state = 0;
+
+       cpuc->events[idx] = event;
+       __set_bit(idx, cpuc->active_mask);
+       __set_bit(idx, cpuc->running);
+       x86_pmu.enable(event);
+       perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+       u64 pebs, debugctl;
+       struct cpu_hw_events *cpuc;
+       unsigned long flags;
+       int cpu, idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.version >= 2) {
+               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+               pr_info("\n");
+               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+               if (x86_pmu.pebs_constraints) {
+                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+               }
+               if (x86_pmu.lbr_nr) {
+                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+               }
+       }
+       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+                       cpu, idx, pmc_ctrl);
+               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+                       cpu, idx, pmc_count);
+       }
+       local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+               x86_pmu.disable(event);
+               cpuc->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               x86_perf_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int i;
+
+       /*
+        * event is descheduled
+        */
+       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+       /*
+        * If we're called during a txn, we don't need to do anything.
+        * The events never got scheduled and ->cancel_txn will truncate
+        * the event_list.
+        *
+        * XXX assumes any ->del() called during a TXN will only be on
+        * an event added during that same TXN.
+        */
+       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Not a TXN, therefore cleanup properly.
+        */
+       x86_pmu_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < cpuc->n_events; i++) {
+               if (event == cpuc->event_list[i])
+                       break;
+       }
+
+       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+               return;
+
+       /* If we have a newly added event; make sure to decrease n_added. */
+       if (i >= cpuc->n_events - cpuc->n_added)
+               --cpuc->n_added;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(cpuc, event);
+
+       /* Delete the array entry. */
+       while (++i < cpuc->n_events) {
+               cpuc->event_list[i-1] = cpuc->event_list[i];
+               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+       }
+       --cpuc->n_events;
+
+       perf_event_update_userpage(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * Some chipsets need to unmask the LVTPC in a particular spot
+        * inside the nmi handler.  As a result, the unmasking was pushed
+        * into all the nmi handlers.
+        *
+        * This generic handler doesn't seem to have any issues where the
+        * unmasking occurs so it was left at the top.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /*
+                        * Though we deactivated the counter some cpus
+                        * might still deliver spurious interrupts still
+                        * in flight. Catch them:
+                        */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+
+               val = x86_perf_event_update(event);
+               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+                       continue;
+
+               /*
+                * event overflow
+                */
+               handled++;
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+       if (!x86_pmu.apic || !x86_pmu_initialized())
+               return;
+
+       /*
+        * Always use NMI for PMU
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+       u64 start_clock;
+       u64 finish_clock;
+       int ret;
+
+       /*
+        * All PMUs/events that share this PMI handler should make sure to
+        * increment active_events for their events.
+        */
+       if (!atomic_read(&active_events))
+               return NMI_DONE;
+
+       start_clock = sched_clock();
+       ret = x86_pmu.handle_irq(regs);
+       finish_clock = sched_clock();
+
+       perf_sample_event_took(finish_clock - start_clock);
+
+       return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int i, ret = NOTIFY_OK;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+                       cpuc->kfree_on_online[i] = NULL;
+               if (x86_pmu.cpu_prepare)
+                       ret = x86_pmu.cpu_prepare(cpu);
+               break;
+
+       case CPU_STARTING:
+               if (x86_pmu.cpu_starting)
+                       x86_pmu.cpu_starting(cpu);
+               break;
+
+       case CPU_ONLINE:
+               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+                       kfree(cpuc->kfree_on_online[i]);
+                       cpuc->kfree_on_online[i] = NULL;
+               }
+               break;
+
+       case CPU_DYING:
+               if (x86_pmu.cpu_dying)
+                       x86_pmu.cpu_dying(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               if (x86_pmu.cpu_dead)
+                       x86_pmu.cpu_dead(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static void __init pmu_check_apic(void)
+{
+       if (cpu_has_apic)
+               return;
+
+       x86_pmu.apic = 0;
+       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+       pr_info("no hardware sampling interrupt available.\n");
+
+       /*
+        * If we have a PMU initialized but no APIC
+        * interrupts, we cannot sample hardware
+        * events (user-space has to fall back and
+        * sample via a hrtimer based software event):
+        */
+       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+       .name = "format",
+       .attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+       struct device_attribute *d;
+       struct perf_pmu_events_attr *pmu_attr;
+       int offset = 0;
+       int i, j;
+
+       for (i = 0; attrs[i]; i++) {
+               d = (struct device_attribute *)attrs[i];
+               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+               /* str trumps id */
+               if (pmu_attr->event_str)
+                       continue;
+               if (x86_pmu.event_map(i + offset))
+                       continue;
+
+               for (j = i; attrs[j]; j++)
+                       attrs[j] = attrs[j + 1];
+
+               /* Check the shifted attr. */
+               i--;
+
+               /*
+                * event_map() is index based, the attrs array is organized
+                * by increasing event index. If we shift the events, then
+                * we need to compensate for the event_map(), otherwise
+                * we are looking up the wrong event in the map
+                */
+               offset++;
+       }
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+       struct attribute **new;
+       int j, i;
+
+       for (j = 0; a[j]; j++)
+               ;
+       for (i = 0; b[i]; i++)
+               j++;
+       j++;
+
+       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+       if (!new)
+               return NULL;
+
+       j = 0;
+       for (i = 0; a[i]; i++)
+               new[j++] = a[i];
+       for (i = 0; b[i]; i++)
+               new[j++] = b[i];
+       new[j] = NULL;
+
+       return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page)
+{
+       struct perf_pmu_events_attr *pmu_attr = \
+               container_of(attr, struct perf_pmu_events_attr, attr);
+       u64 config = x86_pmu.event_map(pmu_attr->id);
+
+       /* string trumps id */
+       if (pmu_attr->event_str)
+               return sprintf(page, "%s", pmu_attr->event_str);
+
+       return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
+EVENT_ATTR(instructions,               INSTRUCTIONS            );
+EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
+EVENT_ATTR(cache-misses,               CACHE_MISSES            );
+EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
+EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
+EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
+EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
+EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+       EVENT_PTR(CPU_CYCLES),
+       EVENT_PTR(INSTRUCTIONS),
+       EVENT_PTR(CACHE_REFERENCES),
+       EVENT_PTR(CACHE_MISSES),
+       EVENT_PTR(BRANCH_INSTRUCTIONS),
+       EVENT_PTR(BRANCH_MISSES),
+       EVENT_PTR(BUS_CYCLES),
+       EVENT_PTR(STALLED_CYCLES_FRONTEND),
+       EVENT_PTR(STALLED_CYCLES_BACKEND),
+       EVENT_PTR(REF_CPU_CYCLES),
+       NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+       .name = "events",
+       .attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
+       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
+       ssize_t ret;
+
+       /*
+       * We have whole page size to spend and just little data
+       * to write, so we can safely use sprintf.
+       */
+       ret = sprintf(page, "event=0x%02llx", event);
+
+       if (umask)
+               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+       if (edge)
+               ret += sprintf(page + ret, ",edge");
+
+       if (pc)
+               ret += sprintf(page + ret, ",pc");
+
+       if (any)
+               ret += sprintf(page + ret, ",any");
+
+       if (inv)
+               ret += sprintf(page + ret, ",inv");
+
+       if (cmask)
+               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+       ret += sprintf(page + ret, "\n");
+
+       return ret;
+}
+
+static int __init init_hw_perf_events(void)
+{
+       struct x86_pmu_quirk *quirk;
+       int err;
+
+       pr_info("Performance Events: ");
+
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_INTEL:
+               err = intel_pmu_init();
+               break;
+       case X86_VENDOR_AMD:
+               err = amd_pmu_init();
+               break;
+       default:
+               err = -ENOTSUPP;
+       }
+       if (err != 0) {
+               pr_cont("no PMU driver, software events only.\n");
+               return 0;
+       }
+
+       pmu_check_apic();
+
+       /* sanity check that the hardware exists or is emulated */
+       if (!check_hw_exists())
+               return 0;
+
+       pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+               quirk->func();
+
+       if (!x86_pmu.intel_ctrl)
+               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       perf_events_lapic_init();
+       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+       unconstrained = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+                                  0, x86_pmu.num_counters, 0, 0);
+
+       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+       if (x86_pmu.event_attrs)
+               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+       if (!x86_pmu.events_sysfs_show)
+               x86_pmu_events_group.attrs = &empty_attrs;
+       else
+               filter_events(x86_pmu_events_group.attrs);
+
+       if (x86_pmu.cpu_events) {
+               struct attribute **tmp;
+
+               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+               if (!WARN_ON(!tmp))
+                       x86_pmu_events_group.attrs = tmp;
+       }
+
+       pr_info("... version:                %d\n",     x86_pmu.version);
+       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
+       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+       perf_cpu_notifier(x86_pmu_notifier);
+
+       return 0;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
+
+       cpuc->txn_flags = txn_flags;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       perf_pmu_disable(pmu);
+       __this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+       unsigned int txn_flags;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       txn_flags = cpuc->txn_flags;
+       cpuc->txn_flags = 0;
+       if (txn_flags & ~PERF_PMU_TXN_ADD)
+               return;
+
+       /*
+        * Truncate collected array by the number of events added in this
+        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+        */
+       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+       perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int assign[X86_PMC_IDX_MAX];
+       int n, ret;
+
+       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+               cpuc->txn_flags = 0;
+               return 0;
+       }
+
+       n = cpuc->n_events;
+
+       if (!x86_pmu_initialized())
+               return -EAGAIN;
+
+       ret = x86_pmu.schedule_events(cpuc, n, assign);
+       if (ret)
+               return ret;
+
+       /*
+        * copy new assignment, now we know it is possible
+        * will be used by hw_perf_enable()
+        */
+       memcpy(cpuc->assign, assign, n*sizeof(int));
+
+       cpuc->txn_flags = 0;
+       perf_pmu_enable(pmu);
+       return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+       kfree(cpuc->shared_regs);
+       kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+       struct cpu_hw_events *cpuc;
+       int cpu = raw_smp_processor_id();
+
+       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+       if (!cpuc)
+               return ERR_PTR(-ENOMEM);
+
+       /* only needed, if we have extra_regs */
+       if (x86_pmu.extra_regs) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto error;
+       }
+       cpuc->is_fake = 1;
+       return cpuc;
+error:
+       free_fake_cpuc(cpuc);
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+       struct cpu_hw_events *fake_cpuc;
+       struct event_constraint *c;
+       int ret = 0;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+
+       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+       if (!c || !c->weight)
+               ret = -EINVAL;
+
+       if (x86_pmu.put_event_constraints)
+               x86_pmu.put_event_constraints(fake_cpuc, event);
+
+       free_fake_cpuc(fake_cpuc);
+
+       return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ *     - check events are compatible which each other
+ *     - events do not compete for the same counter
+ *     - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct cpu_hw_events *fake_cpuc;
+       int ret = -EINVAL, n;
+
+       fake_cpuc = allocate_fake_cpuc();
+       if (IS_ERR(fake_cpuc))
+               return PTR_ERR(fake_cpuc);
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = collect_events(fake_cpuc, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+       n = collect_events(fake_cpuc, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_cpuc->n_events = n;
+
+       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+       free_fake_cpuc(fake_cpuc);
+       return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+       struct pmu *tmp;
+       int err;
+
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
+       if (!err) {
+               /*
+                * we temporarily connect event to its pmu
+                * such that validate_group() can classify
+                * it as an x86 event using is_x86_event()
+                */
+               tmp = event->pmu;
+               event->pmu = &pmu;
+
+               if (event->group_leader != event)
+                       err = validate_group(event);
+               else
+                       err = validate_event(event);
+
+               event->pmu = tmp;
+       }
+       if (err) {
+               if (event->destroy)
+                       event->destroy(event);
+       }
+
+       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+       return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+       if (current->mm)
+               load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+       if (!current->mm)
+               return;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return;
+
+       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+       int idx = event->hw.idx;
+
+       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+               return 0;
+
+       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+               idx -= INTEL_PMC_IDX_FIXED;
+               idx |= 1 << 30;
+       }
+
+       return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+       unsigned long val;
+       ssize_t ret;
+
+       ret = kstrtoul(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val > 2)
+               return -EINVAL;
+
+       if (x86_pmu.attr_rdpmc_broken)
+               return -ENOTSUPP;
+
+       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+               /*
+                * Changing into or out of always available, aka
+                * perf-event-bypassing mode.  This path is extremely slow,
+                * but only root can trigger it, so it's okay.
+                */
+               if (val == 2)
+                       static_key_slow_inc(&rdpmc_always_available);
+               else
+                       static_key_slow_dec(&rdpmc_always_available);
+               on_each_cpu(refresh_pce, NULL, 1);
+       }
+
+       x86_pmu.attr_rdpmc = val;
+
+       return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+       &dev_attr_rdpmc.attr,
+       NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+       .attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+       &x86_pmu_attr_group,
+       &x86_pmu_format_group,
+       &x86_pmu_events_group,
+       NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (x86_pmu.sched_task)
+               x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+       if (x86_pmu.check_microcode)
+               x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
+static struct pmu pmu = {
+       .pmu_enable             = x86_pmu_enable,
+       .pmu_disable            = x86_pmu_disable,
+
+       .attr_groups            = x86_pmu_attr_groups,
+
+       .event_init             = x86_pmu_event_init,
+
+       .event_mapped           = x86_pmu_event_mapped,
+       .event_unmapped         = x86_pmu_event_unmapped,
+
+       .add                    = x86_pmu_add,
+       .del                    = x86_pmu_del,
+       .start                  = x86_pmu_start,
+       .stop                   = x86_pmu_stop,
+       .read                   = x86_pmu_read,
+
+       .start_txn              = x86_pmu_start_txn,
+       .cancel_txn             = x86_pmu_cancel_txn,
+       .commit_txn             = x86_pmu_commit_txn,
+
+       .event_idx              = x86_pmu_event_idx,
+       .sched_task             = x86_pmu_sched_task,
+       .task_ctx_size          = sizeof(struct x86_perf_task_context),
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+                              struct perf_event_mmap_page *userpg, u64 now)
+{
+       struct cyc2ns_data *data;
+
+       userpg->cap_user_time = 0;
+       userpg->cap_user_time_zero = 0;
+       userpg->cap_user_rdpmc =
+               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+       userpg->pmc_width = x86_pmu.cntval_bits;
+
+       if (!sched_clock_stable())
+               return;
+
+       data = cyc2ns_read_begin();
+
+       /*
+        * Internal timekeeping for enabled/running/stopped times
+        * is always in the local_clock domain.
+        */
+       userpg->cap_user_time = 1;
+       userpg->time_mult = data->cyc2ns_mul;
+       userpg->time_shift = data->cyc2ns_shift;
+       userpg->time_offset = data->cyc2ns_offset - now;
+
+       /*
+        * cap_user_time_zero doesn't make sense when we're using a different
+        * time base for the records.
+        */
+       if (event->clock == &local_clock) {
+               userpg->cap_user_time_zero = 1;
+               userpg->time_zero = data->cyc2ns_offset;
+       }
+
+       cyc2ns_read_end(data);
+}
+
+/*
+ * callchain support
+ */
+
+static int backtrace_stack(void *data, char *name)
+{
+       return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct perf_callchain_entry *entry = data;
+
+       perf_callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+       .walk_stack             = print_context_stack_bp,
+};
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
+
+       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+       struct desc_struct *desc;
+       int idx = segment >> 3;
+
+       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+               struct ldt_struct *ldt;
+
+               if (idx > LDT_ENTRIES)
+                       return 0;
+
+               /* IRQs are off, so this synchronizes with smp_store_release */
+               ldt = lockless_dereference(current->active_mm->context.ldt);
+               if (!ldt || idx > ldt->size)
+                       return 0;
+
+               desc = &ldt->entries[idx];
+#else
+               return 0;
+#endif
+       } else {
+               if (idx > GDT_ENTRIES)
+                       return 0;
+
+               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+       }
+
+       return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+       /* 32-bit process in 64-bit kernel. */
+       unsigned long ss_base, cs_base;
+       struct stack_frame_ia32 frame;
+       const void __user *fp;
+
+       if (!test_thread_flag(TIF_IA32))
+               return 0;
+
+       cs_base = get_segment_base(regs->cs);
+       ss_base = get_segment_base(regs->ss);
+
+       fp = compat_ptr(ss_base + regs->bp);
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame     = 0;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 8))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, cs_base + frame.return_address);
+               fp = compat_ptr(ss_base + frame.next_frame);
+       }
+       pagefault_enable();
+       return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+    return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+       struct stack_frame frame;
+       const void __user *fp;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       /*
+        * We don't know what to do with VM86 stacks.. ignore them for now.
+        */
+       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+               return;
+
+       fp = (void __user *)regs->bp;
+
+       perf_callchain_store(entry, regs->ip);
+
+       if (!current->mm)
+               return;
+
+       if (perf_callchain_user32(regs, entry))
+               return;
+
+       pagefault_disable();
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               unsigned long bytes;
+               frame.next_frame             = NULL;
+               frame.return_address = 0;
+
+               if (!access_ok(VERIFY_READ, fp, 16))
+                       break;
+
+               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+               if (bytes != 0)
+                       break;
+               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
+               if (bytes != 0)
+                       break;
+
+               if (!valid_user_frame(fp, sizeof(frame)))
+                       break;
+
+               perf_callchain_store(entry, frame.return_address);
+               fp = (void __user *)frame.next_frame;
+       }
+       pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+       /*
+        * For IA32 we look at the GDT/LDT segment base to convert the
+        * effective IP to a linear address.
+        */
+
+#ifdef CONFIG_X86_32
+       /*
+        * If we are in VM86 mode, add the segment offset to convert to a
+        * linear address.
+        */
+       if (regs->flags & X86_VM_MASK)
+               return 0x10 * regs->cs;
+
+       if (user_mode(regs) && regs->cs != __USER_CS)
+               return get_segment_base(regs->cs);
+#else
+       if (user_mode(regs) && !user_64bit_mode(regs) &&
+           regs->cs != __USER32_CS)
+               return get_segment_base(regs->cs);
+#endif
+       return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+               return perf_guest_cbs->get_guest_ip();
+
+       return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       int misc = 0;
+
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               if (perf_guest_cbs->is_user_mode())
+                       misc |= PERF_RECORD_MISC_GUEST_USER;
+               else
+                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+       } else {
+               if (user_mode(regs))
+                       misc |= PERF_RECORD_MISC_USER;
+               else
+                       misc |= PERF_RECORD_MISC_KERNEL;
+       }
+
+       if (regs->flags & PERF_EFLAGS_EXACT)
+               misc |= PERF_RECORD_MISC_EXACT_IP;
+
+       return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+       cap->version            = x86_pmu.version;
+       cap->num_counters_gp    = x86_pmu.num_counters;
+       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+       cap->bit_width_gp       = x86_pmu.cntval_bits;
+       cap->bit_width_fixed    = x86_pmu.cntval_bits;
+       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
+       cap->events_mask_len    = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
new file mode 100644 (file)
index 0000000..b99dc92
--- /dev/null
@@ -0,0 +1,544 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+struct bts_ctx {
+       struct perf_output_handle       handle;
+       struct debug_store              ds_back;
+       int                             started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE                24
+#define BTS_SAFETY_MARGIN      4080
+
+struct bts_phys {
+       struct page     *page;
+       unsigned long   size;
+       unsigned long   offset;
+       unsigned long   displacement;
+};
+
+struct bts_buffer {
+       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
+       unsigned int    nr_pages;
+       unsigned int    nr_bufs;
+       unsigned int    cur_buf;
+       bool            snapshot;
+       local_t         data_size;
+       local_t         lost;
+       local_t         head;
+       unsigned long   end;
+       void            **data_pages;
+       struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+static size_t buf_size(struct page *page)
+{
+       return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+       struct bts_buffer *buf;
+       struct page *page;
+       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+       unsigned long offset;
+       size_t size = nr_pages << PAGE_SHIFT;
+       int pg, nbuf, pad;
+
+       /* count all the high order buffers */
+       for (pg = 0, nbuf = 0; pg < nr_pages;) {
+               page = virt_to_page(pages[pg]);
+               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+                       return NULL;
+               pg += 1 << page_private(page);
+               nbuf++;
+       }
+
+       /*
+        * to avoid interrupts in overwrite mode, only allow one physical
+        */
+       if (overwrite && nbuf > 1)
+               return NULL;
+
+       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->nr_pages = nr_pages;
+       buf->nr_bufs = nbuf;
+       buf->snapshot = overwrite;
+       buf->data_pages = pages;
+       buf->real_size = size - size % BTS_RECORD_SIZE;
+
+       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+               unsigned int __nr_pages;
+
+               page = virt_to_page(pages[pg]);
+               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+               buf->buf[nbuf].page = page;
+               buf->buf[nbuf].offset = offset;
+               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+               buf->buf[nbuf].size -= pad;
+
+               pg += __nr_pages;
+               offset += __nr_pages << PAGE_SHIFT;
+       }
+
+       return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+       kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+       return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_phys *phys = &buf->buf[buf->cur_buf];
+       unsigned long index, thresh = 0, end = phys->size;
+       struct page *page = phys->page;
+
+       index = local_read(&buf->head);
+
+       if (!buf->snapshot) {
+               if (buf->end < phys->offset + buf_size(page))
+                       end = buf->end - phys->offset - phys->displacement;
+
+               index -= phys->offset + phys->displacement;
+
+               if (end - index > BTS_SAFETY_MARGIN)
+                       thresh = end - BTS_SAFETY_MARGIN;
+               else if (end - index > BTS_RECORD_SIZE)
+                       thresh = end - BTS_RECORD_SIZE;
+               else
+                       thresh = end;
+       }
+
+       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+       ds->bts_index = ds->bts_buffer_base + index;
+       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+       ds->bts_interrupt_threshold = !buf->snapshot
+               ? ds->bts_buffer_base + thresh
+               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+       unsigned long index = head - phys->offset;
+
+       memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= bts->handle.size ||
+           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+               return true;
+
+       return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+       int cpu = raw_smp_processor_id();
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+       if (!buf)
+               return;
+
+       head = index + bts_buffer_offset(buf, buf->cur_buf);
+       old = local_xchg(&buf->head, head);
+
+       if (!buf->snapshot) {
+               if (old == head)
+                       return;
+
+               if (ds->bts_index >= ds->bts_absolute_maximum)
+                       local_inc(&buf->lost);
+
+               /*
+                * old and head are always in the same physical buffer, so we
+                * can subtract them to get the data size.
+                */
+               local_add(head - old, &buf->data_size);
+       } else {
+               local_set(&buf->data_size, head);
+       }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+       u64 config = 0;
+
+       if (!buf || bts_buffer_is_full(buf, bts))
+               return;
+
+       event->hw.itrace_started = 1;
+       event->hw.state = 0;
+
+       if (!buf->snapshot)
+               config |= ARCH_PERFMON_EVENTSEL_INT;
+       if (!event->attr.exclude_kernel)
+               config |= ARCH_PERFMON_EVENTSEL_OS;
+       if (!event->attr.exclude_user)
+               config |= ARCH_PERFMON_EVENTSEL_USR;
+
+       bts_config_buffer(buf);
+
+       /*
+        * local barrier to make sure that ds configuration made it
+        * before we enable BTS
+        */
+       wmb();
+
+       intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       __bts_event_start(event);
+
+       /* PMI handler: this counter is running and likely generating PMIs */
+       ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+       /*
+        * No extra synchronization is mandated by the documentation to have
+        * BTS data stores globally visible.
+        */
+       intel_pmu_disable_bts();
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       /* PMI handler: don't restart this counter */
+       ACCESS_ONCE(bts->started) = 0;
+
+       __bts_event_stop(event);
+
+       if (flags & PERF_EF_UPDATE)
+               bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event && bts->started)
+               __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+       if (bts->handle.event)
+               __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+       unsigned long head, space, next_space, pad, gap, skip, wakeup;
+       unsigned int next_buf;
+       struct bts_phys *phys, *next_phys;
+       int ret;
+
+       if (buf->snapshot)
+               return 0;
+
+       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+       if (WARN_ON_ONCE(head != local_read(&buf->head)))
+               return -EINVAL;
+
+       phys = &buf->buf[buf->cur_buf];
+       space = phys->offset + phys->displacement + phys->size - head;
+       pad = space;
+       if (space > handle->size) {
+               space = handle->size;
+               space -= space % BTS_RECORD_SIZE;
+       }
+       if (space <= BTS_SAFETY_MARGIN) {
+               /* See if next phys buffer has more space */
+               next_buf = buf->cur_buf + 1;
+               if (next_buf >= buf->nr_bufs)
+                       next_buf = 0;
+               next_phys = &buf->buf[next_buf];
+               gap = buf_size(phys->page) - phys->displacement - phys->size +
+                     next_phys->displacement;
+               skip = pad + gap;
+               if (handle->size >= skip) {
+                       next_space = next_phys->size;
+                       if (next_space + skip > handle->size) {
+                               next_space = handle->size - skip;
+                               next_space -= next_space % BTS_RECORD_SIZE;
+                       }
+                       if (next_space > space || !space) {
+                               if (pad)
+                                       bts_buffer_pad_out(phys, head);
+                               ret = perf_aux_output_skip(handle, skip);
+                               if (ret)
+                                       return ret;
+                               /* Advance to next phys buffer */
+                               phys = next_phys;
+                               space = next_space;
+                               head = phys->offset + phys->displacement;
+                               /*
+                                * After this, cur_buf and head won't match ds
+                                * anymore, so we must not be racing with
+                                * bts_update().
+                                */
+                               buf->cur_buf = next_buf;
+                               local_set(&buf->head, head);
+                       }
+               }
+       }
+
+       /* Don't go far beyond wakeup watermark */
+       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+                handle->head;
+       if (space > wakeup) {
+               space = wakeup;
+               space -= space % BTS_RECORD_SIZE;
+       }
+
+       buf->end = head + space;
+
+       /*
+        * If we have no space, the lost notification would have been sent when
+        * we hit absolute_maximum - see bts_update()
+        */
+       if (!space)
+               return -ENOSPC;
+
+       return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct perf_event *event = bts->handle.event;
+       struct bts_buffer *buf;
+       s64 old_head;
+       int err;
+
+       if (!event || !bts->started)
+               return 0;
+
+       buf = perf_get_aux(&bts->handle);
+       /*
+        * Skip snapshot counters: they don't use the interrupt, but
+        * there's no other way of telling, because the pointer will
+        * keep moving
+        */
+       if (!buf || buf->snapshot)
+               return 0;
+
+       old_head = local_read(&buf->head);
+       bts_update(bts);
+
+       /* no new data */
+       if (old_head == local_read(&buf->head))
+               return 0;
+
+       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                           !!local_xchg(&buf->lost, 0));
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return 1;
+
+       err = bts_buffer_reset(buf, &bts->handle);
+       if (err)
+               perf_aux_output_end(&bts->handle, 0, false);
+
+       return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+       bts_event_stop(event, PERF_EF_UPDATE);
+
+       if (buf) {
+               if (buf->snapshot)
+                       bts->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+                                   !!local_xchg(&buf->lost, 0));
+       }
+
+       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+       struct bts_buffer *buf;
+       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               return -EBUSY;
+
+       if (bts->handle.event)
+               return -EBUSY;
+
+       buf = perf_aux_output_begin(&bts->handle, event);
+       if (!buf)
+               return -EINVAL;
+
+       ret = bts_buffer_reset(buf, &bts->handle);
+       if (ret) {
+               perf_aux_output_end(&bts->handle, 0, false);
+               return ret;
+       }
+
+       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+       if (mode & PERF_EF_START) {
+               bts_event_start(event, 0);
+               if (hwc->state & PERF_HES_STOPPED) {
+                       bts_event_del(event, 0);
+                       return -EBUSY;
+               }
+       }
+
+       return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+       x86_release_hardware();
+       x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+       int ret;
+
+       if (event->attr.type != bts_pmu.type)
+               return -ENOENT;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_bts))
+               return -EBUSY;
+
+       /*
+        * BTS leaks kernel addresses even when CPL0 tracing is
+        * disabled, so disallow intel_bts driver for unprivileged
+        * users on paranoid systems since it provides trace data
+        * to the user in a zero-copy fashion.
+        *
+        * Note that the default paranoia setting permits unprivileged
+        * users to profile the kernel.
+        */
+       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+           !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       ret = x86_reserve_hardware();
+       if (ret) {
+               x86_del_exclusive(x86_lbr_exclusive_bts);
+               return ret;
+       }
+
+       event->destroy = bts_event_destroy;
+
+       return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+               return -ENODEV;
+
+       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+       bts_pmu.task_ctx_nr     = perf_sw_context;
+       bts_pmu.event_init      = bts_event_init;
+       bts_pmu.add             = bts_event_add;
+       bts_pmu.del             = bts_event_del;
+       bts_pmu.start           = bts_event_start;
+       bts_pmu.stop            = bts_event_stop;
+       bts_pmu.read            = bts_event_read;
+       bts_pmu.setup_aux       = bts_buffer_setup_aux;
+       bts_pmu.free_aux        = bts_buffer_free_aux;
+
+       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
new file mode 100644 (file)
index 0000000..68fa55b
--- /dev/null
@@ -0,0 +1,3796 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
+       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
+       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
+       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
+       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
+       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
+       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
+       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+       EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7,
+                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7,
+                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+       /*
+        * Note the low 8 bits eventsel code is not a continuous field, containing
+        * some #GPing bits. These are masked out.
+        */
+       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+       EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+       EVENT_PTR(mem_ld_nhm),
+       NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+       EVENT_PTR(mem_ld_snb),
+       EVENT_PTR(mem_st_snb),
+       NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
+       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
+       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
+       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+       EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+       return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
+#define SKL_DEMAND_RFO                 BIT_ULL(1)
+#define SKL_ANY_RESPONSE               BIT_ULL(16)
+#define SKL_SUPPLIER_NONE              BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
+#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT                    BIT_ULL(30)
+#define SKL_SNOOP_NONE                 BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define SKL_SNOOP_MISS                 BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define SKL_SNOOP_HITM                 BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
+#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
+                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS|SKL_ANY_SNOOP|
+                                      SKL_SUPPLIER_NONE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+#define SNB_DMND_DATA_RD       (1ULL << 0)
+#define SNB_DMND_RFO           (1ULL << 1)
+#define SNB_DMND_IFETCH                (1ULL << 2)
+#define SNB_DMND_WB            (1ULL << 3)
+#define SNB_PF_DATA_RD         (1ULL << 4)
+#define SNB_PF_RFO             (1ULL << 5)
+#define SNB_PF_IFETCH          (1ULL << 6)
+#define SNB_LLC_DATA_RD                (1ULL << 7)
+#define SNB_LLC_RFO            (1ULL << 8)
+#define SNB_LLC_IFETCH         (1ULL << 9)
+#define SNB_BUS_LOCKS          (1ULL << 10)
+#define SNB_STRM_ST            (1ULL << 11)
+#define SNB_OTHER              (1ULL << 15)
+#define SNB_RESP_ANY           (1ULL << 16)
+#define SNB_NO_SUPP            (1ULL << 17)
+#define SNB_LLC_HITM           (1ULL << 18)
+#define SNB_LLC_HITE           (1ULL << 19)
+#define SNB_LLC_HITS           (1ULL << 20)
+#define SNB_LLC_HITF           (1ULL << 21)
+#define SNB_LOCAL              (1ULL << 22)
+#define SNB_REMOTE             (0xffULL << 23)
+#define SNB_SNP_NONE           (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
+#define SNB_SNP_MISS           (1ULL << 33)
+#define SNB_NO_FWD             (1ULL << 34)
+#define SNB_SNP_FWD            (1ULL << 35)
+#define SNB_HITM               (1ULL << 36)
+#define SNB_NON_DRAM           (1ULL << 37)
+
+#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+                                SNB_HITM)
+
+#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS          SNB_RESP_ANY
+#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
+#define HSW_DEMAND_RFO                 BIT_ULL(1)
+#define HSW_ANY_RESPONSE               BIT_ULL(16)
+#define HSW_SUPPLIER_NONE              BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
+#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE                 BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
+#define HSW_SNOOP_MISS                 BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
+#define HSW_SNOOP_HITM                 BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
+#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
+                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
+                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL              BIT(26)
+#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
+                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+                                        HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
+               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
+               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
+               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
+               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS|HSW_ANY_SNOOP,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_LOCAL_DRAM|
+                                      HSW_SNOOP_DRAM,
+               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+                                      HSW_L3_MISS_REMOTE|
+                                      HSW_SNOOP_DRAM,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD       (1 << 0)
+#define NHM_DMND_RFO           (1 << 1)
+#define NHM_DMND_IFETCH                (1 << 2)
+#define NHM_DMND_WB            (1 << 3)
+#define NHM_PF_DATA_RD         (1 << 4)
+#define NHM_PF_DATA_RFO                (1 << 5)
+#define NHM_PF_IFETCH          (1 << 6)
+#define NHM_OFFCORE_OTHER      (1 << 7)
+#define NHM_UNCORE_HIT         (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM    (1 << 10)
+                               /* reserved */
+#define NHM_REMOTE_CACHE_FWD   (1 << 12)
+#define NHM_REMOTE_DRAM                (1 << 13)
+#define NHM_LOCAL_DRAM         (1 << 14)
+#define NHM_NON_DRAM           (1 << 15)
+
+#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE             (NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+       },
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       /*
+        * Use RFO, not WRITEBACK, because a write miss would typically occur
+        * on RFO.
+        */
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+       EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ          SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE         SNB_DMND_RFO
+#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS         SNB_RESP_ANY
+#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+       },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+       [ C(OP_PREFETCH) ] = {
+               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+               [ C(RESULT_ACCESS) ] = 0x01b7,
+               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+               [ C(RESULT_MISS)   ] = 0x01b7,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
+#define KNL_MCDRAM_FAR         BIT_ULL(22)
+#define KNL_DDR_LOCAL          BIT_ULL(23)
+#define KNL_DDR_FAR            BIT_ULL(24)
+#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ            SLM_DMND_READ
+#define KNL_L2_WRITE           SLM_DMND_WRITE
+#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS          SLM_LLC_ACCESS
+#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
+                                                 SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+       [C(LL)] = {
+               [C(OP_READ)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = 0,
+               },
+               [C(OP_WRITE)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+               },
+               [C(OP_PREFETCH)] = {
+                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+               },
+       },
+};
+
+/*
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
+ */
+static void __intel_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               intel_pmu_disable_bts();
+       else
+               intel_bts_disable_local();
+
+       intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+       __intel_pmu_disable_all();
+       intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       intel_pmu_pebs_enable_all();
+       intel_pmu_lbr_enable_all(pmi);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+               struct perf_event *event =
+                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+               if (WARN_ON_ONCE(!event))
+                       return;
+
+               intel_pmu_enable_bts(event->hw.config);
+       } else
+               intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+       __intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * The official story:
+ *   These chips need to be 'reset' when adding counters by programming the
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       static const unsigned long nhm_magic[4] = {
+               0x4300B5,
+               0x4300D2,
+               0x4300B1,
+               0x4300B1
+       };
+       struct perf_event *event;
+       int i;
+
+       /*
+        * The Errata requires below steps:
+        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+        * 2) Configure 4 PERFEVTSELx with the magic events and clear
+        *    the corresponding PMCx;
+        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+        */
+
+       /*
+        * The real steps we choose are a little different from above.
+        * A) To reduce MSR operations, we don't run step 1) as they
+        *    are already cleared before this function is called;
+        * B) Call x86_perf_event_update to save PMCx before configuring
+        *    PERFEVTSELx with magic number;
+        * C) With step 5), we do clear only when the PERFEVTSELx is
+        *    not used currently.
+        * D) Call x86_perf_event_set_period to restore PMCx;
+        */
+
+       /* We always operate 4 pairs of PERF Counters */
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+               if (event)
+                       x86_perf_event_update(event);
+       }
+
+       for (i = 0; i < 4; i++) {
+               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+       }
+
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+       for (i = 0; i < 4; i++) {
+               event = cpuc->events[i];
+
+               if (event) {
+                       x86_perf_event_set_period(event);
+                       __x86_pmu_enable_event(&event->hw,
+                                       ARCH_PERFMON_EVENTSEL_ENABLE);
+               } else
+                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+       }
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+       if (added)
+               intel_pmu_nhm_workaround();
+       intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, mask;
+
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               intel_pmu_disable_bts();
+               intel_pmu_drain_bts_buffer();
+               return;
+       }
+
+       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+       /*
+        * must disable before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_disable(event);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc);
+               return;
+       }
+
+       x86_pmu_disable_event(event);
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+       u64 ctrl_val, bits, mask;
+
+       /*
+        * Enable IRQ generation (0x8),
+        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+        * if requested:
+        */
+       bits = 0x8ULL;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+               bits |= 0x2;
+       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+               bits |= 0x1;
+
+       /*
+        * ANY bit is supported in v3 and up
+        */
+       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+               bits |= 0x4;
+
+       bits <<= (idx * 4);
+       mask = 0xfULL << (idx * 4);
+
+       rdmsrl(hwc->config_base, ctrl_val);
+       ctrl_val &= ~mask;
+       ctrl_val |= bits;
+       wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+               if (!__this_cpu_read(cpu_hw_events.enabled))
+                       return;
+
+               intel_pmu_enable_bts(hwc->config);
+               return;
+       }
+       /*
+        * must enabled before any actual event
+        * because any event may be combined with LBR
+        */
+       if (needs_branch_stack(event))
+               intel_pmu_lbr_enable(event);
+
+       if (event->attr.exclude_host)
+               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+       if (event->attr.exclude_guest)
+               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+       if (unlikely(event_is_checkpointed(event)))
+               cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc);
+               return;
+       }
+
+       if (unlikely(event->attr.precise_ip))
+               intel_pmu_pebs_enable(event);
+
+       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+       x86_perf_event_update(event);
+       /*
+        * For a checkpointed counter always reset back to 0.  This
+        * avoids a situation where the counter overflows, aborts the
+        * transaction and is then set back to shortly before the
+        * overflow, and overflows and aborts again.
+        */
+       if (unlikely(event_is_checkpointed(event))) {
+               /* No race with NMIs because the counter should not be armed */
+               wrmsrl(event->hw.event_base, 0);
+               local64_set(&event->hw.prev_count, 0);
+       }
+       return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+       if (ds)
+               ds->bts_index = ds->bts_buffer_base;
+
+       /* Ack all overflows and disable fixed counters */
+       if (x86_pmu.version >= 2) {
+               intel_pmu_ack_status(intel_pmu_get_status());
+               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+       }
+
+       /* Reset LBRs and LBR freezing */
+       if (x86_pmu.lbr_nr) {
+               update_debugctlmsr(get_debugctlmsr() &
+                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int bit, loops;
+       u64 status;
+       int handled;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       /*
+        * No known reason to not always do late ACK,
+        * but just in case do it opt-in.
+        */
+       if (!x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       __intel_pmu_disable_all();
+       handled = intel_pmu_drain_bts_buffer();
+       handled += intel_bts_interrupt();
+       status = intel_pmu_get_status();
+       if (!status)
+               goto done;
+
+       loops = 0;
+again:
+       intel_pmu_lbr_read();
+       intel_pmu_ack_status(status);
+       if (++loops > 100) {
+               static bool warned = false;
+               if (!warned) {
+                       WARN(1, "perfevents: irq loop stuck!\n");
+                       perf_event_print_debug();
+                       warned = true;
+               }
+               intel_pmu_reset();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+
+       /*
+        * Ignore a range of extra bits in status that do not indicate
+        * overflow by themselves.
+        */
+       status &= ~(GLOBAL_STATUS_COND_CHG |
+                   GLOBAL_STATUS_ASIF |
+                   GLOBAL_STATUS_LBRS_FROZEN);
+       if (!status)
+               goto done;
+
+       /*
+        * PEBS overflow sets bit 62 in the global status register
+        */
+       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+               handled++;
+               x86_pmu.drain_pebs(regs);
+               /*
+                * There are cases where, even though, the PEBS ovfl bit is set
+                * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+                * overflow bits set for their counters. We must clear them
+                * here because they have been processed as exact samples in
+                * the drain_pebs() routine. They must not be processed again
+                * in the for_each_bit_set() loop for regular samples below.
+                */
+               status &= ~cpuc->pebs_enabled;
+               status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+       }
+
+       /*
+        * Intel PT
+        */
+       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+               handled++;
+               intel_pt_interrupt();
+       }
+
+       /*
+        * Checkpointed counters can lead to 'spurious' PMIs because the
+        * rollback caused by the PMI will have cleared the overflow status
+        * bit. Therefore always force probe these counters.
+        */
+       status |= cpuc->intel_cp_status;
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (has_branch_stack(event))
+                       data.br_stack = &cpuc->lbr_stack;
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = intel_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               __intel_pmu_enable_all(0, true);
+
+       /*
+        * Only unmask the NMI after the overflow counters
+        * have been reset. This avoids spurious NMIs on
+        * Haswell CPUs.
+        */
+       if (x86_pmu.late_ack)
+               apic_write(APIC_LVTPC, APIC_DM_NMI);
+       return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned int hw_event, bts_event;
+
+       if (event->attr.freq)
+               return NULL;
+
+       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+               return &bts_constraint;
+
+       return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+       int alt_idx = idx;
+
+       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+               return idx;
+
+       if (idx == EXTRA_REG_RSP_0)
+               alt_idx = EXTRA_REG_RSP_1;
+
+       if (idx == EXTRA_REG_RSP_1)
+               alt_idx = EXTRA_REG_RSP_0;
+
+       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+               return idx;
+
+       return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+       event->hw.extra_reg.idx = idx;
+
+       if (idx == EXTRA_REG_RSP_0) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+       } else if (idx == EXTRA_REG_RSP_1) {
+               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+       }
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+                                  struct perf_event *event,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct event_constraint *c = &emptyconstraint;
+       struct er_account *era;
+       unsigned long flags;
+       int idx = reg->idx;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake cpuc we
+        * need to ignore this, otherwise we might fail to allocate proper fake
+        * state for this extra reg constraint. Also see the comment below.
+        */
+       if (reg->alloc && !cpuc->is_fake)
+               return NULL; /* call x86_get_event_constraint() */
+
+again:
+       era = &cpuc->shared_regs->regs[idx];
+       /*
+        * we use spin_lock_irqsave() to avoid lockdep issues when
+        * passing a fake cpuc
+        */
+       raw_spin_lock_irqsave(&era->lock, flags);
+
+       if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+               /*
+                * If its a fake cpuc -- as per validate_{group,event}() we
+                * shouldn't touch event state and we can avoid doing so
+                * since both will only call get_event_constraints() once
+                * on each event, this avoids the need for reg->alloc.
+                *
+                * Not doing the ER fixup will only result in era->reg being
+                * wrong, but since we won't actually try and program hardware
+                * this isn't a problem either.
+                */
+               if (!cpuc->is_fake) {
+                       if (idx != reg->idx)
+                               intel_fixup_er(event, idx);
+
+                       /*
+                        * x86_schedule_events() can call get_event_constraints()
+                        * multiple times on events in the case of incremental
+                        * scheduling(). reg->alloc ensures we only do the ER
+                        * allocation once.
+                        */
+                       reg->alloc = 1;
+               }
+
+               /* lock in msr value */
+               era->config = reg->config;
+               era->reg = reg->reg;
+
+               /* one more user */
+               atomic_inc(&era->ref);
+
+               /*
+                * need to call x86_get_event_constraint()
+                * to check if associated event has constraints
+                */
+               c = NULL;
+       } else {
+               idx = intel_alt_er(idx, reg->config);
+               if (idx != reg->idx) {
+                       raw_spin_unlock_irqrestore(&era->lock, flags);
+                       goto again;
+               }
+       }
+       raw_spin_unlock_irqrestore(&era->lock, flags);
+
+       return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+                                  struct hw_perf_event_extra *reg)
+{
+       struct er_account *era;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also takes
+        * care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake cpuc we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+        * either since it'll be thrown out.
+        */
+       if (!reg->alloc || cpuc->is_fake)
+               return;
+
+       era = &cpuc->shared_regs->regs[reg->idx];
+
+       /* one fewer user */
+       atomic_dec(&era->ref);
+
+       /* allocate again next time */
+       reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+                             struct perf_event *event)
+{
+       struct event_constraint *c = NULL, *d;
+       struct hw_perf_event_extra *xreg, *breg;
+
+       xreg = &event->hw.extra_reg;
+       if (xreg->idx != EXTRA_REG_NONE) {
+               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+               if (c == &emptyconstraint)
+                       return c;
+       }
+       breg = &event->hw.branch_reg;
+       if (breg->idx != EXTRA_REG_NONE) {
+               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+               if (d == &emptyconstraint) {
+                       __intel_shared_reg_put_constraints(cpuc, xreg);
+                       c = d;
+               }
+       }
+       return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (x86_pmu.event_constraints) {
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_bts_constraints(event);
+       if (c)
+               return c;
+
+       c = intel_shared_regs_constraints(cpuc, event);
+       if (c)
+               return c;
+
+       c = intel_pebs_constraints(event);
+       if (c)
+               return c;
+
+       return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = true;
+       /*
+        * lock shared state until we are done scheduling
+        * in stop_event_scheduling()
+        * makes scheduling appear as a transaction
+        */
+       raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct event_constraint *c = cpuc->event_constraint[idx];
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       lockdep_assert_held(&excl_cntrs->lock);
+
+       if (c->flags & PERF_X86_EVENT_EXCL)
+               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+       else
+               xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       xl->sched_started = false;
+       /*
+        * release shared state lock (acquired in intel_start_scheduling())
+        */
+       raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+                          int idx, struct event_constraint *c)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct intel_excl_states *xlo;
+       int tid = cpuc->excl_thread_id;
+       int is_excl, i;
+
+       /*
+        * validating a group does not require
+        * enforcing cross-thread  exclusion
+        */
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return c;
+
+       /*
+        * no exclusion needed
+        */
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return c;
+
+       /*
+        * because we modify the constraint, we need
+        * to make a copy. Static constraints come
+        * from static const tables.
+        *
+        * only needed when constraint has not yet
+        * been cloned (marked dynamic)
+        */
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+               struct event_constraint *cx;
+
+               /*
+                * grab pre-allocated constraint entry
+                */
+               cx = &cpuc->constraint_list[idx];
+
+               /*
+                * initialize dynamic constraint
+                * with static constraint
+                */
+               *cx = *c;
+
+               /*
+                * mark constraint as dynamic, so we
+                * can free it later on
+                */
+               cx->flags |= PERF_X86_EVENT_DYNAMIC;
+               c = cx;
+       }
+
+       /*
+        * From here on, the constraint is dynamic.
+        * Either it was just allocated above, or it
+        * was allocated during a earlier invocation
+        * of this function
+        */
+
+       /*
+        * state of sibling HT
+        */
+       xlo = &excl_cntrs->states[tid ^ 1];
+
+       /*
+        * event requires exclusive counter access
+        * across HT threads
+        */
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+               if (!cpuc->n_excl++)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+       }
+
+       /*
+        * Modify static constraint with current dynamic
+        * state of thread
+        *
+        * EXCLUSIVE: sibling counter measuring exclusive event
+        * SHARED   : sibling counter measuring non-exclusive event
+        * UNUSED   : sibling counter unused
+        */
+       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+               /*
+                * exclusive event in sibling counter
+                * our corresponding counter cannot be used
+                * regardless of our event
+                */
+               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+                       __clear_bit(i, c->idxmsk);
+               /*
+                * if measuring an exclusive event, sibling
+                * measuring non-exclusive, then counter cannot
+                * be used
+                */
+               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+                       __clear_bit(i, c->idxmsk);
+       }
+
+       /*
+        * recompute actual bit weight for scheduling algorithm
+        */
+       c->weight = hweight64(c->idxmsk64);
+
+       /*
+        * if we return an empty mask, then switch
+        * back to static empty constraint to avoid
+        * the cost of freeing later on
+        */
+       if (c->weight == 0)
+               c = &emptyconstraint;
+
+       return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                           struct perf_event *event)
+{
+       struct event_constraint *c1 = NULL;
+       struct event_constraint *c2;
+
+       if (idx >= 0) /* fake does < 0 */
+               c1 = cpuc->event_constraint[idx];
+
+       /*
+        * first time only
+        * - static constraint: no change across incremental scheduling calls
+        * - dynamic constraint: handled by intel_get_excl_constraints()
+        */
+       c2 = __intel_get_event_constraints(cpuc, idx, event);
+       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+               c1->weight = c2->weight;
+               c2 = c1;
+       }
+
+       if (cpuc->excl_cntrs)
+               return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+       return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+               struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       int tid = cpuc->excl_thread_id;
+       struct intel_excl_states *xl;
+
+       /*
+        * nothing needed if in group validation mode
+        */
+       if (cpuc->is_fake)
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+               if (!--cpuc->n_excl)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+       }
+
+       /*
+        * If event was actually assigned, then mark the counter state as
+        * unused now.
+        */
+       if (hwc->idx >= 0) {
+               xl = &excl_cntrs->states[tid];
+
+               /*
+                * put_constraint may be called from x86_schedule_events()
+                * which already has the lock held so here make locking
+                * conditional.
+                */
+               if (!xl->sched_started)
+                       raw_spin_lock(&excl_cntrs->lock);
+
+               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+               if (!xl->sched_started)
+                       raw_spin_unlock(&excl_cntrs->lock);
+       }
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+
+       reg = &event->hw.extra_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+
+       reg = &event->hw.branch_reg;
+       if (reg->idx != EXTRA_REG_NONE)
+               __intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+                                       struct perf_event *event)
+{
+       intel_put_shared_regs_event_constraints(cpuc, event);
+
+       /*
+        * is PMU has exclusive counter restrictions, then
+        * all events are subject to and must call the
+        * put_excl_constraints() routine
+        */
+       if (cpuc->excl_cntrs)
+               intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.ANY_P
+                * (0x00c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * INST_RETIRED.ANY_P counts the number of cycles that retires
+                * CNTMASK instructions. By setting CNTMASK to a value (16)
+                * larger than the maximum number of instructions that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less instructions, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use UOPS_RETIRED.ALL
+                * (0x01c2), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * UOPS_RETIRED.ALL counts the number of cycles that retires
+                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+                * larger than the maximum number of micro-ops that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less micro-ops, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+                * (0x01c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * The PREC_DIST event has special support to minimize sample
+                * shadowing effects. One drawback is that it can be
+                * only programmed on counter 1, but that seems like an
+                * acceptable trade off.
+                */
+               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_snb(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+       if (event->attr.precise_ip < 3)
+               return intel_pebs_aliases_core2(event);
+       return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+       unsigned long flags = x86_pmu.free_running_flags;
+
+       if (event->attr.use_clockid)
+               flags &= ~PERF_SAMPLE_TIME;
+       return flags;
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+       int ret = x86_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+
+       if (event->attr.precise_ip) {
+               if (!event->attr.freq) {
+                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+                       if (!(event->attr.sample_type &
+                             ~intel_pmu_free_running_flags(event)))
+                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+               }
+               if (x86_pmu.pebs_aliases)
+                       x86_pmu.pebs_aliases(event);
+       }
+
+       if (needs_branch_stack(event)) {
+               ret = intel_pmu_setup_lbr_filter(event);
+               if (ret)
+                       return ret;
+
+               /*
+                * BTS is set up earlier in this path, so don't account twice
+                */
+               if (!intel_pmu_has_bts(event)) {
+                       /* disallow lbr if conflicting events are present */
+                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+                               return -EBUSY;
+
+                       event->destroy = hw_perf_lbr_event_destroy;
+               }
+       }
+
+       if (event->attr.type != PERF_TYPE_RAW)
+               return 0;
+
+       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+               return 0;
+
+       if (x86_pmu.version < 3)
+               return -EINVAL;
+
+       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+       return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+       if (x86_pmu.guest_get_msrs)
+               return x86_pmu.guest_get_msrs(nr);
+       *nr = 0;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+       /*
+        * If PMU counter has PEBS enabled it is not enough to disable counter
+        * on a guest entry since PEBS memory write can overshoot guest entry
+        * and corrupt guest memory. Disabling PEBS solves the problem.
+        */
+       arr[1].msr = MSR_IA32_PEBS_ENABLE;
+       arr[1].host = cpuc->pebs_enabled;
+       arr[1].guest = 0;
+
+       *nr = 2;
+       return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+               struct perf_event *event = cpuc->events[idx];
+
+               arr[idx].msr = x86_pmu_config_addr(idx);
+               arr[idx].host = arr[idx].guest = 0;
+
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+
+               arr[idx].host = arr[idx].guest =
+                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+               if (event->attr.exclude_host)
+                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+               else if (event->attr.exclude_guest)
+                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       }
+
+       *nr = x86_pmu.num_counters;
+       return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+       if (!event->attr.exclude_host)
+               x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+               if (!test_bit(idx, cpuc->active_mask) ||
+                               cpuc->events[idx]->attr.exclude_host)
+                       continue;
+
+               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+       }
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+       int ret = intel_pmu_hw_config(event);
+
+       if (ret)
+               return ret;
+       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+               return 0;
+       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+       /*
+        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+        * this combination.
+        */
+       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+             event->attr.precise_ip > 0))
+               return -EOPNOTSUPP;
+
+       if (event_is_checkpointed(event)) {
+               /*
+                * Sampling of checkpointed events can cause situations where
+                * the CPU constantly aborts because of a overflow, which is
+                * then checkpointed back and ignored. Forbid checkpointing
+                * for sampling.
+                *
+                * But still allow a long sampling period, so that perf stat
+                * from KVM works.
+                */
+               if (event->attr.sample_period > 0 &&
+                   event->attr.sample_period < 0x7fffffff)
+                       return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+static struct event_constraint counter2_constraint =
+                       EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       c = intel_get_event_constraints(cpuc, idx, event);
+
+       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+               if (c->idxmsk64 & (1U << 2))
+                       return &counter2_constraint;
+               return &emptyconstraint;
+       }
+
+       return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
+               if (left < 128)
+                       left = 128;
+               left &= ~0x3fu;
+       }
+       return left;
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+PMU_FORMAT_ATTR(in_tx,  "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+       return x86_event_sysfs_show(page, config, event);
+}
+
+struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       struct intel_shared_regs *regs;
+       int i;
+
+       regs = kzalloc_node(sizeof(struct intel_shared_regs),
+                           GFP_KERNEL, cpu_to_node(cpu));
+       if (regs) {
+               /*
+                * initialize the locks to keep lockdep happy
+                */
+               for (i = 0; i < EXTRA_REG_MAX; i++)
+                       raw_spin_lock_init(&regs->regs[i].lock);
+
+               regs->core_id = -1;
+       }
+       return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+       struct intel_excl_cntrs *c;
+
+       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+                        GFP_KERNEL, cpu_to_node(cpu));
+       if (c) {
+               raw_spin_lock_init(&c->lock);
+               c->core_id = -1;
+       }
+       return c;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+               cpuc->shared_regs = allocate_shared_regs(cpu);
+               if (!cpuc->shared_regs)
+                       goto err;
+       }
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+               if (!cpuc->constraint_list)
+                       goto err_shared_regs;
+
+               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+               if (!cpuc->excl_cntrs)
+                       goto err_constraint_list;
+
+               cpuc->excl_thread_id = 0;
+       }
+
+       return NOTIFY_OK;
+
+err_constraint_list:
+       kfree(cpuc->constraint_list);
+       cpuc->constraint_list = NULL;
+
+err_shared_regs:
+       kfree(cpuc->shared_regs);
+       cpuc->shared_regs = NULL;
+
+err:
+       return NOTIFY_BAD;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       int core_id = topology_core_id(cpu);
+       int i;
+
+       init_debug_store_on_cpu(cpu);
+       /*
+        * Deal with CPUs that don't clear their LBRs on power-up.
+        */
+       intel_pmu_lbr_reset();
+
+       cpuc->lbr_sel = NULL;
+
+       if (!cpuc->shared_regs)
+               return;
+
+       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_shared_regs *pc;
+
+                       pc = per_cpu(cpu_hw_events, i).shared_regs;
+                       if (pc && pc->core_id == core_id) {
+                               cpuc->kfree_on_online[0] = cpuc->shared_regs;
+                               cpuc->shared_regs = pc;
+                               break;
+                       }
+               }
+               cpuc->shared_regs->core_id = core_id;
+               cpuc->shared_regs->refcnt++;
+       }
+
+       if (x86_pmu.lbr_sel_map)
+               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+                       struct intel_excl_cntrs *c;
+
+                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
+                       if (c && c->core_id == core_id) {
+                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+                               cpuc->excl_cntrs = c;
+                               cpuc->excl_thread_id = 1;
+                               break;
+                       }
+               }
+               cpuc->excl_cntrs->core_id = core_id;
+               cpuc->excl_cntrs->refcnt++;
+       }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_excl_cntrs *c;
+
+       c = cpuc->excl_cntrs;
+       if (c) {
+               if (c->core_id == -1 || --c->refcnt == 0)
+                       kfree(c);
+               cpuc->excl_cntrs = NULL;
+               kfree(cpuc->constraint_list);
+               cpuc->constraint_list = NULL;
+       }
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+       struct intel_shared_regs *pc;
+
+       pc = cpuc->shared_regs;
+       if (pc) {
+               if (pc->core_id == -1 || --pc->refcnt == 0)
+                       kfree(pc);
+               cpuc->shared_regs = NULL;
+       }
+
+       free_excl_cntrs(cpu);
+
+       fini_debug_store_on_cpu(cpu);
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+                                bool sched_in)
+{
+       if (x86_pmu.pebs_active)
+               intel_pmu_pebs_sched_task(ctx, sched_in);
+       if (x86_pmu.lbr_nr)
+               intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_any.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       &format_attr_in_tx.attr,
+       &format_attr_in_tx_cp.attr,
+
+       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+       &format_attr_ldlat.attr, /* PEBS load latency */
+       NULL,
+};
+
+static struct attribute *skl_format_attr[] = {
+       &format_attr_frontend.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+       .name                   = "core",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = x86_pmu_disable_all,
+       .enable_all             = core_pmu_enable_all,
+       .enable                 = core_pmu_enable_event,
+       .disable                = x86_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+
+       /*
+        * Intel PMCs cannot be accessed sanely above 32-bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL<<31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .event_constraints      = intel_core_event_constraints,
+       .guest_get_msrs         = core_guest_get_msrs,
+       .format_attrs           = intel_arch_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       /*
+        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+        * together with PMU version 1 and thus be using core_pmu with
+        * shared_regs. We need following callbacks here to allocate
+        * it properly.
+        */
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+       .name                   = "Intel",
+       .handle_irq             = intel_pmu_handle_irq,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
+       .enable                 = intel_pmu_enable_event,
+       .disable                = intel_pmu_disable_event,
+       .hw_config              = intel_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
+       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
+       .event_map              = intel_pmu_event_map,
+       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
+       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic event period:
+        */
+       .max_period             = (1ULL << 31) - 1,
+       .get_event_constraints  = intel_get_event_constraints,
+       .put_event_constraints  = intel_put_event_constraints,
+       .pebs_aliases           = intel_pebs_aliases_core2,
+
+       .format_attrs           = intel_arch3_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+       .cpu_prepare            = intel_pmu_cpu_prepare,
+       .cpu_starting           = intel_pmu_cpu_starting,
+       .cpu_dying              = intel_pmu_cpu_dying,
+       .guest_get_msrs         = intel_guest_get_msrs,
+       .sched_task             = intel_pmu_sched_task,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+       /*
+        * PEBS is unreliable due to:
+        *
+        *   AJ67  - PEBS may experience CPL leaks
+        *   AJ68  - PEBS PMI may be delayed by one event
+        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+        *
+        * AJ67 could be worked around by restricting the OS/USR flags.
+        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+        *
+        * AJ106 could possibly be worked around by not allowing LBR
+        *       usage from PEBS, including the fixup.
+        * AJ68  could possibly be worked around by always programming
+        *       a pebs_event_reset[0] value and coping with the lost events.
+        *
+        * But taken together it might just make sense to not enable PEBS on
+        * these chips.
+        */
+       pr_warn("PEBS disabled due to CPU errata\n");
+       x86_pmu.pebs = 0;
+       x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+       u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+       switch (cpu_data(cpu).x86_model) {
+       case 42: /* SNB */
+               rev = 0x28;
+               break;
+
+       case 45: /* SNB-EP */
+               switch (cpu_data(cpu).x86_mask) {
+               case 6: rev = 0x618; break;
+               case 7: rev = 0x70c; break;
+               }
+       }
+
+       return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+       int pebs_broken = 0;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+                       break;
+       }
+       put_online_cpus();
+
+       if (pebs_broken == x86_pmu.pebs_broken)
+               return;
+
+       /*
+        * Serialized by the microcode lock..
+        */
+       if (x86_pmu.pebs_broken) {
+               pr_info("PEBS enabled due to microcode update\n");
+               x86_pmu.pebs_broken = 0;
+       } else {
+               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+               x86_pmu.pebs_broken = 1;
+       }
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+       u64 val_old, val_new, val_tmp;
+
+       /*
+        * Read the current value, change it and read it back to see if it
+        * matches, this is needed to detect certain hardware emulators
+        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+        */
+       if (rdmsrl_safe(msr, &val_old))
+               return false;
+
+       /*
+        * Only change the bits which can be updated by wrmsrl.
+        */
+       val_tmp = val_old ^ mask;
+       if (wrmsrl_safe(msr, val_tmp) ||
+           rdmsrl_safe(msr, &val_new))
+               return false;
+
+       if (val_new != val_tmp)
+               return false;
+
+       /* Here it's sure that the MSR can be safely accessed.
+        * Restore the old value and return.
+        */
+       wrmsrl(msr, val_old);
+
+       return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+       x86_pmu.check_microcode = intel_snb_check_microcode;
+       intel_snb_check_microcode();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+       int bit;
+
+       /* disable event that reported as not presend by cpuid */
+       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+               pr_warn("CPUID marked event: \'%s\' unavailable\n",
+                       intel_arch_events_map[bit].name);
+       }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+       union cpuid10_ebx ebx;
+
+       ebx.full = x86_pmu.events_maskl;
+       if (ebx.split.no_branch_misses_retired) {
+               /*
+                * Erratum AAJ80 detected, we work it around by using
+                * the BR_MISP_EXEC.ANY event. This will over-count
+                * branch-misses, but it's still much better than the
+                * architectural event which is often completely bogus:
+                */
+               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+               ebx.split.no_branch_misses_retired = 0;
+               x86_pmu.events_maskl = ebx.full;
+               pr_info("CPU erratum AAJ80 worked around\n");
+       }
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+       x86_pmu.start_scheduling = intel_start_scheduling;
+       x86_pmu.commit_scheduling = intel_commit_scheduling;
+       x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+       EVENT_PTR(tx_start),
+       EVENT_PTR(tx_commit),
+       EVENT_PTR(tx_abort),
+       EVENT_PTR(tx_capacity),
+       EVENT_PTR(tx_conflict),
+       EVENT_PTR(el_start),
+       EVENT_PTR(el_commit),
+       EVENT_PTR(el_abort),
+       EVENT_PTR(el_capacity),
+       EVENT_PTR(el_conflict),
+       EVENT_PTR(cycles_t),
+       EVENT_PTR(cycles_ct),
+       EVENT_PTR(mem_ld_hsw),
+       EVENT_PTR(mem_st_hsw),
+       NULL
+};
+
+__init int intel_pmu_init(void)
+{
+       union cpuid10_edx edx;
+       union cpuid10_eax eax;
+       union cpuid10_ebx ebx;
+       struct event_constraint *c;
+       unsigned int unused;
+       struct extra_reg *er;
+       int version, i;
+
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               switch (boot_cpu_data.x86) {
+               case 0x6:
+                       return p6_pmu_init();
+               case 0xb:
+                       return knc_pmu_init();
+               case 0xf:
+                       return p4_pmu_init();
+               }
+               return -ENODEV;
+       }
+
+       /*
+        * Check whether the Architectural PerfMon supports
+        * Branch Misses Retired hw_event or not.
+        */
+       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+               return -ENODEV;
+
+       version = eax.split.version_id;
+       if (version < 2)
+               x86_pmu = core_pmu;
+       else
+               x86_pmu = intel_pmu;
+
+       x86_pmu.version                 = version;
+       x86_pmu.num_counters            = eax.split.num_counters;
+       x86_pmu.cntval_bits             = eax.split.bit_width;
+       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
+
+       x86_pmu.events_maskl            = ebx.full;
+       x86_pmu.events_mask_len         = eax.split.mask_length;
+
+       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose events, so
+        * assume at least 3 events:
+        */
+       if (version > 1)
+               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+       if (boot_cpu_has(X86_FEATURE_PDCM)) {
+               u64 capabilities;
+
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+               x86_pmu.intel_cap.capabilities = capabilities;
+       }
+
+       intel_ds_init();
+
+       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+       /*
+        * Install the hw-cache-events table:
+        */
+       switch (boot_cpu_data.x86_model) {
+       case 14: /* 65nm Core "Yonah" */
+               pr_cont("Core events, ");
+               break;
+
+       case 15: /* 65nm Core2 "Merom"          */
+               x86_add_quirk(intel_clovertown_quirk);
+       case 22: /* 65nm Core2 "Merom-L"        */
+       case 23: /* 45nm Core2 "Penryn"         */
+       case 29: /* 45nm Core2 "Dunnington (MP) */
+               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_core();
+
+               x86_pmu.event_constraints = intel_core2_event_constraints;
+               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+               pr_cont("Core2 events, ");
+               break;
+
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_nehalem_event_constraints;
+               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               x86_add_quirk(intel_nehalem_quirk);
+
+               pr_cont("Nehalem events, ");
+               break;
+
+       case 28: /* 45nm Atom "Pineview"   */
+       case 38: /* 45nm Atom "Lincroft"   */
+       case 39: /* 32nm Atom "Penwell"    */
+       case 53: /* 32nm Atom "Cloverview" */
+       case 54: /* 32nm Atom "Cedarview"  */
+               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_gen_event_constraints;
+               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+               pr_cont("Atom events, ");
+               break;
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 76: /* 14nm Atom "Airmont"                   */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+                       sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_atom();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_slm_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               pr_cont("Silvermont events, ");
+               break;
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_nhm();
+
+               x86_pmu.event_constraints = intel_westmere_event_constraints;
+               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_westmere_extra_regs;
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+               x86_pmu.cpu_events = nhm_events_attrs;
+
+               /* UOPS_ISSUED.STALLED_CYCLES */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+               intel_pmu_pebs_data_source_nhm();
+               pr_cont("Westmere events, ");
+               break;
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+               x86_add_quirk(intel_sandybridge_quirk);
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_snb_event_constraints;
+               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+               if (boot_cpu_data.x86_model == 45)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("SandyBridge events, ");
+               break;
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+               x86_add_quirk(intel_ht_bug);
+               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+                      sizeof(hw_cache_event_ids));
+               /* dTLB-load-misses on IVB is different than SNB */
+               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+                      sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_snb();
+
+               x86_pmu.event_constraints = intel_ivb_event_constraints;
+               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               if (boot_cpu_data.x86_model == 62)
+                       x86_pmu.extra_regs = intel_snbep_extra_regs;
+               else
+                       x86_pmu.extra_regs = intel_snb_extra_regs;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.cpu_events = snb_events_attrs;
+
+               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+               pr_cont("IvyBridge events, ");
+               break;
+
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+               x86_add_quirk(intel_ht_bug);
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_hsw_event_constraints;
+               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.lbr_double_abort = true;
+               pr_cont("Haswell events, ");
+               break;
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+                                                                         HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+               intel_pmu_lbr_init_hsw();
+
+               x86_pmu.event_constraints = intel_bdw_event_constraints;
+               x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_snbep_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.limit_period = bdw_limit_period;
+               pr_cont("Broadwell events, ");
+               break;
+
+       case 87: /* Knights Landing Xeon Phi */
+               memcpy(hw_cache_event_ids,
+                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs,
+                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_knl();
+
+               x86_pmu.event_constraints = intel_slm_event_constraints;
+               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_knl_extra_regs;
+
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               pr_cont("Knights Landing events, ");
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               x86_pmu.late_ack = true;
+               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+               intel_pmu_lbr_init_skl();
+
+               x86_pmu.event_constraints = intel_skl_event_constraints;
+               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+               x86_pmu.extra_regs = intel_skl_extra_regs;
+               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+               x86_pmu.pebs_prec_dist = true;
+               /* all extra regs are per-cpu when HT is on */
+               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+               x86_pmu.hw_config = hsw_hw_config;
+               x86_pmu.get_event_constraints = hsw_get_event_constraints;
+               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+                                                 skl_format_attr);
+               WARN_ON(!x86_pmu.format_attrs);
+               x86_pmu.cpu_events = hsw_events_attrs;
+               pr_cont("Skylake events, ");
+               break;
+
+       default:
+               switch (x86_pmu.version) {
+               case 1:
+                       x86_pmu.event_constraints = intel_v1_event_constraints;
+                       pr_cont("generic architected perfmon v1, ");
+                       break;
+               default:
+                       /*
+                        * default constraints for v2 and up
+                        */
+                       x86_pmu.event_constraints = intel_gen_event_constraints;
+                       pr_cont("generic architected perfmon, ");
+                       break;
+               }
+       }
+
+       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+       }
+       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+       }
+
+       x86_pmu.intel_ctrl |=
+               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+       if (x86_pmu.event_constraints) {
+               /*
+                * event on fixed counter2 (REF_CYCLES) only works on this
+                * counter, so do not extend mask to generic counters
+                */
+               for_each_event_constraint(c, x86_pmu.event_constraints) {
+                       if (c->cmask == FIXED_EVENT_FLAGS
+                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+                       }
+                       c->idxmsk64 &=
+                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+                       c->weight = hweight64(c->idxmsk64);
+               }
+       }
+
+       /*
+        * Access LBR MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support LBR MSR
+        * Check all LBT MSR here.
+        * Disable LBR access if any LBR MSRs can not be accessed.
+        */
+       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+               x86_pmu.lbr_nr = 0;
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+                       x86_pmu.lbr_nr = 0;
+       }
+
+       /*
+        * Access extra MSR may cause #GP under certain circumstances.
+        * E.g. KVM doesn't support offcore event
+        * Check all extra_regs here.
+        */
+       if (x86_pmu.extra_regs) {
+               for (er = x86_pmu.extra_regs; er->msr; er++) {
+                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
+                       /* Disable LBR select mapping */
+                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+                               x86_pmu.lbr_sel_map = NULL;
+               }
+       }
+
+       /* Support full width counters using alternative MSR range */
+       if (x86_pmu.intel_cap.full_width_write) {
+               x86_pmu.max_period = x86_pmu.cntval_mask;
+               x86_pmu.perfctr = MSR_IA32_PMC0;
+               pr_cont("full-width counters, ");
+       }
+
+       return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+       int cpu = smp_processor_id();
+       int w, c;
+       /*
+        * problem not present on this CPU model, nothing to do
+        */
+       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+               return 0;
+
+       w = cpumask_weight(topology_sibling_cpumask(cpu));
+       if (w > 1) {
+               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+               return 0;
+       }
+
+       if (lockup_detector_suspend() != 0) {
+               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+               return 0;
+       }
+
+       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+       x86_pmu.start_scheduling = NULL;
+       x86_pmu.commit_scheduling = NULL;
+       x86_pmu.stop_scheduling = NULL;
+
+       lockup_detector_resume();
+
+       get_online_cpus();
+
+       for_each_online_cpu(c) {
+               free_excl_cntrs(c);
+       }
+
+       put_online_cpus();
+       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+       return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
new file mode 100644 (file)
index 0000000..93cb412
--- /dev/null
@@ -0,0 +1,1381 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC     0x0c8f
+#define MSR_IA32_QM_CTR                0x0c8e
+#define MSR_IA32_QM_EVTSEL     0x0c8d
+
+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:              The cached Resource Monitoring ID
+ * @closid:            The cached Class Of Service ID
+ * @rmid_usecnt:       The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+       u32                     rmid;
+       u32                     closid;
+       int                     rmid_usecnt;
+};
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR         (1ULL << 63)
+#define RMID_VAL_UNAVAIL       (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static u32 intel_cqm_rotation_rmid;
+
+#define INVALID_RMID           (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(u32 rmid)
+{
+       if (!rmid || rmid == INVALID_RMID)
+               return false;
+
+       return true;
+}
+
+static u64 __rmid_read(u32 rmid)
+{
+       u64 val;
+
+       /*
+        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+        * it just says that to increase confusion.
+        */
+       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+       rdmsrl(MSR_IA32_QM_CTR, val);
+
+       /*
+        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+        * the number of cachelines tagged with @rmid.
+        */
+       return val;
+}
+
+enum rmid_recycle_state {
+       RMID_YOUNG = 0,
+       RMID_AVAILABLE,
+       RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+       u32 rmid;
+       enum rmid_recycle_state state;
+       struct list_head list;
+       unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       entry = cqm_rmid_ptrs[rmid];
+       WARN_ON(entry->rmid != rmid);
+
+       return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static u32 __get_rmid(void)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       if (list_empty(&cqm_rmid_free_lru))
+               return INVALID_RMID;
+
+       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+       list_del(&entry->list);
+
+       return entry->rmid;
+}
+
+static void __put_rmid(u32 rmid)
+{
+       struct cqm_rmid_entry *entry;
+
+       lockdep_assert_held(&cache_mutex);
+
+       WARN_ON(!__rmid_valid(rmid));
+       entry = __rmid_entry(rmid);
+
+       entry->queue_time = jiffies;
+       entry->state = RMID_YOUNG;
+
+       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+       struct cqm_rmid_entry *entry;
+       unsigned int nr_rmids;
+       int r = 0;
+
+       nr_rmids = cqm_max_rmid + 1;
+       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+                               nr_rmids, GFP_KERNEL);
+       if (!cqm_rmid_ptrs)
+               return -ENOMEM;
+
+       for (; r <= cqm_max_rmid; r++) {
+               struct cqm_rmid_entry *entry;
+
+               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+               if (!entry)
+                       goto fail;
+
+               INIT_LIST_HEAD(&entry->list);
+               entry->rmid = r;
+               cqm_rmid_ptrs[r] = entry;
+
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+       /*
+        * RMID 0 is special and is always allocated. It's used for all
+        * tasks that are not monitored.
+        */
+       entry = __rmid_entry(0);
+       list_del(&entry->list);
+
+       mutex_lock(&cache_mutex);
+       intel_cqm_rotation_rmid = __get_rmid();
+       mutex_unlock(&cache_mutex);
+
+       return 0;
+fail:
+       while (r--)
+               kfree(cqm_rmid_ptrs[r]);
+
+       kfree(cqm_rmid_ptrs);
+       return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+       /* Per-cpu and task events don't mix */
+       if ((a->attach_state & PERF_ATTACH_TASK) !=
+           (b->attach_state & PERF_ATTACH_TASK))
+               return false;
+
+#ifdef CONFIG_CGROUP_PERF
+       if (a->cgrp != b->cgrp)
+               return false;
+#endif
+
+       /* If not task event, we're machine wide */
+       if (!(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Events that target same task are placed into the same cache group.
+        */
+       if (a->hw.target == b->hw.target)
+               return true;
+
+       /*
+        * Are we an inherited event?
+        */
+       if (b->parent == a)
+               return true;
+
+       return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return perf_cgroup_from_task(event->hw.target, event->ctx);
+
+       return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *                PROHIBITS
+ *     system-wide    ->       cgroup and task
+ *     cgroup        ->        system-wide
+ *                           ->        task in cgroup
+ *     task          ->        system-wide
+ *                           ->        task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+       /*
+        * We can have any number of cgroups but only one system-wide
+        * event at a time.
+        */
+       if (a->cgrp && b->cgrp) {
+               struct perf_cgroup *ac = a->cgrp;
+               struct perf_cgroup *bc = b->cgrp;
+
+               /*
+                * This condition should have been caught in
+                * __match_event() and we should be sharing an RMID.
+                */
+               WARN_ON_ONCE(ac == bc);
+
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+
+       if (a->cgrp || b->cgrp) {
+               struct perf_cgroup *ac, *bc;
+
+               /*
+                * cgroup and system-wide events are mutually exclusive
+                */
+               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+                       return true;
+
+               /*
+                * Ensure neither event is part of the other's cgroup
+                */
+               ac = event_to_cgroup(a);
+               bc = event_to_cgroup(b);
+               if (ac == bc)
+                       return true;
+
+               /*
+                * Must have cgroup and non-intersecting task events.
+                */
+               if (!ac || !bc)
+                       return false;
+
+               /*
+                * We have cgroup and task events, and the task belongs
+                * to a cgroup. Check for for overlap.
+                */
+               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+                       return true;
+
+               return false;
+       }
+#endif
+       /*
+        * If one of them is not a task, same story as above with cgroups.
+        */
+       if (!(a->attach_state & PERF_ATTACH_TASK) ||
+           !(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Must be non-overlapping.
+        */
+       return false;
+}
+
+struct rmid_read {
+       u32 rmid;
+       atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
+{
+       struct perf_event *event;
+       struct list_head *head = &group->hw.cqm_group_entry;
+       u32 old_rmid = group->hw.cqm_rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       /*
+        * If our RMID is being deallocated, perform a read now.
+        */
+       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+               struct rmid_read rr = {
+                       .value = ATOMIC64_INIT(0),
+                       .rmid = old_rmid,
+               };
+
+               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+                                &rr, 1);
+               local64_set(&group->count, atomic64_read(&rr.value));
+       }
+
+       raw_spin_lock_irq(&cache_lock);
+
+       group->hw.cqm_rmid = rmid;
+       list_for_each_entry(event, head, hw.cqm_group_entry)
+               event->hw.cqm_rmid = rmid;
+
+       raw_spin_unlock_irq(&cache_lock);
+
+       return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+       struct cqm_rmid_entry *entry;
+
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               if (entry->state != RMID_AVAILABLE)
+                       break;
+
+               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+                       entry->state = RMID_DIRTY;
+       }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(u32 rmid)
+{
+       struct perf_event *leader, *event;
+
+       lockdep_assert_held(&cache_mutex);
+
+       leader = list_first_entry(&cache_groups, struct perf_event,
+                                 hw.cqm_groups_entry);
+       event = leader;
+
+       list_for_each_entry_continue(event, &cache_groups,
+                                    hw.cqm_groups_entry) {
+               if (__rmid_valid(event->hw.cqm_rmid))
+                       continue;
+
+               if (__conflict_event(event, leader))
+                       continue;
+
+               intel_cqm_xchg_rmid(event, rmid);
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+       struct cqm_rmid_entry *entry, *tmp;
+
+       lockdep_assert_held(&cache_mutex);
+
+       *available = 0;
+       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+               unsigned long min_queue_time;
+               unsigned long now = jiffies;
+
+               /*
+                * We hold RMIDs placed into limbo for a minimum queue
+                * time. Before the minimum queue time has elapsed we do
+                * not recycle RMIDs.
+                *
+                * The reasoning is that until a sufficient time has
+                * passed since we stopped using an RMID, any RMID
+                * placed onto the limbo list will likely still have
+                * data tagged in the cache, which means we'll probably
+                * fail to recycle it anyway.
+                *
+                * We can save ourselves an expensive IPI by skipping
+                * any RMIDs that have not been queued for the minimum
+                * time.
+                */
+               min_queue_time = entry->queue_time +
+                       msecs_to_jiffies(__rmid_queue_time_ms);
+
+               if (time_after(min_queue_time, now))
+                       break;
+
+               entry->state = RMID_AVAILABLE;
+               (*available)++;
+       }
+
+       /*
+        * Fast return if none of the RMIDs on the limbo list have been
+        * sitting on the queue for the minimum queue time.
+        */
+       if (!*available)
+               return false;
+
+       /*
+        * Test whether an RMID is free for each package.
+        */
+       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+               /*
+                * Exhausted all RMIDs that have waited min queue time.
+                */
+               if (entry->state == RMID_YOUNG)
+                       break;
+
+               if (entry->state == RMID_DIRTY)
+                       continue;
+
+               list_del(&entry->list); /* remove from limbo */
+
+               /*
+                * The rotation RMID gets priority if it's
+                * currently invalid. In which case, skip adding
+                * the RMID to the the free lru.
+                */
+               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+                       intel_cqm_rotation_rmid = entry->rmid;
+                       continue;
+               }
+
+               /*
+                * If we have groups waiting for RMIDs, hand
+                * them one now provided they don't conflict.
+                */
+               if (intel_cqm_sched_in_event(entry->rmid))
+                       continue;
+
+               /*
+                * Otherwise place it onto the free list.
+                */
+               list_add_tail(&entry->list, &cqm_rmid_free_lru);
+       }
+
+
+       return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+       struct perf_event *rotor;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       rotor = list_first_entry(&cache_groups, struct perf_event,
+                                hw.cqm_groups_entry);
+
+       /*
+        * The group at the front of the list should always have a valid
+        * RMID. If it doesn't then no groups have RMIDs assigned and we
+        * don't need to rotate the list.
+        */
+       if (next == rotor)
+               return;
+
+       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+       __put_rmid(rmid);
+
+       list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+       struct perf_event *group, *g;
+       u32 rmid;
+
+       lockdep_assert_held(&cache_mutex);
+
+       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+               if (group == event)
+                       continue;
+
+               rmid = group->hw.cqm_rmid;
+
+               /*
+                * Skip events that don't have a valid RMID.
+                */
+               if (!__rmid_valid(rmid))
+                       continue;
+
+               /*
+                * No conflict? No problem! Leave the event alone.
+                */
+               if (!__conflict_event(group, event))
+                       continue;
+
+               intel_cqm_xchg_rmid(group, INVALID_RMID);
+               __put_rmid(rmid);
+       }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+       struct perf_event *group, *start = NULL;
+       unsigned int threshold_limit;
+       unsigned int nr_needed = 0;
+       unsigned int nr_available;
+       bool rotated = false;
+
+       mutex_lock(&cache_mutex);
+
+again:
+       /*
+        * Fast path through this function if there are no groups and no
+        * RMIDs that need cleaning.
+        */
+       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+               if (!__rmid_valid(group->hw.cqm_rmid)) {
+                       if (!start)
+                               start = group;
+                       nr_needed++;
+               }
+       }
+
+       /*
+        * We have some event groups, but they all have RMIDs assigned
+        * and no RMIDs need cleaning.
+        */
+       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+               goto out;
+
+       if (!nr_needed)
+               goto stabilize;
+
+       /*
+        * We have more event groups without RMIDs than available RMIDs,
+        * or we have event groups that conflict with the ones currently
+        * scheduled.
+        *
+        * We force deallocate the rmid of the group at the head of
+        * cache_groups. The first event group without an RMID then gets
+        * assigned intel_cqm_rotation_rmid. This ensures we always make
+        * forward progress.
+        *
+        * Rotate the cache_groups list so the previous head is now the
+        * tail.
+        */
+       __intel_cqm_pick_and_rotate(start);
+
+       /*
+        * If the rotation is going to succeed, reduce the threshold so
+        * that we don't needlessly reuse dirty RMIDs.
+        */
+       if (__rmid_valid(intel_cqm_rotation_rmid)) {
+               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+               intel_cqm_rotation_rmid = __get_rmid();
+
+               intel_cqm_sched_out_conflicting_events(start);
+
+               if (__intel_cqm_threshold)
+                       __intel_cqm_threshold--;
+       }
+
+       rotated = true;
+
+stabilize:
+       /*
+        * We now need to stablize the RMID we freed above (if any) to
+        * ensure that the next time we rotate we have an RMID with zero
+        * occupancy value.
+        *
+        * Alternatively, if we didn't need to perform any rotation,
+        * we'll have a bunch of RMIDs in limbo that need stabilizing.
+        */
+       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+       while (intel_cqm_rmid_stabilize(&nr_available) &&
+              __intel_cqm_threshold < threshold_limit) {
+               unsigned int steal_limit;
+
+               /*
+                * Don't spin if nobody is actively waiting for an RMID,
+                * the rotation worker will be kicked as soon as an
+                * event needs an RMID anyway.
+                */
+               if (!nr_needed)
+                       break;
+
+               /* Allow max 25% of RMIDs to be in limbo. */
+               steal_limit = (cqm_max_rmid + 1) / 4;
+
+               /*
+                * We failed to stabilize any RMIDs so our rotation
+                * logic is now stuck. In order to make forward progress
+                * we have a few options:
+                *
+                *   1. rotate ("steal") another RMID
+                *   2. increase the threshold
+                *   3. do nothing
+                *
+                * We do both of 1. and 2. until we hit the steal limit.
+                *
+                * The steal limit prevents all RMIDs ending up on the
+                * limbo list. This can happen if every RMID has a
+                * non-zero occupancy above threshold_limit, and the
+                * occupancy values aren't dropping fast enough.
+                *
+                * Note that there is prioritisation at work here - we'd
+                * rather increase the number of RMIDs on the limbo list
+                * than increase the threshold, because increasing the
+                * threshold skews the event data (because we reuse
+                * dirty RMIDs) - threshold bumps are a last resort.
+                */
+               if (nr_available < steal_limit)
+                       goto again;
+
+               __intel_cqm_threshold++;
+       }
+
+out:
+       mutex_unlock(&cache_mutex);
+       return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+       unsigned long delay;
+
+       __intel_cqm_rmid_rotate();
+
+       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+       schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+                                 struct perf_event **group)
+{
+       struct perf_event *iter;
+       bool conflict = false;
+       u32 rmid;
+
+       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+               rmid = iter->hw.cqm_rmid;
+
+               if (__match_event(iter, event)) {
+                       /* All tasks in a group share an RMID */
+                       event->hw.cqm_rmid = rmid;
+                       *group = iter;
+                       return;
+               }
+
+               /*
+                * We only care about conflicts for events that are
+                * actually scheduled in (and hence have a valid RMID).
+                */
+               if (__conflict_event(iter, event) && __rmid_valid(rmid))
+                       conflict = true;
+       }
+
+       if (conflict)
+               rmid = INVALID_RMID;
+       else
+               rmid = __get_rmid();
+
+       event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+       unsigned long flags;
+       u32 rmid;
+       u64 val;
+
+       /*
+        * Task events are handled by intel_cqm_event_count().
+        */
+       if (event->cpu == -1)
+               return;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       rmid = event->hw.cqm_rmid;
+
+       if (!__rmid_valid(rmid))
+               goto out;
+
+       val = __rmid_read(rmid);
+
+       /*
+        * Ignore this reading on error states and do not update the value.
+        */
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               goto out;
+
+       local64_set(&event->count, val);
+out:
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+       struct rmid_read *rr = info;
+       u64 val;
+
+       val = __rmid_read(rr->rmid);
+
+       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+               return;
+
+       atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+       return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+       unsigned long flags;
+       struct rmid_read rr = {
+               .value = ATOMIC64_INIT(0),
+       };
+
+       /*
+        * We only need to worry about task events. System-wide events
+        * are handled like usual, i.e. entirely with
+        * intel_cqm_event_read().
+        */
+       if (event->cpu != -1)
+               return __perf_event_count(event);
+
+       /*
+        * Only the group leader gets to report values. This stops us
+        * reporting duplicate values to userspace, and gives us a clear
+        * rule for which task gets to report the values.
+        *
+        * Note that it is impossible to attribute these values to
+        * specific packages - we forfeit that ability when we create
+        * task events.
+        */
+       if (!cqm_group_leader(event))
+               return 0;
+
+       /*
+        * Getting up-to-date values requires an SMP IPI which is not
+        * possible if we're being called in interrupt context. Return
+        * the cached values instead.
+        */
+       if (unlikely(in_interrupt()))
+               goto out;
+
+       /*
+        * Notice that we don't perform the reading of an RMID
+        * atomically, because we can't hold a spin lock across the
+        * IPIs.
+        *
+        * Speculatively perform the read, since @event might be
+        * assigned a different (possibly invalid) RMID while we're
+        * busying performing the IPI calls. It's therefore necessary to
+        * check @event's RMID afterwards, and if it has changed,
+        * discard the result of the read.
+        */
+       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+       if (!__rmid_valid(rr.rmid))
+               goto out;
+
+       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+       if (event->hw.cqm_rmid == rr.rmid)
+               local64_set(&event->count, atomic64_read(&rr.value));
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+       return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+       u32 rmid = event->hw.cqm_rmid;
+
+       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+               return;
+
+       event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+       if (state->rmid_usecnt++) {
+               if (!WARN_ON_ONCE(state->rmid != rmid))
+                       return;
+       } else {
+               WARN_ON_ONCE(state->rmid);
+       }
+
+       state->rmid = rmid;
+       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+       if (event->hw.cqm_state & PERF_HES_STOPPED)
+               return;
+
+       event->hw.cqm_state |= PERF_HES_STOPPED;
+
+       intel_cqm_event_read(event);
+
+       if (!--state->rmid_usecnt) {
+               state->rmid = 0;
+               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
+       } else {
+               WARN_ON_ONCE(!state->rmid);
+       }
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+       unsigned long flags;
+       u32 rmid;
+
+       raw_spin_lock_irqsave(&cache_lock, flags);
+
+       event->hw.cqm_state = PERF_HES_STOPPED;
+       rmid = event->hw.cqm_rmid;
+
+       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+               intel_cqm_event_start(event, mode);
+
+       raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+       return 0;
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+       struct perf_event *group_other = NULL;
+
+       mutex_lock(&cache_mutex);
+
+       /*
+        * If there's another event in this group...
+        */
+       if (!list_empty(&event->hw.cqm_group_entry)) {
+               group_other = list_first_entry(&event->hw.cqm_group_entry,
+                                              struct perf_event,
+                                              hw.cqm_group_entry);
+               list_del(&event->hw.cqm_group_entry);
+       }
+
+       /*
+        * And we're the group leader..
+        */
+       if (cqm_group_leader(event)) {
+               /*
+                * If there was a group_other, make that leader, otherwise
+                * destroy the group and return the RMID.
+                */
+               if (group_other) {
+                       list_replace(&event->hw.cqm_groups_entry,
+                                    &group_other->hw.cqm_groups_entry);
+               } else {
+                       u32 rmid = event->hw.cqm_rmid;
+
+                       if (__rmid_valid(rmid))
+                               __put_rmid(rmid);
+                       list_del(&event->hw.cqm_groups_entry);
+               }
+       }
+
+       mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+       struct perf_event *group = NULL;
+       bool rotate = false;
+
+       if (event->attr.type != intel_cqm_pmu.type)
+               return -ENOENT;
+
+       if (event->attr.config & ~QOS_EVENT_MASK)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+       event->destroy = intel_cqm_event_destroy;
+
+       mutex_lock(&cache_mutex);
+
+       /* Will also set rmid */
+       intel_cqm_setup_event(event, &group);
+
+       if (group) {
+               list_add_tail(&event->hw.cqm_group_entry,
+                             &group->hw.cqm_group_entry);
+       } else {
+               list_add_tail(&event->hw.cqm_groups_entry,
+                             &cache_groups);
+
+               /*
+                * All RMIDs are either in use or have recently been
+                * used. Kick the rotation worker to clean/free some.
+                *
+                * We only do this for the group leader, rather than for
+                * every event in a group to save on needless work.
+                */
+               if (!__rmid_valid(event->hw.cqm_rmid))
+                       rotate = true;
+       }
+
+       mutex_unlock(&cache_mutex);
+
+       if (rotate)
+               schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+       return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+       EVENT_PTR(intel_cqm_llc),
+       EVENT_PTR(intel_cqm_llc_pkg),
+       EVENT_PTR(intel_cqm_llc_unit),
+       EVENT_PTR(intel_cqm_llc_scale),
+       EVENT_PTR(intel_cqm_llc_snapshot),
+       NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+       .name = "events",
+       .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+       .name = "format",
+       .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+                          char *page)
+{
+       ssize_t rv;
+
+       mutex_lock(&cache_mutex);
+       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+       mutex_unlock(&cache_mutex);
+
+       return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+                           struct device_attribute *attr,
+                           const char *buf, size_t count)
+{
+       unsigned int bytes, cachelines;
+       int ret;
+
+       ret = kstrtouint(buf, 0, &bytes);
+       if (ret)
+               return ret;
+
+       mutex_lock(&cache_mutex);
+
+       __intel_cqm_max_threshold = bytes;
+       cachelines = bytes / cqm_l3_scale;
+
+       /*
+        * The new maximum takes effect immediately.
+        */
+       if (__intel_cqm_threshold > cachelines)
+               __intel_cqm_threshold = cachelines;
+
+       mutex_unlock(&cache_mutex);
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+       &dev_attr_max_recycle_threshold.attr,
+       NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+       .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+       &intel_cqm_events_group,
+       &intel_cqm_format_group,
+       &intel_cqm_group,
+       NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+       .attr_groups         = intel_cqm_attr_groups,
+       .task_ctx_nr         = perf_sw_context,
+       .event_init          = intel_cqm_event_init,
+       .add                 = intel_cqm_event_add,
+       .del                 = intel_cqm_event_stop,
+       .start               = intel_cqm_event_start,
+       .stop                = intel_cqm_event_stop,
+       .read                = intel_cqm_event_read,
+       .count               = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+       int reader;
+
+       /* First online cpu in package becomes the reader */
+       reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+       if (reader >= nr_cpu_ids)
+               cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+       state->rmid = 0;
+       state->closid = 0;
+       state->rmid_usecnt = 0;
+
+       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+       int target;
+
+       /* Is @cpu the current cqm reader for this package ? */
+       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+               return;
+
+       /* Find another online reader in this package */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &cqm_cpumask);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu  = (unsigned long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               intel_cqm_cpu_exit(cpu);
+               break;
+       case CPU_STARTING:
+               intel_cqm_cpu_starting(cpu);
+               cqm_pick_event_reader(cpu);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+       {}
+};
+
+static int __init intel_cqm_init(void)
+{
+       char *str, scale[20];
+       int i, cpu, ret;
+
+       if (!x86_match_cpu(intel_cqm_match))
+               return -ENODEV;
+
+       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+       /*
+        * It's possible that not all resources support the same number
+        * of RMIDs. Instead of making scheduling much more complicated
+        * (where we have to match a task's RMID to a cpu that supports
+        * that many RMIDs) just find the minimum RMIDs supported across
+        * all cpus.
+        *
+        * Also, check that the scales match on all cpus.
+        */
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu) {
+               struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+               if (c->x86_cache_max_rmid < cqm_max_rmid)
+                       cqm_max_rmid = c->x86_cache_max_rmid;
+
+               if (c->x86_cache_occ_scale != cqm_l3_scale) {
+                       pr_err("Multiple LLC scale values, disabling\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * A reasonable upper limit on the max threshold is the number
+        * of lines tagged per RMID if all RMIDs have the same number of
+        * lines tagged in the LLC.
+        *
+        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+        */
+       __intel_cqm_max_threshold =
+               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+       str = kstrdup(scale, GFP_KERNEL);
+       if (!str) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       event_attr_intel_cqm_llc_scale.event_str = str;
+
+       ret = intel_cqm_setup_rmid_cache();
+       if (ret)
+               goto out;
+
+       for_each_online_cpu(i) {
+               intel_cqm_cpu_starting(i);
+               cqm_pick_event_reader(i);
+       }
+
+       __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+       if (ret)
+               pr_err("Intel CQM perf registration failed: %d\n", ret);
+       else
+               pr_info("Intel CQM monitoring enabled\n");
+
+out:
+       cpu_notifier_register_done();
+
+       return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
new file mode 100644 (file)
index 0000000..7946c42
--- /dev/null
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *     MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *                      perf code: 0x00
+ *                      Available model: SLM,AMT
+ *                      Scope: Core (each processor core has a MSR)
+ *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *                            perf code: 0x03
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Core
+ *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *                            perf code: 0x00
+ *                            Available model: SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *                            perf code: 0x01
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *                            perf code: 0x02
+ *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *                            perf code: 0x03
+ *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *                            perf code: 0x04
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *                            perf code: 0x05
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *                            perf code: 0x06
+ *                            Available model: HSW ULT only
+ *                            Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf);
+
+struct perf_cstate_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+       /*
+        * cstate_core events
+        */
+       PERF_CSTATE_CORE_C1_RES = 0,
+       PERF_CSTATE_CORE_C3_RES,
+       PERF_CSTATE_CORE_C6_RES,
+       PERF_CSTATE_CORE_C7_RES,
+
+       PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C1_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
+       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
+       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
+       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+       .name = "events",
+       .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+       &format_attr_core_event.attr,
+       NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+       .name = "format",
+       .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+       .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+       &core_events_attr_group,
+       &core_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+       /*
+        * cstate_pkg events
+        */
+       PERF_CSTATE_PKG_C2_RES = 0,
+       PERF_CSTATE_PKG_C3_RES,
+       PERF_CSTATE_PKG_C6_RES,
+       PERF_CSTATE_PKG_C7_RES,
+       PERF_CSTATE_PKG_C8_RES,
+       PERF_CSTATE_PKG_C9_RES,
+       PERF_CSTATE_PKG_C10_RES,
+
+       PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+               if (idx == PERF_CSTATE_CORE_C3_RES ||
+                   idx == PERF_CSTATE_CORE_C6_RES ||
+                   idx == PERF_CSTATE_CORE_C7_RES)
+                       return true;
+               break;
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES)
+                       return true;
+               break;
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_CSTATE_CORE_C6_RES)
+                       return true;
+               break;
+       case 69: /* 22nm Haswell ULT */
+               if (idx == PERF_CSTATE_PKG_C2_RES ||
+                   idx == PERF_CSTATE_PKG_C3_RES ||
+                   idx == PERF_CSTATE_PKG_C6_RES ||
+                   idx == PERF_CSTATE_PKG_C7_RES ||
+                   idx == PERF_CSTATE_PKG_C8_RES ||
+                   idx == PERF_CSTATE_PKG_C9_RES ||
+                   idx == PERF_CSTATE_PKG_C10_RES)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
+       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
+       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
+       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
+       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
+       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
+       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+       .name = "events",
+       .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+       &format_attr_pkg_event.attr,
+       NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+       .name = "format",
+       .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+       &pkg_events_attr_group,
+       &pkg_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       if (pmu == &cstate_core_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+       else if (pmu == &cstate_pkg_pmu)
+               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+       else
+               return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+       int ret = 0;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (event->pmu == &cstate_core_pmu) {
+               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+                       return -EINVAL;
+               if (!core_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = core_msr[cfg].msr;
+       } else if (event->pmu == &cstate_pkg_pmu) {
+               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+                       return -EINVAL;
+               if (!pkg_msr[cfg].attr)
+                       return -EINVAL;
+               event->hw.event_base = pkg_msr[cfg].msr;
+       } else
+               return -ENOENT;
+
+       /* must be done before validate_group */
+       event->hw.config = cfg;
+       event->hw.idx = -1;
+
+       return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+       u64 val;
+
+       rdmsrl(event->hw.event_base, val);
+       return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       new_raw_count = cstate_pmu_read_counter(event);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count)
+               goto again;
+
+       local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+       if (mode & PERF_EF_START)
+               cstate_pmu_event_start(event, mode);
+
+       return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+       int i, id, target;
+
+       /* cpu exit for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_core_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+       }
+
+       /* cpu exit for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               target = -1;
+
+               for_each_online_cpu(i) {
+                       if (i == cpu)
+                               continue;
+                       if (id == topology_physical_package_id(i)) {
+                               target = i;
+                               break;
+                       }
+               }
+               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+               if (target >= 0)
+                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+       }
+}
+
+static void cstate_cpu_init(int cpu)
+{
+       int i, id;
+
+       /* cpu init for cstate core */
+       if (has_cstate_core) {
+               id = topology_core_id(cpu);
+               for_each_cpu(i, &cstate_core_cpu_mask) {
+                       if (id == topology_core_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+       }
+
+       /* cpu init for cstate pkg */
+       if (has_cstate_pkg) {
+               id = topology_physical_package_id(cpu);
+               for_each_cpu(i, &cstate_pkg_cpu_mask) {
+                       if (id == topology_physical_package_id(i))
+                               break;
+               }
+               if (i >= nr_cpu_ids)
+                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+       }
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               break;
+       case CPU_STARTING:
+               cstate_cpu_init(cpu);
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               break;
+       case CPU_ONLINE:
+       case CPU_DEAD:
+               break;
+       case CPU_DOWN_PREPARE:
+               cstate_cpu_exit(cpu);
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+                            struct attribute   **events_attrs,
+                            int max_event_nr)
+{
+       int i, j = 0;
+       u64 val;
+
+       /* Probe the cstate events. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining events in the sysfs attrs. */
+       for (i = 0; i < max_event_nr; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+       /* SLM has different MSR for PKG C6 */
+       switch (boot_cpu_data.x86_model) {
+       case 55:
+       case 76:
+       case 77:
+               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+       }
+
+       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+               has_cstate_core = true;
+
+       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+               has_cstate_pkg = true;
+
+       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+       int cpu;
+
+       cpu_notifier_register_begin();
+
+       for_each_online_cpu(cpu)
+               cstate_cpu_init(cpu);
+
+       __perf_cpu_notifier(cstate_cpu_notifier);
+
+       cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+       .attr_groups    = core_attr_groups,
+       .name           = "cstate_core",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+       .attr_groups    = pkg_attr_groups,
+       .name           = "cstate_pkg",
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = cstate_pmu_event_init,
+       .add            = cstate_pmu_event_add, /* must have */
+       .del            = cstate_pmu_event_del, /* must have */
+       .start          = cstate_pmu_event_start,
+       .stop           = cstate_pmu_event_stop,
+       .read           = cstate_pmu_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+       int err;
+
+       if (has_cstate_core) {
+               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_core_pmu.name, err);
+       }
+
+       if (has_cstate_pkg) {
+               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+               if (WARN_ON(err))
+                       pr_info("Failed to register PMU %s error %d\n",
+                               cstate_pkg_pmu.name, err);
+       }
+}
+
+static int __init cstate_pmu_init(void)
+{
+       int err;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       err = cstate_init();
+       if (err)
+               return err;
+
+       cstate_cpumask_init();
+
+       cstate_pmus_register();
+
+       return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
new file mode 100644 (file)
index 0000000..ce7211a
--- /dev/null
@@ -0,0 +1,1410 @@
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
+#define PEBS_FIXUP_SIZE                PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+       u32 flags, ip;
+       u32 ax, bc, cx, dx;
+       u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+       u64 val;
+       struct {
+               unsigned int ld_dse:4;
+               unsigned int ld_stlb_miss:1;
+               unsigned int ld_locked:1;
+               unsigned int ld_reserved:26;
+       };
+       struct {
+               unsigned int st_l1d_hit:1;
+               unsigned int st_reserved1:3;
+               unsigned int st_stlb_miss:1;
+               unsigned int st_locked:1;
+               unsigned int st_reserved2:26;
+       };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
+       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
+       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
+       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
+       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
+       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
+       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+       pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
+       pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+       pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+}
+
+static u64 precise_store_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+       dse.val = status;
+
+       /*
+        * bit 4: TLB access
+        * 1 = stored missed 2nd level TLB
+        *
+        * so it either hit the walker or the OS
+        * otherwise hit 2nd level TLB
+        */
+       if (dse.st_stlb_miss)
+               val |= P(TLB, MISS);
+       else
+               val |= P(TLB, HIT);
+
+       /*
+        * bit 0: hit L1 data cache
+        * if not set, then all we know is that
+        * it missed L1D
+        */
+       if (dse.st_l1d_hit)
+               val |= P(LVL, HIT);
+       else
+               val |= P(LVL, MISS);
+
+       /*
+        * bit 5: Locked prefix
+        */
+       if (dse.st_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+       union perf_mem_data_src dse;
+
+       dse.val = PERF_MEM_NA;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+               dse.mem_op = PERF_MEM_OP_STORE;
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+               dse.mem_op = PERF_MEM_OP_LOAD;
+
+       /*
+        * L1 info only valid for following events:
+        *
+        * MEM_UOPS_RETIRED.STLB_MISS_STORES
+        * MEM_UOPS_RETIRED.LOCK_STORES
+        * MEM_UOPS_RETIRED.SPLIT_STORES
+        * MEM_UOPS_RETIRED.ALL_STORES
+        */
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+               if (status & 1)
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+               else
+                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+       }
+       return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+       union intel_x86_pebs_dse dse;
+       u64 val;
+       int model = boot_cpu_data.x86_model;
+       int fam = boot_cpu_data.x86;
+
+       dse.val = status;
+
+       /*
+        * use the mapping table for bit 0-3
+        */
+       val = pebs_data_source[dse.ld_dse];
+
+       /*
+        * Nehalem models do not support TLB, Lock infos
+        */
+       if (fam == 0x6 && (model == 26 || model == 30
+           || model == 31 || model == 46)) {
+               val |= P(TLB, NA) | P(LOCK, NA);
+               return val;
+       }
+       /*
+        * bit 4: TLB access
+        * 0 = did not miss 2nd level TLB
+        * 1 = missed 2nd level TLB
+        */
+       if (dse.ld_stlb_miss)
+               val |= P(TLB, MISS) | P(TLB, L2);
+       else
+               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+       /*
+        * bit 5: locked prefix
+        */
+       if (dse.ld_locked)
+               val |= P(LOCK, LOCKED);
+
+       return val;
+}
+
+struct pebs_record_core {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+       struct {
+               u32 cycles_last_block     : 32,
+                   hle_abort             : 1,
+                   rtm_abort             : 1,
+                   instruction_abort     : 1,
+                   non_instruction_abort : 1,
+                   retry                 : 1,
+                   data_conflict         : 1,
+                   capacity_writes       : 1,
+                   capacity_reads        : 1;
+       };
+       u64         value;
+};
+
+#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+       u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_events, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static int alloc_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max;
+       void *buffer, *ibuffer;
+
+       if (!x86_pmu.pebs)
+               return 0;
+
+       buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+       if (unlikely(!buffer))
+               return -ENOMEM;
+
+       /*
+        * HSW+ already provides us the eventing ip; no need to allocate this
+        * buffer then.
+        */
+       if (x86_pmu.intel_cap.pebs_format < 2) {
+               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!ibuffer) {
+                       kfree(buffer);
+                       return -ENOMEM;
+               }
+               per_cpu(insn_buffer, cpu) = ibuffer;
+       }
+
+       max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+
+       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+       ds->pebs_index = ds->pebs_buffer_base;
+       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+               max * x86_pmu.pebs_record_size;
+
+       return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.pebs)
+               return;
+
+       kfree(per_cpu(insn_buffer, cpu));
+       per_cpu(insn_buffer, cpu) = NULL;
+
+       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+       int node = cpu_to_node(cpu);
+       int max, thresh;
+       void *buffer;
+
+       if (!x86_pmu.bts)
+               return 0;
+
+       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+       if (unlikely(!buffer)) {
+               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+               return -ENOMEM;
+       }
+
+       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+       thresh = max / 16;
+
+       ds->bts_buffer_base = (u64)(unsigned long)buffer;
+       ds->bts_index = ds->bts_buffer_base;
+       ds->bts_absolute_maximum = ds->bts_buffer_base +
+               max * BTS_RECORD_SIZE;
+       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+               thresh * BTS_RECORD_SIZE;
+
+       return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds || !x86_pmu.bts)
+               return;
+
+       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+       int node = cpu_to_node(cpu);
+       struct debug_store *ds;
+
+       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+       if (unlikely(!ds))
+               return -ENOMEM;
+
+       per_cpu(cpu_hw_events, cpu).ds = ds;
+
+       return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+       if (!ds)
+               return;
+
+       per_cpu(cpu_hw_events, cpu).ds = NULL;
+       kfree(ds);
+}
+
+void release_ds_buffers(void)
+{
+       int cpu;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               release_pebs_buffer(cpu);
+               release_bts_buffer(cpu);
+               release_ds_buffer(cpu);
+       }
+       put_online_cpus();
+}
+
+void reserve_ds_buffers(void)
+{
+       int bts_err = 0, pebs_err = 0;
+       int cpu;
+
+       x86_pmu.bts_active = 0;
+       x86_pmu.pebs_active = 0;
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       if (!x86_pmu.bts)
+               bts_err = 1;
+
+       if (!x86_pmu.pebs)
+               pebs_err = 1;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               if (alloc_ds_buffer(cpu)) {
+                       bts_err = 1;
+                       pebs_err = 1;
+               }
+
+               if (!bts_err && alloc_bts_buffer(cpu))
+                       bts_err = 1;
+
+               if (!pebs_err && alloc_pebs_buffer(cpu))
+                       pebs_err = 1;
+
+               if (bts_err && pebs_err)
+                       break;
+       }
+
+       if (bts_err) {
+               for_each_possible_cpu(cpu)
+                       release_bts_buffer(cpu);
+       }
+
+       if (pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_pebs_buffer(cpu);
+       }
+
+       if (bts_err && pebs_err) {
+               for_each_possible_cpu(cpu)
+                       release_ds_buffer(cpu);
+       } else {
+               if (x86_pmu.bts && !bts_err)
+                       x86_pmu.bts_active = 1;
+
+               if (x86_pmu.pebs && !pebs_err)
+                       x86_pmu.pebs_active = 1;
+
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= DEBUGCTLMSR_TR;
+       debugctlmsr |= DEBUGCTLMSR_BTS;
+       if (config & ARCH_PERFMON_EVENTSEL_INT)
+               debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct bts_record {
+               u64     from;
+               u64     to;
+               u64     flags;
+       };
+       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+       struct bts_record *at, *base, *top;
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+       struct perf_sample_data data;
+       unsigned long skip = 0;
+       struct pt_regs regs;
+
+       if (!event)
+               return 0;
+
+       if (!x86_pmu.bts_active)
+               return 0;
+
+       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+       top  = (struct bts_record *)(unsigned long)ds->bts_index;
+
+       if (top <= base)
+               return 0;
+
+       memset(&regs, 0, sizeof(regs));
+
+       ds->bts_index = ds->bts_buffer_base;
+
+       perf_sample_data_init(&data, 0, event->hw.last_period);
+
+       /*
+        * BTS leaks kernel addresses in branches across the cpl boundary,
+        * such as traps or system calls, so unless the user is asking for
+        * kernel tracing (and right now it's not possible), we'd need to
+        * filter them out. But first we need to count how many of those we
+        * have in the current batch. This is an extra O(n) pass, however,
+        * it's much faster than the other one especially considering that
+        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+        * alloc_bts_buffer()).
+        */
+       for (at = base; at < top; at++) {
+               /*
+                * Note that right now *this* BTS code only works if
+                * attr::exclude_kernel is set, but let's keep this extra
+                * check here in case that changes.
+                */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       skip++;
+       }
+
+       /*
+        * Prepare a generic sample, i.e. fill in the invariant fields.
+        * We will overwrite the from and to address before we output
+        * the sample.
+        */
+       perf_prepare_sample(&header, &data, event, &regs);
+
+       if (perf_output_begin(&handle, event, header.size *
+                             (top - base - skip)))
+               return 1;
+
+       for (at = base; at < top; at++) {
+               /* Filter out any records that contain kernel addresses. */
+               if (event->attr.exclude_kernel &&
+                   (kernel_ip(at->from) || kernel_ip(at->to)))
+                       continue;
+
+               data.ip         = at->from;
+               data.addr       = at->to;
+
+               perf_output_sample(&handle, &header, &data, event);
+       }
+
+       perf_output_end(&handle);
+
+       /* There's new data available. */
+       event->hw.interrupts++;
+       event->pending_kill = POLL_IN;
+       return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+       struct pt_regs regs;
+
+       x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (!sched_in)
+               intel_pmu_drain_pebs_buffer();
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
+       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+        EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
+       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+       /* Allow all events as PEBS with no flags */
+       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+       EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+       struct event_constraint *c;
+
+       if (!event->attr.precise_ip)
+               return NULL;
+
+       if (x86_pmu.pebs_constraints) {
+               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+                       if ((event->hw.config & c->cmask) == c->code) {
+                               event->hw.flags |= c->flags;
+                               return c;
+                       }
+               }
+       }
+
+       return &emptyconstraint;
+}
+
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool first_pebs;
+       u64 threshold;
+
+       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+       first_pebs = !pebs_is_enabled(cpuc);
+       cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled |= 1ULL << 63;
+
+       /*
+        * When the event is constrained enough we can use a larger
+        * threshold and run the event with less frequent PMI.
+        */
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+               threshold = ds->pebs_absolute_maximum -
+                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+               if (first_pebs)
+                       perf_sched_cb_inc(event->ctx->pmu);
+       } else {
+               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+               /*
+                * If not all events can use larger buffer,
+                * roll back to threshold = 1
+                */
+               if (!first_pebs &&
+                   (ds->pebs_interrupt_threshold > threshold))
+                       perf_sched_cb_dec(event->ctx->pmu);
+       }
+
+       /* Use auto-reload if possible to save a MSR write in the PMI */
+       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+               ds->pebs_event_reset[hwc->idx] =
+                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+       }
+
+       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+               ds->pebs_interrupt_threshold = threshold;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool large_pebs = ds->pebs_interrupt_threshold >
+               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+       if (large_pebs)
+               intel_pmu_drain_pebs_buffer();
+
+       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+               cpuc->pebs_enabled &= ~(1ULL << 63);
+
+       if (large_pebs && !pebs_is_enabled(cpuc))
+               perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->pebs_enabled)
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       unsigned long from = cpuc->lbr_entries[0].from;
+       unsigned long old_to, to = cpuc->lbr_entries[0].to;
+       unsigned long ip = regs->ip;
+       int is_64bit = 0;
+       void *kaddr;
+       int size;
+
+       /*
+        * We don't need to fixup if the PEBS assist is fault like
+        */
+       if (!x86_pmu.intel_cap.pebs_trap)
+               return 1;
+
+       /*
+        * No LBR entry, no basic block, no rewinding
+        */
+       if (!cpuc->lbr_stack.nr || !from || !to)
+               return 0;
+
+       /*
+        * Basic blocks should never cross user/kernel boundaries
+        */
+       if (kernel_ip(ip) != kernel_ip(to))
+               return 0;
+
+       /*
+        * unsigned math, either ip is before the start (impossible) or
+        * the basic block is larger than 1 page (sanity)
+        */
+       if ((ip - to) > PEBS_FIXUP_SIZE)
+               return 0;
+
+       /*
+        * We sampled a branch insn, rewind using the LBR stack
+        */
+       if (ip == to) {
+               set_linear_ip(regs, from);
+               return 1;
+       }
+
+       size = ip - to;
+       if (!kernel_ip(ip)) {
+               int bytes;
+               u8 *buf = this_cpu_read(insn_buffer);
+
+               /* 'size' must fit our buffer, see above */
+               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+               if (bytes != 0)
+                       return 0;
+
+               kaddr = buf;
+       } else {
+               kaddr = (void *)to;
+       }
+
+       do {
+               struct insn insn;
+
+               old_to = to;
+
+#ifdef CONFIG_X86_64
+               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+               insn_init(&insn, kaddr, size, is_64bit);
+               insn_get_length(&insn);
+               /*
+                * Make sure there was not a problem decoding the
+                * instruction and getting the length.  This is
+                * doubly important because we have an infinite
+                * loop if insn.length=0.
+                */
+               if (!insn.length)
+                       break;
+
+               to += insn.length;
+               kaddr += insn.length;
+               size -= insn.length;
+       } while (to < ip);
+
+       if (to == ip) {
+               set_linear_ip(regs, old_to);
+               return 1;
+       }
+
+       /*
+        * Even though we decoded the basic block, the instruction stream
+        * never matched the given IP, either the TO or the IP got corrupted.
+        */
+       return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+       if (pebs->tsx_tuning) {
+               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+               return tsx.cycles_last_block;
+       }
+       return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+       /* For RTM XABORTs also log the abort code from AX */
+       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+       return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+                                  struct pt_regs *iregs, void *__pebs,
+                                  struct perf_sample_data *data,
+                                  struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+               (PERF_X86_EVENT_PEBS_ST_HSW | \
+                PERF_X86_EVENT_PEBS_LD_HSW | \
+                PERF_X86_EVENT_PEBS_NA_HSW)
+       /*
+        * We cast to the biggest pebs_record but are careful not to
+        * unconditionally access the 'extra' entries.
+        */
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct pebs_record_skl *pebs = __pebs;
+       u64 sample_type;
+       int fll, fst, dsrc;
+       int fl = event->hw.flags;
+
+       if (pebs == NULL)
+               return;
+
+       sample_type = event->attr.sample_type;
+       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+       perf_sample_data_init(data, 0, event->hw.last_period);
+
+       data->period = event->hw.last_period;
+
+       /*
+        * Use latency for weight (only avail with PEBS-LL)
+        */
+       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+               data->weight = pebs->lat;
+
+       /*
+        * data.data_src encodes the data source
+        */
+       if (dsrc) {
+               u64 val = PERF_MEM_NA;
+               if (fll)
+                       val = load_latency_data(pebs->dse);
+               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+                       val = precise_datala_hsw(event, pebs->dse);
+               else if (fst)
+                       val = precise_store_data(pebs->dse);
+               data->data_src.val = val;
+       }
+
+       /*
+        * We use the interrupt regs as a base because the PEBS record
+        * does not contain a full regs set, specifically it seems to
+        * lack segment descriptors, which get used by things like
+        * user_mode().
+        *
+        * In the simple case fix up only the IP and BP,SP regs, for
+        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+        */
+       *regs = *iregs;
+       regs->flags = pebs->flags;
+       set_linear_ip(regs, pebs->ip);
+       regs->bp = pebs->bp;
+       regs->sp = pebs->sp;
+
+       if (sample_type & PERF_SAMPLE_REGS_INTR) {
+               regs->ax = pebs->ax;
+               regs->bx = pebs->bx;
+               regs->cx = pebs->cx;
+               regs->dx = pebs->dx;
+               regs->si = pebs->si;
+               regs->di = pebs->di;
+               regs->bp = pebs->bp;
+               regs->sp = pebs->sp;
+
+               regs->flags = pebs->flags;
+#ifndef CONFIG_X86_32
+               regs->r8 = pebs->r8;
+               regs->r9 = pebs->r9;
+               regs->r10 = pebs->r10;
+               regs->r11 = pebs->r11;
+               regs->r12 = pebs->r12;
+               regs->r13 = pebs->r13;
+               regs->r14 = pebs->r14;
+               regs->r15 = pebs->r15;
+#endif
+       }
+
+       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+               regs->ip = pebs->real_ip;
+               regs->flags |= PERF_EFLAGS_EXACT;
+       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+               regs->flags |= PERF_EFLAGS_EXACT;
+       else
+               regs->flags &= ~PERF_EFLAGS_EXACT;
+
+       if ((sample_type & PERF_SAMPLE_ADDR) &&
+           x86_pmu.intel_cap.pebs_format >= 1)
+               data->addr = pebs->dla;
+
+       if (x86_pmu.intel_cap.pebs_format >= 2) {
+               /* Only set the TSX weight when no memory weight. */
+               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+                       data->weight = intel_hsw_weight(pebs);
+
+               if (sample_type & PERF_SAMPLE_TRANSACTION)
+                       data->txn = intel_hsw_transaction(pebs);
+       }
+
+       /*
+        * v3 supplies an accurate time stamp, so we use that
+        * for the time stamp.
+        *
+        * We can only do this for the default trace clock.
+        */
+       if (x86_pmu.intel_cap.pebs_format >= 3 &&
+               event->attr.use_clockid == 0)
+               data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+       if (has_branch_stack(event))
+               data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       void *at;
+       u64 pebs_status;
+
+       /*
+        * fmt0 does not have a status bitfield (does not use
+        * perf_record_nhm format)
+        */
+       if (x86_pmu.intel_cap.pebs_format < 1)
+               return base;
+
+       if (base == NULL)
+               return NULL;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+
+               if (test_bit(bit, (unsigned long *)&p->status)) {
+                       /* PEBS v3 has accurate status bits */
+                       if (x86_pmu.intel_cap.pebs_format >= 3)
+                               return at;
+
+                       if (p->status == (1 << bit))
+                               return at;
+
+                       /* clear non-PEBS bit and re-check */
+                       pebs_status = p->status & cpuc->pebs_enabled;
+                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       if (pebs_status == (1 << bit))
+                               return at;
+               }
+       }
+       return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                  struct pt_regs *iregs,
+                                  void *base, void *top,
+                                  int bit, int count)
+{
+       struct perf_sample_data data;
+       struct pt_regs regs;
+       void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+       if (!intel_pmu_save_and_restart(event) &&
+           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+               return;
+
+       while (count > 1) {
+               setup_pebs_sample_data(event, iregs, at, &data, &regs);
+               perf_event_output(event, &data, &regs);
+               at += x86_pmu.pebs_record_size;
+               at = get_next_pebs_record_by_bit(at, top, bit);
+               count--;
+       }
+
+       setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+       /*
+        * All but the last records are processed.
+        * The last one is left to be able to call the overflow handler.
+        */
+       if (perf_event_overflow(event, &data, &regs)) {
+               x86_pmu_stop(event, 0);
+               return;
+       }
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+       struct pebs_record_core *at, *top;
+       int n;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+       /*
+        * Whatever else happens, drain the thing
+        */
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (!test_bit(0, cpuc->active_mask))
+               return;
+
+       WARN_ON_ONCE(!event);
+
+       if (!event->attr.precise_ip)
+               return;
+
+       n = top - at;
+       if (n <= 0)
+               return;
+
+       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct debug_store *ds = cpuc->ds;
+       struct perf_event *event;
+       void *base, *at, *top;
+       short counts[MAX_PEBS_EVENTS] = {};
+       short error[MAX_PEBS_EVENTS] = {};
+       int bit, i;
+
+       if (!x86_pmu.pebs_active)
+               return;
+
+       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+       ds->pebs_index = ds->pebs_buffer_base;
+
+       if (unlikely(base >= top))
+               return;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
+               u64 pebs_status;
+
+               /* PEBS v3 has accurate status bits */
+               if (x86_pmu.intel_cap.pebs_format >= 3) {
+                       for_each_set_bit(bit, (unsigned long *)&p->status,
+                                        MAX_PEBS_EVENTS)
+                               counts[bit]++;
+
+                       continue;
+               }
+
+               pebs_status = p->status & cpuc->pebs_enabled;
+               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+               /*
+                * On some CPUs the PEBS status can be zero when PEBS is
+                * racing with clearing of GLOBAL_STATUS.
+                *
+                * Normally we would drop that record, but in the
+                * case when there is only a single active PEBS event
+                * we can assume it's for that event.
+                */
+               if (!pebs_status && cpuc->pebs_enabled &&
+                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+                       pebs_status = cpuc->pebs_enabled;
+
+               bit = find_first_bit((unsigned long *)&pebs_status,
+                                       x86_pmu.max_pebs_events);
+               if (bit >= x86_pmu.max_pebs_events)
+                       continue;
+
+               /*
+                * The PEBS hardware does not deal well with the situation
+                * when events happen near to each other and multiple bits
+                * are set. But it should happen rarely.
+                *
+                * If these events include one PEBS and multiple non-PEBS
+                * events, it doesn't impact PEBS record. The record will
+                * be handled normally. (slow path)
+                *
+                * If these events include two or more PEBS events, the
+                * records for the events can be collapsed into a single
+                * one, and it's not possible to reconstruct all events
+                * that caused the PEBS record. It's called collision.
+                * If collision happened, the record will be dropped.
+                */
+               if (p->status != (1ULL << bit)) {
+                       for_each_set_bit(i, (unsigned long *)&pebs_status,
+                                        x86_pmu.max_pebs_events)
+                               error[i]++;
+                       continue;
+               }
+
+               counts[bit]++;
+       }
+
+       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+               if ((counts[bit] == 0) && (error[bit] == 0))
+                       continue;
+
+               event = cpuc->events[bit];
+               WARN_ON_ONCE(!event);
+               WARN_ON_ONCE(!event->attr.precise_ip);
+
+               /* log dropped samples number */
+               if (error[bit])
+                       perf_log_lost_samples(event, error[bit]);
+
+               if (counts[bit]) {
+                       __intel_pmu_pebs_event(event, iregs, base,
+                                              top, bit, counts[bit]);
+               }
+       }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+       /*
+        * No support for 32bit formats
+        */
+       if (!boot_cpu_has(X86_FEATURE_DTES64))
+               return;
+
+       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+       x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+       if (x86_pmu.pebs) {
+               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+               int format = x86_pmu.intel_cap.pebs_format;
+
+               switch (format) {
+               case 0:
+                       pr_cont("PEBS fmt0%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+                       /*
+                        * Using >PAGE_SIZE buffers makes the WRMSR to
+                        * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+                        * mysteriously hang on Core2.
+                        *
+                        * As a workaround, we don't do this.
+                        */
+                       x86_pmu.pebs_buffer_size = PAGE_SIZE;
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+                       break;
+
+               case 1:
+                       pr_cont("PEBS fmt1%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 2:
+                       pr_cont("PEBS fmt2%c, ", pebs_type);
+                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       break;
+
+               case 3:
+                       pr_cont("PEBS fmt3%c, ", pebs_type);
+                       x86_pmu.pebs_record_size =
+                                               sizeof(struct pebs_record_skl);
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+                       break;
+
+               default:
+                       pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+                       x86_pmu.pebs = 0;
+               }
+       }
+}
+
+void perf_restore_debug_store(void)
+{
+       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+       if (!x86_pmu.bts && !x86_pmu.pebs)
+               return;
+
+       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
new file mode 100644 (file)
index 0000000..548d5f7
--- /dev/null
@@ -0,0 +1,321 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               /* On Xeon Phi event "0" is a valid DATA_READ          */
+               /*   (L1 Data Cache Reads) Instruction.                */
+               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
+               /* bit will always be set in x86_pmu_hw_config().      */
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ           */
+               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
+               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
+               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
+       },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
+               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
+               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+                                               /* DATA_READ */
+                                               /* see note on L1 OP_READ */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
+               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = 0x0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
+               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
+               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+       return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
+       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
+       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
+       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
+       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
+       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
+       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
+       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
+       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
+       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
+       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
+       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
+       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
+       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
+       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
+       EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
+
+#define KNC_ENABLE_COUNTER0                    0x00000001
+#define KNC_ENABLE_COUNTER1                    0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+       u64 val;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+       u64 status;
+
+       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+       return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       int handled = 0;
+       int bit, loops;
+       u64 status;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       knc_pmu_disable_all();
+
+       status = knc_pmu_get_status();
+       if (!status) {
+               knc_pmu_enable_all(0);
+               return handled;
+       }
+
+       loops = 0;
+again:
+       knc_pmu_ack_status(status);
+       if (++loops > 100) {
+               WARN_ONCE(1, "perf: irq loop stuck!\n");
+               perf_event_print_debug();
+               goto done;
+       }
+
+       inc_irq_stat(apic_perf_irqs);
+
+       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+               struct perf_event *event = cpuc->events[bit];
+
+               handled++;
+
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+
+               if (!intel_pmu_save_and_restart(event))
+                       continue;
+
+               perf_sample_data_init(&data, 0, event->hw.last_period);
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       /*
+        * Repeat if there is more work to be done:
+        */
+       status = knc_pmu_get_status();
+       if (status)
+               goto again;
+
+done:
+       /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+       if (cpuc->enabled)
+               knc_pmu_enable_all(0);
+
+       return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_knc_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+       .name                   = "knc",
+       .handle_irq             = knc_pmu_handle_irq,
+       .disable_all            = knc_pmu_disable_all,
+       .enable_all             = knc_pmu_enable_all,
+       .enable                 = knc_pmu_enable_event,
+       .disable                = knc_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_KNC_EVNTSEL0,
+       .perfctr                = MSR_KNC_PERFCTR0,
+       .event_map              = knc_pmu_event_map,
+       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 39) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       .cntval_bits            = 40,
+       .cntval_mask            = (1ULL << 40) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = knc_event_constraints,
+       .format_attrs           = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+       x86_pmu = knc_pmu;
+
+       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
new file mode 100644 (file)
index 0000000..69dd118
--- /dev/null
@@ -0,0 +1,1062 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+enum {
+       LBR_FORMAT_32           = 0x00,
+       LBR_FORMAT_LIP          = 0x01,
+       LBR_FORMAT_EIP          = 0x02,
+       LBR_FORMAT_EIP_FLAGS    = 0x03,
+       LBR_FORMAT_EIP_FLAGS2   = 0x04,
+       LBR_FORMAT_INFO         = 0x05,
+       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+};
+
+static enum {
+       LBR_EIP_FLAGS           = 1,
+       LBR_TSX                 = 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
+#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT            2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT         5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
+#define LBR_FAR_BIT            8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT     9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
+#define LBR_USER       (1 << LBR_USER_BIT)
+#define LBR_JCC                (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN     (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR                (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
+#define LBR_IGN                0       /* ignored */
+
+#define LBR_ANY                 \
+       (LBR_JCC        |\
+        LBR_REL_CALL   |\
+        LBR_IND_CALL   |\
+        LBR_RETURN     |\
+        LBR_REL_JMP    |\
+        LBR_IND_JMP    |\
+        LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+       X86_BR_NONE             = 0,      /* unknown */
+
+       X86_BR_USER             = 1 << 0, /* branch target is user */
+       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
+
+       X86_BR_CALL             = 1 << 2, /* call */
+       X86_BR_RET              = 1 << 3, /* return */
+       X86_BR_SYSCALL          = 1 << 4, /* syscall */
+       X86_BR_SYSRET           = 1 << 5, /* syscall return */
+       X86_BR_INT              = 1 << 6, /* sw interrupt */
+       X86_BR_IRET             = 1 << 7, /* return from interrupt */
+       X86_BR_JCC              = 1 << 8, /* conditional */
+       X86_BR_JMP              = 1 << 9, /* jump */
+       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
+       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
+       X86_BR_ABORT            = 1 << 12,/* transaction abort */
+       X86_BR_IN_TX            = 1 << 13,/* in transaction */
+       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
+       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
+       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
+       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY       \
+       (X86_BR_CALL    |\
+        X86_BR_RET     |\
+        X86_BR_SYSCALL |\
+        X86_BR_SYSRET  |\
+        X86_BR_INT     |\
+        X86_BR_IRET    |\
+        X86_BR_JCC     |\
+        X86_BR_JMP      |\
+        X86_BR_IRQ      |\
+        X86_BR_ABORT    |\
+        X86_BR_IND_CALL |\
+        X86_BR_IND_JMP  |\
+        X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL                 \
+       (X86_BR_CALL            |\
+        X86_BR_IND_CALL        |\
+        X86_BR_ZERO_CALL       |\
+        X86_BR_SYSCALL         |\
+        X86_BR_IRQ             |\
+        X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       u64 debugctl, lbr_select = 0, orig_debugctl;
+
+       /*
+        * No need to unfreeze manually, as v4 can do that as part
+        * of the GLOBAL_STATUS ack.
+        */
+       if (pmi && x86_pmu.version >= 4)
+               return;
+
+       /*
+        * No need to reprogram LBR_SELECT in a PMI, as it
+        * did not change.
+        */
+       if (cpuc->lbr_sel)
+               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+       if (!pmi && cpuc->lbr_sel)
+               wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       orig_debugctl = debugctl;
+       debugctl |= DEBUGCTLMSR_LBR;
+       /*
+        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+        * may cause superfluous increase/decrease of LBR_TOS.
+        */
+       if (!(lbr_select & LBR_CALL_STACK))
+               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+       if (orig_debugctl != debugctl)
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+       u64 debugctl;
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++)
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               wrmsrl(x86_pmu.lbr_from + i, 0);
+               wrmsrl(x86_pmu.lbr_to   + i, 0);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
+       }
+}
+
+void intel_pmu_lbr_reset(void)
+{
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_reset_32();
+       else
+               intel_pmu_lbr_reset_64();
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+       u64 tos;
+
+       rdmsrl(x86_pmu.lbr_tos, tos);
+       return tos;
+}
+
+enum {
+       LBR_NONE,
+       LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0 ||
+           task_ctx->lbr_stack_state == LBR_NONE) {
+               intel_pmu_lbr_reset();
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = task_ctx->tos;
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       wrmsrl(x86_pmu.lbr_tos, tos);
+       task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+       int i;
+       unsigned lbr_idx, mask;
+       u64 tos;
+
+       if (task_ctx->lbr_callstack_users == 0) {
+               task_ctx->lbr_stack_state = LBR_NONE;
+               return;
+       }
+
+       mask = x86_pmu.lbr_nr - 1;
+       tos = intel_pmu_lbr_tos();
+       for (i = 0; i < tos; i++) {
+               lbr_idx = (tos - i) & mask;
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+       }
+       task_ctx->tos = tos;
+       task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       /*
+        * If LBR callstack feature is enabled and the stack was saved when
+        * the task was scheduled out, restore the stack. Otherwise flush
+        * the LBR stack.
+        */
+       task_ctx = ctx ? ctx->task_ctx_data : NULL;
+       if (task_ctx) {
+               if (sched_in) {
+                       __intel_pmu_lbr_restore(task_ctx);
+                       cpuc->lbr_context = ctx;
+               } else {
+                       __intel_pmu_lbr_save(task_ctx);
+               }
+               return;
+       }
+
+       /*
+        * When sampling the branck stack in system-wide, it may be
+        * necessary to flush the stack on context switch. This happens
+        * when the branch stack does not tag its entries with the pid
+        * of the current task. Otherwise it becomes impossible to
+        * associate a branch entry with a task. This ambiguity is more
+        * likely to appear when the branch stack supports priv level
+        * filtering and the user sets it to monitor only at the user
+        * level (which could be a useful measurement in system-wide
+        * mode). In that case, the risk is high of having a branch
+        * stack with branch from multiple tasks.
+        */
+       if (sched_in) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = ctx;
+       }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       /*
+        * Reset the LBR stack if we changed task context to
+        * avoid data leaks.
+        */
+       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+               intel_pmu_lbr_reset();
+               cpuc->lbr_context = event->ctx;
+       }
+       cpuc->br_sel = event->hw.branch_reg.reg;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users++;
+       }
+
+       cpuc->lbr_users++;
+       perf_sched_cb_inc(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_disable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       struct x86_perf_task_context *task_ctx;
+
+       if (!x86_pmu.lbr_nr)
+               return;
+
+       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+                                       event->ctx->task_ctx_data) {
+               task_ctx = event->ctx->task_ctx_data;
+               task_ctx->lbr_callstack_users--;
+       }
+
+       cpuc->lbr_users--;
+       WARN_ON_ONCE(cpuc->lbr_users < 0);
+       perf_sched_cb_dec(event->ctx->pmu);
+
+       if (cpuc->enabled && !cpuc->lbr_users) {
+               __intel_pmu_lbr_disable();
+               /* avoid stale pointer */
+               cpuc->lbr_context = NULL;
+       }
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (cpuc->lbr_users)
+               __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+
+       for (i = 0; i < x86_pmu.lbr_nr; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               union {
+                       struct {
+                               u32 from;
+                               u32 to;
+                       };
+                       u64     lbr;
+               } msr_lastbranch;
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
+               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
+               cpuc->lbr_entries[i].mispred    = 0;
+               cpuc->lbr_entries[i].predicted  = 0;
+               cpuc->lbr_entries[i].reserved   = 0;
+       }
+       cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+       bool need_info = false;
+       unsigned long mask = x86_pmu.lbr_nr - 1;
+       int lbr_format = x86_pmu.intel_cap.lbr_format;
+       u64 tos = intel_pmu_lbr_tos();
+       int i;
+       int out = 0;
+       int num = x86_pmu.lbr_nr;
+
+       if (cpuc->lbr_sel) {
+               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+                       num = tos;
+       }
+
+       for (i = 0; i < num; i++) {
+               unsigned long lbr_idx = (tos - i) & mask;
+               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+               int skip = 0;
+               u16 cycles = 0;
+               int lbr_flags = lbr_desc[lbr_format];
+
+               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+               if (lbr_format == LBR_FORMAT_INFO && need_info) {
+                       u64 info;
+
+                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+                       mis = !!(info & LBR_INFO_MISPRED);
+                       pred = !mis;
+                       in_tx = !!(info & LBR_INFO_IN_TX);
+                       abort = !!(info & LBR_INFO_ABORT);
+                       cycles = (info & LBR_INFO_CYCLES);
+               }
+               if (lbr_flags & LBR_EIP_FLAGS) {
+                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
+                       pred = !mis;
+                       skip = 1;
+               }
+               if (lbr_flags & LBR_TSX) {
+                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+                       abort = !!(from & LBR_FROM_FLAG_ABORT);
+                       skip = 3;
+               }
+               from = (u64)((((s64)from) << skip) >> skip);
+
+               /*
+                * Some CPUs report duplicated abort records,
+                * with the second entry not having an abort bit set.
+                * Skip them here. This loop runs backwards,
+                * so we need to undo the previous record.
+                * If the abort just happened outside the window
+                * the extra entry cannot be removed.
+                */
+               if (abort && x86_pmu.lbr_double_abort && out > 0)
+                       out--;
+
+               cpuc->lbr_entries[out].from      = from;
+               cpuc->lbr_entries[out].to        = to;
+               cpuc->lbr_entries[out].mispred   = mis;
+               cpuc->lbr_entries[out].predicted = pred;
+               cpuc->lbr_entries[out].in_tx     = in_tx;
+               cpuc->lbr_entries[out].abort     = abort;
+               cpuc->lbr_entries[out].cycles    = cycles;
+               cpuc->lbr_entries[out].reserved  = 0;
+               out++;
+       }
+       cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       if (!cpuc->lbr_users)
+               return;
+
+       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+               intel_pmu_lbr_read_32(cpuc);
+       else
+               intel_pmu_lbr_read_64(cpuc);
+
+       intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+       u64 br_type = event->attr.branch_sample_type;
+       int mask = 0;
+
+       if (br_type & PERF_SAMPLE_BRANCH_USER)
+               mask |= X86_BR_USER;
+
+       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+               mask |= X86_BR_KERNEL;
+
+       /* we ignore BRANCH_HV here */
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY)
+               mask |= X86_BR_ANY;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+               mask |= X86_BR_ANY_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+               mask |= X86_BR_IND_CALL;
+
+       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+               mask |= X86_BR_ABORT;
+
+       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+               mask |= X86_BR_IN_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+               mask |= X86_BR_NO_TX;
+
+       if (br_type & PERF_SAMPLE_BRANCH_COND)
+               mask |= X86_BR_JCC;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+               if (!x86_pmu_has_lbr_callstack())
+                       return -EOPNOTSUPP;
+               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+                       return -EINVAL;
+               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+                       X86_BR_CALL_STACK;
+       }
+
+       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+               mask |= X86_BR_IND_JMP;
+
+       if (br_type & PERF_SAMPLE_BRANCH_CALL)
+               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+       /*
+        * stash actual user request into reg, it may
+        * be used by fixup code for some CPU
+        */
+       event->hw.branch_reg.reg = mask;
+       return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg;
+       u64 br_type = event->attr.branch_sample_type;
+       u64 mask = 0, v;
+       int i;
+
+       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+               if (!(br_type & (1ULL << i)))
+                       continue;
+
+               v = x86_pmu.lbr_sel_map[i];
+               if (v == LBR_NOT_SUPP)
+                       return -EOPNOTSUPP;
+
+               if (v != LBR_IGN)
+                       mask |= v;
+       }
+
+       reg = &event->hw.branch_reg;
+       reg->idx = EXTRA_REG_LBR;
+
+       /*
+        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+        * in suppress mode. So LBR_SELECT should be set to
+        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+        */
+       reg->config = mask ^ x86_pmu.lbr_sel_mask;
+
+       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+               reg->config |= LBR_NO_INFO;
+
+       return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+       int ret = 0;
+
+       /*
+        * no LBR on this PMU
+        */
+       if (!x86_pmu.lbr_nr)
+               return -EOPNOTSUPP;
+
+       /*
+        * setup SW LBR filter
+        */
+       ret = intel_pmu_setup_sw_lbr_filter(event);
+       if (ret)
+               return ret;
+
+       /*
+        * setup HW LBR filter, if any
+        */
+       if (x86_pmu.lbr_sel_map)
+               ret = intel_pmu_setup_hw_lbr_filter(event);
+
+       return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+       struct insn insn;
+       void *addr;
+       int bytes_read, bytes_left;
+       int ret = X86_BR_NONE;
+       int ext, to_plm, from_plm;
+       u8 buf[MAX_INSN_SIZE];
+       int is64 = 0;
+
+       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+       /*
+        * maybe zero if lbr did not fill up after a reset by the time
+        * we get a PMU interrupt
+        */
+       if (from == 0 || to == 0)
+               return X86_BR_NONE;
+
+       if (abort)
+               return X86_BR_ABORT | to_plm;
+
+       if (from_plm == X86_BR_USER) {
+               /*
+                * can happen if measuring at the user level only
+                * and we interrupt in a kernel thread, e.g., idle.
+                */
+               if (!current->mm)
+                       return X86_BR_NONE;
+
+               /* may fail if text not present */
+               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+                                               MAX_INSN_SIZE);
+               bytes_read = MAX_INSN_SIZE - bytes_left;
+               if (!bytes_read)
+                       return X86_BR_NONE;
+
+               addr = buf;
+       } else {
+               /*
+                * The LBR logs any address in the IP, even if the IP just
+                * faulted. This means userspace can control the from address.
+                * Ensure we don't blindy read any address by validating it is
+                * a known text address.
+                */
+               if (kernel_text_address(from)) {
+                       addr = (void *)from;
+                       /*
+                        * Assume we can get the maximum possible size
+                        * when grabbing kernel data.  This is not
+                        * _strictly_ true since we could possibly be
+                        * executing up next to a memory hole, but
+                        * it is very unlikely to be a problem.
+                        */
+                       bytes_read = MAX_INSN_SIZE;
+               } else {
+                       return X86_BR_NONE;
+               }
+       }
+
+       /*
+        * decoder needs to know the ABI especially
+        * on 64-bit systems running 32-bit apps
+        */
+#ifdef CONFIG_X86_64
+       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+       insn_init(&insn, addr, bytes_read, is64);
+       insn_get_opcode(&insn);
+       if (!insn.opcode.got)
+               return X86_BR_ABORT;
+
+       switch (insn.opcode.bytes[0]) {
+       case 0xf:
+               switch (insn.opcode.bytes[1]) {
+               case 0x05: /* syscall */
+               case 0x34: /* sysenter */
+                       ret = X86_BR_SYSCALL;
+                       break;
+               case 0x07: /* sysret */
+               case 0x35: /* sysexit */
+                       ret = X86_BR_SYSRET;
+                       break;
+               case 0x80 ... 0x8f: /* conditional */
+                       ret = X86_BR_JCC;
+                       break;
+               default:
+                       ret = X86_BR_NONE;
+               }
+               break;
+       case 0x70 ... 0x7f: /* conditional */
+               ret = X86_BR_JCC;
+               break;
+       case 0xc2: /* near ret */
+       case 0xc3: /* near ret */
+       case 0xca: /* far ret */
+       case 0xcb: /* far ret */
+               ret = X86_BR_RET;
+               break;
+       case 0xcf: /* iret */
+               ret = X86_BR_IRET;
+               break;
+       case 0xcc ... 0xce: /* int */
+               ret = X86_BR_INT;
+               break;
+       case 0xe8: /* call near rel */
+               insn_get_immediate(&insn);
+               if (insn.immediate1.value == 0) {
+                       /* zero length call */
+                       ret = X86_BR_ZERO_CALL;
+                       break;
+               }
+       case 0x9a: /* call far absolute */
+               ret = X86_BR_CALL;
+               break;
+       case 0xe0 ... 0xe3: /* loop jmp */
+               ret = X86_BR_JCC;
+               break;
+       case 0xe9 ... 0xeb: /* jmp */
+               ret = X86_BR_JMP;
+               break;
+       case 0xff: /* call near absolute, call far absolute ind */
+               insn_get_modrm(&insn);
+               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+               switch (ext) {
+               case 2: /* near ind call */
+               case 3: /* far ind call */
+                       ret = X86_BR_IND_CALL;
+                       break;
+               case 4:
+               case 5:
+                       ret = X86_BR_IND_JMP;
+                       break;
+               }
+               break;
+       default:
+               ret = X86_BR_NONE;
+       }
+       /*
+        * interrupts, traps, faults (and thus ring transition) may
+        * occur on any instructions. Thus, to classify them correctly,
+        * we need to first look at the from and to priv levels. If they
+        * are different and to is in the kernel, then it indicates
+        * a ring transition. If the from instruction is not a ring
+        * transition instr (syscall, systenter, int), then it means
+        * it was a irq, trap or fault.
+        *
+        * we have no way of detecting kernel to kernel faults.
+        */
+       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+               ret = X86_BR_IRQ;
+
+       /*
+        * branch priv level determined by target as
+        * is done by HW when LBR_SELECT is implemented
+        */
+       if (ret != X86_BR_NONE)
+               ret |= to_plm;
+
+       return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+       u64 from, to;
+       int br_sel = cpuc->br_sel;
+       int i, j, type;
+       bool compress = false;
+
+       /* if sampling all branches, then nothing to filter */
+       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+               return;
+
+       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+               from = cpuc->lbr_entries[i].from;
+               to = cpuc->lbr_entries[i].to;
+
+               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+                       if (cpuc->lbr_entries[i].in_tx)
+                               type |= X86_BR_IN_TX;
+                       else
+                               type |= X86_BR_NO_TX;
+               }
+
+               /* if type does not correspond, then discard */
+               if (type == X86_BR_NONE || (br_sel & type) != type) {
+                       cpuc->lbr_entries[i].from = 0;
+                       compress = true;
+               }
+       }
+
+       if (!compress)
+               return;
+
+       /* remove all entries with from=0 */
+       for (i = 0; i < cpuc->lbr_stack.nr; ) {
+               if (!cpuc->lbr_entries[i].from) {
+                       j = i;
+                       while (++j < cpuc->lbr_stack.nr)
+                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+                       cpuc->lbr_stack.nr--;
+                       if (!cpuc->lbr_entries[i].from)
+                               continue;
+               }
+               i++;
+       }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
+                                               | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+        */
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+       /*
+        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+        */
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
+       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
+       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
+       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
+       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_FAR,
+       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
+       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
+                                               | LBR_RETURN | LBR_CALL_STACK,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
+       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+       x86_pmu.lbr_nr     = 4;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("4-deep LBR, ");
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+       x86_pmu.lbr_nr     = 16;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - workaround LBR_SEL errata (see above)
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("16-deep LBR, ");
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+       x86_pmu.lbr_nr   = 16;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       pr_cont("16-deep LBR, ");
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+       x86_pmu.lbr_nr   = 32;
+       x86_pmu.lbr_tos  = MSR_LBR_TOS;
+       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+       /*
+        * SW branch filter usage:
+        * - support syscall, sysret capture.
+        *   That requires LBR_FAR but that means far
+        *   jmp need to be filtered out
+        */
+       pr_cont("32-deep LBR, ");
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+       /*
+        * only models starting at stepping 10 seems
+        * to have an operational LBR which can freeze
+        * on PMU interrupt
+        */
+       if (boot_cpu_data.x86_model == 28
+           && boot_cpu_data.x86_mask < 10) {
+               pr_cont("LBR disabled due to erratum");
+               return;
+       }
+
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+       /*
+        * SW branch filter usage:
+        * - compensate for lack of HW filter
+        */
+       pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+       x86_pmu.lbr_nr     = 8;
+       x86_pmu.lbr_tos    = MSR_LBR_TOS;
+       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+       pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
new file mode 100644 (file)
index 0000000..0a5ede1
--- /dev/null
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+       unsigned int opcode;                    /* Event code and ESCR selector */
+       unsigned int escr_msr[2];               /* ESCR MSR for this event */
+       unsigned int escr_emask;                /* valid ESCR EventMask bits */
+       unsigned int shared;                    /* event is shared across threads */
+       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+       unsigned int metric_pebs;
+       unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
+       [P4_PEBS_METRIC__##name] = {                            \
+               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
+               .metric_vert = vert,                            \
+       }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
+       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
+       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
+       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
+       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
+       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
+       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
+       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+       [P4_EVENT_TC_DELIVER_MODE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+               .shared         = 1,
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_BPU_FETCH_REQUEST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_ITLB_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_MEMORY_CANCEL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MEMORY_COMPLETE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_LOAD_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_STORE_PORT_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_MOB_LOAD_REPLAY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_PAGE_WALK_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ALLOCATION] = {
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_FSB_DATA_ACTIVITY] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+               .shared         = 1,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+               .cntr           = { {0, -1, -1}, {1, -1, -1} },
+       },
+       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
+               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+               .cntr           = { {2, -1, -1}, {3, -1, -1} },
+       },
+       [P4_EVENT_SSE_INPUT_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_PACKED_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_SP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_SCALAR_DP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_64BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_128BIT_MMX_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_X87_FP_UOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_TC_MISC] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
+               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_TC_MS_XFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_UOP_QUEUE_WRITES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+               .cntr           = { {4, 5, -1}, {6, 7, -1} },
+       },
+       [P4_EVENT_RESOURCE_STALL] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_WC_BUFFER] = {
+               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
+               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+               .shared         = 1,
+               .cntr           = { {8, 9, -1}, {10, 11, -1} },
+       },
+       [P4_EVENT_B2B_CYCLES] = {
+               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_BNR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BNR),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_SNOOP] = {
+               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_RESPONSE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
+               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
+               .cntr           = { {0, -1, -1}, {2, -1, -1} },
+       },
+       [P4_EVENT_FRONT_END_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_EXECUTION_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_REPLAY_EVENT] = {
+               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOPS_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_UOP_TYPE] = {
+               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
+               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_X87_ASSIST] = {
+               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_MACHINE_CLEAR] = {
+               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+       [P4_EVENT_INSTR_COMPLETED] = {
+               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+               .cntr           = { {12, 13, 16}, {14, 15, 17} },
+       },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
+       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
+                           P4_ESCR_EMASK_BIT(event, bit))              | \
+       p4_config_pack_cccr(metric                                      | \
+                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+       },
+},
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0,
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+                                               P4_PEBS_METRIC__none),
+               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+                                               P4_PEBS_METRIC__none),
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(NODE) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+       u64 original;
+       u64 alternative;
+} p4_event_aliases[] = {
+       {
+               /*
+                * Non-halted cycles can be substituted with non-sleeping cycles (see
+                * Intel SDM Vol3b for details). We need this alias to be able
+                * to run nmi-watchdog and 'perf top' (or any other user space tool
+                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+                * simultaneously.
+                */
+       .original       =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+       .alternative    =
+               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
+                                   P4_CCCR_COMPARE),
+       },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+       u64 config_match;
+       int i;
+
+       /*
+        * Only event with special mark is allowed,
+        * we're to be sure it didn't come as malformed
+        * RAW event.
+        */
+       if (!(config & P4_CONFIG_ALIASABLE))
+               return 0;
+
+       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+               if (config_match == p4_event_aliases[i].original) {
+                       config_match = p4_event_aliases[i].alternative;
+                       break;
+               } else if (config_match == p4_event_aliases[i].alternative) {
+                       config_match = p4_event_aliases[i].original;
+                       break;
+               }
+       }
+
+       if (i >= ARRAY_SIZE(p4_event_aliases))
+               return 0;
+
+       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
+               P4_CONFIG_ALIASABLE,
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]        =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
+               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
+       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+       unsigned int evnt = p4_config_unpack_event(config);
+       struct p4_event_bind *bind = NULL;
+
+       if (evnt < ARRAY_SIZE(p4_event_bind_map))
+               bind = &p4_event_bind_map[evnt];
+
+       return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+       struct p4_event_bind *bind;
+       unsigned int esel;
+       u64 config;
+
+       config = p4_general_events[hw_event];
+       bind = p4_config_get_bind(config);
+       esel = P4_OPCODE_ESEL(bind->opcode);
+       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+       return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+               if (boot_cpu_data.x86_model != 3 &&
+                       boot_cpu_data.x86_model != 4 &&
+                       boot_cpu_data.x86_model != 6)
+                       return false;
+       }
+
+       /*
+        * For info
+        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+        */
+
+       return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+       unsigned int v, emask;
+
+       /* User data may have out-of-bound event index */
+       v = p4_config_unpack_event(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_event_bind_map))
+               return -EINVAL;
+
+       /* It may be unsupported: */
+       if (!p4_event_match_cpu_model(v))
+               return -EINVAL;
+
+       /*
+        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+        * in Architectural Performance Monitoring, it means not
+        * on _which_ logical cpu to count but rather _when_, ie it
+        * depends on logical cpu state -- count event if one cpu active,
+        * none, both or any, so we just allow user to pass any value
+        * desired.
+        *
+        * In turn we always set Tx_OS/Tx_USR bits bound to logical
+        * cpu without their propagation to another cpu
+        */
+
+       /*
+        * if an event is shared across the logical threads
+        * the user needs special permissions to be able to use it
+        */
+       if (p4_ht_active() && p4_event_bind_map[v].shared) {
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
+       /* ESCR EventMask bits may be invalid */
+       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+       if (emask & ~p4_event_bind_map[v].escr_emask)
+               return -EINVAL;
+
+       /*
+        * it may have some invalid PEBS bits
+        */
+       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+               return -EINVAL;
+
+       v = p4_config_unpack_metric(event->attr.config);
+       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+               return -EINVAL;
+
+       return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+       int cpu = get_cpu();
+       int rc = 0;
+       u32 escr, cccr;
+
+       /*
+        * the reason we use cpu that early is that: if we get scheduled
+        * first time on the same cpu -- we will not need swap thread
+        * specific flags in config (and will save some cpu cycles)
+        */
+
+       cccr = p4_default_cccr_conf(cpu);
+       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+                                        event->attr.exclude_user);
+       event->hw.config = p4_config_pack_escr(escr) |
+                          p4_config_pack_cccr(cccr);
+
+       if (p4_ht_active() && p4_ht_thread(cpu))
+               event->hw.config = p4_set_ht_bit(event->hw.config);
+
+       if (event->attr.type == PERF_TYPE_RAW) {
+               struct p4_event_bind *bind;
+               unsigned int esel;
+               /*
+                * Clear bits we reserve to be managed by kernel itself
+                * and never allowed from a user space
+                */
+                event->attr.config &= P4_CONFIG_MASK;
+
+               rc = p4_validate_raw_event(event);
+               if (rc)
+                       goto out;
+
+               /*
+                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+                * bits since we keep additional info here (for cache events and etc)
+                */
+               event->hw.config |= event->attr.config;
+               bind = p4_config_get_bind(event->attr.config);
+               if (!bind) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+               esel = P4_OPCODE_ESEL(bind->opcode);
+               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+       }
+
+       rc = x86_setup_perfctr(event);
+out:
+       put_cpu();
+       return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+       u64 v;
+
+       /* an official way for overflow indication */
+       rdmsrl(hwc->config_base, v);
+       if (v & P4_CCCR_OVF) {
+               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+               return 1;
+       }
+
+       /*
+        * In some circumstances the overflow might issue an NMI but did
+        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+        * we simply check for high bit being set, if it's cleared it means
+        * the counter has reached zero value and continued counting before
+        * real NMI signal was received:
+        */
+       rdmsrl(hwc->event_base, v);
+       if (!(v & ARCH_P4_UNFLAGGED_BIT))
+               return 1;
+
+       return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+       /*
+        * FIXME
+        *
+        * It's still allowed that two threads setup same cache
+        * events so we can't simply clear metrics until we knew
+        * no one is depending on us, so we need kind of counter
+        * for "ReplayEvent" users.
+        *
+        * What is more complex -- RAW events, if user (for some
+        * reason) will pass some cache event metric with improper
+        * event opcode -- it's fine from hardware point of view
+        * but completely nonsense from "meaning" of such action.
+        *
+        * So at moment let leave metrics turned on forever -- it's
+        * ok for now but need to be revisited!
+        *
+        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+        */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       /*
+        * If event gets disabled while counter is in overflowed
+        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+        * asserted again and again
+        */
+       (void)wrmsrl_safe(hwc->config_base,
+               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_disable_event(event);
+       }
+
+       p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+       struct p4_pebs_bind *bind;
+       unsigned int idx;
+
+       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+       idx = p4_config_unpack_metric(config);
+       if (idx == P4_PEBS_METRIC__none)
+               return;
+
+       bind = &p4_pebs_bind_map[idx];
+
+       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int thread = p4_ht_config_thread(hwc->config);
+       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+       unsigned int idx = p4_config_unpack_event(hwc->config);
+       struct p4_event_bind *bind;
+       u64 escr_addr, cccr;
+
+       bind = &p4_event_bind_map[idx];
+       escr_addr = bind->escr_msr[thread];
+
+       /*
+        * - we dont support cascaded counters yet
+        * - and counter 1 is broken (erratum)
+        */
+       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+       WARN_ON_ONCE(hwc->idx == 1);
+
+       /* we need a real Event value */
+       escr_conf &= ~P4_ESCR_EVENT_MASK;
+       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       /*
+        * it could be Cache event so we need to write metrics
+        * into additional MSRs
+        */
+       p4_pmu_enable_pebs(hwc->config);
+
+       (void)wrmsrl_safe(escr_addr, escr_conf);
+       (void)wrmsrl_safe(hwc->config_base,
+                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       int idx;
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_event *event = cpuc->events[idx];
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               p4_pmu_enable_event(event);
+       }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+       struct perf_sample_data data;
+       struct cpu_hw_events *cpuc;
+       struct perf_event *event;
+       struct hw_perf_event *hwc;
+       int idx, handled = 0;
+       u64 val;
+
+       cpuc = this_cpu_ptr(&cpu_hw_events);
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               int overflow;
+
+               if (!test_bit(idx, cpuc->active_mask)) {
+                       /* catch in-flight IRQs */
+                       if (__test_and_clear_bit(idx, cpuc->running))
+                               handled++;
+                       continue;
+               }
+
+               event = cpuc->events[idx];
+               hwc = &event->hw;
+
+               WARN_ON_ONCE(hwc->idx != idx);
+
+               /* it might be unflagged overflow */
+               overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+               val = x86_perf_event_update(event);
+               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+                       continue;
+
+               handled += overflow;
+
+               /* event overflow for sure */
+               perf_sample_data_init(&data, 0, hwc->last_period);
+
+               if (!x86_perf_event_set_period(event))
+                       continue;
+
+
+               if (perf_event_overflow(event, &data, regs))
+                       x86_pmu_stop(event, 0);
+       }
+
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+
+       /*
+        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+        * been observed that the OVF bit flag has to be cleared first _before_
+        * the LVTPC can be unmasked.
+        *
+        * The reason is the NMI line will continue to be asserted while the OVF
+        * bit is set.  This causes a second NMI to generate if the LVTPC is
+        * unmasked before the OVF bit is cleared, leading to unknown NMI
+        * messages.
+        */
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+       return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+       u32 escr, cccr;
+
+       /*
+        * we either lucky and continue on same cpu or no HT support
+        */
+       if (!p4_should_swap_ts(hwc->config, cpu))
+               return;
+
+       /*
+        * the event is migrated from an another logical
+        * cpu, so we need to swap thread specific flags
+        */
+
+       escr = p4_config_unpack_escr(hwc->config);
+       cccr = p4_config_unpack_cccr(hwc->config);
+
+       if (p4_ht_thread(cpu)) {
+               cccr &= ~P4_CCCR_OVF_PMI_T0;
+               cccr |= P4_CCCR_OVF_PMI_T1;
+               if (escr & P4_ESCR_T0_OS) {
+                       escr &= ~P4_ESCR_T0_OS;
+                       escr |= P4_ESCR_T1_OS;
+               }
+               if (escr & P4_ESCR_T0_USR) {
+                       escr &= ~P4_ESCR_T0_USR;
+                       escr |= P4_ESCR_T1_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config |= P4_CONFIG_HT;
+       } else {
+               cccr &= ~P4_CCCR_OVF_PMI_T1;
+               cccr |= P4_CCCR_OVF_PMI_T0;
+               if (escr & P4_ESCR_T1_OS) {
+                       escr &= ~P4_ESCR_T1_OS;
+                       escr |= P4_ESCR_T0_OS;
+               }
+               if (escr & P4_ESCR_T1_USR) {
+                       escr &= ~P4_ESCR_T1_USR;
+                       escr |= P4_ESCR_T0_USR;
+               }
+               hwc->config  = p4_config_pack_escr(escr);
+               hwc->config |= p4_config_pack_cccr(cccr);
+               hwc->config &= ~P4_CONFIG_HT;
+       }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE               0x000003a0
+#define P4_ESCR_MSR_MAX                        0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+       unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
+                       !p4_escr_table[idx]             ||
+                       p4_escr_table[idx] != addr)) {
+               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+               return -1;
+       }
+
+       return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+                       struct p4_event_bind *bind)
+{
+       int i, j;
+
+       for (i = 0; i < P4_CNTR_LIMIT; i++) {
+               j = bind->cntr[thread][i];
+               if (j != -1 && !test_bit(j, used_mask))
+                       return j;
+       }
+
+       return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+       int cpu = smp_processor_id();
+       struct hw_perf_event *hwc;
+       struct p4_event_bind *bind;
+       unsigned int i, thread, num;
+       int cntr_idx, escr_idx;
+       u64 config_alias;
+       int pass;
+
+       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+       for (i = 0, num = n; i < n; i++, num--) {
+
+               hwc = &cpuc->event_list[i]->hw;
+               thread = p4_ht_thread(cpu);
+               pass = 0;
+
+again:
+               /*
+                * It's possible to hit a circular lock
+                * between original and alternative events
+                * if both are scheduled already.
+                */
+               if (pass > 2)
+                       goto done;
+
+               bind = p4_config_get_bind(hwc->config);
+               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+               if (unlikely(escr_idx == -1))
+                       goto done;
+
+               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+                       cntr_idx = hwc->idx;
+                       if (assign)
+                               assign[i] = hwc->idx;
+                       goto reserve;
+               }
+
+               cntr_idx = p4_next_cntr(thread, used_mask, bind);
+               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+                       /*
+                        * Check whether an event alias is still available.
+                        */
+                       config_alias = p4_get_alias_event(hwc->config);
+                       if (!config_alias)
+                               goto done;
+                       hwc->config = config_alias;
+                       pass++;
+                       goto again;
+               }
+               /*
+                * Perf does test runs to see if a whole group can be assigned
+                * together succesfully.  There can be multiple rounds of this.
+                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+                * bits, such that the next round of group assignments will
+                * cause the above p4_should_swap_ts to pass instead of fail.
+                * This leads to counters exclusive to thread0 being used by
+                * thread1.
+                *
+                * Solve this with a cheap hack, reset the idx back to -1 to
+                * force a new lookup (p4_next_cntr) to get the right counter
+                * for the right thread.
+                *
+                * This probably doesn't comply with the general spirit of how
+                * perf wants to work, but P4 is special. :-(
+                */
+               if (p4_should_swap_ts(hwc->config, cpu))
+                       hwc->idx = -1;
+               p4_pmu_swap_config_ts(hwc, cpu);
+               if (assign)
+                       assign[i] = cntr_idx;
+reserve:
+               set_bit(cntr_idx, used_mask);
+               set_bit(escr_idx, escr_mask);
+       }
+
+done:
+       return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+       &format_attr_cccr.attr,
+       &format_attr_escr.attr,
+       &format_attr_ht.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+       .name                   = "Netburst P4/Xeon",
+       .handle_irq             = p4_pmu_handle_irq,
+       .disable_all            = p4_pmu_disable_all,
+       .enable_all             = p4_pmu_enable_all,
+       .enable                 = p4_pmu_enable_event,
+       .disable                = p4_pmu_disable_event,
+       .eventsel               = MSR_P4_BPU_CCCR0,
+       .perfctr                = MSR_P4_BPU_PERFCTR0,
+       .event_map              = p4_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p4_general_events),
+       .get_event_constraints  = x86_get_event_constraints,
+       /*
+        * IF HT disabled we may need to use all
+        * ARCH_P4_MAX_CCCR counters simulaneously
+        * though leave it restricted at moment assuming
+        * HT is on
+        */
+       .num_counters           = ARCH_P4_MAX_CCCR,
+       .apic                   = 1,
+       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
+       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
+       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+       .hw_config              = p4_hw_config,
+       .schedule_events        = p4_pmu_schedule_events,
+       /*
+        * This handles erratum N15 in intel doc 249199-029,
+        * the counter may not be updated correctly on write
+        * so we need a second write operation to do the trick
+        * (the official workaround didn't work)
+        *
+        * the former idea is taken from OProfile code
+        */
+       .perfctr_second_write   = 1,
+
+       .format_attrs           = intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+       unsigned int low, high;
+       int i, reg;
+
+       /* If we get stripped -- indexing fails */
+       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+       if (!(low & (1 << 7))) {
+               pr_cont("unsupported Netburst CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       pr_cont("Netburst events, ");
+
+       x86_pmu = p4_pmu;
+
+       /*
+        * Even though the counters are configured to interrupt a particular
+        * logical processor when an overflow happens, testing has shown that
+        * on kdump kernels (which uses a single cpu), thread1's counter
+        * continues to run and will report an NMI on thread0.  Due to the
+        * overflow bug, this leads to a stream of unknown NMIs.
+        *
+        * Solve this by zero'ing out the registers to mimic a reset.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu_config_addr(i);
+               wrmsrl_safe(reg, 0ULL);
+       }
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
new file mode 100644 (file)
index 0000000..1f5c47a
--- /dev/null
@@ -0,0 +1,279 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
+  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
+  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
+  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
+  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
+  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
+                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
+       },
+        [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+        },
+ },
+ [ C(L1I ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(LL  ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(DTLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = 0,
+               [ C(RESULT_MISS)   ] = 0,
+       },
+ },
+ [ C(ITLB) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
+               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
+       },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+ [ C(BPU ) ] = {
+       [ C(OP_READ) ] = {
+               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
+               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
+        },
+       [ C(OP_WRITE) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+       [ C(OP_PREFETCH) ] = {
+               [ C(RESULT_ACCESS) ] = -1,
+               [ C(RESULT_MISS)   ] = -1,
+       },
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+       return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT                   0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
+       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
+       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
+       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
+       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
+       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+       u64 val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+       unsigned long val;
+
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val = P6_NOP_EVENT;
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 val;
+
+       val = hwc->config;
+
+       /*
+        * p6 only has a global event enable, set on PerfEvtSel0
+        * We "disable" events by programming P6_NOP_EVENT
+        * and we rely on p6_pmu_enable_all() being called
+        * to actually enable the events.
+        */
+
+       (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7"    );
+PMU_FORMAT_ATTR(umask, "config:8-15"   );
+PMU_FORMAT_ATTR(edge,  "config:18"     );
+PMU_FORMAT_ATTR(pc,    "config:19"     );
+PMU_FORMAT_ATTR(inv,   "config:23"     );
+PMU_FORMAT_ATTR(cmask, "config:24-31"  );
+
+static struct attribute *intel_p6_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_pc.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask.attr,
+       NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = x86_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_event,
+       .disable                = p6_pmu_disable_event,
+       .hw_config              = x86_pmu_hw_config,
+       .schedule_events        = x86_schedule_events,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       /*
+        * Events have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a event for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .cntval_bits            = 32,
+       .cntval_mask            = (1ULL << 32) - 1,
+       .get_event_constraints  = x86_get_event_constraints,
+       .event_constraints      = p6_event_constraints,
+
+       .format_attrs           = intel_p6_formats_attr,
+       .events_sysfs_show      = intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+       if (boot_cpu_data.x86_mask < 9) {
+               /*
+                * PPro erratum 26; fixed in stepping 9 and above.
+                */
+               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+               x86_pmu.attr_rdpmc_broken = 1;
+               x86_pmu.attr_rdpmc = 0;
+       }
+}
+
+__init int p6_pmu_init(void)
+{
+       x86_pmu = p6_pmu;
+
+       switch (boot_cpu_data.x86_model) {
+       case  1: /* Pentium Pro */
+               x86_add_quirk(p6_pmu_rdpmc_quirk);
+               break;
+
+       case  3: /* Pentium II - Klamath */
+       case  5: /* Pentium II - Deschutes */
+       case  6: /* Pentium II - Mendocino */
+               break;
+
+       case  7: /* Pentium III - Katmai */
+       case  8: /* Pentium III - Coppermine */
+       case 10: /* Pentium III Xeon */
+       case 11: /* Pentium III - Tualatin */
+               break;
+
+       case  9: /* Pentium M - Banias */
+       case 13: /* Pentium M - Dothan */
+               break;
+
+       default:
+               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+
+       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+               sizeof(hw_cache_event_ids));
+
+       return 0;
+}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
new file mode 100644 (file)
index 0000000..6af7cf7
--- /dev/null
@@ -0,0 +1,1188 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+
+#include "../perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+       CR_EAX = 0,
+       CR_ECX,
+       CR_EDX,
+       CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)                                         \
+       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
+                           .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+       const char      *name;
+       u32             leaf;
+       u8              reg;
+       u32             mask;
+} pt_caps[] = {
+       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
+       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
+       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
+       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
+       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
+       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
+       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
+       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
+       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
+       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
+       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+       struct pt_cap_desc *cd = &pt_caps[cap];
+       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+       unsigned int shift = __ffs(cd->mask);
+
+       return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct dev_ext_attribute *ea =
+               container_of(attr, struct dev_ext_attribute, attr);
+       enum pt_capabilities cap = (long)ea->var;
+
+       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+       .name   = "caps",
+};
+
+PMU_FORMAT_ATTR(cyc,           "config:1"      );
+PMU_FORMAT_ATTR(mtc,           "config:9"      );
+PMU_FORMAT_ATTR(tsc,           "config:10"     );
+PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
+PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
+PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
+PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
+
+static struct attribute *pt_formats_attr[] = {
+       &format_attr_cyc.attr,
+       &format_attr_mtc.attr,
+       &format_attr_tsc.attr,
+       &format_attr_noretcomp.attr,
+       &format_attr_mtc_period.attr,
+       &format_attr_cyc_thresh.attr,
+       &format_attr_psb_period.attr,
+       NULL,
+};
+
+static struct attribute_group pt_format_group = {
+       .name   = "format",
+       .attrs  = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+       &pt_cap_group,
+       &pt_format_group,
+       NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+       struct dev_ext_attribute *de_attrs;
+       struct attribute **attrs;
+       size_t size;
+       int ret;
+       long i;
+
+       attrs = NULL;
+
+       for (i = 0; i < PT_CPUID_LEAVES; i++) {
+               cpuid_count(20, i,
+                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+       }
+
+       ret = -ENOMEM;
+       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+       attrs = kzalloc(size, GFP_KERNEL);
+       if (!attrs)
+               goto fail;
+
+       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+       de_attrs = kzalloc(size, GFP_KERNEL);
+       if (!de_attrs)
+               goto fail;
+
+       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+               struct dev_ext_attribute *de_attr = de_attrs + i;
+
+               de_attr->attr.attr.name = pt_caps[i].name;
+
+               sysfs_attr_init(&de_attr->attr.attr);
+
+               de_attr->attr.attr.mode         = S_IRUGO;
+               de_attr->attr.show              = pt_cap_show;
+               de_attr->var                    = (void *)i;
+
+               attrs[i] = &de_attr->attr.attr;
+       }
+
+       pt_cap_group.attrs = attrs;
+
+       return 0;
+
+fail:
+       kfree(attrs);
+
+       return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
+                         RTIT_CTL_CYC_THRESH   | \
+                         RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
+                        RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
+                       RTIT_CTL_DISRETC        | \
+                       RTIT_CTL_CYC_PSB        | \
+                       RTIT_CTL_MTC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+       u64 config = event->attr.config;
+       u64 allowed, requested;
+
+       if ((config & PT_CONFIG_MASK) != config)
+               return false;
+
+       if (config & RTIT_CTL_CYC_PSB) {
+               if (!pt_cap_get(PT_CAP_psb_cyc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_psb_periods);
+               requested = (config & RTIT_CTL_PSB_FREQ) >>
+                       RTIT_CTL_PSB_FREQ_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+               requested = (config & RTIT_CTL_CYC_THRESH) >>
+                       RTIT_CTL_CYC_THRESH_OFFSET;
+               if (requested && (!(allowed & BIT(requested))))
+                       return false;
+       }
+
+       if (config & RTIT_CTL_MTC) {
+               /*
+                * In the unlikely case that CPUID lists valid mtc periods,
+                * but not the mtc capability, drop out here.
+                *
+                * Spec says that setting mtc period bits while mtc bit in
+                * CPUID is 0 will #GP, so better safe than sorry.
+                */
+               if (!pt_cap_get(PT_CAP_mtc))
+                       return false;
+
+               allowed = pt_cap_get(PT_CAP_mtc_periods);
+               if (!allowed)
+                       return false;
+
+               requested = (config & RTIT_CTL_MTC_RANGE) >>
+                       RTIT_CTL_MTC_RANGE_OFFSET;
+
+               if (!(allowed & BIT(requested)))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config(struct perf_event *event)
+{
+       u64 reg;
+
+       if (!event->hw.itrace_started) {
+               event->hw.itrace_started = 1;
+               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+       }
+
+       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+       if (!event->attr.exclude_kernel)
+               reg |= RTIT_CTL_OS;
+       if (!event->attr.exclude_user)
+               reg |= RTIT_CTL_USR;
+
+       reg |= (event->attr.config & PT_CONFIG_MASK);
+
+       wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+       u64 ctl;
+
+       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+       if (start)
+               ctl |= RTIT_CTL_TRACEEN;
+       else
+               ctl &= ~RTIT_CTL_TRACEEN;
+       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+       /*
+        * A wrmsr that disables trace generation serializes other PT
+        * registers and causes all data packets to be written to memory,
+        * but a fence is required for the data to become globally visible.
+        *
+        * The below WMB, separating data store and aux_head store matches
+        * the consumer's RMB that separates aux_head load and data load.
+        */
+       if (!start)
+               wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+                            unsigned int output_off)
+{
+       u64 reg;
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:     actual ToPA table entries, as understood by PT hardware
+ * @list:      linkage to struct pt_buffer's list of tables
+ * @phys:      physical address of this page
+ * @offset:    offset of the first entry in this table in the buffer
+ * @size:      total size of all entries in this table
+ * @last:      index of the last initialized entry in this table
+ */
+struct topa {
+       struct topa_entry       table[TENTS_PER_PAGE];
+       struct list_head        list;
+       u64                     phys;
+       u64                     offset;
+       size_t                  size;
+       int                     last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:       CPU on which to allocate.
+ * @gfp:       Allocation flags.
+ *
+ * Return:     On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+       int node = cpu_to_node(cpu);
+       struct topa *topa;
+       struct page *p;
+
+       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+       if (!p)
+               return NULL;
+
+       topa = page_address(p);
+       topa->last = 0;
+       topa->phys = page_to_phys(p);
+
+       /*
+        * In case of singe-entry ToPA, always put the self-referencing END
+        * link as the 2nd entry in the table
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(topa, 1)->end = 1;
+       }
+
+       return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:      Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+       free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:        PT buffer that's being extended.
+ * @topa:       New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+       struct topa *last = buf->last;
+
+       list_add_tail(&topa->list, &buf->tables);
+
+       if (!buf->first) {
+               buf->first = buf->last = buf->cur = topa;
+               return;
+       }
+
+       topa->offset = last->offset + last->size;
+       buf->last = topa;
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return;
+
+       BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+       TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:      ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+       /* single-entry ToPA is a special case */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return !!topa->last;
+
+       return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:       PT buffer being initialized.
+ * @gfp:       Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:     0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+       struct topa *topa = buf->last;
+       int order = 0;
+       struct page *p;
+
+       p = virt_to_page(buf->data_pages[buf->nr_pages]);
+       if (PagePrivate(p))
+               order = page_private(p);
+
+       if (topa_table_full(topa)) {
+               topa = topa_alloc(buf->cpu, gfp);
+               if (!topa)
+                       return -ENOMEM;
+
+               topa_insert_table(buf, topa);
+       }
+
+       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+       TOPA_ENTRY(topa, -1)->size = order;
+       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(topa, -1)->intr = 1;
+               TOPA_ENTRY(topa, -1)->stop = 1;
+       }
+
+       topa->last++;
+       topa->size += sizes(order);
+
+       buf->nr_pages += 1ul << order;
+
+       return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:       PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+       struct topa *topa;
+
+       list_for_each_entry(topa, &buf->tables, list) {
+               int i;
+
+               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+                        topa->phys, topa->offset, topa->size);
+               for (i = 0; i < TENTS_PER_PAGE; i++) {
+                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+                                &topa->table[i],
+                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
+                                sizes(topa->table[i].size),
+                                topa->table[i].end ?  'E' : ' ',
+                                topa->table[i].intr ? 'I' : ' ',
+                                topa->table[i].stop ? 'S' : ' ',
+                                *(u64 *)&topa->table[i]);
+                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+                            topa->table[i].stop) ||
+                           topa->table[i].end)
+                               break;
+               }
+       }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:       PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+       buf->output_off = 0;
+       buf->cur_idx++;
+
+       if (buf->cur_idx == buf->cur->last) {
+               if (buf->cur == buf->last)
+                       buf->cur = buf->first;
+               else
+                       buf->cur = list_entry(buf->cur->list.next, struct topa,
+                                             list);
+               buf->cur_idx = 0;
+       }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:                Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       u64 topa_idx, base, old;
+
+       /* offset of the first region in this table from the beginning of buf */
+       base = buf->cur->offset + buf->output_off;
+
+       /* offset of the current output region within this table */
+       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+               base += sizes(buf->cur->table[topa_idx].size);
+
+       if (buf->snapshot) {
+               local_set(&buf->data_size, base);
+       } else {
+               old = (local64_xchg(&buf->head, base) &
+                      ((buf->nr_pages << PAGE_SHIFT) - 1));
+               if (base < old)
+                       base += buf->nr_pages << PAGE_SHIFT;
+
+               local_add(base - old, &buf->data_size);
+       }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:       PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:       PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+       return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:                Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+       int advance = 0;
+       u64 status;
+
+       rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+       if (status & RTIT_STATUS_ERROR) {
+               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+               pt_topa_dump(buf);
+               status &= ~RTIT_STATUS_ERROR;
+       }
+
+       if (status & RTIT_STATUS_STOPPED) {
+               status &= ~RTIT_STATUS_STOPPED;
+
+               /*
+                * On systems that only do single-entry ToPA, hitting STOP
+                * means we are already losing data; need to let the decoder
+                * know.
+                */
+               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+                       local_inc(&buf->lost);
+                       advance++;
+               }
+       }
+
+       /*
+        * Also on single-entry ToPA implementations, interrupt will come
+        * before the output reaches its output region's boundary.
+        */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+               void *head = pt_buffer_region(buf);
+
+               /* everything within this margin needs to be zeroed out */
+               memset(head + buf->output_off, 0,
+                      pt_buffer_region_size(buf) -
+                      buf->output_off);
+               advance++;
+       }
+
+       if (advance)
+               pt_buffer_advance(buf);
+
+       wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:       PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+       u64 offset, base_topa;
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+       buf->cur = phys_to_virt(base_topa);
+
+       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+       /* offset within current output region */
+       buf->output_off = offset >> 32;
+       /* index of current output region within this table */
+       buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:       PT buffer.
+ * @pg:                Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+       struct topa_entry *te = buf->topa_index[pg];
+
+       /* one region */
+       if (buf->first == buf->last && buf->first->last == 1)
+               return pg;
+
+       do {
+               pg++;
+               pg &= buf->nr_pages - 1;
+       } while (buf->topa_index[pg] == te);
+
+       return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:       PT buffer.
+ * @handle:    Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+                                  struct perf_output_handle *handle)
+
+{
+       unsigned long head = local64_read(&buf->head);
+       unsigned long idx, npages, wakeup;
+
+       /* can't stop in the middle of an output region */
+       if (buf->output_off + handle->size + 1 <
+           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+               return -EINVAL;
+
+
+       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               return 0;
+
+       /* clear STOP and INT from current entry */
+       buf->topa_index[buf->stop_pos]->stop = 0;
+       buf->topa_index[buf->intr_pos]->intr = 0;
+
+       /* how many pages till the STOP marker */
+       npages = handle->size >> PAGE_SHIFT;
+
+       /* if it's on a page boundary, fill up one more page */
+       if (!offset_in_page(head + handle->size + 1))
+               npages++;
+
+       idx = (head >> PAGE_SHIFT) + npages;
+       idx &= buf->nr_pages - 1;
+       buf->stop_pos = idx;
+
+       wakeup = handle->wakeup >> PAGE_SHIFT;
+
+       /* in the worst case, wake up the consumer one page before hard stop */
+       idx = (head >> PAGE_SHIFT) + npages - 1;
+       if (idx > wakeup)
+               idx = wakeup;
+
+       idx &= buf->nr_pages - 1;
+       buf->intr_pos = idx;
+
+       buf->topa_index[buf->stop_pos]->stop = 1;
+       buf->topa_index[buf->intr_pos]->intr = 1;
+
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:       PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+       struct topa *cur = buf->first, *prev = buf->last;
+       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+       int pg = 0, idx = 0;
+
+       while (pg < buf->nr_pages) {
+               int tidx;
+
+               /* pages within one topa entry */
+               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+                       buf->topa_index[pg] = te_prev;
+
+               te_prev = te_cur;
+
+               if (idx == cur->last - 1) {
+                       /* advance to next topa table */
+                       idx = 0;
+                       cur = list_entry(cur->list.next, struct topa, list);
+               } else {
+                       idx++;
+               }
+               te_cur = TOPA_ENTRY(cur, idx);
+       }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:       PT buffer.
+ * @head:      Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+       int pg;
+
+       if (buf->snapshot)
+               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+       pg = pt_topa_next_entry(buf, pg);
+
+       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
+       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+       local64_set(&buf->head, head);
+       local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:       PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+       struct topa *topa, *iter;
+
+       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+               /*
+                * right now, this is in free_aux() path only, so
+                * no need to unlink this table from the list
+                */
+               topa_free(topa);
+       }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:       PT buffer.
+ * @size:      Total size of all regions within this ToPA.
+ * @gfp:       Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+                              gfp_t gfp)
+{
+       struct topa *topa;
+       int err;
+
+       topa = topa_alloc(buf->cpu, gfp);
+       if (!topa)
+               return -ENOMEM;
+
+       topa_insert_table(buf, topa);
+
+       while (buf->nr_pages < nr_pages) {
+               err = topa_insert_pages(buf, gfp);
+               if (err) {
+                       pt_buffer_fini_topa(buf);
+                       return -ENOMEM;
+               }
+       }
+
+       pt_buffer_setup_topa_index(buf);
+
+       /* link last table to the first one, unless we're double buffering */
+       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+               TOPA_ENTRY(buf->last, -1)->end = 1;
+       }
+
+       pt_topa_dump(buf);
+       return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:       Cpu on which to allocate, -1 means current.
+ * @pages:     Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:  Number of pages in the buffer.
+ * @snapshot:  If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:     Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+       struct pt_buffer *buf;
+       int node, ret;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (cpu == -1)
+               cpu = raw_smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+                          GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->cpu = cpu;
+       buf->snapshot = snapshot;
+       buf->data_pages = pages;
+
+       INIT_LIST_HEAD(&buf->tables);
+
+       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+       if (ret) {
+               kfree(buf);
+               return NULL;
+       }
+
+       return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:      PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+       struct pt_buffer *buf = data;
+
+       pt_buffer_fini_topa(buf);
+       kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:       PT buffer.
+ * @pt:                Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+       if (buf->snapshot)
+               return false;
+
+       if (local_read(&buf->data_size) >= pt->handle.size)
+               return true;
+
+       return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+       struct perf_event *event = pt->handle.event;
+
+       /*
+        * There may be a dangling PT bit in the interrupt status register
+        * after PT has been disabled by pt_event_stop(). Make sure we don't
+        * do anything (particularly, re-enable) for this event here.
+        */
+       if (!ACCESS_ONCE(pt->handle_nmi))
+               return;
+
+       pt_config_start(false);
+
+       if (!event)
+               return;
+
+       buf = perf_get_aux(&pt->handle);
+       if (!buf)
+               return;
+
+       pt_read_offset(buf);
+
+       pt_handle_status(pt);
+
+       pt_update_head(pt);
+
+       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                           local_xchg(&buf->lost, 0));
+
+       if (!event->hw.state) {
+               int ret;
+
+               buf = perf_aux_output_begin(&pt->handle, event);
+               if (!buf) {
+                       event->hw.state = PERF_HES_STOPPED;
+                       return;
+               }
+
+               pt_buffer_reset_offsets(buf, pt->handle.head);
+               /* snapshot counters don't use PMI, so it's safe */
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret) {
+                       perf_aux_output_end(&pt->handle, 0, true);
+                       return;
+               }
+
+               pt_config_buffer(buf->cur->table, buf->cur_idx,
+                                buf->output_off);
+               pt_config(event);
+       }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+       if (!buf || pt_buffer_is_full(buf, pt)) {
+               event->hw.state = PERF_HES_STOPPED;
+               return;
+       }
+
+       ACCESS_ONCE(pt->handle_nmi) = 1;
+       event->hw.state = 0;
+
+       pt_config_buffer(buf->cur->table, buf->cur_idx,
+                        buf->output_off);
+       pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       /*
+        * Protect against the PMI racing with disabling wrmsr,
+        * see comment in intel_pt_interrupt().
+        */
+       ACCESS_ONCE(pt->handle_nmi) = 0;
+       pt_config_start(false);
+
+       if (event->hw.state == PERF_HES_STOPPED)
+               return;
+
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_UPDATE) {
+               struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+               if (!buf)
+                       return;
+
+               if (WARN_ON_ONCE(pt->handle.event != event))
+                       return;
+
+               pt_read_offset(buf);
+
+               pt_handle_status(pt);
+
+               pt_update_head(pt);
+       }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct pt_buffer *buf;
+
+       pt_event_stop(event, PERF_EF_UPDATE);
+
+       buf = perf_get_aux(&pt->handle);
+
+       if (buf) {
+               if (buf->snapshot)
+                       pt->handle.head =
+                               local_xchg(&buf->data_size,
+                                          buf->nr_pages << PAGE_SHIFT);
+               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+                                   local_xchg(&buf->lost, 0));
+       }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+       struct pt_buffer *buf;
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct hw_perf_event *hwc = &event->hw;
+       int ret = -EBUSY;
+
+       if (pt->handle.event)
+               goto fail;
+
+       buf = perf_aux_output_begin(&pt->handle, event);
+       ret = -EINVAL;
+       if (!buf)
+               goto fail_stop;
+
+       pt_buffer_reset_offsets(buf, pt->handle.head);
+       if (!buf->snapshot) {
+               ret = pt_buffer_reset_markers(buf, &pt->handle);
+               if (ret)
+                       goto fail_end_stop;
+       }
+
+       if (mode & PERF_EF_START) {
+               pt_event_start(event, 0);
+               ret = -EBUSY;
+               if (hwc->state == PERF_HES_STOPPED)
+                       goto fail_end_stop;
+       } else {
+               hwc->state = PERF_HES_STOPPED;
+       }
+
+       return 0;
+
+fail_end_stop:
+       perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+       hwc->state = PERF_HES_STOPPED;
+fail:
+       return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+       x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+       if (event->attr.type != pt_pmu.pmu.type)
+               return -ENOENT;
+
+       if (!pt_event_valid(event))
+               return -EINVAL;
+
+       if (x86_add_exclusive(x86_lbr_exclusive_pt))
+               return -EBUSY;
+
+       event->destroy = pt_event_destroy;
+
+       return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+       if (pt->handle.event)
+               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+       int ret, cpu, prior_warn = 0;
+
+       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+               return -ENODEV;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               u64 ctl;
+
+               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+               if (!ret && (ctl & RTIT_CTL_TRACEEN))
+                       prior_warn++;
+       }
+       put_online_cpus();
+
+       if (prior_warn) {
+               x86_add_exclusive(x86_lbr_exclusive_pt);
+               pr_warn("PT is enabled at boot time, doing nothing\n");
+
+               return -EBUSY;
+       }
+
+       ret = pt_pmu_hw_init();
+       if (ret)
+               return ret;
+
+       if (!pt_cap_get(PT_CAP_topa_output)) {
+               pr_warn("ToPA output is not supported on this CPU\n");
+               return -ENODEV;
+       }
+
+       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+               pt_pmu.pmu.capabilities =
+                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+       pt_pmu.pmu.attr_groups  = pt_attr_groups;
+       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
+       pt_pmu.pmu.event_init   = pt_event_init;
+       pt_pmu.pmu.add          = pt_event_add;
+       pt_pmu.pmu.del          = pt_event_del;
+       pt_pmu.pmu.start        = pt_event_start;
+       pt_pmu.pmu.stop         = pt_event_stop;
+       pt_pmu.pmu.read         = pt_event_read;
+       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
+       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
+       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+       return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
new file mode 100644 (file)
index 0000000..336878a
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+       return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+       u64     end     : 1;
+       u64     rsvd0   : 1;
+       u64     intr    : 1;
+       u64     rsvd1   : 1;
+       u64     stop    : 1;
+       u64     rsvd2   : 1;
+       u64     size    : 4;
+       u64     rsvd3   : 2;
+       u64     base    : 36;
+       u64     rsvd4   : 16;
+};
+
+#define PT_CPUID_LEAVES                2
+#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+       PT_CAP_max_subleaf = 0,
+       PT_CAP_cr3_filtering,
+       PT_CAP_psb_cyc,
+       PT_CAP_mtc,
+       PT_CAP_topa_output,
+       PT_CAP_topa_multiple_entries,
+       PT_CAP_single_range_output,
+       PT_CAP_payloads_lip,
+       PT_CAP_mtc_periods,
+       PT_CAP_cycle_thresholds,
+       PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+       struct pmu              pmu;
+       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *             cpu, depending on perf event configuration
+ * @cpu:       cpu for per-cpu allocation
+ * @tables:    list of ToPA tables in this buffer
+ * @first:     shorthand for first topa table
+ * @last:      shorthand for last topa table
+ * @cur:       current topa table
+ * @nr_pages:  buffer size in pages
+ * @cur_idx:   current output region's index within @cur table
+ * @output_off:        offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost:      if data was lost/truncated
+ * @head:      logical write offset inside the buffer
+ * @snapshot:  if this is for a snapshot/overwrite counter
+ * @stop_pos:  STOP topa entry in the buffer
+ * @intr_pos:  INT topa entry in the buffer
+ * @data_pages:        array of pages from perf
+ * @topa_index:        table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+       int                     cpu;
+       struct list_head        tables;
+       struct topa             *first, *last, *cur;
+       unsigned int            cur_idx;
+       size_t                  output_off;
+       unsigned long           nr_pages;
+       local_t                 data_size;
+       local_t                 lost;
+       local64_t               head;
+       bool                    snapshot;
+       unsigned long           stop_pos, intr_pos;
+       void                    **data_pages;
+       struct topa_entry       *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:    perf output handle
+ * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+       struct perf_output_handle handle;
+       int                     handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
new file mode 100644 (file)
index 0000000..b834a3f
--- /dev/null
@@ -0,0 +1,767 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ *       event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *       event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *       event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *       event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
+#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
+#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
+#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
+#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS         0x4
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+       "pp0-core",
+       "package",
+       "dram",
+       "pp1-gpu",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
+                        1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT|\
+                        1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
+                        1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK        0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
+                               struct kobj_attribute *attr,    \
+                               char *page)                     \
+{                                                              \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
+       return sprintf(page, _format "\n");                     \
+}                                                              \
+static struct kobj_attribute format_attr_##_var =              \
+       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_CNTR_WIDTH 32
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
+static struct perf_pmu_events_attr event_attr_##v = {                          \
+       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
+       .id             = 0,                                                    \
+       .event_str      = str,                                                  \
+};
+
+struct rapl_pmu {
+       raw_spinlock_t          lock;
+       int                     n_active;
+       int                     cpu;
+       struct list_head        active_list;
+       struct pmu              *pmu;
+       ktime_t                 timer_interval;
+       struct hrtimer          hrtimer;
+};
+
+struct rapl_pmus {
+       struct pmu              pmu;
+       unsigned int            maxpkg;
+       struct rapl_pmu         *pmus[];
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus;
+static cpumask_t rapl_cpu_mask;
+static unsigned int rapl_cntr_mask;
+static u64 rapl_timer_ms;
+
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+       return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+       u64 raw;
+       rdmsrl(event->hw.event_base, raw);
+       return raw;
+}
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+       if (cfg > NR_RAPL_DOMAINS) {
+               pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+               return v;
+       }
+       /*
+        * scale delta to smallest unit (1/2^32)
+        * users must then scale back: count * 1/(1e9*2^32) to get Joules
+        * or use ldexp(count, -32).
+        * Watts = Joules/Time delta
+        */
+       return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta, sdelta;
+       int shift = RAPL_CNTR_WIDTH;
+
+again:
+       prev_raw_count = local64_read(&hwc->prev_count);
+       rdmsrl(event->hw.event_base, new_raw_count);
+
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                           new_raw_count) != prev_raw_count) {
+               cpu_relax();
+               goto again;
+       }
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (event-)time and add that to the generic event.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count.
+        */
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
+
+       sdelta = rapl_scale(delta, event->hw.config);
+
+       local64_add(sdelta, &event->count);
+
+       return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+                    HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+       struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+       struct perf_event *event;
+       unsigned long flags;
+
+       if (!pmu->n_active)
+               return HRTIMER_NORESTART;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       list_for_each_entry(event, &pmu->active_list, active_entry)
+               rapl_event_update(event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+       return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+       struct hrtimer *hr = &pmu->hrtimer;
+
+       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+                                  struct perf_event *event)
+{
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+
+       list_add_tail(&event->active_entry, &pmu->active_list);
+
+       local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+       pmu->n_active++;
+       if (pmu->n_active == 1)
+               rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+       __rapl_pmu_event_start(pmu, event);
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       /* mark event as deactivated and stopped */
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               WARN_ON_ONCE(pmu->n_active <= 0);
+               pmu->n_active--;
+               if (pmu->n_active == 0)
+                       hrtimer_cancel(&pmu->hrtimer);
+
+               list_del(&event->active_entry);
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       /* check if update of sw counter is necessary */
+       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               rapl_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+       struct rapl_pmu *pmu = event->pmu_private;
+       struct hw_perf_event *hwc = &event->hw;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pmu->lock, flags);
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_START)
+               __rapl_pmu_event_start(pmu, event);
+
+       raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+       return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+       int bit, msr, ret = 0;
+       struct rapl_pmu *pmu;
+
+       /* only look at RAPL events */
+       if (event->attr.type != rapl_pmus->pmu.type)
+               return -ENOENT;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~RAPL_EVENT_MASK)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /*
+        * check event is known (determines counter)
+        */
+       switch (cfg) {
+       case INTEL_RAPL_PP0:
+               bit = RAPL_IDX_PP0_NRG_STAT;
+               msr = MSR_PP0_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PKG:
+               bit = RAPL_IDX_PKG_NRG_STAT;
+               msr = MSR_PKG_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_RAM:
+               bit = RAPL_IDX_RAM_NRG_STAT;
+               msr = MSR_DRAM_ENERGY_STATUS;
+               break;
+       case INTEL_RAPL_PP1:
+               bit = RAPL_IDX_PP1_NRG_STAT;
+               msr = MSR_PP1_ENERGY_STATUS;
+               break;
+       default:
+               return -EINVAL;
+       }
+       /* check event supported */
+       if (!(rapl_cntr_mask & (1 << bit)))
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /* must be done before validate_group */
+       pmu = cpu_to_rapl_pmu(event->cpu);
+       event->cpu = pmu->cpu;
+       event->pmu_private = pmu;
+       event->hw.event_base = msr;
+       event->hw.config = cfg;
+       event->hw.idx = bit;
+
+       return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+       rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+       .attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+       EVENT_PTR(rapl_cores),
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_gpu),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_cores_unit),
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_gpu_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_cores_scale),
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_gpu_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+       EVENT_PTR(rapl_pkg),
+       EVENT_PTR(rapl_ram),
+
+       EVENT_PTR(rapl_pkg_unit),
+       EVENT_PTR(rapl_ram_unit),
+
+       EVENT_PTR(rapl_pkg_scale),
+       EVENT_PTR(rapl_ram_scale),
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+       .name = "events",
+       .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+       .name = "format",
+       .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+       &rapl_pmu_attr_group,
+       &rapl_pmu_format_group,
+       &rapl_pmu_events_group,
+       NULL,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /* Check if exiting cpu is used for collecting rapl events */
+       if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+               return;
+
+       pmu->cpu = -1;
+       /* Find a new cpu to collect rapl events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate rapl events to the new target */
+       if (target < nr_cpu_ids) {
+               cpumask_set_cpu(target, &rapl_cpu_mask);
+               pmu->cpu = target;
+               perf_pmu_migrate_context(pmu->pmu, cpu, target);
+       }
+}
+
+static void rapl_cpu_init(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package which collects rapl
+        * events already.
+        */
+       target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &rapl_cpu_mask);
+       pmu->cpu = cpu;
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+       struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+
+       if (pmu)
+               return 0;
+
+       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+       if (!pmu)
+               return -ENOMEM;
+
+       raw_spin_lock_init(&pmu->lock);
+       INIT_LIST_HEAD(&pmu->active_list);
+       pmu->pmu = &rapl_pmus->pmu;
+       pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+       pmu->cpu = -1;
+       rapl_hrtimer_init(pmu);
+       rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
+       return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+                            unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               rapl_cpu_prepare(cpu);
+               break;
+
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               rapl_cpu_init(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               rapl_cpu_exit(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int rapl_check_hw_unit(bool apply_quirk)
+{
+       u64 msr_rapl_power_unit_bits;
+       int i;
+
+       /* protect rdmsrl() to handle virtualization */
+       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+               return -1;
+       for (i = 0; i < NR_RAPL_DOMAINS; i++)
+               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+       /*
+        * DRAM domain on HSW server and KNL has fixed energy unit which can be
+        * different than the unit from power unit MSR. See
+        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+        */
+       if (apply_quirk)
+               rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+
+       /*
+        * Calculate the timer rate:
+        * Use reference of 200W for scaling the timeout to avoid counter
+        * overflows. 200W = 200 Joules/sec
+        * Divide interval by 2 to avoid lockstep (2 * 100)
+        * if hw unit is 32, then we use 2 ms 1/200/2
+        */
+       rapl_timer_ms = 2;
+       if (rapl_hw_unit[0] < 32) {
+               rapl_timer_ms = (1000 / (2 * 100));
+               rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+       }
+       return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+       int i;
+
+       pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+               hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+               if (rapl_cntr_mask & (1 << i)) {
+                       pr_info("hw unit of domain %s 2^-%d Joules\n",
+                               rapl_domain_names[i], rapl_hw_unit[i]);
+               }
+       }
+}
+
+static int __init rapl_prepare_cpus(void)
+{
+       unsigned int cpu, pkg;
+       int ret;
+
+       for_each_online_cpu(cpu) {
+               pkg = topology_logical_package_id(cpu);
+               if (rapl_pmus->pmus[pkg])
+                       continue;
+
+               ret = rapl_cpu_prepare(cpu);
+               if (ret)
+                       return ret;
+               rapl_cpu_init(cpu);
+       }
+       return 0;
+}
+
+static void __init cleanup_rapl_pmus(void)
+{
+       int i;
+
+       for (i = 0; i < rapl_pmus->maxpkg; i++)
+               kfree(rapl_pmus->pmus + i);
+       kfree(rapl_pmus);
+}
+
+static int __init init_rapl_pmus(void)
+{
+       int maxpkg = topology_max_packages();
+       size_t size;
+
+       size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+       rapl_pmus = kzalloc(size, GFP_KERNEL);
+       if (!rapl_pmus)
+               return -ENOMEM;
+
+       rapl_pmus->maxpkg               = maxpkg;
+       rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
+       rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
+       rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
+       rapl_pmus->pmu.add              = rapl_pmu_event_add;
+       rapl_pmus->pmu.del              = rapl_pmu_event_del;
+       rapl_pmus->pmu.start            = rapl_pmu_event_start;
+       rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
+       rapl_pmus->pmu.read             = rapl_pmu_event_read;
+       return 0;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
+       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+       [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+       bool apply_quirk = false;
+       int ret;
+
+       if (!x86_match_cpu(rapl_cpu_match))
+               return -ENODEV;
+
+       switch (boot_cpu_data.x86_model) {
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+               rapl_cntr_mask = RAPL_IDX_CLN;
+               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+               break;
+       case 63: /* Haswell-Server */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell-Celeron */
+       case 61: /* Broadwell */
+               rapl_cntr_mask = RAPL_IDX_HSW;
+               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+               break;
+       case 45: /* Sandy Bridge-EP */
+       case 62: /* IvyTown */
+               rapl_cntr_mask = RAPL_IDX_SRV;
+               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+               break;
+       case 87: /* Knights Landing */
+               apply_quirk = true;
+               rapl_cntr_mask = RAPL_IDX_KNL;
+               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = rapl_check_hw_unit(apply_quirk);
+       if (ret)
+               return ret;
+
+       ret = init_rapl_pmus();
+       if (ret)
+               return ret;
+
+       cpu_notifier_register_begin();
+
+       ret = rapl_prepare_cpus();
+       if (ret)
+               goto out;
+
+       ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+       if (ret)
+               goto out;
+
+       __perf_cpu_notifier(rapl_cpu_notifier);
+       cpu_notifier_register_done();
+       rapl_advertise();
+       return 0;
+
+out:
+       pr_warn("Initialization failed (%d), disabled\n", ret);
+       cleanup_rapl_pmus();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
new file mode 100644 (file)
index 0000000..7012d18
--- /dev/null
@@ -0,0 +1,1412 @@
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+       EVENT_CONSTRAINT(0, 0, 0);
+
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+       struct pci2phy_map *map;
+       int phys_id = -1;
+
+       raw_spin_lock(&pci2phy_map_lock);
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == pci_domain_nr(bus)) {
+                       phys_id = map->pbus_to_physid[bus->number];
+                       break;
+               }
+       }
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       return phys_id;
+}
+
+static void uncore_free_pcibus_map(void)
+{
+       struct pci2phy_map *map, *tmp;
+
+       list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+               list_del(&map->list);
+               kfree(map);
+       }
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+       struct pci2phy_map *map, *alloc = NULL;
+       int i;
+
+       lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+       list_for_each_entry(map, &pci2phy_map_head, list) {
+               if (map->segment == segment)
+                       goto end;
+       }
+
+       if (!alloc) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+               raw_spin_lock(&pci2phy_map_lock);
+
+               if (!alloc)
+                       return NULL;
+
+               goto lookup;
+       }
+
+       map = alloc;
+       alloc = NULL;
+       map->segment = segment;
+       for (i = 0; i < 256; i++)
+               map->pbus_to_physid[i] = -1;
+       list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+       kfree(alloc);
+       return map;
+}
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf)
+{
+       struct uncore_event_desc *event =
+               container_of(attr, struct uncore_event_desc, attr);
+       return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+       return pmu->boxes[topology_logical_package_id(cpu)];
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 count;
+
+       rdmsrl(event->hw.event_base, count);
+
+       return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       unsigned long flags;
+       bool ok = false;
+
+       /*
+        * reg->alloc can be set due to existing state, so for fake box we
+        * need to ignore this, otherwise we might fail to allocate proper
+        * fake state for this extra reg constraint.
+        */
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+
+       er = &box->shared_regs[reg1->idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!atomic_read(&er->ref) ||
+           (er->config1 == reg1->config && er->config2 == reg2->config)) {
+               atomic_inc(&er->ref);
+               er->config1 = reg1->config;
+               er->config2 = reg2->config;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (ok) {
+               if (!uncore_box_is_fake(box))
+                       reg1->alloc = 1;
+               return NULL;
+       }
+
+       return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+       /*
+        * Only put constraint if extra reg was actually allocated. Also
+        * takes care of event which do not use an extra shared reg.
+        *
+        * Also, if this is a fake box we shouldn't touch any event state
+        * (reg->alloc) and we don't care about leaving inconsistent box
+        * state either since it will be thrown out.
+        */
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       er = &box->shared_regs[reg1->idx];
+       atomic_dec(&er->ref);
+       reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       er = &box->shared_regs[idx];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+                                  struct perf_event *event, int idx)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = idx;
+       hwc->last_tag = ++box->tags[idx];
+
+       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+               hwc->event_base = uncore_fixed_ctr(box);
+               hwc->config_base = uncore_fixed_ctl(box);
+               return;
+       }
+
+       hwc->config_base = uncore_event_ctl(box, hwc->idx);
+       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+       u64 prev_count, new_count, delta;
+       int shift;
+
+       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+               shift = 64 - uncore_fixed_ctr_bits(box);
+       else
+               shift = 64 - uncore_perf_ctr_bits(box);
+
+       /* the hrtimer might modify the previous event value */
+again:
+       prev_count = local64_read(&event->hw.prev_count);
+       new_count = uncore_read_counter(box, event);
+       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+               goto again;
+
+       delta = (new_count << shift) - (prev_count << shift);
+       delta >>= shift;
+
+       local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+       struct intel_uncore_box *box;
+       struct perf_event *event;
+       unsigned long flags;
+       int bit;
+
+       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+       if (!box->n_active || box->cpu != smp_processor_id())
+               return HRTIMER_NORESTART;
+       /*
+        * disable local interrupt to prevent uncore_pmu_event_start/stop
+        * to interrupt the update process
+        */
+       local_irq_save(flags);
+
+       /*
+        * handle boxes with an active event list as opposed to active
+        * counters
+        */
+       list_for_each_entry(event, &box->active_list, active_entry) {
+               uncore_perf_event_update(box, event);
+       }
+
+       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+               uncore_perf_event_update(box, box->events[bit]);
+
+       local_irq_restore(flags);
+
+       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+       return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+                     HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+                                                int node)
+{
+       int i, size, numshared = type->num_shared_regs ;
+       struct intel_uncore_box *box;
+
+       size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
+
+       box = kzalloc_node(size, GFP_KERNEL, node);
+       if (!box)
+               return NULL;
+
+       for (i = 0; i < numshared; i++)
+               raw_spin_lock_init(&box->shared_regs[i].lock);
+
+       uncore_pmu_init_hrtimer(box);
+       box->cpu = -1;
+       box->pci_phys_id = -1;
+       box->pkgid = -1;
+
+       /* set default hrtimer timeout */
+       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+       INIT_LIST_HEAD(&box->active_list);
+
+       return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+       return event->pmu->event_init == uncore_pmu_event_init;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+                     bool dogrp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = box->pmu->type->num_counters;
+       if (box->pmu->type->fixed_ctl)
+               max_count++;
+
+       if (box->n_events >= max_count)
+               return -EINVAL;
+
+       n = box->n_events;
+
+       if (is_uncore_event(leader)) {
+               box->event_list[n] = leader;
+               n++;
+       }
+
+       if (!dogrp)
+               return n;
+
+       list_for_each_entry(event, &leader->sibling_list, group_entry) {
+               if (!is_uncore_event(event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               box->event_list[n] = event;
+               n++;
+       }
+       return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct event_constraint *c;
+
+       if (type->ops->get_constraint) {
+               c = type->ops->get_constraint(box, event);
+               if (c)
+                       return c;
+       }
+
+       if (event->attr.config == UNCORE_FIXED_EVENT)
+               return &uncore_constraint_fixed;
+
+       if (type->constraints) {
+               for_each_event_constraint(c, type->constraints) {
+                       if ((event->hw.config & c->cmask) == c->code)
+                               return c;
+               }
+       }
+
+       return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       if (box->pmu->type->ops->put_constraint)
+               box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       struct event_constraint *c;
+       int i, wmin, wmax, ret = 0;
+       struct hw_perf_event *hwc;
+
+       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+               c = uncore_get_event_constraint(box, box->event_list[i]);
+               box->event_constraint[i] = c;
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
+       }
+
+       /* fastpath, try to reuse previous register */
+       for (i = 0; i < n; i++) {
+               hwc = &box->event_list[i]->hw;
+               c = box->event_constraint[i];
+
+               /* never assigned */
+               if (hwc->idx == -1)
+                       break;
+
+               /* constraint still honored */
+               if (!test_bit(hwc->idx, c->idxmsk))
+                       break;
+
+               /* not already used */
+               if (test_bit(hwc->idx, used_mask))
+                       break;
+
+               __set_bit(hwc->idx, used_mask);
+               if (assign)
+                       assign[i] = hwc->idx;
+       }
+       /* slow path */
+       if (i != n)
+               ret = perf_assign_events(box->event_constraint, n,
+                                        wmin, wmax, n, assign);
+
+       if (!assign || ret) {
+               for (i = 0; i < n; i++)
+                       uncore_put_event_constraint(box, box->event_list[i]);
+       }
+       return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int idx = event->hw.idx;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+               return;
+
+       event->hw.state = 0;
+       box->events[idx] = event;
+       box->n_active++;
+       __set_bit(idx, box->active_mask);
+
+       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+       uncore_enable_event(box, event);
+
+       if (box->n_active == 1) {
+               uncore_enable_box(box);
+               uncore_pmu_start_hrtimer(box);
+       }
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+               uncore_disable_event(box, event);
+               box->n_active--;
+               box->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               if (box->n_active == 0) {
+                       uncore_disable_box(box);
+                       uncore_pmu_cancel_hrtimer(box);
+               }
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+       int assign[UNCORE_PMC_IDX_MAX];
+       int i, n, ret;
+
+       if (!box)
+               return -ENODEV;
+
+       ret = n = uncore_collect_events(box, event, false);
+       if (ret < 0)
+               return ret;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       ret = uncore_assign_events(box, assign, n);
+       if (ret)
+               return ret;
+
+       /* save events moving to new counters */
+       for (i = 0; i < box->n_events; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx == assign[i] &&
+                       hwc->last_tag == box->tags[assign[i]])
+                       continue;
+               /*
+                * Ensure we don't accidentally enable a stopped
+                * counter simply because we rescheduled.
+                */
+               if (hwc->state & PERF_HES_STOPPED)
+                       hwc->state |= PERF_HES_ARCH;
+
+               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+       }
+
+       /* reprogram moved events into new counters */
+       for (i = 0; i < n; i++) {
+               event = box->event_list[i];
+               hwc = &event->hw;
+
+               if (hwc->idx != assign[i] ||
+                       hwc->last_tag != box->tags[assign[i]])
+                       uncore_assign_hw_event(box, event, assign[i]);
+               else if (i < box->n_events)
+                       continue;
+
+               if (hwc->state & PERF_HES_ARCH)
+                       continue;
+
+               uncore_pmu_event_start(event, 0);
+       }
+       box->n_events = n;
+
+       return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       uncore_put_event_constraint(box, event);
+
+                       for (++i; i < box->n_events; i++)
+                               box->event_list[i - 1] = box->event_list[i];
+
+                       --box->n_events;
+                       break;
+               }
+       }
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+                               struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct intel_uncore_box *fake_box;
+       int ret = -EINVAL, n;
+
+       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+       if (!fake_box)
+               return -ENOMEM;
+
+       fake_box->pmu = pmu;
+       /*
+        * the event is not yet connected with its
+        * siblings therefore we must first collect
+        * existing siblings, then add the new event
+        * before we can simulate the scheduling
+        */
+       n = uncore_collect_events(fake_box, leader, true);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+       n = uncore_collect_events(fake_box, event, false);
+       if (n < 0)
+               goto out;
+
+       fake_box->n_events = n;
+
+       ret = uncore_assign_events(fake_box, NULL, n);
+out:
+       kfree(fake_box);
+       return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       int ret;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /*
+        * Uncore PMU does measure at all privilege level all the time.
+        * So it doesn't make sense to specify any exclude bits.
+        */
+       if (event->attr.exclude_user || event->attr.exclude_kernel ||
+                       event->attr.exclude_hv || event->attr.exclude_idle)
+               return -EINVAL;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+       if (event->attr.config == UNCORE_FIXED_EVENT) {
+               /* no fixed counter */
+               if (!pmu->type->fixed_ctl)
+                       return -EINVAL;
+               /*
+                * if there is only one fixed counter, only the first pmu
+                * can access the fixed counter
+                */
+               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+                       return -EINVAL;
+
+               /* fixed counters have event field hardcoded to zero */
+               hwc->config = 0ULL;
+       } else {
+               hwc->config = event->attr.config & pmu->type->event_mask;
+               if (pmu->type->ops->hw_config) {
+                       ret = pmu->type->ops->hw_config(box, event);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       if (event->group_leader != event)
+               ret = uncore_validate_group(pmu, event);
+       else
+               ret = 0;
+
+       return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+       .attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+       int ret;
+
+       if (!pmu->type->pmu) {
+               pmu->pmu = (struct pmu) {
+                       .attr_groups    = pmu->type->attr_groups,
+                       .task_ctx_nr    = perf_invalid_context,
+                       .event_init     = uncore_pmu_event_init,
+                       .add            = uncore_pmu_event_add,
+                       .del            = uncore_pmu_event_del,
+                       .start          = uncore_pmu_event_start,
+                       .stop           = uncore_pmu_event_stop,
+                       .read           = uncore_pmu_event_read,
+               };
+       } else {
+               pmu->pmu = *pmu->type->pmu;
+               pmu->pmu.attr_groups = pmu->type->attr_groups;
+       }
+
+       if (pmu->type->num_boxes == 1) {
+               if (strlen(pmu->type->name) > 0)
+                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
+               else
+                       sprintf(pmu->name, "uncore");
+       } else {
+               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+                       pmu->pmu_idx);
+       }
+
+       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+       if (!ret)
+               pmu->registered = true;
+       return ret;
+}
+
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+       if (!pmu->registered)
+               return;
+       perf_pmu_unregister(&pmu->pmu);
+       pmu->registered = false;
+}
+
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       if (pmu) {
+               pkg = topology_physical_package_id(cpu);
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+       struct intel_uncore_type **types;
+
+       for (types = uncore_msr_uncores; *types; types++)
+               __uncore_exit_boxes(*types++, smp_processor_id());
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+       int pkg;
+
+       for (pkg = 0; pkg < max_packages; pkg++)
+               kfree(pmu->boxes[pkg]);
+       kfree(pmu->boxes);
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       int i;
+
+       if (pmu) {
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       uncore_pmu_unregister(pmu);
+                       uncore_free_boxes(pmu);
+               }
+               kfree(type->pmus);
+               type->pmus = NULL;
+       }
+       kfree(type->events_group);
+       type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+       for (; *types; types++)
+               uncore_type_exit(*types);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
+{
+       struct intel_uncore_pmu *pmus;
+       struct attribute_group *attr_group;
+       struct attribute **attrs;
+       size_t size;
+       int i, j;
+
+       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+       if (!pmus)
+               return -ENOMEM;
+
+       size = max_packages * sizeof(struct intel_uncore_box *);
+
+       for (i = 0; i < type->num_boxes; i++) {
+               pmus[i].func_id = setid ? i : -1;
+               pmus[i].pmu_idx = i;
+               pmus[i].type    = type;
+               pmus[i].boxes   = kzalloc(size, GFP_KERNEL);
+               if (!pmus[i].boxes)
+                       return -ENOMEM;
+       }
+
+       type->pmus = pmus;
+       type->unconstrainted = (struct event_constraint)
+               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+                               0, type->num_counters, 0, 0);
+
+       if (type->event_descs) {
+               for (i = 0; type->event_descs[i].attr.attr.name; i++);
+
+               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+                                       sizeof(*attr_group), GFP_KERNEL);
+               if (!attr_group)
+                       return -ENOMEM;
+
+               attrs = (struct attribute **)(attr_group + 1);
+               attr_group->name = "events";
+               attr_group->attrs = attrs;
+
+               for (j = 0; j < i; j++)
+                       attrs[j] = &type->event_descs[j].attr.attr;
+
+               type->events_group = attr_group;
+       }
+
+       type->pmu_group = &uncore_pmu_attr_group;
+       return 0;
+}
+
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
+{
+       int ret;
+
+       for (; *types; types++) {
+               ret = uncore_type_init(*types, setid);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct intel_uncore_type *type;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int phys_id, pkg, ret;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       if (phys_id < 0)
+               return -ENODEV;
+
+       pkg = topology_phys_to_logical_pkg(phys_id);
+       if (WARN_ON_ONCE(pkg < 0))
+               return -EINVAL;
+
+       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+               uncore_extra_pci_dev[pkg].dev[idx] = pdev;
+               pci_set_drvdata(pdev, NULL);
+               return 0;
+       }
+
+       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+       /*
+        * for performance monitoring unit with multiple boxes,
+        * each box has a different function id.
+        */
+       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+       /* Knights Landing uses a common PCI device ID for multiple instances of
+        * an uncore PMU device type. There is only one entry per device type in
+        * the knl_uncore_pci_ids table inspite of multiple devices present for
+        * some device types. Hence PCI device idx would be 0 for all devices.
+        * So increment pmu pointer to point to an unused array element.
+        */
+       if (boot_cpu_data.x86_model == 87) {
+               while (pmu->func_id >= 0)
+                       pmu++;
+       }
+
+       if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+               return -EINVAL;
+
+       box = uncore_alloc_box(type, NUMA_NO_NODE);
+       if (!box)
+               return -ENOMEM;
+
+       if (pmu->func_id < 0)
+               pmu->func_id = pdev->devfn;
+       else
+               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+       atomic_inc(&box->refcnt);
+       box->pci_phys_id = phys_id;
+       box->pkgid = pkg;
+       box->pci_dev = pdev;
+       box->pmu = pmu;
+       uncore_box_init(box);
+       pci_set_drvdata(pdev, box);
+
+       pmu->boxes[pkg] = box;
+       if (atomic_inc_return(&pmu->activeboxes) > 1)
+               return 0;
+
+       /* First active box registers the pmu */
+       ret = uncore_pmu_register(pmu);
+       if (ret) {
+               pci_set_drvdata(pdev, NULL);
+               pmu->boxes[pkg] = NULL;
+               uncore_box_exit(box);
+               kfree(box);
+       }
+       return ret;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+       struct intel_uncore_box *box = pci_get_drvdata(pdev);
+       struct intel_uncore_pmu *pmu;
+       int i, phys_id, pkg;
+
+       phys_id = uncore_pcibus_to_physid(pdev->bus);
+       pkg = topology_phys_to_logical_pkg(phys_id);
+
+       box = pci_get_drvdata(pdev);
+       if (!box) {
+               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+                       if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+                               uncore_extra_pci_dev[pkg].dev[i] = NULL;
+                               break;
+                       }
+               }
+               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+               return;
+       }
+
+       pmu = box->pmu;
+       if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+               return;
+
+       pci_set_drvdata(pdev, NULL);
+       pmu->boxes[pkg] = NULL;
+       if (atomic_dec_return(&pmu->activeboxes) == 0)
+               uncore_pmu_unregister(pmu);
+       uncore_box_exit(box);
+       kfree(box);
+}
+
+static int __init uncore_pci_init(void)
+{
+       size_t size;
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 45: /* Sandy Bridge-EP */
+               ret = snbep_uncore_pci_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ret = ivbep_uncore_pci_init();
+               break;
+       case 63: /* Haswell-EP */
+               ret = hswep_uncore_pci_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               ret = bdx_uncore_pci_init();
+               break;
+       case 42: /* Sandy Bridge */
+               ret = snb_uncore_pci_init();
+               break;
+       case 58: /* Ivy Bridge */
+               ret = ivb_uncore_pci_init();
+               break;
+       case 60: /* Haswell */
+       case 69: /* Haswell Celeron */
+               ret = hsw_uncore_pci_init();
+               break;
+       case 61: /* Broadwell */
+               ret = bdw_uncore_pci_init();
+               break;
+       case 87: /* Knights Landing */
+               ret = knl_uncore_pci_init();
+               break;
+       case 94: /* SkyLake */
+               ret = skl_uncore_pci_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       if (ret)
+               return ret;
+
+       size = max_packages * sizeof(struct pci_extra_dev);
+       uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+       if (!uncore_extra_pci_dev) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = uncore_types_init(uncore_pci_uncores, false);
+       if (ret)
+               goto errtype;
+
+       uncore_pci_driver->probe = uncore_pci_probe;
+       uncore_pci_driver->remove = uncore_pci_remove;
+
+       ret = pci_register_driver(uncore_pci_driver);
+       if (ret)
+               goto errtype;
+
+       pcidrv_registered = true;
+       return 0;
+
+errtype:
+       uncore_types_exit(uncore_pci_uncores);
+       kfree(uncore_extra_pci_dev);
+       uncore_extra_pci_dev = NULL;
+       uncore_free_pcibus_map();
+err:
+       uncore_pci_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+       if (pcidrv_registered) {
+               pcidrv_registered = false;
+               pci_unregister_driver(uncore_pci_driver);
+               uncore_types_exit(uncore_pci_uncores);
+               kfree(uncore_extra_pci_dev);
+               uncore_free_pcibus_map();
+       }
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (box && atomic_dec_return(&box->refcnt) == 0)
+                               uncore_box_exit(box);
+               }
+       }
+}
+
+static void uncore_cpu_starting(int cpu, bool init)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg, ncpus = 1;
+
+       if (init) {
+               /*
+                * On init we get the number of online cpus in the package
+                * and set refcount for all of them.
+                */
+               ncpus = cpumask_weight(topology_core_cpumask(cpu));
+       }
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       box = pmu->boxes[pkg];
+                       if (!box)
+                               continue;
+                       /* The first cpu on a package activates the box */
+                       if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
+                               uncore_box_init(box);
+               }
+       }
+}
+
+static int uncore_cpu_prepare(int cpu)
+{
+       struct intel_uncore_type *type, **types = uncore_msr_uncores;
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(cpu);
+       for (; *types; types++) {
+               type = *types;
+               pmu = type->pmus;
+               for (i = 0; i < type->num_boxes; i++, pmu++) {
+                       if (pmu->boxes[pkg])
+                               continue;
+                       /* First cpu of a package allocates the box */
+                       box = uncore_alloc_box(type, cpu_to_node(cpu));
+                       if (!box)
+                               return -ENOMEM;
+                       box->pmu = pmu;
+                       box->pkgid = pkg;
+                       pmu->boxes[pkg] = box;
+               }
+       }
+       return 0;
+}
+
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+                                  int new_cpu)
+{
+       struct intel_uncore_pmu *pmu = type->pmus;
+       struct intel_uncore_box *box;
+       int i, pkg;
+
+       pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
+       for (i = 0; i < type->num_boxes; i++, pmu++) {
+               box = pmu->boxes[pkg];
+               if (!box)
+                       continue;
+
+               if (old_cpu < 0) {
+                       WARN_ON_ONCE(box->cpu != -1);
+                       box->cpu = new_cpu;
+                       continue;
+               }
+
+               WARN_ON_ONCE(box->cpu != old_cpu);
+               box->cpu = -1;
+               if (new_cpu < 0)
+                       continue;
+
+               uncore_pmu_cancel_hrtimer(box);
+               perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+               box->cpu = new_cpu;
+       }
+}
+
+static void uncore_change_context(struct intel_uncore_type **uncores,
+                                 int old_cpu, int new_cpu)
+{
+       for (; *uncores; uncores++)
+               uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+       int target;
+
+       /* Check if exiting cpu is used for collecting uncore events */
+       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+               return;
+
+       /* Find a new cpu to collect uncore events */
+       target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+       /* Migrate uncore events to the new target */
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &uncore_cpu_mask);
+       else
+               target = -1;
+
+       uncore_change_context(uncore_msr_uncores, cpu, target);
+       uncore_change_context(uncore_pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+       int target;
+
+       /*
+        * Check if there is an online cpu in the package
+        * which collects uncore events already.
+        */
+       target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+       if (target < nr_cpu_ids)
+               return;
+
+       cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+       uncore_change_context(uncore_msr_uncores, -1, cpu);
+       uncore_change_context(uncore_pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+                              unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               return notifier_from_errno(uncore_cpu_prepare(cpu));
+
+       case CPU_STARTING:
+               uncore_cpu_starting(cpu, false);
+       case CPU_DOWN_FAILED:
+               uncore_event_init_cpu(cpu);
+               break;
+
+       case CPU_UP_CANCELED:
+       case CPU_DYING:
+               uncore_cpu_dying(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+               uncore_event_exit_cpu(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+       .notifier_call  = uncore_cpu_notifier,
+       /*
+        * to migrate uncore events, our notifier should be executed
+        * before perf core's notifier.
+        */
+       .priority       = CPU_PRI_PERF + 1,
+};
+
+static int __init type_pmu_register(struct intel_uncore_type *type)
+{
+       int i, ret;
+
+       for (i = 0; i < type->num_boxes; i++) {
+               ret = uncore_pmu_register(&type->pmus[i]);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+       struct intel_uncore_type **types = uncore_msr_uncores;
+       int ret;
+
+       for (; *types; types++) {
+               ret = type_pmu_register(*types);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int __init uncore_cpu_init(void)
+{
+       int ret;
+
+       switch (boot_cpu_data.x86_model) {
+       case 26: /* Nehalem */
+       case 30:
+       case 37: /* Westmere */
+       case 44:
+               nhm_uncore_cpu_init();
+               break;
+       case 42: /* Sandy Bridge */
+       case 58: /* Ivy Bridge */
+       case 60: /* Haswell */
+       case 69: /* Haswell */
+       case 70: /* Haswell */
+       case 61: /* Broadwell */
+       case 71: /* Broadwell */
+               snb_uncore_cpu_init();
+               break;
+       case 45: /* Sandy Bridge-EP */
+               snbep_uncore_cpu_init();
+               break;
+       case 46: /* Nehalem-EX */
+       case 47: /* Westmere-EX aka. Xeon E7 */
+               nhmex_uncore_cpu_init();
+               break;
+       case 62: /* Ivy Bridge-EP */
+               ivbep_uncore_cpu_init();
+               break;
+       case 63: /* Haswell-EP */
+               hswep_uncore_cpu_init();
+               break;
+       case 79: /* BDX-EP */
+       case 86: /* BDX-DE */
+               bdx_uncore_cpu_init();
+               break;
+       case 87: /* Knights Landing */
+               knl_uncore_cpu_init();
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       ret = uncore_types_init(uncore_msr_uncores, true);
+       if (ret)
+               goto err;
+
+       ret = uncore_msr_pmus_register();
+       if (ret)
+               goto err;
+       return 0;
+err:
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_msr_uncores = empty_uncore;
+       return ret;
+}
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+       uncore_cpu_starting(smp_processor_id(), true);
+}
+
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
+
+static int __init uncore_cpumask_init(bool msr)
+{
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               unsigned int pkg = topology_logical_package_id(cpu);
+               int ret;
+
+               if (test_and_set_bit(pkg, packages))
+                       continue;
+               /*
+                * The first online cpu of each package allocates and takes
+                * the refcounts for all other online cpus in that package.
+                * If msrs are not enabled no allocation is required.
+                */
+               if (msr) {
+                       ret = uncore_cpu_prepare(cpu);
+                       if (ret)
+                               return ret;
+               }
+               uncore_event_init_cpu(cpu);
+               smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
+       }
+       __register_cpu_notifier(&uncore_cpu_nb);
+       return 0;
+}
+
+static int __init intel_uncore_init(void)
+{
+       int pret, cret, ret;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return -ENODEV;
+
+       if (cpu_has_hypervisor)
+               return -ENODEV;
+
+       max_packages = topology_max_packages();
+
+       pret = uncore_pci_init();
+       cret = uncore_cpu_init();
+
+       if (cret && pret)
+               return -ENODEV;
+
+       cpu_notifier_register_begin();
+       ret = uncore_cpumask_init(!cret);
+       if (ret)
+               goto err;
+       cpu_notifier_register_done();
+       return 0;
+
+err:
+       /* Undo box->init_box() */
+       on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
+       uncore_types_exit(uncore_msr_uncores);
+       uncore_pci_exit();
+       cpu_notifier_register_done();
+       return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
new file mode 100644 (file)
index 0000000..79766b9
--- /dev/null
@@ -0,0 +1,378 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <asm/apicdef.h>
+
+#include <linux/perf_event.h>
+#include "../perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN            32
+#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT             0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC     8
+#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV           0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX       3
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct pci_extra_dev {
+       struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+       const char *name;
+       int num_counters;
+       int num_boxes;
+       int perf_ctr_bits;
+       int fixed_ctr_bits;
+       unsigned perf_ctr;
+       unsigned event_ctl;
+       unsigned event_mask;
+       unsigned fixed_ctr;
+       unsigned fixed_ctl;
+       unsigned box_ctl;
+       unsigned msr_offset;
+       unsigned num_shared_regs:8;
+       unsigned single_fixed:1;
+       unsigned pair_ctr_ctl:1;
+       unsigned *msr_offsets;
+       struct event_constraint unconstrainted;
+       struct event_constraint *constraints;
+       struct intel_uncore_pmu *pmus;
+       struct intel_uncore_ops *ops;
+       struct uncore_event_desc *event_descs;
+       const struct attribute_group *attr_groups[4];
+       struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+       void (*init_box)(struct intel_uncore_box *);
+       void (*exit_box)(struct intel_uncore_box *);
+       void (*disable_box)(struct intel_uncore_box *);
+       void (*enable_box)(struct intel_uncore_box *);
+       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+                                                  struct perf_event *);
+       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+       struct pmu                      pmu;
+       char                            name[UNCORE_PMU_NAME_LEN];
+       int                             pmu_idx;
+       int                             func_id;
+       bool                            registered;
+       atomic_t                        activeboxes;
+       struct intel_uncore_type        *type;
+       struct intel_uncore_box         **boxes;
+};
+
+struct intel_uncore_extra_reg {
+       raw_spinlock_t lock;
+       u64 config, config1, config2;
+       atomic_t ref;
+};
+
+struct intel_uncore_box {
+       int pci_phys_id;
+       int pkgid;
+       int n_active;   /* number of active events */
+       int n_events;
+       int cpu;        /* cpu to collect events */
+       unsigned long flags;
+       atomic_t refcnt;
+       struct perf_event *events[UNCORE_PMC_IDX_MAX];
+       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+       u64 tags[UNCORE_PMC_IDX_MAX];
+       struct pci_dev *pci_dev;
+       struct intel_uncore_pmu *pmu;
+       u64 hrtimer_duration; /* hrtimer timeout for this box */
+       struct hrtimer hrtimer;
+       struct list_head list;
+       struct list_head active_list;
+       void *io_addr;
+       struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED      0
+
+struct uncore_event_desc {
+       struct kobj_attribute attr;
+       const char *config;
+};
+
+struct pci2phy_map {
+       struct list_head list;
+       int segment;
+       int pbus_to_physid[256];
+};
+
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct kobject *kobj,
+                         struct kobj_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
+{                                                              \
+       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
+       .config = _config,                                      \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
+                               struct kobj_attribute *attr,            \
+                               char *page)                             \
+{                                                                      \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
+       return sprintf(page, _format "\n");                             \
+}                                                                      \
+static struct kobj_attribute format_attr_##_var =                      \
+       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+       struct intel_uncore_pmu *pmu = box->pmu;
+       return pmu->type->msr_offsets ?
+               pmu->type->msr_offsets[pmu->pmu_idx] :
+               pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->box_ctl)
+               return 0;
+       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (!box->pmu->type->fixed_ctl)
+               return 0;
+       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->event_ctl +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       return box->pmu->type->perf_ctr +
+               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+               uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctl(box);
+       else
+               return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+       if (box->pci_dev)
+               return uncore_pci_fixed_ctr(box);
+       else
+               return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_event_ctl(box, idx);
+       else
+               return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+       if (box->pci_dev)
+               return uncore_pci_perf_ctr(box, idx);
+       else
+               return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+       return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+       return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->disable_box)
+               box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->type->ops->enable_box)
+               box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+                               struct perf_event *event)
+{
+       return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->init_box)
+                       box->pmu->type->ops->init_box(box);
+       }
+}
+
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+       if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->exit_box)
+                       box->pmu->type->ops->exit_box(box);
+       }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+       return (box->pkgid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+       return event->pmu_private;
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_extra_dev *uncore_extra_pci_dev;
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
new file mode 100644 (file)
index 0000000..cda5693
--- /dev/null
@@ -0,0 +1,1227 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
+#define NHMEX_PMON_CTL_INVERT          (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
+                                        NHMEX_PMON_CTL_UMASK_MASK | \
+                                        NHMEX_PMON_CTL_EDGE_DET | \
+                                        NHMEX_PMON_CTL_INVERT | \
+                                        NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define NHMEX_U_MSR_PMON_CTR                   0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK            \
+               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
+                NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
+#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
+#define NHMEX_C_MSR_OFFSET                     0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
+#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
+#define NHMEX_B_MSR_OFFSET                     0x40
+#define NHMEX_B0_MSR_MATCH                     0xe45
+#define NHMEX_B0_MSR_MASK                      0xe46
+#define NHMEX_B1_MSR_MATCH                     0xe4d
+#define NHMEX_B1_MSR_MASK                      0xe4e
+
+#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT         6
+#define NHMEX_B_PMON_CTR_MASK          \
+               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK            \
+               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+                NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
+#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
+#define NHMEX_S_MSR_OFFSET                     0x80
+#define NHMEX_S0_MSR_MM_CFG                    0xe48
+#define NHMEX_S0_MSR_MATCH                     0xe49
+#define NHMEX_S0_MSR_MASK                      0xe4a
+#define NHMEX_S1_MSR_MM_CFG                    0xe58
+#define NHMEX_S1_MSR_MATCH                     0xe59
+#define NHMEX_S1_MSR_MASK                      0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV             0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
+#define NHMEX_M0_MSR_PMU_DSP                   0xca5
+#define NHMEX_M0_MSR_PMU_ISS                   0xca6
+#define NHMEX_M0_MSR_PMU_MAP                   0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
+#define NHMEX_M0_MSR_PMU_PGT                   0xca9
+#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
+#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
+#define NHMEX_M_MSR_OFFSET                     0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
+
+#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
+       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
+       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
+       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
+       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
+               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
+                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
+                NHMEX_M_PMON_CTL_WRAP_MODE |           \
+                NHMEX_M_PMON_CTL_FLAG_MODE |           \
+                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
+                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+                          NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+                               NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+                               MBOX_SET_FLAG_SEL_MASK, \
+                               (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
+#define NHMEX_R_MSR_PMON_CTL0                  0xe10
+#define NHMEX_R_MSR_PMON_CNT0                  0xe11
+#define NHMEX_R_MSR_OFFSET                     0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
+               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
+               (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
+               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
+               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
+               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
+               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
+#define NHMEX_W_MSR_PMON_CNT0                  0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~((1ULL << uncore_num_counters(box)) - 1);
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       u64 config;
+
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= (1ULL << uncore_num_counters(box)) - 1;
+               /* WBox has a fixed counter */
+               if (uncore_msr_fixed_ctl(box))
+                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+               wrmsrl(msr, config);
+       }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+       else
+               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
+       .init_box       = nhmex_uncore_msr_init_box,            \
+       .exit_box       = nhmex_uncore_msr_exit_box,            \
+       .disable_box    = nhmex_uncore_msr_disable_box,         \
+       .enable_box     = nhmex_uncore_msr_enable_box,          \
+       .disable_event  = nhmex_uncore_msr_disable_event,       \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_edge.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 1,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
+       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
+       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
+       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+       .ops            = &nhmex_uncore_ops,
+       .format_group   = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 6,
+       .num_boxes              = 10,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
+       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+       .msr_offsets            = nhmex_cbox_msr_offsets,
+       .pair_ctr_ctl           = 1,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+       .name                   = "wbox",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
+       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
+       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
+       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
+       .pair_ctr_ctl           = 1,
+       .event_descs            = nhmex_uncore_wbox_events,
+       .ops                    = &nhmex_uncore_ops,
+       .format_group           = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int ctr, ev_sel;
+
+       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+               NHMEX_B_PMON_CTR_SHIFT;
+       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+       /* events that do not use the match/mask registers */
+       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_B0_MSR_MATCH;
+       else
+               reg1->reg = NHMEX_B1_MSR_MATCH;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, reg1->config);
+               wrmsrl(reg1->reg + 1, reg2->config);
+       }
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+       EVENT_CONSTRAINT(0 , 1, 0xc0),
+       EVENT_CONSTRAINT(0x40, 2, 0xc0),
+       EVENT_CONSTRAINT(0x80, 4, 0xc0),
+       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+       EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_counter.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_bbox_msr_enable_event,
+       .hw_config              = nhmex_bbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+       .name                   = "bbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_B_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .constraints            = nhmex_uncore_bbox_constraints,
+       .ops                    = &nhmex_uncore_bbox_ops,
+       .format_group           = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       /* only TO_R_PROG_EV event uses the match/mask register */
+       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+           NHMEX_S_EVENT_TO_R_PROG_EV)
+               return 0;
+
+       if (box->pmu->pmu_idx == 0)
+               reg1->reg = NHMEX_S0_MSR_MM_CFG;
+       else
+               reg1->reg = NHMEX_S1_MSR_MM_CFG;
+       reg1->idx = 0;
+       reg1->config = event->attr.config1;
+       reg2->config = event->attr.config2;
+       return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg1->reg, 0);
+               wrmsrl(reg1->reg + 1, reg1->config);
+               wrmsrl(reg1->reg + 2, reg2->config);
+               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+       }
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match.attr,
+       &format_attr_mask.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+       .name                   = "format",
+       .attrs                  = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_sbox_msr_enable_event,
+       .hw_config              = nhmex_sbox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
+       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+       .msr_offset             = NHMEX_S_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 1,
+       .ops                    = &nhmex_uncore_sbox_ops,
+       .format_group           = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+       EXTRA_REG_NHMEX_M_FILTER,
+       EXTRA_REG_NHMEX_M_DSP,
+       EXTRA_REG_NHMEX_M_ISS,
+       EXTRA_REG_NHMEX_M_MAP,
+       EXTRA_REG_NHMEX_M_MSC_THR,
+       EXTRA_REG_NHMEX_M_PGT,
+       EXTRA_REG_NHMEX_M_PLD,
+       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+       /* event 0xa uses two extra registers */
+       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+       /* events 0xd ~ 0x10 use the same extra register */
+       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+       EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       bool ret = false;
+       u64 mask;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               raw_spin_lock_irqsave(&er->lock, flags);
+               if (!atomic_read(&er->ref) || er->config == config) {
+                       atomic_inc(&er->ref);
+                       er->config = config;
+                       ret = true;
+               }
+               raw_spin_unlock_irqrestore(&er->lock, flags);
+
+               return ret;
+       }
+       /*
+        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+        * fields which are shared.
+        */
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (WARN_ON_ONCE(idx >= 4))
+               return false;
+
+       /* mask of the shared fields */
+       if (uncore_nhmex)
+               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       else
+               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       /* add mask of the non-shared field if it's in use */
+       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+               if (uncore_nhmex)
+                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       }
+
+       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               if (uncore_nhmex)
+                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               else
+                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+               er->config &= ~mask;
+               er->config |= (config & mask);
+               ret = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               er = &box->shared_regs[idx];
+               atomic_dec(&er->ref);
+               return;
+       }
+
+       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+       u64 config = reg1->config;
+
+       /* get the non-shared control bits and shift them */
+       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+       if (uncore_nhmex)
+               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       else
+               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+       if (new_idx > orig_idx) {
+               idx = new_idx - orig_idx;
+               config <<= 3 * idx;
+       } else {
+               idx = orig_idx - new_idx;
+               config >>= 3 * idx;
+       }
+
+       /* add the shared control bits back */
+       if (uncore_nhmex)
+               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       else
+               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+       if (modify) {
+               /* adjust the main event selector */
+               if (new_idx > orig_idx)
+                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               else
+                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+               reg1->config = config;
+               reg1->idx = ~0xff | new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int i, idx[2], alloc = 0;
+       u64 config1 = reg1->config;
+
+       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+       for (i = 0; i < 2; i++) {
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       idx[i] = 0xff;
+
+               if (idx[i] == 0xff)
+                       continue;
+
+               if (!nhmex_mbox_get_shared_reg(box, idx[i],
+                               __BITS_VALUE(config1, i, 32)))
+                       goto fail;
+               alloc |= (0x1 << i);
+       }
+
+       /* for the match/mask registers */
+       if (reg2->idx != EXTRA_REG_NONE &&
+           (uncore_box_is_fake(box) || !reg2->alloc) &&
+           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+               goto fail;
+
+       /*
+        * If it's a fake box -- as per validate_{group,event}() we
+        * shouldn't touch event state and we can avoid doing so
+        * since both will only call get_event_constraints() once
+        * on each event, this avoids the need for reg->alloc.
+        */
+       if (!uncore_box_is_fake(box)) {
+               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+                       nhmex_mbox_alter_er(event, idx[0], true);
+               reg1->alloc |= alloc;
+               if (reg2->idx != EXTRA_REG_NONE)
+                       reg2->alloc = 1;
+       }
+       return NULL;
+fail:
+       if (idx[0] != 0xff && !(alloc & 0x1) &&
+           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+               /*
+                * events 0xd ~ 0x10 are functional identical, but are
+                * controlled by different fields in the ZDP_CTL_FVC
+                * register. If we failed to take one field, try the
+                * rest 3 choices.
+                */
+               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               idx[0] = (idx[0] + 1) % 4;
+               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
+                       goto again;
+               }
+       }
+
+       if (alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, idx[0]);
+       if (alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, idx[1]);
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       if (reg1->alloc & 0x1)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+       if (reg1->alloc & 0x2)
+               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+       reg1->alloc = 0;
+
+       if (reg2->alloc) {
+               nhmex_mbox_put_shared_reg(box, reg2->idx);
+               reg2->alloc = 0;
+       }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return er->idx;
+       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_type *type = box->pmu->type;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       struct extra_reg *er;
+       unsigned msr;
+       int reg_idx = 0;
+       /*
+        * The mbox events may require 2 extra MSRs at the most. But only
+        * the lower 32 bits in these MSRs are significant, so we can use
+        * config1 to pass two MSRs' config.
+        */
+       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               if (event->attr.config1 & ~er->valid_mask)
+                       return -EINVAL;
+
+               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+                       return -EINVAL;
+
+               /* always use the 32~63 bits to pass the PLD config */
+               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+                       reg_idx = 1;
+               else if (WARN_ON_ONCE(reg_idx > 0))
+                       return -EINVAL;
+
+               reg1->idx &= ~(0xff << (reg_idx * 8));
+               reg1->reg &= ~(0xffff << (reg_idx * 16));
+               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+               reg1->reg |= msr << (reg_idx * 16);
+               reg1->config = event->attr.config1;
+               reg_idx++;
+       }
+       /*
+        * The mbox only provides ability to perform address matching
+        * for the PLD events.
+        */
+       if (reg_idx == 2) {
+               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+                       reg2->config = event->attr.config2;
+               else
+                       reg2->config = ~0ULL;
+               if (box->pmu->pmu_idx == 0)
+                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+               else
+                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+       }
+       return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       u64 config;
+
+       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+               return box->shared_regs[idx].config;
+
+       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       config = er->config;
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx;
+
+       idx = __BITS_VALUE(reg1->idx, 0, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+       idx = __BITS_VALUE(reg1->idx, 1, 8);
+       if (idx != 0xff)
+               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+                       nhmex_mbox_shared_reg_config(box, idx));
+
+       if (reg2->idx != EXTRA_REG_NONE) {
+               wrmsrl(reg2->reg, 0);
+               if (reg2->config != ~0ULL) {
+                       wrmsrl(reg2->reg + 1,
+                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+               }
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+       &format_attr_count_mode.attr,
+       &format_attr_storage_mode.attr,
+       &format_attr_wrap_mode.attr,
+       &format_attr_flag_mode.attr,
+       &format_attr_inc_sel.attr,
+       &format_attr_set_flag_sel.attr,
+       &format_attr_filter_cfg_en.attr,
+       &format_attr_filter_match.attr,
+       &format_attr_filter_mask.attr,
+       &format_attr_dsp.attr,
+       &format_attr_thr.attr,
+       &format_attr_fvc.attr,
+       &format_attr_pgt.attr,
+       &format_attr_map.attr,
+       &format_attr_iss.attr,
+       &format_attr_pld.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+       .name           = "format",
+       .attrs          = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event   = nhmex_mbox_msr_enable_event,
+       .hw_config      = nhmex_mbox_hw_config,
+       .get_constraint = nhmex_mbox_get_constraint,
+       .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+       .name                   = "mbox",
+       .num_counters           = 6,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
+       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
+       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_M_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 8,
+       .event_descs            = nhmex_uncore_mbox_events,
+       .ops                    = &nhmex_uncore_mbox_ops,
+       .format_group           = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       /* adjust the main event selector and extra register index */
+       if (reg1->idx % 2) {
+               reg1->idx--;
+               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       } else {
+               reg1->idx++;
+               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       }
+
+       /* adjust extra register config */
+       switch (reg1->idx % 6) {
+       case 2:
+               /* shift the 8~15 bits to the 0~7 bits */
+               reg1->config >>= 8;
+               break;
+       case 3:
+               /* shift the 0~7 bits to the 8~15 bits */
+               reg1->config <<= 8;
+               break;
+       }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       struct intel_uncore_extra_reg *er;
+       unsigned long flags;
+       int idx, er_idx;
+       u64 config1;
+       bool ok = false;
+
+       if (!uncore_box_is_fake(box) && reg1->alloc)
+               return NULL;
+
+       idx = reg1->idx % 6;
+       config1 = reg1->config;
+again:
+       er_idx = idx;
+       /* the 3rd and 4th events use the same extra register */
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (idx < 2) {
+               if (!atomic_read(&er->ref) || er->config == reg1->config) {
+                       atomic_inc(&er->ref);
+                       er->config = reg1->config;
+                       ok = true;
+               }
+       } else if (idx == 2 || idx == 3) {
+               /*
+                * these two events use different fields in a extra register,
+                * the 0~7 bits and the 8~15 bits respectively.
+                */
+               u64 mask = 0xff << ((idx - 2) * 8);
+               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+                               !((er->config ^ config1) & mask)) {
+                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= config1 & mask;
+                       ok = true;
+               }
+       } else {
+               if (!atomic_read(&er->ref) ||
+                               (er->config == (hwc->config >> 32) &&
+                                er->config1 == reg1->config &&
+                                er->config2 == reg2->config)) {
+                       atomic_inc(&er->ref);
+                       er->config = (hwc->config >> 32);
+                       er->config1 = reg1->config;
+                       er->config2 = reg2->config;
+                       ok = true;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               /*
+                * The Rbox events are always in pairs. The paired
+                * events are functional identical, but use different
+                * extra registers. If we failed to take an extra
+                * register, try the alternative.
+                */
+               idx ^= 1;
+               if (idx != reg1->idx % 6) {
+                       if (idx == 2)
+                               config1 >>= 8;
+                       else if (idx == 3)
+                               config1 <<= 8;
+                       goto again;
+               }
+       } else {
+               if (!uncore_box_is_fake(box)) {
+                       if (idx != reg1->idx % 6)
+                               nhmex_rbox_alter_er(box, event);
+                       reg1->alloc = 1;
+               }
+               return NULL;
+       }
+       return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct intel_uncore_extra_reg *er;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       int idx, er_idx;
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       idx = reg1->idx % 6;
+       er_idx = idx;
+       if (er_idx > 2)
+               er_idx--;
+       er_idx += (reg1->idx / 6) * 5;
+
+       er = &box->shared_regs[er_idx];
+       if (idx == 2 || idx == 3)
+               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+       else
+               atomic_dec(&er->ref);
+
+       reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+       int idx;
+
+       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+       if (idx >= 0x18)
+               return -EINVAL;
+
+       reg1->idx = idx;
+       reg1->config = event->attr.config1;
+
+       switch (idx % 6) {
+       case 4:
+       case 5:
+               hwc->config |= event->attr.config & (~0ULL << 32);
+               reg2->config = event->attr.config2;
+               break;
+       }
+       return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+       int idx, port;
+
+       idx = reg1->idx;
+       port = idx / 6 + box->pmu->pmu_idx * 4;
+
+       switch (idx % 6) {
+       case 0:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+               break;
+       case 1:
+               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+               break;
+       case 2:
+       case 3:
+               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+               break;
+       case 4:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+               break;
+       case 5:
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+                       hwc->config >> 32);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+               break;
+       }
+
+       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+       &format_attr_event5.attr,
+       &format_attr_xbr_mm_cfg.attr,
+       &format_attr_xbr_match.attr,
+       &format_attr_xbr_mask.attr,
+       &format_attr_qlx_cfg.attr,
+       &format_attr_iperf_cfg.attr,
+       NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+       .name = "format",
+       .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
+       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
+       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+       NHMEX_UNCORE_OPS_COMMON_INIT(),
+       .enable_event           = nhmex_rbox_msr_enable_event,
+       .hw_config              = nhmex_rbox_hw_config,
+       .get_constraint         = nhmex_rbox_get_constraint,
+       .put_constraint         = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+       .name                   = "rbox",
+       .num_counters           = 8,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
+       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
+       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
+       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
+       .msr_offset             = NHMEX_R_MSR_OFFSET,
+       .pair_ctr_ctl           = 1,
+       .num_shared_regs        = 20,
+       .event_descs            = nhmex_uncore_rbox_events,
+       .ops                    = &nhmex_uncore_rbox_ops,
+       .format_group           = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+       &nhmex_uncore_ubox,
+       &nhmex_uncore_cbox,
+       &nhmex_uncore_bbox,
+       &nhmex_uncore_sbox,
+       &nhmex_uncore_mbox,
+       &nhmex_uncore_rbox,
+       &nhmex_uncore_wbox,
+       NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+       if (boot_cpu_data.x86_model == 46)
+               uncore_nhmex = true;
+       else
+               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
new file mode 100644 (file)
index 0000000..96531d2
--- /dev/null
@@ -0,0 +1,731 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC    0x191f
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
+#define SNB_UNC_CTL_EN                         (1 << 22)
+#define SNB_UNC_CTL_INVERT                     (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
+                                                SNB_UNC_CTL_UMASK_MASK | \
+                                                SNB_UNC_CTL_EDGE_DET | \
+                                                SNB_UNC_CTL_INVERT | \
+                                                NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0                   0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET                 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0) {
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+       }
+}
+
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+       if (box->pmu->pmu_idx == 0)
+               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask5.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+       .name           = "format",
+       .attrs          = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+       .init_box       = snb_uncore_msr_init_box,
+       .exit_box       = snb_uncore_msr_exit_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = snb_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+       .name           = "cbox",
+       .num_counters   = 2,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
+       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
+       .fixed_ctr      = SNB_UNC_FIXED_CTR,
+       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
+       .single_fixed   = 1,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+       .event_descs    = snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+       .name           = "arb",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
+       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
+       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
+       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
+       .constraints    = snb_uncore_arb_constraints,
+       .ops            = &snb_uncore_msr_ops,
+       .format_group   = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+       &snb_uncore_cbox,
+       &snb_uncore_arb,
+       NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = snb_msr_uncores;
+       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+       SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+       { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+       .name = "format",
+       .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+       resource_size_t addr;
+       u32 pci_dword;
+
+       pci_read_config_dword(pdev, where, &pci_dword);
+       addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+       pci_read_config_dword(pdev, where + 4, &pci_dword);
+       addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+       addr &= ~(PAGE_SIZE - 1);
+
+       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+       iounmap(box->io_addr);
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+       struct intel_uncore_pmu *pmu;
+       struct intel_uncore_box *box;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+       int idx, base;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       pmu = uncore_event_to_pmu(event);
+       /* no device found for this pmu */
+       if (pmu->func_id < 0)
+               return -ENOENT;
+
+       /* Sampling not supported yet */
+       if (hwc->sample_period)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       /*
+        * Place all uncore events for a particular physical package
+        * onto a single cpu
+        */
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       /* check only supported bits are set */
+       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+               return -EINVAL;
+
+       box = uncore_pmu_to_box(pmu, event->cpu);
+       if (!box || box->cpu < 0)
+               return -EINVAL;
+
+       event->cpu = box->cpu;
+       event->pmu_private = box;
+
+       event->hw.idx = -1;
+       event->hw.last_tag = ~0ULL;
+       event->hw.extra_reg.idx = EXTRA_REG_NONE;
+       event->hw.branch_reg.idx = EXTRA_REG_NONE;
+       /*
+        * check event is known (whitelist, determines counter)
+        */
+       switch (cfg) {
+       case SNB_UNCORE_PCI_IMC_DATA_READS:
+               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+               idx = UNCORE_PMC_IDX_FIXED;
+               break;
+       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+               idx = UNCORE_PMC_IDX_FIXED + 1;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* must be done before validate_group */
+       event->hw.event_base = base;
+       event->hw.config = cfg;
+       event->hw.idx = idx;
+
+       /* no group validation needed, we have free running counters */
+
+       return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       u64 count;
+
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       event->hw.state = 0;
+       box->n_active++;
+
+       list_add_tail(&event->active_entry, &box->active_list);
+
+       count = snb_uncore_imc_read_counter(box, event);
+       local64_set(&event->hw.prev_count, count);
+
+       if (box->n_active == 1)
+               uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               box->n_active--;
+
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+
+               list_del(&event->active_entry);
+
+               if (box->n_active == 0)
+                       uncore_pmu_cancel_hrtimer(box);
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               uncore_perf_event_update(box, event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (!box)
+               return -ENODEV;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       snb_uncore_imc_event_start(event, 0);
+
+       box->n_events++;
+
+       return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+       struct intel_uncore_box *box = uncore_event_to_box(event);
+       int i;
+
+       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+       for (i = 0; i < box->n_events; i++) {
+               if (event == box->event_list[i]) {
+                       --box->n_events;
+                       break;
+               }
+       }
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+       struct pci_dev *dev = NULL;
+       struct pci2phy_map *map;
+       int bus, segment;
+
+       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+       if (!dev)
+               return -ENOTTY;
+
+       bus = dev->bus->number;
+       segment = pci_domain_nr(dev->bus);
+
+       raw_spin_lock(&pci2phy_map_lock);
+       map = __find_pci2phy_map(segment);
+       if (!map) {
+               raw_spin_unlock(&pci2phy_map_lock);
+               pci_dev_put(dev);
+               return -ENOMEM;
+       }
+       map->pbus_to_physid[bus] = 0;
+       raw_spin_unlock(&pci2phy_map_lock);
+
+       pci_dev_put(dev);
+
+       return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+       .task_ctx_nr    = perf_invalid_context,
+       .event_init     = snb_uncore_imc_event_init,
+       .add            = snb_uncore_imc_event_add,
+       .del            = snb_uncore_imc_event_del,
+       .start          = snb_uncore_imc_event_start,
+       .stop           = snb_uncore_imc_event_stop,
+       .read           = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+       .init_box       = snb_uncore_imc_init_box,
+       .exit_box       = snb_uncore_imc_exit_box,
+       .enable_box     = snb_uncore_imc_enable_box,
+       .disable_box    = snb_uncore_imc_disable_box,
+       .disable_event  = snb_uncore_imc_disable_event,
+       .enable_event   = snb_uncore_imc_enable_event,
+       .hw_config      = snb_uncore_imc_hw_config,
+       .read_counter   = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .fixed_ctr_bits = 32,
+       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
+       .event_descs    = snb_uncore_imc_events,
+       .format_group   = &snb_uncore_imc_format_group,
+       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+       .ops            = &snb_uncore_imc_ops,
+       .pmu            = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
+       NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+       .name           = "snb_uncore",
+       .id_table       = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+       .name           = "ivb_uncore",
+       .id_table       = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+       .name           = "hsw_uncore",
+       .id_table       = hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+       .name           = "bdw_uncore",
+       .id_table       = bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+       .name           = "skl_uncore",
+       .id_table       = skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+       __u32 pci_id;
+       struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
+       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
+       IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
+       {  /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+       for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+       const struct imc_uncore_pci_dev *p;
+       int ret;
+
+       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+               ret = snb_pci2phy_map_init(p->pci_id);
+               if (ret == 0)
+                       return p->driver;
+       }
+       return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+       struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+       if (!imc_drv)
+               return -ENODEV;
+
+       uncore_pci_uncores = snb_pci_uncores;
+       uncore_pci_driver = imc_drv;
+
+       return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+       else
+               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_cmask8.attr,
+       NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+       .name = "format",
+       .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+       .disable_box    = nhm_uncore_msr_disable_box,
+       .enable_box     = nhm_uncore_msr_enable_box,
+       .disable_event  = snb_uncore_msr_disable_event,
+       .enable_event   = nhm_uncore_msr_enable_event,
+       .read_counter   = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+       .name           = "",
+       .num_counters   = 8,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .event_ctl      = NHM_UNC_PERFEVTSEL0,
+       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
+       .fixed_ctr      = NHM_UNC_FIXED_CTR,
+       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
+       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
+       .event_descs    = nhm_uncore_events,
+       .ops            = &nhm_uncore_msr_ops,
+       .format_group   = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+       &nhm_uncore,
+       NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
new file mode 100644 (file)
index 0000000..93f6bd9
--- /dev/null
@@ -0,0 +1,3135 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
+                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
+#define SNBEP_PMON_CTL_RST             (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
+#define SNBEP_PMON_CTL_EN              (1 << 22)
+#define SNBEP_PMON_CTL_INVERT          (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_INVERT | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
+#define SNBEP_PCI_PMON_CTL0                    0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0                    0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0                  0xc16
+#define SNBEP_U_MSR_PMON_CTL0                  0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
+#define SNBEP_CBO_MSR_OFFSET                   0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
+       .event = (e),                           \
+       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
+       .config_mask = (m),                     \
+       .idx = (i)                              \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                        SNBEP_PMON_CTL_UMASK_MASK | \
+                                        SNBEP_PMON_CTL_EDGE_DET | \
+                                        SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_UMASK_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
+                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
+                               (IVBEP_PMON_RAW_EVENT_MASK | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+                               ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0                  0x709
+#define HSWEP_U_MSR_PMON_CTL0                  0x705
+#define HSWEP_U_MSR_PMON_FILTER                        0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
+#define HSWEP_CBO_MSR_OFFSET                   0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0                 0x726
+#define HSWEP_S0_MSR_PMON_CTL0                 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
+#define HSWEP_SBOX_MSR_OFFSET                  0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
+#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+                                               SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET                     0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+                                        KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
+#define KNL_UCLK_MSR_PMON_CTL0                 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
+#define KNL_PMON_FIXED_CTL_EN                  0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
+                                                KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
+                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+                                SNBEP_PMON_CTL_EDGE_DET | \
+                                SNBEP_CBO_PMON_CTL_TID_EN | \
+                                SNBEP_PMON_CTL_EV_SEL_EXT | \
+                                SNBEP_PMON_CTL_INVERT | \
+                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+       u32 config = 0;
+
+       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               pci_write_config_dword(pdev, box_ctl, config);
+       }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config |= SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+       u64 config;
+       unsigned msr;
+
+       msr = uncore_msr_box_ctl(box);
+       if (msr) {
+               rdmsrl(msr, config);
+               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+               wrmsrl(msr, config);
+       }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE)
+               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr)
+               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_nid.attr,
+       &format_attr_filter_state.attr,
+       &format_attr_filter_opc.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+       { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
+       .init_box       = snbep_uncore_msr_init_box             \
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
+       .init_box       = snbep_uncore_pci_init_box,            \
+       .disable_box    = snbep_uncore_pci_disable_box,         \
+       .enable_box     = snbep_uncore_pci_enable_box,          \
+       .disable_event  = snbep_uncore_pci_disable_event,       \
+       .read_counter   = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event   = snbep_uncore_pci_enable_event,        \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &snbep_uncore_msr_ops,
+       .format_group   = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+       EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i;
+
+       if (uncore_box_is_fake(box))
+               return;
+
+       for (i = 0; i < 5; i++) {
+               if (reg1->alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+                           u64 (*cbox_filter_mask)(int fields))
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       int i, alloc = 0;
+       unsigned long flags;
+       u64 mask;
+
+       if (reg1->idx == EXTRA_REG_NONE)
+               return NULL;
+
+       raw_spin_lock_irqsave(&er->lock, flags);
+       for (i = 0; i < 5; i++) {
+               if (!(reg1->idx & (0x1 << i)))
+                       continue;
+               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+                       continue;
+
+               mask = cbox_filter_mask(0x1 << i);
+               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+                   !((reg1->config ^ er->config) & mask)) {
+                       atomic_add(1 << (i * 6), &er->ref);
+                       er->config &= ~mask;
+                       er->config |= reg1->config & mask;
+                       alloc |= (0x1 << i);
+               } else {
+                       break;
+               }
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+       if (i < 5)
+               goto fail;
+
+       if (!uncore_box_is_fake(box))
+               reg1->alloc |= alloc;
+
+       return NULL;
+fail:
+       for (; i >= 0; i--) {
+               if (alloc & (0x1 << i))
+                       atomic_sub(1 << (i * 6), &er->ref);
+       }
+       return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x4)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+       return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_cbox_hw_config,
+       .get_constraint         = snbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &snbep_uncore_cbox_ops,
+       .format_group           = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       u64 config = reg1->config;
+
+       if (new_idx > reg1->idx)
+               config <<= 8 * (new_idx - reg1->idx);
+       else
+               config >>= 8 * (reg1->idx - new_idx);
+
+       if (modify) {
+               hwc->config += new_idx - reg1->idx;
+               reg1->config = config;
+               reg1->idx = new_idx;
+       }
+       return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+       unsigned long flags;
+       int idx = reg1->idx;
+       u64 mask, config1 = reg1->config;
+       bool ok = false;
+
+       if (reg1->idx == EXTRA_REG_NONE ||
+           (!uncore_box_is_fake(box) && reg1->alloc))
+               return NULL;
+again:
+       mask = 0xffULL << (idx * 8);
+       raw_spin_lock_irqsave(&er->lock, flags);
+       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+           !((config1 ^ er->config) & mask)) {
+               atomic_add(1 << (idx * 8), &er->ref);
+               er->config &= ~mask;
+               er->config |= config1 & mask;
+               ok = true;
+       }
+       raw_spin_unlock_irqrestore(&er->lock, flags);
+
+       if (!ok) {
+               idx = (idx + 1) % 4;
+               if (idx != reg1->idx) {
+                       config1 = snbep_pcu_alter_er(event, idx, false);
+                       goto again;
+               }
+               return &uncore_constraint_empty;
+       }
+
+       if (!uncore_box_is_fake(box)) {
+               if (idx != reg1->idx)
+                       snbep_pcu_alter_er(event, idx, true);
+               reg1->alloc = 1;
+       }
+       return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+       if (uncore_box_is_fake(box) || !reg1->alloc)
+               return;
+
+       atomic_sub(1 << (reg1->idx * 8), &er->ref);
+       reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+       &snbep_uncore_ubox,
+       &snbep_uncore_cbox,
+       &snbep_uncore_pcu,
+       NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+       SNBEP_PCI_QPI_PORT0_FILTER,
+       SNBEP_PCI_QPI_PORT1_FILTER,
+       HSWEP_PCI_PCU_3,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+               reg1->idx = 0;
+               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+               reg1->config = event->attr.config1;
+               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+               reg2->config = event->attr.config2;
+       }
+       return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+               int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+               struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
+               if (filter_pdev) {
+                       pci_write_config_dword(filter_pdev, reg1->reg,
+                                               (u32)reg1->config);
+                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
+                                               (u32)(reg1->config >> 32));
+                       pci_write_config_dword(filter_pdev, reg2->reg,
+                                               (u32)reg2->config);
+                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
+                                               (u32)(reg2->config >> 32));
+               }
+       }
+
+       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+       .enable_event           = snbep_qpi_enable_event,
+       .hw_config              = snbep_qpi_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &snbep_uncore_pci_ops,                \
+       .format_group   = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 4,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .event_descs            = snbep_uncore_qpi_events,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       SNBEP_PCI_UNCORE_HA,
+       SNBEP_PCI_UNCORE_IMC,
+       SNBEP_PCI_UNCORE_QPI,
+       SNBEP_PCI_UNCORE_R2PCIE,
+       SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
+       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
+       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
+       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
+       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+       { /* Home Agent */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* MC Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* QPI Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+       .name           = "snbep_uncore",
+       .id_table       = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+       struct pci_dev *ubox_dev = NULL;
+       int i, bus, nodeid, segment;
+       struct pci2phy_map *map;
+       int err = 0;
+       u32 config = 0;
+
+       while (1) {
+               /* find the UBOX device */
+               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+               if (!ubox_dev)
+                       break;
+               bus = ubox_dev->bus->number;
+               /* get the Node ID of the local register */
+               err = pci_read_config_dword(ubox_dev, 0x40, &config);
+               if (err)
+                       break;
+               nodeid = config;
+               /* get the Node ID mapping */
+               err = pci_read_config_dword(ubox_dev, 0x54, &config);
+               if (err)
+                       break;
+
+               segment = pci_domain_nr(ubox_dev->bus);
+               raw_spin_lock(&pci2phy_map_lock);
+               map = __find_pci2phy_map(segment);
+               if (!map) {
+                       raw_spin_unlock(&pci2phy_map_lock);
+                       err = -ENOMEM;
+                       break;
+               }
+
+               /*
+                * every three bits in the Node ID mapping register maps
+                * to a particular node.
+                */
+               for (i = 0; i < 8; i++) {
+                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
+                               map->pbus_to_physid[bus] = i;
+                               break;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       if (!err) {
+               /*
+                * For PCI bus with no UBOX device, find the next bus
+                * that has UBOX device and use its mapping.
+                */
+               raw_spin_lock(&pci2phy_map_lock);
+               list_for_each_entry(map, &pci2phy_map_head, list) {
+                       i = -1;
+                       for (bus = 255; bus >= 0; bus--) {
+                               if (map->pbus_to_physid[bus] >= 0)
+                                       i = map->pbus_to_physid[bus];
+                               else
+                                       map->pbus_to_physid[bus] = i;
+                       }
+               }
+               raw_spin_unlock(&pci2phy_map_lock);
+       }
+
+       pci_dev_put(ubox_dev);
+
+       return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x3ce0);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = snbep_pci_uncores;
+       uncore_pci_driver = &snbep_uncore_pci_driver;
+       return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+       if (msr)
+               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+
+       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
+       .init_box       = ivbep_uncore_msr_init_box,            \
+       .disable_box    = snbep_uncore_msr_disable_box,         \
+       .enable_box     = snbep_uncore_msr_enable_box,          \
+       .disable_event  = snbep_uncore_msr_disable_event,       \
+       .enable_event   = snbep_uncore_msr_enable_event,        \
+       .read_counter   = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_uncore_pci_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
+       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
+       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
+       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
+       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
+       .ops            = &ivbep_uncore_pci_ops,                        \
+       .format_group   = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid.attr,
+       &format_attr_filter_link.attr,
+       &format_attr_filter_state2.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge.attr,
+       &format_attr_filter_band0.attr,
+       &format_attr_filter_band1.attr,
+       &format_attr_filter_band2.attr,
+       &format_attr_filter_band3.attr,
+       NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+       &format_attr_event_ext.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_match_rds.attr,
+       &format_attr_match_rnid30.attr,
+       &format_attr_match_rnid4.attr,
+       &format_attr_match_dnid.attr,
+       &format_attr_match_mc.attr,
+       &format_attr_match_opc.attr,
+       &format_attr_match_vnw.attr,
+       &format_attr_match0.attr,
+       &format_attr_match1.attr,
+       &format_attr_mask_rds.attr,
+       &format_attr_mask_rnid30.attr,
+       &format_attr_mask_rnid4.attr,
+       &format_attr_mask_dnid.attr,
+       &format_attr_mask_mc.attr,
+       &format_attr_mask_opc.attr,
+       &format_attr_mask_vnw.attr,
+       &format_attr_mask0.attr,
+       &format_attr_mask1.attr,
+       NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+       .name = "format",
+       .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+       .name           = "ubox",
+       .num_counters   = 2,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .fixed_ctr_bits = 48,
+       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
+       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
+       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops            = &ivbep_uncore_msr_ops,
+       .format_group   = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+
+       return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 6, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+       .init_box               = ivbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = ivbep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = ivbep_cbox_hw_config,
+       .get_constraint         = ivbep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 15,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
+       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = snbep_uncore_cbox_constraints,
+       .ops                    = &ivbep_uncore_cbox_ops,
+       .format_group           = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = snbep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_pcu_ops,
+       .format_group           = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+       &ivbep_uncore_ubox,
+       &ivbep_uncore_cbox,
+       &ivbep_uncore_pcu,
+       NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 4,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = snbep_uncore_imc_events,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+                              hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &ivbep_uncore_irp_ops,
+       .format_group           = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+       .init_box       = ivbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = snbep_uncore_pci_disable_event,
+       .enable_event   = snbep_qpi_enable_event,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .hw_config      = snbep_qpi_hw_config,
+       .get_constraint = uncore_get_constraint,
+       .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_qpi_ops,
+       .format_group           = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r2pcie_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 44,
+       .constraints    = snbep_uncore_r3qpi_constraints,
+       IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       IVBEP_PCI_UNCORE_HA,
+       IVBEP_PCI_UNCORE_IMC,
+       IVBEP_PCI_UNCORE_IRP,
+       IVBEP_PCI_UNCORE_QPI,
+       IVBEP_PCI_UNCORE_R2PCIE,
+       IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
+       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
+       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
+       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
+       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
+       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 4 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+       .name           = "ivbep_uncore",
+       .id_table       = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x0e1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = ivbep_pci_uncores;
+       uncore_pci_driver = &ivbep_uncore_pci_driver;
+       return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid4.attr,
+       &format_attr_filter_link3.attr,
+       &format_attr_filter_state4.attr,
+       &format_attr_filter_local.attr,
+       &format_attr_filter_all_op.attr,
+       &format_attr_filter_nnm.attr,
+       &format_attr_filter_opc3.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+       EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+       u64 mask = 0;
+
+       if (fields & 0x1)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x4)
+               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+       return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+                            struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                   struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = knl_cha_hw_config,
+       .get_constraint         = knl_cha_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+       .name                   = "cha",
+       .num_counters           = 4,
+       .num_boxes              = 38,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = KNL_CHA_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = knl_uncore_cha_constraints,
+       .ops                    = &knl_uncore_cha_ops,
+       .format_group           = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+       &format_attr_event2.attr,
+       &format_attr_use_occ_ctr.attr,
+       &format_attr_occ_sel.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh6.attr,
+       &format_attr_occ_invert.attr,
+       &format_attr_occ_edge_det.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_msr_ops,
+       .format_group           = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+       &knl_uncore_ubox,
+       &knl_uncore_cha,
+       &knl_uncore_pcu,
+       NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+       uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       int box_ctl = uncore_pci_box_ctl(box);
+
+       pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+                                       struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+                                                       == UNCORE_FIXED_EVENT)
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
+       else
+               pci_write_config_dword(pdev, hwc->config_base,
+                                      hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = knl_uncore_imc_enable_box,
+       .read_counter   = snbep_uncore_pci_read_counter,
+       .enable_event   = knl_uncore_imc_enable_event,
+       .disable_event  = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+       .name                   = "imc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 2,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+       .name                   = "imc",
+       .num_counters           = 4,
+       .num_boxes              = 6,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+       .name                   = "edc_uclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+       .name                   = "edc_eclk",
+       .num_counters           = 4,
+       .num_boxes              = 8,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+       .ops                    = &knl_uncore_imc_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+       .name           = "m2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = knl_uncore_m2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_qor.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+       .name = "format",
+       .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
+       .ops                    = &snbep_uncore_pci_ops,
+       .format_group           = &knl_uncore_irp_format_group,
+};
+
+enum {
+       KNL_PCI_UNCORE_MC_UCLK,
+       KNL_PCI_UNCORE_MC_DCLK,
+       KNL_PCI_UNCORE_EDC_UCLK,
+       KNL_PCI_UNCORE_EDC_ECLK,
+       KNL_PCI_UNCORE_M2PCIE,
+       KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
+       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
+       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
+       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
+       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
+       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
+       NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *     PCI Device ID   Uncore PMU Devices
+ *     ----------------------------------
+ *     0x7841          MC0 UClk, MC1 UClk
+ *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *     0x7817          M2PCIe
+ *     0x7814          IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+       { /* MC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+       },
+       { /* MC DClk Channel */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+       },
+       { /* EDC UClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+       },
+       { /* EDC EClk */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+       },
+       { /* M2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+       .name           = "knl_uncore",
+       .id_table       = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+       int ret;
+
+       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+       ret = snb_pci2phy_map_init(0x7814); /* IRP */
+       if (ret)
+               return ret;
+       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+       if (ret)
+               return ret;
+       uncore_pci_uncores = knl_pci_uncores;
+       uncore_pci_driver = &knl_uncore_pci_driver;
+       return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh5.attr,
+       &format_attr_filter_tid2.attr,
+       &format_attr_filter_cid.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+       reg1->idx = 0;
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_ubox_hw_config,
+       .get_constraint         = uncore_get_constraint,
+       .put_constraint         = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 44,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_ubox_ops,
+       .format_group           = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_thresh8.attr,
+       &format_attr_filter_tid3.attr,
+       &format_attr_filter_link2.attr,
+       &format_attr_filter_state3.attr,
+       &format_attr_filter_nid2.attr,
+       &format_attr_filter_opc2.attr,
+       &format_attr_filter_nc.attr,
+       &format_attr_filter_c6.attr,
+       &format_attr_filter_isoc.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+       EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+       u64 mask = 0;
+       if (fields & 0x1)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+       if (fields & 0x2)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+       if (fields & 0x4)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+       if (fields & 0x8)
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+       if (fields & 0x10) {
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+       }
+       return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+       struct extra_reg *er;
+       int idx = 0;
+
+       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+               if (er->event != (event->hw.config & er->config_mask))
+                       continue;
+               idx |= er->idx;
+       }
+
+       if (idx) {
+               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+               reg1->idx = idx;
+       }
+       return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+                                 struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+       if (reg1->idx != EXTRA_REG_NONE) {
+               u64 filter = uncore_shared_reg_config(box, 0);
+               wrmsrl(reg1->reg, filter & 0xffffffff);
+               wrmsrl(reg1->reg + 1, filter >> 32);
+       }
+
+       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+       .init_box               = snbep_uncore_msr_init_box,
+       .disable_box            = snbep_uncore_msr_disable_box,
+       .enable_box             = snbep_uncore_msr_enable_box,
+       .disable_event          = snbep_uncore_msr_disable_event,
+       .enable_event           = hswep_cbox_enable_event,
+       .read_counter           = uncore_msr_read_counter,
+       .hw_config              = hswep_cbox_hw_config,
+       .get_constraint         = hswep_cbox_get_constraint,
+       .put_constraint         = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 18,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = hswep_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+       unsigned msr = uncore_msr_box_ctl(box);
+
+       if (msr) {
+               u64 init = SNBEP_PMON_BOX_CTL_INT;
+               u64 flags = 0;
+               int i;
+
+               for_each_set_bit(i, (unsigned long *)&init, 64) {
+                       flags |= (1ULL << i);
+                       wrmsrl(msr, flags);
+               }
+       }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .init_box               = hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_umask.attr,
+       &format_attr_edge.attr,
+       &format_attr_tid_en.attr,
+       &format_attr_inv.attr,
+       &format_attr_thresh8.attr,
+       NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+       .name = "format",
+       .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 44,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+       if (ev_sel >= 0xb && ev_sel <= 0xe) {
+               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+               reg1->idx = ev_sel - 0xb;
+               reg1->config = event->attr.config1 & (0xff << reg1->idx);
+       }
+       return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+       .hw_config              = hswep_pcu_hw_config,
+       .get_constraint         = snbep_pcu_get_constraint,
+       .put_constraint         = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+       .name                   = "pcu",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &hswep_uncore_pcu_ops,
+       .format_group           = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+       &hswep_uncore_ubox,
+       &hswep_uncore_cbox,
+       &hswep_uncore_sbox,
+       &hswep_uncore_pcu,
+       NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+       int pkg = topology_phys_to_logical_pkg(0);
+
+       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+       /* Detect 6-8 core systems with only two SBOXes */
+       if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
+               u32 capid4;
+
+               pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
+                                     0x94, &capid4);
+               if (((capid4 >> 6) & 0x3) == 0)
+                       hswep_uncore_sbox.num_boxes = 2;
+       }
+
+       uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 5,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+       { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+       struct pci_dev *pdev = box->pci_dev;
+       struct hw_perf_event *hwc = &event->hw;
+       u64 count = 0;
+
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+       return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+       .init_box       = snbep_uncore_pci_init_box,
+       .disable_box    = snbep_uncore_pci_disable_box,
+       .enable_box     = snbep_uncore_pci_enable_box,
+       .disable_event  = ivbep_uncore_irp_disable_event,
+       .enable_event   = ivbep_uncore_irp_enable_event,
+       .read_counter   = hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 5,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = hswep_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 4,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 44,
+       .constraints    = hswep_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       HSWEP_PCI_UNCORE_HA,
+       HSWEP_PCI_UNCORE_IMC,
+       HSWEP_PCI_UNCORE_IRP,
+       HSWEP_PCI_UNCORE_QPI,
+       HSWEP_PCI_UNCORE_R2PCIE,
+       HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
+       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
+       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
+       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
+       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
+       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT0_FILTER),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  SNBEP_PCI_QPI_PORT1_FILTER),
+       },
+       { /* PCU.3 (for Capability registers) */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+                                                  HSWEP_PCI_PCU_3),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+       .name           = "hswep_uncore",
+       .id_table       = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x2f1e);
+       if (ret)
+               return ret;
+       uncore_pci_uncores = hswep_pci_uncores;
+       uncore_pci_driver = &hswep_uncore_pci_driver;
+       return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+       .name                   = "ubox",
+       .num_counters           = 2,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .fixed_ctr_bits         = 48,
+       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
+       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
+       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &ivbep_uncore_msr_ops,
+       .format_group           = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+       .name                   = "cbox",
+       .num_counters           = 4,
+       .num_boxes              = 24,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
+       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
+       .num_shared_regs        = 1,
+       .constraints            = bdx_uncore_cbox_constraints,
+       .ops                    = &hswep_uncore_cbox_ops,
+       .format_group           = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+       .name                   = "sbox",
+       .num_counters           = 4,
+       .num_boxes              = 4,
+       .perf_ctr_bits          = 48,
+       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
+       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
+       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
+       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
+       .ops                    = &hswep_uncore_sbox_msr_ops,
+       .format_group           = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX    3
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+       &bdx_uncore_ubox,
+       &bdx_uncore_cbox,
+       &hswep_uncore_pcu,
+       &bdx_uncore_sbox,
+       NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+       uncore_msr_uncores = bdx_msr_uncores;
+
+       /* BDX-DE doesn't have SBOX */
+       if (boot_cpu_data.x86_model == 86)
+               uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+       .name           = "ha",
+       .num_counters   = 4,
+       .num_boxes      = 2,
+       .perf_ctr_bits  = 48,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+       .name           = "imc",
+       .num_counters   = 5,
+       .num_boxes      = 8,
+       .perf_ctr_bits  = 48,
+       .fixed_ctr_bits = 48,
+       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+       .event_descs    = hswep_uncore_imc_events,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+       .name                   = "irp",
+       .num_counters           = 4,
+       .num_boxes              = 1,
+       .perf_ctr_bits          = 48,
+       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .ops                    = &hswep_uncore_irp_ops,
+       .format_group           = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+       .name                   = "qpi",
+       .num_counters           = 4,
+       .num_boxes              = 3,
+       .perf_ctr_bits          = 48,
+       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
+       .event_ctl              = SNBEP_PCI_PMON_CTL0,
+       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
+       .num_shared_regs        = 1,
+       .ops                    = &snbep_uncore_qpi_ops,
+       .format_group           = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+       .name           = "r2pcie",
+       .num_counters   = 4,
+       .num_boxes      = 1,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r2pcie_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+       EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+       .name           = "r3qpi",
+       .num_counters   = 3,
+       .num_boxes      = 3,
+       .perf_ctr_bits  = 48,
+       .constraints    = bdx_uncore_r3qpi_constraints,
+       SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+       BDX_PCI_UNCORE_HA,
+       BDX_PCI_UNCORE_IMC,
+       BDX_PCI_UNCORE_IRP,
+       BDX_PCI_UNCORE_QPI,
+       BDX_PCI_UNCORE_R2PCIE,
+       BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
+       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
+       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
+       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
+       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
+       NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+       { /* Home Agent 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+       },
+       { /* Home Agent 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+       },
+       { /* MC0 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+       },
+       { /* MC0 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+       },
+       { /* MC0 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+       },
+       { /* MC0 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+       },
+       { /* MC1 Channel 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+       },
+       { /* MC1 Channel 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+       },
+       { /* MC1 Channel 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+       },
+       { /* MC1 Channel 3 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+       },
+       { /* IRP */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+       },
+       { /* QPI0 Port 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+       },
+       { /* QPI0 Port 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+       },
+       { /* QPI1 Port 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+       },
+       { /* R2PCIe */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+       },
+       { /* R3QPI0 Link 0 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+       },
+       { /* R3QPI0 Link 1 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+       },
+       { /* R3QPI1 Link 2 */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+       },
+       { /* QPI Port 0 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+       },
+       { /* QPI Port 1 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+       },
+       { /* QPI Port 2 filter  */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+       },
+       { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+       .name           = "bdx_uncore",
+       .id_table       = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+       int ret = snbep_pci2phy_map_init(0x6f1e);
+
+       if (ret)
+               return ret;
+       uncore_pci_uncores = bdx_pci_uncores;
+       uncore_pci_driver = &bdx_uncore_pci_driver;
+       return 0;
+}
+
+/* end of BDX uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
new file mode 100644 (file)
index 0000000..ec863b9
--- /dev/null
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+       PERF_MSR_TSC                    = 0,
+       PERF_MSR_APERF                  = 1,
+       PERF_MSR_MPERF                  = 2,
+       PERF_MSR_PPERF                  = 3,
+       PERF_MSR_SMI                    = 4,
+
+       PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+       return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+           boot_cpu_data.x86 != 6)
+               return false;
+
+       switch (boot_cpu_data.x86_model) {
+       case 30: /* 45nm Nehalem    */
+       case 26: /* 45nm Nehalem-EP */
+       case 46: /* 45nm Nehalem-EX */
+
+       case 37: /* 32nm Westmere    */
+       case 44: /* 32nm Westmere-EP */
+       case 47: /* 32nm Westmere-EX */
+
+       case 42: /* 32nm SandyBridge         */
+       case 45: /* 32nm SandyBridge-E/EN/EP */
+
+       case 58: /* 22nm IvyBridge       */
+       case 62: /* 22nm IvyBridge-EP/EX */
+
+       case 60: /* 22nm Haswell Core */
+       case 63: /* 22nm Haswell Server */
+       case 69: /* 22nm Haswell ULT */
+       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+       case 61: /* 14nm Broadwell Core-M */
+       case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
+
+       case 55: /* 22nm Atom "Silvermont"                */
+       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+       case 76: /* 14nm Atom "Airmont"                   */
+               if (idx == PERF_MSR_SMI)
+                       return true;
+               break;
+
+       case 78: /* 14nm Skylake Mobile */
+       case 94: /* 14nm Skylake Desktop */
+               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+                       return true;
+               break;
+       }
+
+       return false;
+}
+
+struct perf_msr {
+       u64     msr;
+       struct  perf_pmu_events_attr *attr;
+       bool    (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
+       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
+       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
+       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
+       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+       NULL,
+};
+
+static struct attribute_group events_attr_group = {
+       .name = "events",
+       .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+static struct attribute_group format_attr_group = {
+       .name = "format",
+       .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+       &events_attr_group,
+       &format_attr_group,
+       NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+       u64 cfg = event->attr.config;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       if (cfg >= PERF_MSR_EVENT_MAX)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       if (!msr[cfg].attr)
+               return -EINVAL;
+
+       event->hw.idx = -1;
+       event->hw.event_base = msr[cfg].msr;
+       event->hw.config = cfg;
+
+       return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+       u64 now;
+
+       if (event->hw.event_base)
+               rdmsrl(event->hw.event_base, now);
+       else
+               rdtscll(now);
+
+       return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+       u64 prev, now;
+       s64 delta;
+
+       /* Careful, an NMI might modify the previous event value. */
+again:
+       prev = local64_read(&event->hw.prev_count);
+       now = msr_read_counter(event);
+
+       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+               goto again;
+
+       delta = now - prev;
+       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+               delta = sign_extend64(delta, 31);
+
+       local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+       u64 now;
+
+       now = msr_read_counter(event);
+       local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+       msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+       msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               msr_event_start(event, flags);
+
+       return 0;
+}
+
+static struct pmu pmu_msr = {
+       .task_ctx_nr    = perf_sw_context,
+       .attr_groups    = attr_groups,
+       .event_init     = msr_event_init,
+       .add            = msr_event_add,
+       .del            = msr_event_del,
+       .start          = msr_event_start,
+       .stop           = msr_event_stop,
+       .read           = msr_event_update,
+       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+       int i, j = 0;
+
+       if (!boot_cpu_has(X86_FEATURE_TSC)) {
+               pr_cont("no MSR PMU driver.\n");
+               return 0;
+       }
+
+       /* Probe the MSRs. */
+       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+               u64 val;
+
+               /*
+                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+                */
+               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+                       msr[i].attr = NULL;
+       }
+
+       /* List remaining MSRs in the sysfs attrs. */
+       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+               if (msr[i].attr)
+                       events_attrs[j++] = &msr[i].attr->attr.attr;
+       }
+       events_attrs[j] = NULL;
+
+       perf_pmu_register(&pmu_msr, "msr", -1);
+
+       return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
new file mode 100644 (file)
index 0000000..68155ca
--- /dev/null
@@ -0,0 +1,960 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+       EXTRA_REG_NONE  = -1,   /* not used */
+
+       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
+       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
+       EXTRA_REG_LBR   = 2,    /* lbr_select */
+       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
+       EXTRA_REG_FE    = 4,    /* fe_* */
+
+       EXTRA_REG_MAX           /* number of entries needed */
+};
+
+struct event_constraint {
+       union {
+               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+               u64             idxmsk64;
+       };
+       u64     code;
+       u64     cmask;
+       int     weight;
+       int     overlap;
+       int     flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
+
+
+struct amd_nb {
+       int nb_id;  /* NorthBridge id */
+       int refcnt; /* reference count */
+       struct perf_event *owners[X86_PMC_IDX_MAX];
+       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS                8
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+       PERF_SAMPLE_TRANSACTION)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+       raw_spinlock_t          lock;   /* per-core: protect structure */
+       u64                 config;     /* extra MSR config */
+       u64                 reg;        /* extra MSR number */
+       atomic_t            ref;        /* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+       struct er_account       regs[EXTRA_REG_MAX];
+       int                     refcnt;         /* per-core: #HT threads */
+       unsigned                core_id;        /* per-core: core id */
+};
+
+enum intel_excl_state_type {
+       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+       bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+       raw_spinlock_t  lock;
+
+       struct intel_excl_states states[2];
+
+       union {
+               u16     has_exclusive[2];
+               u32     exclusive_present;
+       };
+
+       int             refcnt;         /* per-core: #HT threads */
+       unsigned        core_id;        /* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES                32
+
+enum {
+       X86_PERF_KFREE_SHARED = 0,
+       X86_PERF_KFREE_EXCL   = 1,
+       X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+       /*
+        * Generic x86 PMC bits
+        */
+       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
+       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       int                     enabled;
+
+       int                     n_events; /* the # of events in the below arrays */
+       int                     n_added;  /* the # last events in the below arrays;
+                                            they've never been enabled yet */
+       int                     n_txn;    /* the # last events in the below arrays;
+                                            added in the current transaction */
+       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+       u64                     tags[X86_PMC_IDX_MAX];
+
+       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+       int                     n_excl; /* the number of exclusive events */
+
+       unsigned int            txn_flags;
+       int                     is_fake;
+
+       /*
+        * Intel DebugStore bits
+        */
+       struct debug_store      *ds;
+       u64                     pebs_enabled;
+
+       /*
+        * Intel LBR bits
+        */
+       int                             lbr_users;
+       void                            *lbr_context;
+       struct perf_branch_stack        lbr_stack;
+       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
+       struct er_account               *lbr_sel;
+       u64                             br_sel;
+
+       /*
+        * Intel host/guest exclude bits
+        */
+       u64                             intel_ctrl_guest_mask;
+       u64                             intel_ctrl_host_mask;
+       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
+
+       /*
+        * Intel checkpoint mask
+        */
+       u64                             intel_cp_status;
+
+       /*
+        * manage shared (per-core, per-cpu) registers
+        * used on Intel NHM/WSM/SNB
+        */
+       struct intel_shared_regs        *shared_regs;
+       /*
+        * manage exclusive counter access between hyperthread
+        */
+       struct event_constraint *constraint_list; /* in enable order */
+       struct intel_excl_cntrs         *excl_cntrs;
+       int excl_thread_id; /* 0 or 1 */
+
+       /*
+        * AMD specific bits
+        */
+       struct amd_nb                   *amd_nb;
+       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+       u64                             perf_ctr_virt_mask;
+
+       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+       { .idxmsk64 = (n) },            \
+       .code = (c),                    \
+       .cmask = (m),                   \
+       .weight = (w),                  \
+       .overlap = (o),                 \
+       .flags = f,                     \
+}
+
+#define EVENT_CONSTRAINT(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+                          0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  - in_tx
+ *  - in_tx_checkpointed
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n)   \
+       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)  \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n)     \
+       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
+       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, \
+                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+       __EVENT_CONSTRAINT(code, n,                     \
+                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c)        \
+       for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+       unsigned int            event;
+       unsigned int            msr;
+       u64                     config_mask;
+       u64                     valid_mask;
+       int                     idx;  /* per_xxx->regs[] reg index */
+       bool                    extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
+       .event = (e),                   \
+       .msr = (ms),                    \
+       .config_mask = (m),             \
+       .valid_mask = (vm),             \
+       .idx = EXTRA_REG_##i,           \
+       .extra_msr_access = true,       \
+       }
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+       INTEL_UEVENT_EXTRA_REG(c, \
+                              MSR_PEBS_LD_LAT_THRESHOLD, \
+                              0xffff, \
+                              LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+       struct {
+               u64     lbr_format:6;
+               u64     pebs_trap:1;
+               u64     pebs_arch_reg:1;
+               u64     pebs_format:4;
+               u64     smm_freeze:1;
+               /*
+                * PMU supports separate counter range for writing
+                * values > 32bit.
+                */
+               u64     full_width_write:1;
+       };
+       u64     capabilities;
+};
+
+struct x86_pmu_quirk {
+       struct x86_pmu_quirk *next;
+       void (*func)(void);
+};
+
+union x86_pmu_config {
+       struct {
+               u64 event:8,
+                   umask:8,
+                   usr:1,
+                   os:1,
+                   edge:1,
+                   pc:1,
+                   interrupt:1,
+                   __reserved1:1,
+                   en:1,
+                   inv:1,
+                   cmask:8,
+                   event2:4,
+                   __reserved2:4,
+                   go:1,
+                   ho:1;
+       } bits;
+       u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+       x86_lbr_exclusive_lbr,
+       x86_lbr_exclusive_bts,
+       x86_lbr_exclusive_pt,
+       x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+       /*
+        * Generic x86 PMC bits
+        */
+       const char      *name;
+       int             version;
+       int             (*handle_irq)(struct pt_regs *);
+       void            (*disable_all)(void);
+       void            (*enable_all)(int added);
+       void            (*enable)(struct perf_event *);
+       void            (*disable)(struct perf_event *);
+       int             (*hw_config)(struct perf_event *event);
+       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+       unsigned        eventsel;
+       unsigned        perfctr;
+       int             (*addr_offset)(int index, bool eventsel);
+       int             (*rdpmc_index)(int index);
+       u64             (*event_map)(int);
+       int             max_events;
+       int             num_counters;
+       int             num_counters_fixed;
+       int             cntval_bits;
+       u64             cntval_mask;
+       union {
+                       unsigned long events_maskl;
+                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+       };
+       int             events_mask_len;
+       int             apic;
+       u64             max_period;
+       struct event_constraint *
+                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
+                                                int idx,
+                                                struct perf_event *event);
+
+       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
+                                                struct perf_event *event);
+
+       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+       struct event_constraint *event_constraints;
+       struct x86_pmu_quirk *quirks;
+       int             perfctr_second_write;
+       bool            late_ack;
+       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
+
+       /*
+        * sysfs attrs
+        */
+       int             attr_rdpmc_broken;
+       int             attr_rdpmc;
+       struct attribute **format_attrs;
+       struct attribute **event_attrs;
+
+       ssize_t         (*events_sysfs_show)(char *page, u64 config);
+       struct attribute **cpu_events;
+
+       /*
+        * CPU Hotplug hooks
+        */
+       int             (*cpu_prepare)(int cpu);
+       void            (*cpu_starting)(int cpu);
+       void            (*cpu_dying)(int cpu);
+       void            (*cpu_dead)(int cpu);
+
+       void            (*check_microcode)(void);
+       void            (*sched_task)(struct perf_event_context *ctx,
+                                     bool sched_in);
+
+       /*
+        * Intel Arch Perfmon v2+
+        */
+       u64                     intel_ctrl;
+       union perf_capabilities intel_cap;
+
+       /*
+        * Intel DebugStore bits
+        */
+       unsigned int    bts             :1,
+                       bts_active      :1,
+                       pebs            :1,
+                       pebs_active     :1,
+                       pebs_broken     :1,
+                       pebs_prec_dist  :1;
+       int             pebs_record_size;
+       int             pebs_buffer_size;
+       void            (*drain_pebs)(struct pt_regs *regs);
+       struct event_constraint *pebs_constraints;
+       void            (*pebs_aliases)(struct perf_event *event);
+       int             max_pebs_events;
+       unsigned long   free_running_flags;
+
+       /*
+        * Intel LBR
+        */
+       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+       int             lbr_nr;                    /* hardware stack size */
+       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
+       const int       *lbr_sel_map;              /* lbr_select mappings */
+       bool            lbr_double_abort;          /* duplicated lbr aborts */
+
+       /*
+        * Intel PT/LBR/BTS are exclusive
+        */
+       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
+
+       /*
+        * Extra registers for events
+        */
+       struct extra_reg *extra_regs;
+       unsigned int flags;
+
+       /*
+        * Intel host/guest support (KVM)
+        */
+       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+struct x86_perf_task_context {
+       u64 lbr_from[MAX_LBR_ENTRIES];
+       u64 lbr_to[MAX_LBR_ENTRIES];
+       u64 lbr_info[MAX_LBR_ENTRIES];
+       int tos;
+       int lbr_callstack_users;
+       int lbr_stack_state;
+};
+
+#define x86_add_quirk(func_)                                           \
+do {                                                                   \
+       static struct x86_pmu_quirk __quirk __initdata = {              \
+               .func = func_,                                          \
+       };                                                              \
+       __quirk.next = x86_pmu.quirks;                                  \
+       x86_pmu.quirks = &__quirk;                                      \
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
+
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)                                         \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = PERF_COUNT_HW_##_id,                          \
+       .event_str      = NULL,                                         \
+};
+
+#define EVENT_ATTR_STR(_name, v, str)                                  \
+static struct perf_pmu_events_attr event_attr_##v = {                  \
+       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+       .id             = 0,                                            \
+       .event_str      = str,                                          \
+};
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+       return  x86_pmu.lbr_sel_map &&
+               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+                               [PERF_COUNT_HW_CACHE_MAX]
+                               [PERF_COUNT_HW_CACHE_OP_MAX]
+                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+                                  x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+                                 x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+                                         u64 enable_mask)
+{
+       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+       if (hwc->extra_reg.reg)
+               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+       return ip > PAGE_OFFSET;
+#else
+       return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+       if (regs->flags & X86_VM_MASK)
+               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+       regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+       return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+           !event->attr.freq && event->hw.sample_period == 1)
+               return true;
+
+       return false;
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+                         struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+void intel_pmu_pebs_data_source_nhm(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+                         char *page);
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+       return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+       return NULL;
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+       return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
index 7bfc85b..99afb66 100644 (file)
@@ -151,12 +151,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
        ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)                    \
        ".popsection"
 
-/*
- * This must be included *after* the definition of ALTERNATIVE due to
- * <asm/arch_hweight.h>
- */
-#include <asm/cpufeature.h>
-
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
index 3c56ef1..5e828da 100644 (file)
@@ -27,15 +27,23 @@ struct amd_l3_cache {
 };
 
 struct threshold_block {
-       unsigned int            block;
-       unsigned int            bank;
-       unsigned int            cpu;
-       u32                     address;
-       u16                     interrupt_enable;
-       bool                    interrupt_capable;
-       u16                     threshold_limit;
-       struct kobject          kobj;
-       struct list_head        miscj;
+       unsigned int     block;                 /* Number within bank */
+       unsigned int     bank;                  /* MCA bank the block belongs to */
+       unsigned int     cpu;                   /* CPU which controls MCA bank */
+       u32              address;               /* MSR address for the block */
+       u16              interrupt_enable;      /* Enable/Disable APIC interrupt */
+       bool             interrupt_capable;     /* Bank can generate an interrupt. */
+
+       u16              threshold_limit;       /*
+                                                * Value upon which threshold
+                                                * interrupt is generated.
+                                                */
+
+       struct kobject   kobj;                  /* sysfs object */
+       struct list_head miscj;                 /*
+                                                * List of threshold blocks
+                                                * within a bank.
+                                                */
 };
 
 struct threshold_bank {
index c80f6b6..0899cfc 100644 (file)
@@ -6,7 +6,6 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/processor.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
 #include <asm/fixmap.h>
index 259a7c1..02e799f 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#include <asm/cpufeatures.h>
+
 #ifdef CONFIG_64BIT
 /* popcnt %edi, %eax -- redundant REX prefix for alignment */
 #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
index 189679a..f5063b6 100644 (file)
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)                                 \
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
        .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
+       .balign 4 ;                                             \
        .long (from) - . ;                                      \
        .long (to) - . ;                                        \
+       .long (handler) - . ;                                   \
        .popsection
 
-# define _ASM_EXTABLE_EX(from,to)                              \
-       .pushsection "__ex_table","a" ;                         \
-       .balign 8 ;                                             \
-       .long (from) - . ;                                      \
-       .long (to) - . + 0x7ffffff0 ;                           \
-       .popsection
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)                                  \
        .pushsection "_kprobe_blacklist","aw" ;                 \
        .endm
 
 #else
-# define _ASM_EXTABLE(from,to)                                 \
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)                        \
        " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
+       " .balign 4\n"                                          \
        " .long (" #from ") - .\n"                              \
        " .long (" #to ") - .\n"                                \
+       " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"    \
        " .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)                              \
-       " .pushsection \"__ex_table\",\"a\"\n"                  \
-       " .balign 8\n"                                          \
-       " .long (" #from ") - .\n"                              \
-       " .long (" #to ") - . + 0x7ffffff0\n"                   \
-       " .popsection\n"
+# define _ASM_EXTABLE(from, to)                                        \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)                          \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)                             \
+       _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
index a584e1c..bfb28ca 100644 (file)
@@ -6,18 +6,17 @@
 
 /*
  * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
  * to devices.
  */
 
 #ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+                                     X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+                                      X86_FEATURE_XMM2) ::: "memory", "cc")
 #else
 #define mb()   asm volatile("mfence":::"memory")
 #define rmb()  asm volatile("lfence":::"memory")
index cfe3b95..7766d1c 100644 (file)
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
        barrier();
        clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
        barrier();
        __clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
        asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
        if (IS_IMMEDIATE(nr)) {
                asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 {
        int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
        int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 {
        int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
 {
        GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
 {
        int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
        asm("rep; bsf %1,%0"
                : "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
        asm("rep; bsf %1,%0"
                : "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
        asm("bsr %1,%0"
            : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
        int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
        int r;
 
index e63aa38..61518cf 100644 (file)
@@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
 int rodata_test(void);
index eda81dc..d194266 100644 (file)
@@ -3,10 +3,11 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#define VCLOCK_NONE 0  /* No vDSO clock available.     */
-#define VCLOCK_TSC  1  /* vDSO should use vread_tsc.   */
-#define VCLOCK_HPET 2  /* vDSO should use vread_hpet.  */
-#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_NONE    0  /* No vDSO clock available.  */
+#define VCLOCK_TSC     1  /* vDSO should use vread_tsc.        */
+#define VCLOCK_HPET    2  /* vDSO should use vread_hpet.       */
+#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_MAX     3
 
 struct arch_clocksource_data {
        int vclock_mode;
index ad19841..9733361 100644 (file)
@@ -2,6 +2,7 @@
 #define ASM_X86_CMPXCHG_H
 
 #include <linux/compiler.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
 /*
index 7ad8c94..68e4e82 100644 (file)
@@ -1,288 +1,7 @@
-/*
- * Defines x86 CPU feature bits
- */
 #ifndef _ASM_X86_CPUFEATURE_H
 #define _ASM_X86_CPUFEATURE_H
 
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#include <asm/required-features.h>
-#endif
-
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#include <asm/disabled-features.h>
-#endif
-
-#define NCAPINTS       16      /* N 32-bit words worth of info */
-#define NBUGINTS       1       /* N 32-bit bug flags */
-
-/*
- * Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name.  If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
- */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU                ( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME                ( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE         ( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE                ( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC                ( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR                ( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE                ( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE                ( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8                ( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC       ( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP                ( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR       ( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE                ( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA                ( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV       ( 0*32+15) /* CMOV instructions */
-                                         /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT                ( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36      ( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN         ( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH    ( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS         ( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI       ( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX                ( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR       ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM                ( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2       ( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP  ( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT         ( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC                ( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64       ( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE                ( 0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL    ( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP         ( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX         ( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT     ( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT   ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES    ( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP     ( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM         ( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT   ( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW      ( 1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY   ( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN    ( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI       ( 2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX      ( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR    ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR  ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR        ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8         ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7         ( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3         ( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4         ( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP         ( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS       ( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS                ( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32  ( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD   ( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP  ( 3*32+19) * "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL       ( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS     ( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY  ( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC        ( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID        ( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ  ( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64     ( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT      ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL      ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX                ( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX                ( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST                ( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2                ( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3      ( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID                ( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG       ( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA                ( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16       ( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR       ( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM       ( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID       ( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA                ( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1     ( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2     ( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC     ( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE      ( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES                ( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE      ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE    ( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX                ( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C       ( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND     ( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE     ( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN  ( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT     ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN  ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2       ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN    ( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE                ( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN     ( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM                ( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN     ( 5*32+13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM    ( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM                ( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC    ( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM                ( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A      ( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW       ( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS                ( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP                ( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT     ( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT                ( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP                ( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4       ( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE                ( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM                ( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT    ( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT      (6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
-#define X86_FEATURE_MWAITX     ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7.
- *
- * Reuse free bits when adding new feature flags!
- */
-
-#define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-
-#define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-
-#define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
-
-/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
-
-#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
-
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE   ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1       ( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE                ( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2       ( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP       ( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2       ( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS       ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID    ( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM                ( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM                ( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX                ( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F    ( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED     ( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX                ( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP       ( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT    ( 9*32+22) /* PCOMMIT instruction */
-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB       ( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC     (10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1    (10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES     (10*32+ 3) /* XSAVES/XRSTORS */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC    (11*32+ 1) /* LLC QoS if 1 */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO     (13*32+0) /* CLZERO instruction */
-
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM     (14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA                (14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT       (14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN                (14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS                (14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP                (14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP    (14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT                (15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV       (15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML       (15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS      (15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-
-/*
- * BUG word(s)
- */
-#define X86_BUG(x)             (NCAPINTS*32 + (x))
-
-#define X86_BUG_F00F           X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV           X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA           X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E   X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP           X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK    X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR        X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS        X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#include <asm/processor.h>
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
@@ -369,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * is not relevant.
  */
 #define cpu_feature_enabled(bit)       \
-       (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 :  \
-        cpu_has(&boot_cpu_data, bit))
+       (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
 #define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
 
@@ -406,106 +124,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_has_osxsave                boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor     boot_cpu_has(X86_FEATURE_HYPERVISOR)
 /*
- * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * Do not add any more of those clumsy macros - use static_cpu_has() for
  * fast paths and boot_cpu_has() otherwise!
  */
 
-#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern void warn_pre_alternatives(void);
-extern bool __static_cpu_has_safe(u16 bit);
-
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These are only valid after alternatives have run, but will statically
- * patch the target code for additional performance.
+ * These will statically patch the target code for additional
+ * performance.
  */
-static __always_inline __pure bool __static_cpu_has(u16 bit)
-{
-#ifdef CC_HAVE_ASM_GOTO
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-
-               /*
-                * Catch too early usage of this before alternatives
-                * have run.
-                */
-               asm_volatile_goto("1: jmp %l[t_warn]\n"
-                        "2:\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"
-                        " .long 0\n"           /* no replacement */
-                        " .word %P0\n"         /* 1: do replace */
-                        " .byte 2b - 1b\n"     /* source len */
-                        " .byte 0\n"           /* replacement len */
-                        " .byte 0\n"           /* pad len */
-                        ".previous\n"
-                        /* skipping size check since replacement size = 0 */
-                        : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
-
-#endif
-
-               asm_volatile_goto("1: jmp %l[t_no]\n"
-                        "2:\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"
-                        " .long 0\n"           /* no replacement */
-                        " .word %P0\n"         /* feature bit */
-                        " .byte 2b - 1b\n"     /* source len */
-                        " .byte 0\n"           /* replacement len */
-                        " .byte 0\n"           /* pad len */
-                        ".previous\n"
-                        /* skipping size check since replacement size = 0 */
-                        : : "i" (bit) : : t_no);
-               return true;
-       t_no:
-               return false;
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-       t_warn:
-               warn_pre_alternatives();
-               return false;
-#endif
-
-#else /* CC_HAVE_ASM_GOTO */
-
-               u8 flag;
-               /* Open-coded due to __stringify() in ALTERNATIVE() */
-               asm volatile("1: movb $0,%0\n"
-                            "2:\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"
-                            " .long 3f - .\n"
-                            " .word %P1\n"             /* feature bit */
-                            " .byte 2b - 1b\n"         /* source len */
-                            " .byte 4f - 3f\n"         /* replacement len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "3: movb $1,%0\n"
-                            "4:\n"
-                            ".previous\n"
-                            : "=qm" (flag) : "i" (bit));
-               return flag;
-
-#endif /* CC_HAVE_ASM_GOTO */
-}
-
-#define static_cpu_has(bit)                                    \
-(                                                              \
-       __builtin_constant_p(boot_cpu_has(bit)) ?               \
-               boot_cpu_has(bit) :                             \
-       __builtin_constant_p(bit) ?                             \
-               __static_cpu_has(bit) :                         \
-               boot_cpu_has(bit)                               \
-)
-
-static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
+static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-#ifdef CC_HAVE_ASM_GOTO
-               asm_volatile_goto("1: jmp %l[t_dynamic]\n"
+               asm_volatile_goto("1: jmp 6f\n"
                         "2:\n"
                         ".skip -(((5f-4f) - (2b-1b)) > 0) * "
                                 "((5f-4f) - (2b-1b)),0x90\n"
@@ -530,66 +161,34 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                         " .byte 0\n"                   /* repl len */
                         " .byte 0\n"                   /* pad len */
                         ".previous\n"
-                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
-                        : : t_dynamic, t_no);
+                        ".section .altinstr_aux,\"ax\"\n"
+                        "6:\n"
+                        " testb %[bitnum],%[cap_byte]\n"
+                        " jnz %l[t_yes]\n"
+                        " jmp %l[t_no]\n"
+                        ".previous\n"
+                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+                            [bitnum] "i" (1 << (bit & 7)),
+                            [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+                        : : t_yes, t_no);
+       t_yes:
                return true;
        t_no:
                return false;
-       t_dynamic:
-               return __static_cpu_has_safe(bit);
-#else
-               u8 flag;
-               /* Open-coded due to __stringify() in ALTERNATIVE() */
-               asm volatile("1: movb $2,%0\n"
-                            "2:\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"          /* src offset */
-                            " .long 3f - .\n"          /* repl offset */
-                            " .word %P2\n"             /* always replace */
-                            " .byte 2b - 1b\n"         /* source len */
-                            " .byte 4f - 3f\n"         /* replacement len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "3: movb $0,%0\n"
-                            "4:\n"
-                            ".previous\n"
-                            ".section .altinstructions,\"a\"\n"
-                            " .long 1b - .\n"          /* src offset */
-                            " .long 5f - .\n"          /* repl offset */
-                            " .word %P1\n"             /* feature bit */
-                            " .byte 4b - 3b\n"         /* src len */
-                            " .byte 6f - 5f\n"         /* repl len */
-                            " .byte 0\n"               /* pad len */
-                            ".previous\n"
-                            ".section .discard,\"aw\",@progbits\n"
-                            " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
-                            ".previous\n"
-                            ".section .altinstr_replacement,\"ax\"\n"
-                            "5: movb $1,%0\n"
-                            "6:\n"
-                            ".previous\n"
-                            : "=qm" (flag)
-                            : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-               return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
-#endif /* CC_HAVE_ASM_GOTO */
 }
 
-#define static_cpu_has_safe(bit)                               \
+#define static_cpu_has(bit)                                    \
 (                                                              \
        __builtin_constant_p(boot_cpu_has(bit)) ?               \
                boot_cpu_has(bit) :                             \
-               _static_cpu_has_safe(bit)                       \
+               _static_cpu_has(bit)                            \
 )
 #else
 /*
- * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
  */
 #define static_cpu_has(bit)            boot_cpu_has(bit)
-#define static_cpu_has_safe(bit)       boot_cpu_has(bit)
 #endif
 
 #define cpu_has_bug(c, bit)            cpu_has(c, (bit))
@@ -597,7 +196,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 #define clear_cpu_bug(c, bit)          clear_cpu_cap(c, (bit))
 
 #define static_cpu_has_bug(bit)                static_cpu_has((bit))
-#define static_cpu_has_bug_safe(bit)   static_cpu_has_safe((bit))
 #define boot_cpu_has_bug(bit)          cpu_has_bug(&boot_cpu_data, (bit))
 
 #define MAX_CPU_FEATURES               (NCAPINTS * 32)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644 (file)
index 0000000..074b760
--- /dev/null
@@ -0,0 +1,300 @@
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS       16      /* N 32-bit words worth of info */
+#define NBUGINTS       1       /* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU                ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME                ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE         ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE                ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC                ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR                ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE                ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE                ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8                ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC       ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP                ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR       ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE                ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA                ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV       ( 0*32+15) /* CMOV instructions */
+                                         /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT                ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36      ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN         ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH    ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS         ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI       ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX                ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR       ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM                ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2       ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP  ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT         ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC                ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64       ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE                ( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL    ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP         ( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX         ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT     ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT   ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES    ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP     ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM         ( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT   ( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW      ( 1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY   ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN    ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI       ( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX      ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR    ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR  ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR        ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8         ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7         ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3         ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4         ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP         ( 3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_ART                ( 3*32+10) /* Platform has always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS       ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS                ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32  ( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD   ( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP  ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL       ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS     ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY  ( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC        ( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID        ( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU  ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3       ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ  ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64     ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT      ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL      ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX                ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX                ( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST                ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2                ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3      ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID                ( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG       ( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA                ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16       ( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR       ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM       ( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID       ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA                ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1     ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2     ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC     ( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE      ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES                ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE      ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE    ( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX                ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C       ( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND     ( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE     ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN  ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT     ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN  ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2       ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN    ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE                ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN     ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM                ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN     ( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM    ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM                ( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC    ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM                ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A      ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW       ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS                ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP                ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT     ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT                ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP                ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4       ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE                ( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM                ( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT    ( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT      (6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX     ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+
+#define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+
+#define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
+#define X86_FEATURE_INTEL_PT   ( 7*32+15) /* Intel Processor Trace */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE   ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1       ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE                ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2       ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP       ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2       ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS       ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID    ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM                ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM                ( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX                ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F    ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ   ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED     ( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX                ( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP       ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT    ( 9*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB       ( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW   ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL   ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC     (10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1    (10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES     (10*32+ 3) /* XSAVES/XRSTORS */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC    (11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO     (13*32+0) /* CLZERO instruction */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM     (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA                (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT       (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN                (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS                (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP                (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP    (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT                (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV       (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML       (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS      (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC       (15*32+13) /* Virtual Interrupt Controller */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x)             (NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F           X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV           X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA           X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E   X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP           X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK    X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR        X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS        X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX         X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
+#endif /* _ASM_X86_CPUFEATURES_H */
index 278441f..eb5deb4 100644 (file)
@@ -98,4 +98,27 @@ struct desc_ptr {
 
 #endif /* !__ASSEMBLY__ */
 
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA         (0 * (1 << 9))
+#define AR_TYPE_RWDATA         (1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9))
+#define AR_TYPE_XOCODE         (4 * (1 << 9))
+#define AR_TYPE_XRCODE         (5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF    (6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF    (7 * (1 << 9))
+#define AR_TYPE_MASK           (7 * (1 << 9))
+
+#define AR_DPL0                        (0 * (1 << 13))
+#define AR_DPL3                        (3 * (1 << 13))
+#define AR_DPL_MASK            (3 * (1 << 13))
+
+#define AR_A                   (1 << 8)   /* "Accessed" */
+#define AR_S                   (1 << 12)  /* If clear, "System" segment */
+#define AR_P                   (1 << 15)  /* "Present" */
+#define AR_AVL                 (1 << 20)  /* "AVaiLable" (no HW effect) */
+#define AR_L                   (1 << 21)  /* "Long mode" for code segments */
+#define AR_DB                  (1 << 22)  /* D/B, effect depends on type */
+#define AR_G                   (1 << 23)  /* "Granularity" (limit in pages) */
+
 #endif /* _ASM_X86_DESC_DEFS_H */
index 535192f..3c69fed 100644 (file)
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_early_remap                early_ioremap
 #define dmi_early_unmap                early_iounmap
-#define dmi_remap              ioremap
+#define dmi_remap              ioremap_cache
 #define dmi_unmap              iounmap
 
 #endif /* _ASM_X86_DMI_H */
index 1514753..15340e3 100644 (file)
@@ -256,7 +256,7 @@ extern int force_personality32;
    instruction set this CPU supports.  This could be done in user space,
    but it's not easy, and we've already done it here.  */
 
-#define ELF_HWCAP              (boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP              (boot_cpu_data.x86_capability[CPUID_1_EDX])
 
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
index 6d7d0e5..8554f96 100644 (file)
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
 extern int fixmaps_set;
 
 extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
+#define kmap_prot PAGE_KERNEL
 extern pte_t *pkmap_page_table;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
index 0fd440d..a212434 100644 (file)
@@ -17,6 +17,7 @@
 #include <asm/user.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
 
 /*
  * High level FPU state handling functions:
@@ -58,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
  */
 static __always_inline __pure bool use_eager_fpu(void)
 {
-       return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+       return static_cpu_has(X86_FEATURE_EAGER_FPU);
 }
 
 static __always_inline __pure bool use_xsaveopt(void)
 {
-       return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+       return static_cpu_has(X86_FEATURE_XSAVEOPT);
 }
 
 static __always_inline __pure bool use_xsave(void)
 {
-       return static_cpu_has_safe(X86_FEATURE_XSAVE);
+       return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
 static __always_inline __pure bool use_fxsr(void)
 {
-       return static_cpu_has_safe(X86_FEATURE_FXSR);
+       return static_cpu_has(X86_FEATURE_FXSR);
 }
 
 /*
@@ -300,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 
        WARN_ON(system_state != SYSTEM_BOOTING);
 
-       if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+       if (static_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
@@ -322,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 
        WARN_ON(system_state != SYSTEM_BOOTING);
 
-       if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+       if (static_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -460,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
         * pending. Clear the x87 state here by setting it to fixed values.
         * "m" is a random variable that should be in L1.
         */
-       if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+       if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
@@ -589,7 +590,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
         * If the task has used the math, pre-load the FPU on xsave processors
         * or if the past 5 consecutive context-switches used math.
         */
-       fpu.preload = new_fpu->fpstate_active &&
+       fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
+                     new_fpu->fpstate_active &&
                      (use_eager_fpu() || new_fpu->counter > 5);
 
        if (old_fpu->fpregs_active) {
index af30fde..f23cd8c 100644 (file)
 
 /* Supported features which support lazy state saving */
 #define XFEATURE_MASK_LAZY     (XFEATURE_MASK_FP | \
-                                XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | \
-                                XFEATURE_MASK_BNDCSR | \
+                                XFEATURE_MASK_SSE | \
                                 XFEATURE_MASK_YMM | \
                                 XFEATURE_MASK_OPMASK | \
                                 XFEATURE_MASK_ZMM_Hi256 | \
                                 XFEATURE_MASK_Hi16_ZMM)
 
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER    (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
 /* All currently supported features */
 #define XCNTXT_MASK    (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
index 793179c..6e4d170 100644 (file)
@@ -1,23 +1,44 @@
-#ifdef __ASSEMBLY__
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
 
 #include <asm/asm.h>
 
-/* The annotation hides the frame from the unwinder and makes it look
-   like a ordinary ebp save/restore. This avoids some special cases for
-   frame pointer later */
+/*
+ * These are stack frame creation macros.  They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
+
 #ifdef CONFIG_FRAME_POINTER
-       .macro FRAME
-       __ASM_SIZE(push,)       %__ASM_REG(bp)
-       __ASM_SIZE(mov)         %__ASM_REG(sp), %__ASM_REG(bp)
-       .endm
-       .macro ENDFRAME
-       __ASM_SIZE(pop,)        %__ASM_REG(bp)
-       .endm
-#else
-       .macro FRAME
-       .endm
-       .macro ENDFRAME
-       .endm
-#endif
-
-#endif  /*  __ASSEMBLY__  */
+
+#ifdef __ASSEMBLY__
+
+.macro FRAME_BEGIN
+       push %_ASM_BP
+       _ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+       pop %_ASM_BP
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define FRAME_BEGIN                            \
+       "push %" _ASM_BP "\n"                   \
+       _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#endif /* __ASSEMBLY__ */
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_FRAME_H */
index cd2ce40..ebea2c9 100644 (file)
@@ -53,7 +53,7 @@
 #define IMR_MASK               (IMR_ALIGN - 1)
 
 int imr_add_range(phys_addr_t base, size_t size,
-                 unsigned int rmask, unsigned int wmask, bool lock);
+                 unsigned int rmask, unsigned int wmask);
 
 int imr_remove_range(phys_addr_t base, size_t size);
 
index cfc9a0d..a4fe16e 100644 (file)
@@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void)
                cpu_relax();
 }
 
-static inline void
-__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
-{
-       /*
-        * Subtle. In the case of the 'never do double writes' workaround
-        * we have to lock out interrupts to be safe.  As we don't care
-        * of the value read we use an atomic rmw access to avoid costly
-        * cli/sti.  Otherwise we use an even cheaper single atomic write
-        * to the APIC.
-        */
-       unsigned int cfg;
-
-       /*
-        * Wait for idle.
-        */
-       __xapic_wait_icr_idle();
-
-       /*
-        * No need to touch the target chip field
-        */
-       cfg = __prepare_ICR(shortcut, vector, dest);
-
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
 
 /*
  * This is used to send an IPI with no shorthand notation (the destination is
  * specified in bits 56 to 63 of the ICR).
  */
-static inline void
- __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
-{
-       unsigned long cfg;
-
-       /*
-        * Wait for idle.
-        */
-       if (unlikely(vector == NMI_VECTOR))
-               safe_apic_wait_icr_idle();
-       else
-               __xapic_wait_icr_idle();
-
-       /*
-        * prepare target chip field
-        */
-       cfg = __prepare_ICR2(mask);
-       native_apic_mem_write(APIC_ICR2, cfg);
-
-       /*
-        * program the ICR
-        */
-       cfg = __prepare_ICR(0, vector, dest);
-
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
 
 extern void default_send_IPI_single(int cpu, int vector);
 extern void default_send_IPI_single_phys(int cpu, int vector);
index 881b476..e7de5c9 100644 (file)
@@ -23,11 +23,13 @@ extern void irq_ctx_init(int cpu);
 
 #define __ARCH_HAS_DO_SOFTIRQ
 
+struct irq_desc;
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
 extern int check_irq_vectors_for_cpu_disable(void);
 extern void fixup_irqs(void);
-extern void irq_force_complete_move(int);
+extern void irq_force_complete_move(struct irq_desc *desc);
 #endif
 
 #ifdef CONFIG_HAVE_KVM
@@ -37,7 +39,6 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
 
-struct irq_desc;
 extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
index 78162f8..d0afb05 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _ASM_IRQ_WORK_H
 #define _ASM_IRQ_WORK_H
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 
 static inline bool arch_irq_work_has_interrupt(void)
 {
index c1adf33..bc62e7c 100644 (file)
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 }
 #endif /* CONFIG_KVM_GUEST */
 
-#ifdef CONFIG_DEBUG_RODATA
 #define KVM_HYPERCALL \
         ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
 
 /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
index 19c099a..e795f52 100644 (file)
@@ -41,7 +41,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
        regs->ip = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVEPATCH
+#error Include linux/livepatch.h, not asm/livepatch.h
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
index 2ea4527..92b6f65 100644 (file)
 #define MCI_STATUS_AR   (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED    (1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON      (1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC         (1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX                0x1
+#define MCI_IPID_MCATYPE       0xFFFF0000
+#define MCI_IPID_HWID          0xFFF
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE      "MACHINECHECK"
 
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0       0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG      0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID                0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1       0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)     (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)   (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)     (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+
 /*
  * This structure contains all data related to the MCE log.  Also
  * carries a signature to make it easier to find from external
@@ -113,6 +135,7 @@ struct mca_config {
        bool ignore_ce;
        bool disabled;
        bool ser;
+       bool recovery;
        bool bios_cmci_threshold;
        u8 banks;
        s8 bootlog;
@@ -287,4 +310,49 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
                                      struct cper_sec_mem_err *mem_err);
 
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+       SMCA_F17H_CORE = 0,     /* Core errors */
+       SMCA_DF,                /* Data Fabric */
+       SMCA_UMC,               /* Unified Memory Controller */
+       SMCA_PB,                /* Parameter Block */
+       SMCA_PSP,               /* Platform Security Processor */
+       SMCA_SMU,               /* System Management Unit */
+       N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+       const char *name;
+       unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+       SMCA_LS = 0,    /* Load Store */
+       SMCA_IF,        /* Instruction Fetch */
+       SMCA_L2_CACHE,  /* L2 cache */
+       SMCA_DE,        /* Decoder unit */
+       RES,            /* Reserved */
+       SMCA_EX,        /* Execution unit */
+       SMCA_FP,        /* Floating Point */
+       SMCA_L3_CACHE,  /* L3 cache */
+       N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+       SMCA_CS = 0,    /* Coherent Slave */
+       SMCA_PIE,       /* Power management, Interrupts, etc */
+       N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
 #endif /* _ASM_X86_MCE_H */
index 1e1b07a..9d3a96c 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <asm/cpu.h>
 #include <linux/earlycpio.h>
+#include <linux/initrd.h>
 
 #define native_rdmsr(msr, val1, val2)                  \
 do {                                                   \
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void)                    { }
 static inline bool
 get_builtin_firmware(struct cpio_data *cd, const char *name)   { return false; }
 #endif
+
+static inline unsigned long get_initrd_start(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+       return initrd_start;
+#else
+       return 0;
+#endif
+}
+
+static inline unsigned long get_initrd_start_addr(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_X86_32
+       unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+
+       return (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+       return get_initrd_start();
+#endif
+#else /* CONFIG_BLK_DEV_INITRD */
+       return 0;
+#endif
+}
+
 #endif /* _ASM_X86_MICROCODE_H */
index 8559b01..603417f 100644 (file)
@@ -40,7 +40,6 @@ struct extended_sigtable {
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
 #define EXT_HEADER_SIZE                (sizeof(struct extended_sigtable))
 #define EXT_SIGNATURE_SIZE     (sizeof(struct extended_signature))
-#define DWSIZE                 (sizeof(u32))
 
 #define get_totalsize(mc) \
        (((struct microcode_intel *)mc)->hdr.datasize ? \
index 55234d5..1ea0bae 100644 (file)
@@ -19,7 +19,8 @@ typedef struct {
 #endif
 
        struct mutex lock;
-       void __user *vdso;
+       void __user *vdso;                      /* vdso base address */
+       const struct vdso_image *vdso_image;    /* vdso image in use */
 
        atomic_t perf_rdpmc_allowed;    /* nonzero if rdpmc is allowed */
 } mm_context_t;
index b05402e..984ab75 100644 (file)
@@ -1,7 +1,12 @@
 #ifndef _ASM_X86_MSR_INDEX_H
 #define _ASM_X86_MSR_INDEX_H
 
-/* CPU model specific register (MSR) numbers */
+/*
+ * CPU model specific register (MSR) numbers.
+ *
+ * Do not add new entries to this file unless the definitions are shared
+ * between multiple compilation units.
+ */
 
 /* x86-64 specific MSRs */
 #define MSR_EFER               0xc0000080 /* extended feature register */
index c70689b..0deeb2d 100644 (file)
@@ -3,6 +3,8 @@
 
 #include <linux/sched.h>
 
+#include <asm/cpufeature.h>
+
 #define MWAIT_SUBSTATE_MASK            0xf
 #define MWAIT_CSTATE_MASK              0xf
 #define MWAIT_SUBSTATE_SIZE            4
index 46873fb..d08eacd 100644 (file)
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
+extern bool mp_should_keep_irq(struct device *dev);
+
 struct pci_raw_ops {
        int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
                                                int reg, int len, u32 *val);
index 7bcb861..5a2ed3e 100644 (file)
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
 #define GLOBAL_STATUS_ASIF                             BIT_ULL(60)
 #define GLOBAL_STATUS_COUNTERS_FROZEN                  BIT_ULL(59)
 #define GLOBAL_STATUS_LBRS_FROZEN                      BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI                    BIT_ULL(55)
 
 /*
  * IBS cpuid feature detection
index 04c27a0..4432ab7 100644 (file)
@@ -366,20 +366,18 @@ static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
 }
 static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
 {
+       pgprotval_t val = pgprot_val(pgprot);
        pgprot_t new;
-       unsigned long val;
 
-       val = pgprot_val(pgprot);
        pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
        return new;
 }
 static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
 {
+       pgprotval_t val = pgprot_val(pgprot);
        pgprot_t new;
-       unsigned long val;
 
-       val = pgprot_val(pgprot);
        pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                          ((val & _PAGE_PAT_LARGE) >>
                           (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
index 2d5a50c..983738a 100644 (file)
@@ -13,7 +13,7 @@ struct vm86;
 #include <asm/types.h>
 #include <uapi/asm/sigcontext.h>
 #include <asm/current.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 #include <asm/percpu.h>
@@ -24,7 +24,6 @@ struct vm86;
 #include <asm/fpu/types.h>
 
 #include <linux/personality.h>
-#include <linux/cpumask.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/math64.h>
@@ -129,6 +128,8 @@ struct cpuinfo_x86 {
        u16                     booted_cores;
        /* Physical processor id: */
        u16                     phys_proc_id;
+       /* Logical processor id: */
+       u16                     logical_proc_id;
        /* Core id: */
        u16                     cpu_core_id;
        /* Compute unit id */
@@ -298,10 +299,13 @@ struct tss_struct {
         */
        unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
 
+#ifdef CONFIG_X86_32
        /*
-        * Space for the temporary SYSENTER stack:
+        * Space for the temporary SYSENTER stack.
         */
+       unsigned long           SYSENTER_stack_canary;
        unsigned long           SYSENTER_stack[64];
+#endif
 
 } ____cacheline_aligned;
 
@@ -766,7 +770,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
  * Return saved PC of a blocked thread.
  * What is this good for? it will be always the scheduler or ret_from_fork.
  */
-#define thread_saved_pc(t)     (*(unsigned long *)((t)->thread.sp - 8))
+#define thread_saved_pc(t)     READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
 
 #define task_pt_regs(tsk)      ((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
index a4a7728..9b9b30b 100644 (file)
@@ -7,12 +7,23 @@
 
 void syscall_init(void);
 
+#ifdef CONFIG_X86_64
 void entry_SYSCALL_64(void);
-void entry_SYSCALL_compat(void);
+#endif
+
+#ifdef CONFIG_X86_32
 void entry_INT80_32(void);
-void entry_INT80_compat(void);
 void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
 void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#endif
 
 void x86_configure_nx(void);
 void x86_report_nx(void);
index 0a52424..13b6cdd 100644 (file)
@@ -7,7 +7,7 @@
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 #endif
 
index 89db467..452c88b 100644 (file)
@@ -13,7 +13,6 @@
                         X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                     struct pt_regs *regs, unsigned long mask);
 
index ba665eb..db33330 100644 (file)
@@ -15,7 +15,7 @@
 
 #include <linux/stringify.h>
 #include <asm/nops.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 
 /* "Raw" instruction opcodes */
 #define __ASM_CLAC     .byte 0x0f,0x01,0xca
index dfcf072..20a3de5 100644 (file)
@@ -16,7 +16,6 @@
 #endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
-#include <asm/cpufeature.h>
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
index ff8b9a1..ca6ba36 100644 (file)
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:       destination address
+ * @src:       source address
+ * @cnt:       number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
index c7b5510..8286669 100644 (file)
@@ -49,7 +49,7 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/atomic.h>
 
 struct thread_info {
@@ -134,10 +134,13 @@ struct thread_info {
 #define _TIF_ADDR32            (1 << TIF_ADDR32)
 #define _TIF_X32               (1 << TIF_X32)
 
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
 #define _TIF_WORK_SYSCALL_ENTRY        \
        (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |   \
-        _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |     \
+        _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |       \
         _TIF_NOHZ)
 
 /* work to do on any return to user space */
index 6df2029..c24b422 100644 (file)
@@ -5,8 +5,57 @@
 #include <linux/sched.h>
 
 #include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+                            unsigned long type)
+{
+       struct { u64 d[2]; } desc = { { pcid, addr } };
+
+       /*
+        * The memory clobber is because the whole point is to invalidate
+        * stale TLB entries and, especially if we're flushing global
+        * mappings, we don't want the compiler to reorder any subsequent
+        * memory accesses before the TLB flush.
+        *
+        * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+        * invpcid (%rcx), %rax in long mode.
+        */
+       asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+                     : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR                0
+#define INVPCID_TYPE_SINGLE_CTXT       1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL   2
+#define INVPCID_TYPE_ALL_NON_GLOBAL    3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+                                    unsigned long addr)
+{
+       __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+       __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+       __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+       __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -104,6 +153,15 @@ static inline void __native_flush_tlb_global(void)
 {
        unsigned long flags;
 
+       if (static_cpu_has(X86_FEATURE_INVPCID)) {
+               /*
+                * Using INVPCID is considerably faster than a pair of writes
+                * to CR4 sandwiched inside an IRQ flag save/restore.
+                */
+               invpcid_flush_all();
+               return;
+       }
+
        /*
         * Read-modify-write to CR4 - protect it from preemption and
         * from interrupts. (Use the raw variant because this code can
index 0fb4648..7f991bd 100644 (file)
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
 
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#define topology_logical_package_id(cpu)       (cpu_data(cpu).logical_proc_id)
 #define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
 #define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)             (per_cpu(cpu_core_map, cpu))
 #define topology_sibling_cpumask(cpu)          (per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages()                        (__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages()                        (1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
index 6d7c547..174c421 100644 (file)
@@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void)
        return rdtsc();
 }
 
+extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
+
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
index a4a30e4..c0f27d7 100644 (file)
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
        likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-       int insn, fixup;
+       int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
index f5dcb52..3fe0eac 100644 (file)
@@ -48,20 +48,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 
                switch (n) {
                case 1:
+                       __uaccess_begin();
                        __put_user_size(*(u8 *)from, (u8 __user *)to,
                                        1, ret, 1);
+                       __uaccess_end();
                        return ret;
                case 2:
+                       __uaccess_begin();
                        __put_user_size(*(u16 *)from, (u16 __user *)to,
                                        2, ret, 2);
+                       __uaccess_end();
                        return ret;
                case 4:
+                       __uaccess_begin();
                        __put_user_size(*(u32 *)from, (u32 __user *)to,
                                        4, ret, 4);
+                       __uaccess_end();
                        return ret;
                case 8:
+                       __uaccess_begin();
                        __put_user_size(*(u64 *)from, (u64 __user *)to,
                                        8, ret, 8);
+                       __uaccess_end();
                        return ret;
                }
        }
@@ -103,13 +111,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 
                switch (n) {
                case 1:
+                       __uaccess_begin();
                        __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __uaccess_end();
                        return ret;
                case 2:
+                       __uaccess_begin();
                        __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __uaccess_end();
                        return ret;
                case 4:
+                       __uaccess_begin();
                        __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __uaccess_end();
                        return ret;
                }
        }
@@ -148,13 +162,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 
                switch (n) {
                case 1:
+                       __uaccess_begin();
                        __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __uaccess_end();
                        return ret;
                case 2:
+                       __uaccess_begin();
                        __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __uaccess_end();
                        return ret;
                case 4:
+                       __uaccess_begin();
                        __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __uaccess_end();
                        return ret;
                }
        }
@@ -170,13 +190,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
                switch (n) {
                case 1:
+                       __uaccess_begin();
                        __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __uaccess_end();
                        return ret;
                case 2:
+                       __uaccess_begin();
                        __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __uaccess_end();
                        return ret;
                case 4:
+                       __uaccess_begin();
                        __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __uaccess_end();
                        return ret;
                }
        }
index b89c34c..3076986 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <asm/alternative.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 
 /*
index deabaf9..43dc55b 100644 (file)
@@ -13,9 +13,6 @@ struct vdso_image {
        void *data;
        unsigned long size;   /* Always a multiple of PAGE_SIZE */
 
-       /* text_mapping.pages is big enough for data/size page pointers */
-       struct vm_special_mapping text_mapping;
-
        unsigned long alt, alt_len;
 
        long sym_vvar_start;  /* Negative offset to the vvar area */
index f556c48..e728699 100644 (file)
@@ -37,6 +37,12 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
+extern int vclocks_used;
+static inline bool vclock_was_used(int vclock)
+{
+       return READ_ONCE(vclocks_used) & (1 << vclock);
+}
+
 static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
 {
        unsigned ret;
index 968d57d..f320ee3 100644 (file)
@@ -57,7 +57,7 @@ static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
 {
        if (xen_pci_frontend && xen_pci_frontend->enable_msi)
                return xen_pci_frontend->enable_msi(dev, vectors);
-       return -ENODEV;
+       return -ENOSYS;
 }
 static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
 {
@@ -69,7 +69,7 @@ static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
 {
        if (xen_pci_frontend && xen_pci_frontend->enable_msix)
                return xen_pci_frontend->enable_msix(dev, vectors, nvec);
-       return -ENODEV;
+       return -ENOSYS;
 }
 static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
 {
index d485232..62d4111 100644 (file)
@@ -256,7 +256,7 @@ struct sigcontext_64 {
        __u16                           cs;
        __u16                           gs;
        __u16                           fs;
-       __u16                           __pad0;
+       __u16                           ss;
        __u64                           err;
        __u64                           trapno;
        __u64                           oldmask;
@@ -341,9 +341,37 @@ struct sigcontext {
        __u64                           rip;
        __u64                           eflags;         /* RFLAGS */
        __u16                           cs;
+
+       /*
+        * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+        * Linux saved and restored fs and gs in these slots.  This
+        * was counterproductive, as fsbase and gsbase were never
+        * saved, so arch_prctl was presumably unreliable.
+        *
+        * These slots should never be reused without extreme caution:
+        *
+        *  - Some DOSEMU versions stash fs and gs in these slots manually,
+        *    thus overwriting anything the kernel expects to be preserved
+        *    in these slots.
+        *
+        *  - If these slots are ever needed for any other purpose,
+        *    there is some risk that very old 64-bit binaries could get
+        *    confused.  I doubt that many such binaries still work,
+        *    though, since the same patch in 2.5.64 also removed the
+        *    64-bit set_thread_area syscall, so it appears that there
+        *    is no TLS API beyond modify_ldt that works in both pre-
+        *    and post-2.5.64 kernels.
+        *
+        * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+        * save/restore, it will most likely need to be opt-in and use
+        * different context slots.
+        */
        __u16                           gs;
        __u16                           fs;
-       __u16                           __pad0;
+       union {
+               __u16                   ss;     /* If UC_SIGCONTEXT_SS */
+               __u16                   __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+       };
        __u64                           err;
        __u64                           trapno;
        __u64                           oldmask;
index b7c29c8..e3d1ec9 100644 (file)
@@ -1,11 +1,54 @@
 #ifndef _ASM_X86_UCONTEXT_H
 #define _ASM_X86_UCONTEXT_H
 
-#define UC_FP_XSTATE   0x1     /* indicates the presence of extended state
-                                * information in the memory layout pointed
-                                * by the fpstate pointer in the ucontext's
-                                * sigcontext struct (uc_mcontext).
-                                */
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE   0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ *   with zero or garbage in the SS slot (e.g. old CRIU) and call
+ *   sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ *   context, delete the old SS segment (with modify_ldt), and change
+ *   the saved CS to a 64-bit segment.  These DOSEMU versions expect
+ *   sigreturn to send them back to 64-bit mode without killing them,
+ *   despite the fact that the SS selector when the signal was raised is
+ *   no longer valid.  UC_STRICT_RESTORE_SS will be clear, so the kernel
+ *   will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ *   modifying the saved context will end up in exactly the state they
+ *   started in, even if they were running in a segmented context when
+ *   the signal was raised..  Old kernels would lose track of the
+ *   previous SS value.
+ */
+#define UC_SIGCONTEXT_SS       0x2
+#define UC_STRICT_RESTORE_SS   0x4
+#endif
 
 #include <asm-generic/ucontext.h>
 
index d1daead..adb3eaf 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 
+#include <linux/ftrace.h>
 #include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+       /*
+        * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+        * inconsistent call/return info after it jumps to the wakeup vector.
+        */
+       pause_graph_tracing();
        do_suspend_lowlevel();
+       unpause_graph_tracing();
        return 0;
 }
 
index 8a5cdda..531b961 100644 (file)
@@ -2077,6 +2077,20 @@ int generic_processor_info(int apicid, int version)
        } else
                cpu = cpumask_next_zero(-1, cpu_present_mask);
 
+       /*
+        * This can happen on physical hotplug. The sanity check at boot time
+        * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+        * established.
+        */
+       if (topology_update_package_map(apicid, cpu) < 0) {
+               int thiscpu = max + disabled_cpus;
+
+               pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+                          thiscpu, apicid);
+               disabled_cpus++;
+               return -ENOSPC;
+       }
+
        /*
         * Validate version
         */
index 9968f30..76f89e2 100644 (file)
@@ -53,7 +53,7 @@ void flat_init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
 
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
        unsigned long flags;
 
index c80c02c..ab5c2c6 100644 (file)
@@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
        unsigned long value;
        unsigned int id = (x >> 24) & 0xff;
 
-       if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+       if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
                rdmsrl(MSR_FAM10H_NODE_ID, value);
                id |= (value << 2) & 0xff00;
        }
@@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
        this_cpu_write(cpu_llc_id, node);
 
        /* Account for nodes per socket in multi-core-module processors */
-       if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+       if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
                rdmsrl(MSR_FAM10H_NODE_ID, val);
                nodes = ((val >> 3) & 7) + 1;
        }
index f253218..fdb0fbf 100644 (file)
@@ -2521,6 +2521,7 @@ void __init setup_ioapic_dest(void)
 {
        int pin, ioapic, irq, irq_entry;
        const struct cpumask *mask;
+       struct irq_desc *desc;
        struct irq_data *idata;
        struct irq_chip *chip;
 
@@ -2536,7 +2537,9 @@ void __init setup_ioapic_dest(void)
                if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
                        continue;
 
-               idata = irq_get_irq_data(irq);
+               desc = irq_to_desc(irq);
+               raw_spin_lock_irq(&desc->lock);
+               idata = irq_desc_get_irq_data(desc);
 
                /*
                 * Honour affinities which have been set in early boot
@@ -2550,6 +2553,7 @@ void __init setup_ioapic_dest(void)
                /* Might be lapic_chip for irq 0 */
                if (chip->irq_set_affinity)
                        chip->irq_set_affinity(idata, mask, false);
+               raw_spin_unlock_irq(&desc->lock);
        }
 }
 #endif
index eb45fc9..28bde88 100644 (file)
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+       /*
+        * Subtle. In the case of the 'never do double writes' workaround
+        * we have to lock out interrupts to be safe.  As we don't care
+        * of the value read we use an atomic rmw access to avoid costly
+        * cli/sti.  Otherwise we use an even cheaper single atomic write
+        * to the APIC.
+        */
+       unsigned int cfg;
+
+       /*
+        * Wait for idle.
+        */
+       __xapic_wait_icr_idle();
+
+       /*
+        * No need to touch the target chip field
+        */
+       cfg = __prepare_ICR(shortcut, vector, dest);
+
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+       unsigned long cfg;
+
+       /*
+        * Wait for idle.
+        */
+       if (unlikely(vector == NMI_VECTOR))
+               safe_apic_wait_icr_idle();
+       else
+               __xapic_wait_icr_idle();
+
+       /*
+        * prepare target chip field
+        */
+       cfg = __prepare_ICR2(mask);
+       native_apic_mem_write(APIC_ICR2, cfg);
+
+       /*
+        * program the ICR
+        */
+       cfg = __prepare_ICR(0, vector, dest);
+
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       native_apic_mem_write(APIC_ICR, cfg);
+}
+
 void default_send_IPI_single_phys(int cpu, int vector)
 {
        unsigned long flags;
index 908cb37..3b670df 100644 (file)
@@ -31,7 +31,7 @@ struct apic_chip_data {
 struct irq_domain *x86_vector_domain;
 EXPORT_SYMBOL_GPL(x86_vector_domain);
 static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask;
+static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
 static struct irq_chip lapic_controller;
 #ifdef CONFIG_X86_IO_APIC
 static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -118,35 +118,47 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
         */
        static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
        static int current_offset = VECTOR_OFFSET_START % 16;
-       int cpu, err;
+       int cpu, vector;
 
-       if (d->move_in_progress)
+       /*
+        * If there is still a move in progress or the previous move has not
+        * been cleaned up completely, tell the caller to come back later.
+        */
+       if (d->move_in_progress ||
+           cpumask_intersects(d->old_domain, cpu_online_mask))
                return -EBUSY;
 
        /* Only try and allocate irqs on cpus that are present */
-       err = -ENOSPC;
        cpumask_clear(d->old_domain);
+       cpumask_clear(searched_cpumask);
        cpu = cpumask_first_and(mask, cpu_online_mask);
        while (cpu < nr_cpu_ids) {
-               int new_cpu, vector, offset;
+               int new_cpu, offset;
 
+               /* Get the possible target cpus for @mask/@cpu from the apic */
                apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
+               /*
+                * Clear the offline cpus from @vector_cpumask for searching
+                * and verify whether the result overlaps with @mask. If true,
+                * then the call to apic->cpu_mask_to_apicid_and() will
+                * succeed as well. If not, no point in trying to find a
+                * vector in this mask.
+                */
+               cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
+               if (!cpumask_intersects(vector_searchmask, mask))
+                       goto next_cpu;
+
                if (cpumask_subset(vector_cpumask, d->domain)) {
-                       err = 0;
                        if (cpumask_equal(vector_cpumask, d->domain))
-                               break;
+                               goto success;
                        /*
-                        * New cpumask using the vector is a proper subset of
-                        * the current in use mask. So cleanup the vector
-                        * allocation for the members that are not used anymore.
+                        * Mark the cpus which are not longer in the mask for
+                        * cleanup.
                         */
-                       cpumask_andnot(d->old_domain, d->domain,
-                                      vector_cpumask);
-                       d->move_in_progress =
-                          cpumask_intersects(d->old_domain, cpu_online_mask);
-                       cpumask_and(d->domain, d->domain, vector_cpumask);
-                       break;
+                       cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
+                       vector = d->cfg.vector;
+                       goto update;
                }
 
                vector = current_vector;
@@ -158,45 +170,60 @@ next:
                        vector = FIRST_EXTERNAL_VECTOR + offset;
                }
 
-               if (unlikely(current_vector == vector)) {
-                       cpumask_or(d->old_domain, d->old_domain,
-                                  vector_cpumask);
-                       cpumask_andnot(vector_cpumask, mask, d->old_domain);
-                       cpu = cpumask_first_and(vector_cpumask,
-                                               cpu_online_mask);
-                       continue;
-               }
+               /* If the search wrapped around, try the next cpu */
+               if (unlikely(current_vector == vector))
+                       goto next_cpu;
 
                if (test_bit(vector, used_vectors))
                        goto next;
 
-               for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
+               for_each_cpu(new_cpu, vector_searchmask) {
                        if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
                                goto next;
                }
                /* Found one! */
                current_vector = vector;
                current_offset = offset;
-               if (d->cfg.vector) {
+               /* Schedule the old vector for cleanup on all cpus */
+               if (d->cfg.vector)
                        cpumask_copy(d->old_domain, d->domain);
-                       d->move_in_progress =
-                          cpumask_intersects(d->old_domain, cpu_online_mask);
-               }
-               for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
+               for_each_cpu(new_cpu, vector_searchmask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
-               d->cfg.vector = vector;
-               cpumask_copy(d->domain, vector_cpumask);
-               err = 0;
-               break;
-       }
+               goto update;
 
-       if (!err) {
-               /* cache destination APIC IDs into cfg->dest_apicid */
-               err = apic->cpu_mask_to_apicid_and(mask, d->domain,
-                                                  &d->cfg.dest_apicid);
+next_cpu:
+               /*
+                * We exclude the current @vector_cpumask from the requested
+                * @mask and try again with the next online cpu in the
+                * result. We cannot modify @mask, so we use @vector_cpumask
+                * as a temporary buffer here as it will be reassigned when
+                * calling apic->vector_allocation_domain() above.
+                */
+               cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
+               cpumask_andnot(vector_cpumask, mask, searched_cpumask);
+               cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+               continue;
        }
+       return -ENOSPC;
 
-       return err;
+update:
+       /*
+        * Exclude offline cpus from the cleanup mask and set the
+        * move_in_progress flag when the result is not empty.
+        */
+       cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+       d->move_in_progress = !cpumask_empty(d->old_domain);
+       d->cfg.vector = vector;
+       cpumask_copy(d->domain, vector_cpumask);
+success:
+       /*
+        * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
+        * as we already established, that mask & d->domain & cpu_online_mask
+        * is not empty.
+        */
+       BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
+                                           &d->cfg.dest_apicid));
+       return 0;
 }
 
 static int assign_irq_vector(int irq, struct apic_chip_data *data,
@@ -226,10 +253,8 @@ static int assign_irq_vector_policy(int irq, int node,
 static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
        struct irq_desc *desc;
-       unsigned long flags;
        int cpu, vector;
 
-       raw_spin_lock_irqsave(&vector_lock, flags);
        BUG_ON(!data->cfg.vector);
 
        vector = data->cfg.vector;
@@ -239,10 +264,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
        data->cfg.vector = 0;
        cpumask_clear(data->domain);
 
-       if (likely(!data->move_in_progress)) {
-               raw_spin_unlock_irqrestore(&vector_lock, flags);
+       /*
+        * If move is in progress or the old_domain mask is not empty,
+        * i.e. the cleanup IPI has not been processed yet, we need to remove
+        * the old references to desc from all cpus vector tables.
+        */
+       if (!data->move_in_progress && cpumask_empty(data->old_domain))
                return;
-       }
 
        desc = irq_to_desc(irq);
        for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
@@ -255,7 +283,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
                }
        }
        data->move_in_progress = 0;
-       raw_spin_unlock_irqrestore(&vector_lock, flags);
 }
 
 void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -276,19 +303,24 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
 static void x86_vector_free_irqs(struct irq_domain *domain,
                                 unsigned int virq, unsigned int nr_irqs)
 {
+       struct apic_chip_data *apic_data;
        struct irq_data *irq_data;
+       unsigned long flags;
        int i;
 
        for (i = 0; i < nr_irqs; i++) {
                irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
                if (irq_data && irq_data->chip_data) {
+                       raw_spin_lock_irqsave(&vector_lock, flags);
                        clear_irq_vector(virq + i, irq_data->chip_data);
-                       free_apic_chip_data(irq_data->chip_data);
+                       apic_data = irq_data->chip_data;
+                       irq_domain_reset_irq_data(irq_data);
+                       raw_spin_unlock_irqrestore(&vector_lock, flags);
+                       free_apic_chip_data(apic_data);
 #ifdef CONFIG_X86_IO_APIC
                        if (virq + i < nr_legacy_irqs())
                                legacy_irq_data[virq + i] = NULL;
 #endif
-                       irq_domain_reset_irq_data(irq_data);
                }
        }
 }
@@ -406,6 +438,8 @@ int __init arch_early_irq_init(void)
        arch_init_htirq_domain(x86_vector_domain);
 
        BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+       BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+       BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
 
        return arch_early_ioapic_init();
 }
@@ -494,14 +528,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
                return -EINVAL;
 
        err = assign_irq_vector(irq, data, dest);
-       if (err) {
-               if (assign_irq_vector(irq, data,
-                                     irq_data_get_affinity_mask(irq_data)))
-                       pr_err("Failed to recover vector for irq %d\n", irq);
-               return err;
-       }
-
-       return IRQ_SET_MASK_OK;
+       return err ? err : IRQ_SET_MASK_OK;
 }
 
 static struct irq_chip lapic_controller = {
@@ -513,20 +540,12 @@ static struct irq_chip lapic_controller = {
 #ifdef CONFIG_SMP
 static void __send_cleanup_vector(struct apic_chip_data *data)
 {
-       cpumask_var_t cleanup_mask;
-
-       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-               unsigned int i;
-
-               for_each_cpu_and(i, data->old_domain, cpu_online_mask)
-                       apic->send_IPI_mask(cpumask_of(i),
-                                           IRQ_MOVE_CLEANUP_VECTOR);
-       } else {
-               cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
-               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               free_cpumask_var(cleanup_mask);
-       }
+       raw_spin_lock(&vector_lock);
+       cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
        data->move_in_progress = 0;
+       if (!cpumask_empty(data->old_domain))
+               apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+       raw_spin_unlock(&vector_lock);
 }
 
 void send_cleanup_vector(struct irq_cfg *cfg)
@@ -570,12 +589,25 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
                        goto unlock;
 
                /*
-                * Check if the irq migration is in progress. If so, we
-                * haven't received the cleanup request yet for this irq.
+                * Nothing to cleanup if irq migration is in progress
+                * or this cpu is not set in the cleanup mask.
                 */
-               if (data->move_in_progress)
+               if (data->move_in_progress ||
+                   !cpumask_test_cpu(me, data->old_domain))
                        goto unlock;
 
+               /*
+                * We have two cases to handle here:
+                * 1) vector is unchanged but the target mask got reduced
+                * 2) vector and the target mask has changed
+                *
+                * #1 is obvious, but in #2 we have two vectors with the same
+                * irq descriptor: the old and the new vector. So we need to
+                * make sure that we only cleanup the old vector. The new
+                * vector has the current @vector number in the config and
+                * this cpu is part of the target mask. We better leave that
+                * one alone.
+                */
                if (vector == data->cfg.vector &&
                    cpumask_test_cpu(me, data->domain))
                        goto unlock;
@@ -593,6 +625,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
                        goto unlock;
                }
                __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+               cpumask_clear_cpu(me, data->old_domain);
 unlock:
                raw_spin_unlock(&desc->lock);
        }
@@ -621,12 +654,48 @@ void irq_complete_move(struct irq_cfg *cfg)
        __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
 }
 
-void irq_force_complete_move(int irq)
+/*
+ * Called with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
 {
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_data *irqdata = irq_desc_get_irq_data(desc);
+       struct apic_chip_data *data = apic_chip_data(irqdata);
+       struct irq_cfg *cfg = data ? &data->cfg : NULL;
 
-       if (cfg)
-               __irq_complete_move(cfg, cfg->vector);
+       if (!cfg)
+               return;
+
+       __irq_complete_move(cfg, cfg->vector);
+
+       /*
+        * This is tricky. If the cleanup of @data->old_domain has not been
+        * done yet, then the following setaffinity call will fail with
+        * -EBUSY. This can leave the interrupt in a stale state.
+        *
+        * The cleanup cannot make progress because we hold @desc->lock. So in
+        * case @data->old_domain is not yet cleaned up, we need to drop the
+        * lock and acquire it again. @desc cannot go away, because the
+        * hotplug code holds the sparse irq lock.
+        */
+       raw_spin_lock(&vector_lock);
+       /* Clean out all offline cpus (including ourself) first. */
+       cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+       while (!cpumask_empty(data->old_domain)) {
+               raw_spin_unlock(&vector_lock);
+               raw_spin_unlock(&desc->lock);
+               cpu_relax();
+               raw_spin_lock(&desc->lock);
+               /*
+                * Reevaluate apic_chip_data. It might have been cleared after
+                * we dropped @desc->lock.
+                */
+               data = apic_chip_data(irqdata);
+               if (!data)
+                       return;
+               raw_spin_lock(&vector_lock);
+       }
+       raw_spin_unlock(&vector_lock);
 }
 #endif
 
index d760c6b..624db00 100644 (file)
@@ -889,7 +889,10 @@ void __init uv_system_init(void)
                return;
        }
        pr_info("UV: Found %s hub\n", hub);
-       map_low_mmrs();
+
+       /* We now only need to map the MMRs on UV1 */
+       if (is_uv1_hub())
+               map_low_mmrs();
 
        m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
        m_val = m_n_config.s.m_skt;
index 84a7524..5c04246 100644 (file)
@@ -59,7 +59,6 @@ void common(void) {
 
 #ifdef CONFIG_PARAVIRT
        BLANK();
-       OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
        OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
        OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
        OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
index 6ce3902..ecdc1d2 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
@@ -52,6 +52,11 @@ void foo(void)
        DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
               offsetofend(struct tss_struct, SYSENTER_stack));
 
+       /* Offset from cpu_tss to SYSENTER_stack */
+       OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+       /* Size of SYSENTER_stack */
+       DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
        BLANK();
        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
index f2edafb..d875f97 100644 (file)
@@ -4,17 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
index 5803130..0d373d7 100644 (file)
@@ -30,33 +30,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)               += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
 
-obj-$(CONFIG_PERF_EVENTS)              += perf_event.o
-
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd.o perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_amd_iommu.o
-endif
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_pt.o perf_event_intel_bts.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_intel_cstate.o
-
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
-                                          perf_event_intel_uncore_snb.o \
-                                          perf_event_intel_uncore_snbep.o \
-                                          perf_event_intel_uncore_nhmex.o
-obj-$(CONFIG_CPU_SUP_INTEL)            += perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD)              += perf_event_msr.o
-endif
-
-
 obj-$(CONFIG_X86_MCE)                  += mcheck/
 obj-$(CONFIG_MTRR)                     += mtrr/
 obj-$(CONFIG_MICROCODE)                        += microcode/
 
-obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o
 
 obj-$(CONFIG_HYPERVISOR_GUEST)         += vmware.o hypervisor.o mshyperv.o
 
@@ -64,7 +42,7 @@ ifdef CONFIG_X86_FEATURE_NAMES
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
 
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
 
 targets += capflags.c
 $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
index a07956a..97c59fd 100644 (file)
@@ -117,7 +117,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                void (*f_vide)(void);
                u64 d, d2;
 
-               printk(KERN_INFO "AMD K6 stepping B detected - ");
+               pr_info("AMD K6 stepping B detected - ");
 
                /*
                 * It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -133,10 +133,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                d = d2-d;
 
                if (d > 20*K6_BUG_LOOP)
-                       printk(KERN_CONT
-                               "system stability may be impaired when more than 32 MB are used.\n");
+                       pr_cont("system stability may be impaired when more than 32 MB are used.\n");
                else
-                       printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+                       pr_cont("probably OK (after B9730xxxx).\n");
        }
 
        /* K6 with old style WHCR */
@@ -154,7 +153,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                        wbinvd();
                        wrmsr(MSR_K6_WHCR, l, h);
                        local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling old style K6 write allocation for %d Mb\n",
                                mbytes);
                }
                return;
@@ -175,7 +174,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
                        wbinvd();
                        wrmsr(MSR_K6_WHCR, l, h);
                        local_irq_restore(flags);
-                       printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+                       pr_info("Enabling new style K6 write allocation for %d Mb\n",
                                mbytes);
                }
 
@@ -202,7 +201,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
         */
        if (c->x86_model >= 6 && c->x86_model <= 10) {
                if (!cpu_has(c, X86_FEATURE_XMM)) {
-                       printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+                       pr_info("Enabling disabled K7/SSE Support.\n");
                        msr_clear_bit(MSR_K7_HWCR, 15);
                        set_cpu_cap(c, X86_FEATURE_XMM);
                }
@@ -216,9 +215,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
        if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
                rdmsr(MSR_K7_CLK_CTL, l, h);
                if ((l & 0xfff00000) != 0x20000000) {
-                       printk(KERN_INFO
-                           "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
-                                       l, ((l & 0x000fffff)|0x20000000));
+                       pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+                               l, ((l & 0x000fffff)|0x20000000));
                        wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
                }
        }
@@ -485,7 +483,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
                if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
                        unsigned long pfn = tseg >> PAGE_SHIFT;
 
-                       printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+                       pr_debug("tseg: %010llx\n", tseg);
                        if (pfn_range_is_mapped(pfn, pfn + 1))
                                set_memory_4k((unsigned long)__va(tseg), 1);
                }
@@ -500,8 +498,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
                        rdmsrl(MSR_K7_HWCR, val);
                        if (!(val & BIT(24)))
-                               printk(KERN_WARNING FW_BUG "TSC doesn't count "
-                                       "with P0 frequency!\n");
+                               pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
                }
        }
 
index 04f0fe5..a972ac4 100644 (file)
@@ -15,7 +15,7 @@ void __init check_bugs(void)
 {
        identify_boot_cpu();
 #if !defined(CONFIG_SMP)
-       printk(KERN_INFO "CPU: ");
+       pr_info("CPU: ");
        print_cpu_info(&boot_cpu_data);
 #endif
        alternative_instructions();
index ae20be6..1661d8e 100644 (file)
@@ -1,7 +1,7 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                        rdmsr(MSR_VIA_FCR, lo, hi);
                        lo |= ACE_FCR;          /* enable ACE unit */
                        wrmsr(MSR_VIA_FCR, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+                       pr_info("CPU: Enabled ACE h/w crypto\n");
                }
 
                /* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
                        rdmsr(MSR_VIA_RNG, lo, hi);
                        lo |= RNG_ENABLE;       /* enable RNG unit */
                        wrmsr(MSR_VIA_RNG, lo, hi);
-                       printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+                       pr_info("CPU: Enabled h/w RNG\n");
                }
 
                /* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
                        name = "C6";
                        fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
                        fcr_clr = DPDC;
-                       printk(KERN_NOTICE "Disabling bugged TSC.\n");
+                       pr_notice("Disabling bugged TSC.\n");
                        clear_cpu_cap(c, X86_FEATURE_TSC);
                        break;
                case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
                newlo = (lo|fcr_set) & (~fcr_clr);
 
                if (newlo != lo) {
-                       printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+                       pr_info("Centaur FCR was 0x%X now 0x%X\n",
                                lo, newlo);
                        wrmsr(MSR_IDT_FCR1, newlo, hi);
                } else {
-                       printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+                       pr_info("Centaur FCR is 0x%X\n", lo);
                }
                /* Emulate MTRRs using Centaur's MCR. */
                set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
index 37830de..249461f 100644 (file)
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+static int __init x86_noinvpcid_setup(char *s)
+{
+       /* noinvpcid doesn't accept parameters */
+       if (s)
+               return -EINVAL;
+
+       /* do not emit a message if the feature is not present */
+       if (!boot_cpu_has(X86_FEATURE_INVPCID))
+               return 0;
+
+       setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+       pr_info("noinvpcid: INVPCID feature disabled\n");
+       return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
@@ -228,7 +244,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
        lo |= 0x200000;
        wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
 
-       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       pr_notice("CPU serial number disabled.\n");
        clear_cpu_cap(c, X86_FEATURE_PN);
 
        /* Disabling the serial number may affect the cpuid level */
@@ -329,9 +345,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
                if (!warn)
                        continue;
 
-               printk(KERN_WARNING
-                      "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
-                               x86_cap_flag(df->feature), df->level);
+               pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+                       x86_cap_flag(df->feature), df->level);
        }
 }
 
@@ -510,7 +525,7 @@ void detect_ht(struct cpuinfo_x86 *c)
        smp_num_siblings = (ebx & 0xff0000) >> 16;
 
        if (smp_num_siblings == 1) {
-               printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+               pr_info_once("CPU0: Hyper-Threading is disabled\n");
                goto out;
        }
 
@@ -531,10 +546,10 @@ void detect_ht(struct cpuinfo_x86 *c)
 
 out:
        if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-                      c->phys_proc_id);
-               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-                      c->cpu_core_id);
+               pr_info("CPU: Physical Processor ID: %d\n",
+                       c->phys_proc_id);
+               pr_info("CPU: Processor Core ID: %d\n",
+                       c->cpu_core_id);
                printed = 1;
        }
 #endif
@@ -559,9 +574,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
                }
        }
 
-       printk_once(KERN_ERR
-                       "CPU: vendor_id '%s' unknown, using generic init.\n" \
-                       "CPU: Your system may be unstable.\n", v);
+       pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+                   "CPU: Your system may be unstable.\n", v);
 
        c->x86_vendor = X86_VENDOR_UNKNOWN;
        this_cpu = &default_cpu;
@@ -760,7 +774,7 @@ void __init early_cpu_init(void)
        int count = 0;
 
 #ifdef CONFIG_PROCESSOR_SELECT
-       printk(KERN_INFO "KERNEL supported cpus:\n");
+       pr_info("KERNEL supported cpus:\n");
 #endif
 
        for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +792,7 @@ void __init early_cpu_init(void)
                        for (j = 0; j < 2; j++) {
                                if (!cpudev->c_ident[j])
                                        continue;
-                               printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+                               pr_info("  %s %s\n", cpudev->c_vendor,
                                        cpudev->c_ident[j]);
                        }
                }
@@ -802,6 +816,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
        clear_cpu_cap(c, X86_FEATURE_NOPL);
 #else
        set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
+
+       /*
+        * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
+        * systems that run Linux at CPL > 0 may or may not have the
+        * issue, but, even if they have the issue, there's absolutely
+        * nothing we can do about it because we can't use the real IRET
+        * instruction.
+        *
+        * NB: For the time being, only 32-bit kernels support
+        * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
+        * whether to apply espfix using paravirt hooks.  If any
+        * non-paravirt system ever shows up that does *not* have the
+        * ESPFIX issue, we can change this.
+        */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+       do {
+               extern void native_iret(void);
+               if (pv_cpu_ops.iret == native_iret)
+                       set_cpu_bug(c, X86_BUG_ESPFIX);
+       } while (0);
+#else
+       set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
 #endif
 }
 
@@ -977,6 +1016,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
        numa_add_cpu(smp_processor_id());
 #endif
+       /* The boot/hotplug time assigment got cleared, restore it */
+       c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
 }
 
 /*
@@ -1061,7 +1102,7 @@ static void __print_cpu_msr(void)
                for (index = index_min; index < index_max; index++) {
                        if (rdmsrl_safe(index, &val))
                                continue;
-                       printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+                       pr_info(" MSR%08x: %016llx\n", index, val);
                }
        }
 }
@@ -1100,19 +1141,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
        }
 
        if (vendor && !strstr(c->x86_model_id, vendor))
-               printk(KERN_CONT "%s ", vendor);
+               pr_cont("%s ", vendor);
 
        if (c->x86_model_id[0])
-               printk(KERN_CONT "%s", c->x86_model_id);
+               pr_cont("%s", c->x86_model_id);
        else
-               printk(KERN_CONT "%d86", c->x86);
+               pr_cont("%d86", c->x86);
 
-       printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+       pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
        if (c->x86_mask || c->cpuid_level >= 0)
-               printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+               pr_cont(", stepping: 0x%x)\n", c->x86_mask);
        else
-               printk(KERN_CONT ")\n");
+               pr_cont(")\n");
 
        print_cpu_msr(c);
 }
@@ -1438,7 +1479,7 @@ void cpu_init(void)
 
        show_ucode_info_early();
 
-       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+       pr_info("Initializing CPU#%d\n", cpu);
 
        if (cpu_feature_enabled(X86_FEATURE_VME) ||
            cpu_has_tsc ||
@@ -1475,20 +1516,6 @@ void cpu_init(void)
 }
 #endif
 
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-void warn_pre_alternatives(void)
-{
-       WARN(1, "You're using static_cpu_has before alternatives have run!\n");
-}
-EXPORT_SYMBOL_GPL(warn_pre_alternatives);
-#endif
-
-inline bool __static_cpu_has_safe(u16 bit)
-{
-       return boot_cpu_has(bit);
-}
-EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
-
 static void bsp_resume(void)
 {
        if (this_cpu->c_bsp_resume)
index aaf152e..6adef9c 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/timer.h>
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
+#include <asm/cpufeature.h>
 
 #include "cpu.h"
 
@@ -103,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
                local_irq_restore(flags);
 
                if (ccr5 & 2) { /* possible wrong calibration done */
-                       printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+                       pr_info("Recalibrating delay loop with SLOP bit reset\n");
                        calibrate_delay();
                        c->loops_per_jiffy = loops_per_jiffy;
                }
@@ -115,7 +116,7 @@ static void set_cx86_reorder(void)
 {
        u8 ccr3;
 
-       printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
        ccr3 = getCx86(CX86_CCR3);
        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
@@ -128,7 +129,7 @@ static void set_cx86_reorder(void)
 
 static void set_cx86_memwb(void)
 {
-       printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+       pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
        /* CCR2 bit 2: unlock NW bit */
        setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -268,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
                 *  VSA1 we work around however.
                 */
 
-               printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+               pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
                isa_dma_bridge_buggy = 2;
 
                /* We do this before the PCI layer is running. However we
@@ -426,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
                if (dir0 == 5 || dir0 == 3) {
                        unsigned char ccr3;
                        unsigned long flags;
-                       printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+                       pr_info("Enabling CPUID on Cyrix processor.\n");
                        local_irq_save(flags);
                        ccr3 = getCx86(CX86_CCR3);
                        /* enable MAPEN  */
index d820d8e..73d391a 100644 (file)
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
        }
 
        if (max_pri)
-               printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+               pr_info("Hypervisor detected: %s\n", x86_hyper->name);
 }
 
 void init_hypervisor(struct cpuinfo_x86 *c)
index 565648b..1f7fdb9 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/bugs.h>
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
         */
        if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
            c->microcode < 0x20e) {
-               printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+               pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
                clear_cpu_cap(c, X86_FEATURE_PSE);
        }
 
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
        if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
                rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
-                       printk(KERN_INFO "Disabled fast string operations\n");
+                       pr_info("Disabled fast string operations\n");
                        setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
                        setup_clear_cpu_cap(X86_FEATURE_ERMS);
                }
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                pr_info("Disabling PGE capability bit\n");
                setup_clear_cpu_cap(X86_FEATURE_PGE);
        }
+
+       if (c->cpuid_level >= 0x00000001) {
+               u32 eax, ebx, ecx, edx;
+
+               cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+               /*
+                * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+                * apicids which are reserved per package. Store the resulting
+                * shift value for the package management code.
+                */
+               if (edx & (1U << 28))
+                       c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+       }
 }
 
 #ifdef CONFIG_X86_32
@@ -176,7 +189,7 @@ int ppro_with_ram_bug(void)
            boot_cpu_data.x86 == 6 &&
            boot_cpu_data.x86_model == 1 &&
            boot_cpu_data.x86_mask < 8) {
-               printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+               pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
                return 1;
        }
        return 0;
@@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 
                set_cpu_bug(c, X86_BUG_F00F);
                if (!f00f_workaround_enabled) {
-                       printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+                       pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
                        f00f_workaround_enabled = 1;
                }
        }
@@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
         * Forcefully enable PAE if kernel parameter "forcepae" is present.
         */
        if (forcepae) {
-               printk(KERN_WARNING "PAE forced!\n");
+               pr_warn("PAE forced!\n");
                set_cpu_cap(c, X86_FEATURE_PAE);
                add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
        }
index 0b6c523..de6626c 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
        err = amd_set_l3_disable_slot(nb, cpu, slot, val);
        if (err) {
                if (err == -EEXIST)
-                       pr_warning("L3 slot %d in use/index already disabled!\n",
+                       pr_warn("L3 slot %d in use/index already disabled!\n",
                                   slot);
                return err;
        }
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
deleted file mode 100644 (file)
index 336878a..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#ifndef __INTEL_PT_H__
-#define __INTEL_PT_H__
-
-/*
- * Single-entry ToPA: when this close to region boundary, switch
- * buffers to avoid losing data.
- */
-#define TOPA_PMI_MARGIN 512
-
-#define TOPA_SHIFT 12
-
-static inline unsigned int sizes(unsigned int tsz)
-{
-       return 1 << (tsz + TOPA_SHIFT);
-};
-
-struct topa_entry {
-       u64     end     : 1;
-       u64     rsvd0   : 1;
-       u64     intr    : 1;
-       u64     rsvd1   : 1;
-       u64     stop    : 1;
-       u64     rsvd2   : 1;
-       u64     size    : 4;
-       u64     rsvd3   : 2;
-       u64     base    : 36;
-       u64     rsvd4   : 16;
-};
-
-#define PT_CPUID_LEAVES                2
-#define PT_CPUID_REGS_NUM      4 /* number of regsters (eax, ebx, ecx, edx) */
-
-enum pt_capabilities {
-       PT_CAP_max_subleaf = 0,
-       PT_CAP_cr3_filtering,
-       PT_CAP_psb_cyc,
-       PT_CAP_mtc,
-       PT_CAP_topa_output,
-       PT_CAP_topa_multiple_entries,
-       PT_CAP_single_range_output,
-       PT_CAP_payloads_lip,
-       PT_CAP_mtc_periods,
-       PT_CAP_cycle_thresholds,
-       PT_CAP_psb_periods,
-};
-
-struct pt_pmu {
-       struct pmu              pmu;
-       u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-};
-
-/**
- * struct pt_buffer - buffer configuration; one buffer per task_struct or
- *             cpu, depending on perf event configuration
- * @cpu:       cpu for per-cpu allocation
- * @tables:    list of ToPA tables in this buffer
- * @first:     shorthand for first topa table
- * @last:      shorthand for last topa table
- * @cur:       current topa table
- * @nr_pages:  buffer size in pages
- * @cur_idx:   current output region's index within @cur table
- * @output_off:        offset within the current output region
- * @data_size: running total of the amount of data in this buffer
- * @lost:      if data was lost/truncated
- * @head:      logical write offset inside the buffer
- * @snapshot:  if this is for a snapshot/overwrite counter
- * @stop_pos:  STOP topa entry in the buffer
- * @intr_pos:  INT topa entry in the buffer
- * @data_pages:        array of pages from perf
- * @topa_index:        table of topa entries indexed by page offset
- */
-struct pt_buffer {
-       int                     cpu;
-       struct list_head        tables;
-       struct topa             *first, *last, *cur;
-       unsigned int            cur_idx;
-       size_t                  output_off;
-       unsigned long           nr_pages;
-       local_t                 data_size;
-       local_t                 lost;
-       local64_t               head;
-       bool                    snapshot;
-       unsigned long           stop_pos, intr_pos;
-       void                    **data_pages;
-       struct topa_entry       *topa_index[0];
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:    perf output handle
- * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
- */
-struct pt {
-       struct perf_output_handle handle;
-       int                     handle_nmi;
-};
-
-#endif /* __INTEL_PT_H__ */
index afa9f0d..fbb5e90 100644 (file)
@@ -1,5 +1,5 @@
 #include <asm/cpu_device_id.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/slab.h>
index 4cfba43..517619e 100644 (file)
@@ -115,7 +115,7 @@ static int raise_local(void)
        int cpu = m->extcpu;
 
        if (m->inject_flags & MCJ_EXCEPTION) {
-               printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+               pr_info("Triggering MCE exception on CPU %d\n", cpu);
                switch (context) {
                case MCJ_CTX_IRQ:
                        /*
@@ -128,15 +128,15 @@ static int raise_local(void)
                        raise_exception(m, NULL);
                        break;
                default:
-                       printk(KERN_INFO "Invalid MCE context\n");
+                       pr_info("Invalid MCE context\n");
                        ret = -EINVAL;
                }
-               printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+               pr_info("MCE exception done on CPU %d\n", cpu);
        } else if (m->status) {
-               printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+               pr_info("Starting machine check poll CPU %d\n", cpu);
                raise_poll(m);
                mce_notify_irq();
-               printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+               pr_info("Machine check poll done on CPU %d\n", cpu);
        } else
                m->finished = 0;
 
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
                start = jiffies;
                while (!cpumask_empty(mce_inject_cpumask)) {
                        if (!time_before(jiffies, start + 2*HZ)) {
-                               printk(KERN_ERR
-                               "Timeout waiting for mce inject %lx\n",
+                               pr_err("Timeout waiting for mce inject %lx\n",
                                        *cpumask_bits(mce_inject_cpumask));
                                break;
                        }
@@ -241,7 +240,7 @@ static int inject_init(void)
 {
        if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
                return -ENOMEM;
-       printk(KERN_INFO "Machine check injector initialized\n");
+       pr_info("Machine check injector initialized\n");
        register_mce_write_callback(mce_write);
        register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
                                "mce_notify");
index 9c682c2..5119766 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL                .context = IN_KERNEL
 #define  USER          .context = IN_USER
+#define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
 #define  SER           .ser = SER_REQUIRED
 #define  NOSER         .ser = NO_SER
 #define  EXCP          .excp = EXCP_CONTEXT
@@ -86,6 +88,10 @@ static struct severity {
                PANIC, "In kernel and no restart IP",
                EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
                ),
+       MCESEV(
+               PANIC, "In kernel and no restart IP",
+               EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+               ),
        MCESEV(
                DEFERRED, "Deferred error",
                NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
@@ -122,6 +128,11 @@ static struct severity {
                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
                MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
                ),
+       MCESEV(
+               AR, "Action required: data load in error recoverable area of kernel",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               KERNEL_RECOV
+               ),
        MCESEV(
                AR, "Action required: data load error in a user process",
                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
@@ -170,6 +181,9 @@ static struct severity {
                )       /* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+                               (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-       return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+       if ((m->cs & 3) == 3)
+               return IN_USER;
+       if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+               return IN_KERNEL_RECOV;
+       return IN_KERNEL;
 }
 
 /*
index a006f4c..f0c921b 100644 (file)
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
        }
 }
 
+static int do_memory_failure(struct mce *m)
+{
+       int flags = MF_ACTION_REQUIRED;
+       int ret;
+
+       pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+       if (!(m->mcgstatus & MCG_STATUS_RIPV))
+               flags |= MF_MUST_KILL;
+       ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+       if (ret)
+               pr_err("Memory error not recovered");
+       return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
        char *msg = "Unknown";
-       u64 recover_paddr = ~0ull;
-       int flags = MF_ACTION_REQUIRED;
        int lmce = 0;
 
        /* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        }
 
        /*
-        * At insane "tolerant" levels we take no action. Otherwise
-        * we only die if we have no other choice. For less serious
-        * issues we try to recover, or limit damage to the current
-        * process.
+        * If tolerant is at an insane level we drop requests to kill
+        * processes and continue even when there is no way out.
         */
-       if (cfg->tolerant < 3) {
-               if (no_way_out)
-                       mce_panic("Fatal machine check on current CPU", &m, msg);
-               if (worst == MCE_AR_SEVERITY) {
-                       recover_paddr = m.addr;
-                       if (!(m.mcgstatus & MCG_STATUS_RIPV))
-                               flags |= MF_MUST_KILL;
-               } else if (kill_it) {
-                       force_sig(SIGBUS, current);
-               }
-       }
+       if (cfg->tolerant == 3)
+               kill_it = 0;
+       else if (no_way_out)
+               mce_panic("Fatal machine check on current CPU", &m, msg);
 
        if (worst > 0)
                mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
        sync_core();
 
-       if (recover_paddr == ~0ull)
-               goto done;
+       if (worst != MCE_AR_SEVERITY && !kill_it)
+               goto out_ist;
 
-       pr_err("Uncorrected hardware memory error in user-access at %llx",
-                recover_paddr);
-       /*
-        * We must call memory_failure() here even if the current process is
-        * doomed. We still need to mark the page as poisoned and alert any
-        * other users of the page.
-        */
-       ist_begin_non_atomic(regs);
-       local_irq_enable();
-       if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-               pr_err("Memory error not recovered");
-               force_sig(SIGBUS, current);
+       /* Fault was in user mode and we need to take some action */
+       if ((m.cs & 3) == 3) {
+               ist_begin_non_atomic(regs);
+               local_irq_enable();
+
+               if (kill_it || do_memory_failure(&m))
+                       force_sig(SIGBUS, current);
+               local_irq_disable();
+               ist_end_non_atomic();
+       } else {
+               if (!fixup_exception(regs, X86_TRAP_MC))
+                       mce_panic("Failed kernel mode recovery", &m, NULL);
        }
-       local_irq_disable();
-       ist_end_non_atomic();
-done:
+
+out_ist:
        ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1576,6 +1578,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
                if (c->x86 == 6 && c->x86_model == 45)
                        quirk_no_way_out = quirk_sandybridge_ifu;
+               /*
+                * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+                * whether this processor will actually generate recoverable
+                * machine checks. Check to see if this is an E7 model Xeon.
+                * We can't do a model number check because E5 and E7 use the
+                * same model number. E5 doesn't support recovery, E7 does.
+                */
+               if (mca_cfg.recovery || (mca_cfg.ser &&
+                       !strncmp(c->x86_model_id,
+                                "Intel(R) Xeon(R) CPU E7-", 24)))
+                       set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
        }
        if (cfg->monarch_timeout < 0)
                cfg->monarch_timeout = 0;
@@ -1617,10 +1630,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
        case X86_VENDOR_AMD: {
                u32 ebx = cpuid_ebx(0x80000007);
 
-               mce_amd_feature_init(c);
                mce_flags.overflow_recov = !!(ebx & BIT(0));
                mce_flags.succor         = !!(ebx & BIT(1));
                mce_flags.smca           = !!(ebx & BIT(3));
+               mce_amd_feature_init(c);
 
                break;
                }
@@ -2028,6 +2041,8 @@ static int __init mcheck_enable(char *str)
                cfg->bootlog = (str[0] == 'b');
        else if (!strcmp(str, "bios_cmci_threshold"))
                cfg->bios_cmci_threshold = true;
+       else if (!strcmp(str, "recovery"))
+               cfg->recovery = true;
        else if (isdigit(str[0])) {
                if (get_option(&str, &cfg->tolerant) == 2)
                        get_option(&str, &(cfg->monarch_timeout));
index e99b150..9d656fd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -28,7 +28,7 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
 #define MASK_VALID_HI     0x80000000
 #define DEF_LVT_OFF            0x2
 #define DEF_INT_TYPE_APIC      0x2
 
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF       0xF000
+
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF       0x1
+
 static const char * const th_names[] = {
        "load_store",
        "insn_fetch",
@@ -58,6 +71,35 @@ static const char * const th_names[] = {
        "execution_unit",
 };
 
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+       [SMCA_F17H_CORE]        = { "f17h_core",        0xB0 },
+       [SMCA_DF]               = { "data_fabric",      0x2E },
+       [SMCA_UMC]              = { "umc",              0x96 },
+       [SMCA_PB]               = { "param_block",      0x5 },
+       [SMCA_PSP]              = { "psp",              0xFF },
+       [SMCA_SMU]              = { "smu",              0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+       [SMCA_LS]               = "load_store",
+       [SMCA_IF]               = "insn_fetch",
+       [SMCA_L2_CACHE]         = "l2_cache",
+       [SMCA_DE]               = "decode_unit",
+       [RES]                   = "",
+       [SMCA_EX]               = "execution_unit",
+       [SMCA_FP]               = "floating_point",
+       [SMCA_L3_CACHE]         = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+       [SMCA_CS]               = "coherent_slave",
+       [SMCA_PIE]              = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
 
@@ -84,6 +126,13 @@ struct thresh_restart {
 
 static inline bool is_shared_bank(int bank)
 {
+       /*
+        * Scalable MCA provides for only one core to have access to the MSRs of
+        * a shared bank.
+        */
+       if (mce_flags.smca)
+               return false;
+
        /* Bank 4 is for northbridge reporting and is thus shared */
        return (bank == 4);
 }
@@ -135,6 +184,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
        }
 
        if (apic != msr) {
+               /*
+                * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+                * the BIOS provides the value. The original field where LVT offset
+                * was set is reserved. Return early here:
+                */
+               if (mce_flags.smca)
+                       return 0;
+
                pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
                       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
                       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -144,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
        return 1;
 };
 
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
 static void threshold_restart_bank(void *_tr)
 {
        struct thresh_restart *tr = _tr;
@@ -247,27 +301,116 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
        wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+                            unsigned int bank, unsigned int block)
+{
+       u32 addr = 0, offset = 0;
+
+       if (mce_flags.smca) {
+               if (!block) {
+                       addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+               } else {
+                       /*
+                        * For SMCA enabled processors, BLKPTR field of the
+                        * first MISC register (MCx_MISC0) indicates presence of
+                        * additional MISC register set (MISC1-4).
+                        */
+                       u32 low, high;
+
+                       if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+                               return addr;
+
+                       if (!(low & MCI_CONFIG_MCAX))
+                               return addr;
+
+                       if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+                           (low & MASK_BLKPTR_LO))
+                               addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+               }
+               return addr;
+       }
+
+       /* Fall back to method we used for older processors: */
+       switch (block) {
+       case 0:
+               addr = MSR_IA32_MCx_MISC(bank);
+               break;
+       case 1:
+               offset = ((low & MASK_BLKPTR_LO) >> 21);
+               if (offset)
+                       addr = MCG_XBLK_ADDR + offset;
+               break;
+       default:
+               addr = ++current_addr;
+       }
+       return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+                       int offset, u32 misc_high)
+{
+       unsigned int cpu = smp_processor_id();
+       struct threshold_block b;
+       int new;
+
+       if (!block)
+               per_cpu(bank_map, cpu) |= (1 << bank);
+
+       memset(&b, 0, sizeof(b));
+       b.cpu                   = cpu;
+       b.bank                  = bank;
+       b.block                 = block;
+       b.address               = addr;
+       b.interrupt_capable     = lvt_interrupt_supported(bank, misc_high);
+
+       if (!b.interrupt_capable)
+               goto done;
+
+       b.interrupt_enable = 1;
+
+       if (mce_flags.smca) {
+               u32 smca_low, smca_high;
+               u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+               if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+                       smca_high |= SMCA_MCAX_EN_OFF;
+                       wrmsr(smca_addr, smca_low, smca_high);
+               }
+
+               /* Gather LVT offset for thresholding: */
+               if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+                       goto out;
+
+               new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+       } else {
+               new = (misc_high & MASK_LVTOFF_HI) >> 20;
+       }
+
+       offset = setup_APIC_mce_threshold(offset, new);
+
+       if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+               mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+       mce_threshold_block_init(&b, offset);
+
+out:
+       return offset;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-       struct threshold_block b;
-       unsigned int cpu = smp_processor_id();
        u32 low = 0, high = 0, address = 0;
        unsigned int bank, block;
-       int offset = -1, new;
+       int offset = -1;
 
        for (bank = 0; bank < mca_cfg.banks; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0)
-                               address = MSR_IA32_MCx_MISC(bank);
-                       else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-
-                               address += MCG_XBLK_ADDR;
-                       } else
-                               ++address;
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
 
                        if (rdmsr_safe(address, &low, &high))
                                break;
@@ -279,29 +422,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                             (high & MASK_LOCKED_HI))
                                continue;
 
-                       if (!block)
-                               per_cpu(bank_map, cpu) |= (1 << bank);
-
-                       memset(&b, 0, sizeof(b));
-                       b.cpu                   = cpu;
-                       b.bank                  = bank;
-                       b.block                 = block;
-                       b.address               = address;
-                       b.interrupt_capable     = lvt_interrupt_supported(bank, high);
-
-                       if (!b.interrupt_capable)
-                               goto init;
-
-                       b.interrupt_enable = 1;
-                       new     = (high & MASK_LVTOFF_HI) >> 20;
-                       offset  = setup_APIC_mce_threshold(offset, new);
-
-                       if ((offset == new) &&
-                           (mce_threshold_vector != amd_threshold_interrupt))
-                               mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-                       mce_threshold_block_init(&b, offset);
+                       offset = prepare_threshold_block(bank, block, address, offset, high);
                }
        }
 
@@ -394,16 +515,9 @@ static void amd_threshold_interrupt(void)
                if (!(per_cpu(bank_map, cpu) & (1 << bank)))
                        continue;
                for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0) {
-                               address = MSR_IA32_MCx_MISC(bank);
-                       } else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-                               address += MCG_XBLK_ADDR;
-                       } else {
-                               ++address;
-                       }
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
 
                        if (rdmsr_safe(address, &low, &high))
                                break;
@@ -623,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
        if (err)
                goto out_free;
 recurse:
-       if (!block) {
-               address = (low & MASK_BLKPTR_LO) >> 21;
-               if (!address)
-                       return 0;
-               address += MCG_XBLK_ADDR;
-       } else {
-               ++address;
-       }
+       address = get_block_address(address, low, high, bank, ++block);
+       if (!address)
+               return 0;
 
-       err = allocate_threshold_blocks(cpu, bank, ++block, address);
+       err = allocate_threshold_blocks(cpu, bank, block, address);
        if (err)
                goto out_free;
 
index 12402e1..2a0717b 100644 (file)
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
        rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
        rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
 
-       printk(KERN_EMERG
-               "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-               smp_processor_id(), loaddr, lotype);
+       pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+                smp_processor_id(), loaddr, lotype);
 
        if (lotype & (1<<5)) {
-               printk(KERN_EMERG
-                       "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-                       smp_processor_id());
+               pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+                        smp_processor_id());
        }
 
        add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
        /* Read registers before enabling: */
        rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
        rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-       printk(KERN_INFO
-              "Intel old style machine check architecture supported.\n");
+       pr_info("Intel old style machine check architecture supported.\n");
 
        /* Enable MCE: */
        cr4_set_bits(X86_CR4_MCE);
-       printk(KERN_INFO
-              "Intel old style machine check reporting enabled on CPU#%d.\n",
-              smp_processor_id());
+       pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+               smp_processor_id());
 }
index 2c5aaf8..0b445c2 100644 (file)
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
        /* if we just entered the thermal event */
        if (new_event) {
                if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+                       pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
                                this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package",
                                state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
        }
        if (old_event) {
                if (event == THERMAL_THROTTLING_EVENT)
-                       printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-                               this_cpu,
+                       pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
                                level == CORE_LEVEL ? "Core" : "Package");
                return 1;
        }
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
 
 static void unexpected_thermal_interrupt(void)
 {
-       printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
-                       smp_processor_id());
+       pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+               smp_processor_id());
 }
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 
        if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                if (system_state == SYSTEM_BOOTING)
-                       printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+                       pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
                return;
        }
 
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
        l = apic_read(APIC_LVTTHMR);
        apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 
-       printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
-                      tm2 ? "TM2" : "TM1");
+       pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+                     tm2 ? "TM2" : "TM1");
 
        /* enable thermal throttle processing */
        atomic_set(&therm_throt_en, 1);
index 7245980..fcf9ae9 100644 (file)
@@ -12,8 +12,8 @@
 
 static void default_threshold_interrupt(void)
 {
-       printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-                        THRESHOLD_APIC_VECTOR);
+       pr_err("Unexpected threshold interrupt at vector %x\n",
+               THRESHOLD_APIC_VECTOR);
 }
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
index 01dd870..c6a722e 100644 (file)
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
        ist_enter(regs);
 
-       printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+       pr_emerg("CPU0: Machine Check Exception.\n");
        add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
        ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
 
        cr4_set_bits(X86_CR4_MCE);
 
-       printk(KERN_INFO
-              "Winchip machine check reporting enabled on CPU#0.\n");
+       pr_info("Winchip machine check reporting enabled on CPU#0.\n");
 }
index 2233f8a..8581963 100644 (file)
@@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void)
        else
                container = cont_va;
 
-       if (ucode_new_rev)
-               pr_info("microcode: updated early to new patch_level=0x%08x\n",
-                       ucode_new_rev);
-
        eax   = cpuid_eax(0x00000001);
        eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 
@@ -469,8 +465,7 @@ void reload_ucode_amd(void)
        if (mc && rev < mc->hdr.patch_id) {
                if (!__apply_microcode_amd(mc)) {
                        ucode_new_rev = mc->hdr.patch_id;
-                       pr_info("microcode: reload patch_level=0x%08x\n",
-                               ucode_new_rev);
+                       pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
                }
        }
 }
@@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
                return -EINVAL;
        }
 
-       patch->data = kzalloc(patch_size, GFP_KERNEL);
+       patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
        if (!patch->data) {
                pr_err("Patch data allocation failure.\n");
                kfree(patch);
                return -EINVAL;
        }
 
-       /* All looks ok, copy patch... */
-       memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
        INIT_LIST_HEAD(&patch->plist);
        patch->patch_id  = mc_hdr->patch_id;
        patch->equiv_cpu = proc_id;
@@ -953,10 +946,14 @@ struct microcode_ops * __init init_amd_microcode(void)
        struct cpuinfo_x86 *c = &boot_cpu_data;
 
        if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-               pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+               pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
                return NULL;
        }
 
+       if (ucode_new_rev)
+               pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+                            ucode_new_rev);
+
        return &microcode_amd_ops;
 }
 
index faec712..ac360bf 100644 (file)
 #define MICROCODE_VERSION      "2.01"
 
 static struct microcode_ops    *microcode_ops;
-
 static bool dis_ucode_ldr;
 
-static int __init disable_loader(char *str)
-{
-       dis_ucode_ldr = true;
-       return 1;
-}
-__setup("dis_ucode_ldr", disable_loader);
-
 /*
  * Synchronization.
  *
@@ -81,15 +73,16 @@ struct cpu_info_ctx {
 
 static bool __init check_loader_disabled_bsp(void)
 {
+       static const char *__dis_opt_str = "dis_ucode_ldr";
+
 #ifdef CONFIG_X86_32
        const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
-       const char *opt     = "dis_ucode_ldr";
-       const char *option  = (const char *)__pa_nodebug(opt);
+       const char *option  = (const char *)__pa_nodebug(__dis_opt_str);
        bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
 
 #else /* CONFIG_X86_64 */
        const char *cmdline = boot_command_line;
-       const char *option  = "dis_ucode_ldr";
+       const char *option  = __dis_opt_str;
        bool *res = &dis_ucode_ldr;
 #endif
 
@@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
        enum ucode_state ustate;
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-       if (uci && uci->valid)
+       if (uci->valid)
                return UCODE_OK;
 
        if (collect_cpu_info(cpu))
@@ -630,7 +623,7 @@ int __init microcode_init(void)
        struct cpuinfo_x86 *c = &boot_cpu_data;
        int error;
 
-       if (paravirt_enabled() || dis_ucode_ldr)
+       if (dis_ucode_ldr)
                return -EINVAL;
 
        if (c->x86_vendor == X86_VENDOR_INTEL)
index ee81c54..cbb3cf0 100644 (file)
 #include <asm/setup.h>
 #include <asm/msr.h>
 
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+/*
+ * Temporary microcode blobs pointers storage. We note here the pointers to
+ * microcode blobs we've got from whatever storage (detached initrd, builtin).
+ * Later on, we put those into final storage mc_saved_data.mc_saved.
+ */
+static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
+
 static struct mc_saved_data {
-       unsigned int mc_saved_count;
+       unsigned int num_saved;
        struct microcode_intel **mc_saved;
 } mc_saved_data;
 
@@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved,
 }
 
 static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
-                 unsigned long off, int num_saved)
+copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
+         unsigned long off, int num_saved)
 {
        int i;
 
        for (i = 0; i < num_saved; i++)
-               mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+               mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
 }
 
 #ifdef CONFIG_X86_32
 static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
-              struct mc_saved_data *mc_saved_data)
+microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 {
        int i;
        struct microcode_intel ***mc_saved;
 
-       mc_saved = (struct microcode_intel ***)
-                  __pa_nodebug(&mc_saved_data->mc_saved);
-       for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+       mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+
+       for (i = 0; i < mcs->num_saved; i++) {
                struct microcode_intel *p;
 
-               p = *(struct microcode_intel **)
-                       __pa_nodebug(mc_saved_data->mc_saved + i);
+               p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
                mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
        }
 }
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
-              unsigned long initrd_start, struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
+              unsigned long offset, struct ucode_cpu_info *uci)
 {
        struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-       unsigned int count = mc_saved_data->mc_saved_count;
+       unsigned int count = mcs->num_saved;
 
-       if (!mc_saved_data->mc_saved) {
-               copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+       if (!mcs->mc_saved) {
+               copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
 
                return load_microcode_early(mc_saved_tmp, count, uci);
        } else {
 #ifdef CONFIG_X86_32
-               microcode_phys(mc_saved_tmp, mc_saved_data);
+               microcode_phys(mc_saved_tmp, mcs);
                return load_microcode_early(mc_saved_tmp, count, uci);
 #else
-               return load_microcode_early(mc_saved_data->mc_saved,
-                                                   count, uci);
+               return load_microcode_early(mcs->mc_saved, count, uci);
 #endif
        }
 }
@@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 }
 
 static int
-save_microcode(struct mc_saved_data *mc_saved_data,
+save_microcode(struct mc_saved_data *mcs,
               struct microcode_intel **mc_saved_src,
-              unsigned int mc_saved_count)
+              unsigned int num_saved)
 {
        int i, j;
        struct microcode_intel **saved_ptr;
        int ret;
 
-       if (!mc_saved_count)
+       if (!num_saved)
                return -EINVAL;
 
        /*
         * Copy new microcode data.
         */
-       saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+       saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
        if (!saved_ptr)
                return -ENOMEM;
 
-       for (i = 0; i < mc_saved_count; i++) {
+       for (i = 0; i < num_saved; i++) {
                struct microcode_header_intel *mc_hdr;
                struct microcode_intel *mc;
                unsigned long size;
@@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data,
                mc_hdr = &mc->hdr;
                size   = get_totalsize(mc_hdr);
 
-               saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+               saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
                if (!saved_ptr[i]) {
                        ret = -ENOMEM;
                        goto err;
                }
-
-               memcpy(saved_ptr[i], mc, size);
        }
 
        /*
         * Point to newly saved microcode.
         */
-       mc_saved_data->mc_saved = saved_ptr;
-       mc_saved_data->mc_saved_count = mc_saved_count;
+       mcs->mc_saved  = saved_ptr;
+       mcs->num_saved = num_saved;
 
        return 0;
 
@@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
  * BSP can stay in the platform.
  */
 static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
-                            void *data, size_t size,
-                            struct mc_saved_data *mc_saved_data,
-                            unsigned long *mc_saved_in_initrd,
+get_matching_model_microcode(unsigned long start, void *data, size_t size,
+                            struct mc_saved_data *mcs, unsigned long *mc_ptrs,
                             struct ucode_cpu_info *uci)
 {
-       u8 *ucode_ptr = data;
-       unsigned int leftover = size;
+       struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+       struct microcode_header_intel *mc_header;
+       unsigned int num_saved = mcs->num_saved;
        enum ucode_state state = UCODE_OK;
+       unsigned int leftover = size;
+       u8 *ucode_ptr = data;
        unsigned int mc_size;
-       struct microcode_header_intel *mc_header;
-       struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-       unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
        int i;
 
-       while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+       while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
 
                if (leftover < sizeof(mc_header))
                        break;
@@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start,
                 * the platform, we need to find and save microcode patches
                 * with the same family and model as the BSP.
                 */
-               if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
-                        UCODE_OK) {
+               if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
                        ucode_ptr += mc_size;
                        continue;
                }
 
-               mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+               num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
 
                ucode_ptr += mc_size;
        }
 
        if (leftover) {
                state = UCODE_ERROR;
-               goto out;
+               return state;
        }
 
-       if (mc_saved_count == 0) {
+       if (!num_saved) {
                state = UCODE_NFOUND;
-               goto out;
+               return state;
        }
 
-       for (i = 0; i < mc_saved_count; i++)
-               mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+       for (i = 0; i < num_saved; i++)
+               mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+       mcs->num_saved = num_saved;
 
-       mc_saved_data->mc_saved_count = mc_saved_count;
-out:
        return state;
 }
 
@@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
                native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
                csig.pf = 1 << ((val[1] >> 18) & 7);
        }
-       native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+       native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
        /* As documented in the SDM: Do a CPUID 1 here */
        sync_core();
@@ -396,11 +394,11 @@ static void show_saved_mc(void)
        unsigned int sig, pf, rev, total_size, data_size, date;
        struct ucode_cpu_info uci;
 
-       if (mc_saved_data.mc_saved_count == 0) {
+       if (!mc_saved_data.num_saved) {
                pr_debug("no microcode data saved.\n");
                return;
        }
-       pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+       pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
 
        collect_cpu_info_early(&uci);
 
@@ -409,7 +407,7 @@ static void show_saved_mc(void)
        rev = uci.cpu_sig.rev;
        pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
-       for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+       for (i = 0; i < mc_saved_data.num_saved; i++) {
                struct microcode_header_intel *mc_saved_header;
                struct extended_sigtable *ext_header;
                int ext_sigcount;
@@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc)
 {
        struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
        unsigned int mc_saved_count_init;
-       unsigned int mc_saved_count;
+       unsigned int num_saved;
        struct microcode_intel **mc_saved;
        int ret = 0;
        int i;
@@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc)
         */
        mutex_lock(&x86_cpu_microcode_mutex);
 
-       mc_saved_count_init = mc_saved_data.mc_saved_count;
-       mc_saved_count = mc_saved_data.mc_saved_count;
+       mc_saved_count_init = mc_saved_data.num_saved;
+       num_saved = mc_saved_data.num_saved;
        mc_saved = mc_saved_data.mc_saved;
 
-       if (mc_saved && mc_saved_count)
+       if (mc_saved && num_saved)
                memcpy(mc_saved_tmp, mc_saved,
-                      mc_saved_count * sizeof(struct microcode_intel *));
+                      num_saved * sizeof(struct microcode_intel *));
        /*
         * Save the microcode patch mc in mc_save_tmp structure if it's a newer
         * version.
         */
-       mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+       num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
 
        /*
         * Save the mc_save_tmp in global mc_saved_data.
         */
-       ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+       ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
        if (ret) {
                pr_err("Cannot save microcode patch.\n");
                goto out;
@@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
               unsigned long start, unsigned long size,
               struct ucode_cpu_info *uci)
 {
@@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
        cd.data = NULL;
        cd.size = 0;
 
-       cd = find_cpio_data(p, (void *)start, size, &offset);
-       if (!cd.data) {
+       /* try built-in microcode if no initrd */
+       if (!size) {
                if (!load_builtin_intel_microcode(&cd))
                        return UCODE_ERROR;
+       } else {
+               cd = find_cpio_data(p, (void *)start, size, &offset);
+               if (!cd.data)
+                       return UCODE_ERROR;
        }
 
-       return get_matching_model_microcode(0, start, cd.data, cd.size,
-                                           mc_saved_data, initrd, uci);
+       return get_matching_model_microcode(start, cd.data, cd.size,
+                                           mcs, mc_ptrs, uci);
 }
 
 /*
@@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 static void
 print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
 {
-       int cpu = smp_processor_id();
-
-       pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
-               cpu,
-               uci->cpu_sig.rev,
-               date & 0xffff,
-               date >> 24,
-               (date >> 16) & 0xff);
+       pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+                    uci->cpu_sig.rev,
+                    date & 0xffff,
+                    date >> 24,
+                    (date >> 16) & 0xff);
 }
 
 #ifdef CONFIG_X86_32
@@ -603,19 +602,19 @@ void show_ucode_info_early(void)
  */
 static void print_ucode(struct ucode_cpu_info *uci)
 {
-       struct microcode_intel *mc_intel;
+       struct microcode_intel *mc;
        int *delay_ucode_info_p;
        int *current_mc_date_p;
 
-       mc_intel = uci->mc;
-       if (mc_intel == NULL)
+       mc = uci->mc;
+       if (!mc)
                return;
 
        delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
        current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
 
        *delay_ucode_info_p = 1;
-       *current_mc_date_p = mc_intel->hdr.date;
+       *current_mc_date_p = mc->hdr.date;
 }
 #else
 
@@ -630,37 +629,35 @@ static inline void flush_tlb_early(void)
 
 static inline void print_ucode(struct ucode_cpu_info *uci)
 {
-       struct microcode_intel *mc_intel;
+       struct microcode_intel *mc;
 
-       mc_intel = uci->mc;
-       if (mc_intel == NULL)
+       mc = uci->mc;
+       if (!mc)
                return;
 
-       print_ucode_info(uci, mc_intel->hdr.date);
+       print_ucode_info(uci, mc->hdr.date);
 }
 #endif
 
 static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 {
-       struct microcode_intel *mc_intel;
+       struct microcode_intel *mc;
        unsigned int val[2];
 
-       mc_intel = uci->mc;
-       if (mc_intel == NULL)
+       mc = uci->mc;
+       if (!mc)
                return 0;
 
        /* write microcode via MSR 0x79 */
-       native_wrmsr(MSR_IA32_UCODE_WRITE,
-             (unsigned long) mc_intel->bits,
-             (unsigned long) mc_intel->bits >> 16 >> 16);
-       native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+       native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+       native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
        /* As documented in the SDM: Do a CPUID 1 here */
        sync_core();
 
        /* get the current revision from MSR 0x8B */
        native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-       if (val[1] != mc_intel->hdr.rev)
+       if (val[1] != mc->hdr.rev)
                return -1;
 
 #ifdef CONFIG_X86_64
@@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
        if (early)
                print_ucode(uci);
        else
-               print_ucode_info(uci, mc_intel->hdr.date);
+               print_ucode_info(uci, mc->hdr.date);
 
        return 0;
 }
 
 /*
  * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
  */
 int __init save_microcode_in_initrd_intel(void)
 {
-       unsigned int count = mc_saved_data.mc_saved_count;
+       unsigned int count = mc_saved_data.num_saved;
        struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
        int ret = 0;
 
-       if (count == 0)
+       if (!count)
                return ret;
 
-       copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+       copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
+
        ret = save_microcode(&mc_saved_data, mc_saved, count);
        if (ret)
                pr_err("Cannot save microcode patches from initrd.\n");
@@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void)
 }
 
 static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-                     unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
                      unsigned long start, unsigned long size)
 {
        struct ucode_cpu_info uci;
@@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 
        collect_cpu_info_early(&uci);
 
-       ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+       ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
        if (ret != UCODE_OK)
                return;
 
-       ret = load_microcode(mc_saved_data, initrd, start, &uci);
+       ret = load_microcode(mcs, mc_ptrs, start, &uci);
        if (ret != UCODE_OK)
                return;
 
@@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void)
        struct boot_params *p;
 
        p       = (struct boot_params *)__pa_nodebug(&boot_params);
-       start   = p->hdr.ramdisk_image;
        size    = p->hdr.ramdisk_size;
 
-       _load_ucode_intel_bsp(
-                       (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-                       (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-                       start, size);
+       /*
+        * Set start only if we have an initrd image. We cannot use initrd_start
+        * because it is not set that early yet.
+        */
+       start   = (size ? p->hdr.ramdisk_image : 0);
+
+       _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+                             (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
+                             start, size);
 #else
-       start   = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
        size    = boot_params.hdr.ramdisk_size;
+       start   = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
 
-       _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+       _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
 #endif
 }
 
 void load_ucode_intel_ap(void)
 {
-       struct mc_saved_data *mc_saved_data_p;
+       unsigned long *mcs_tmp_p;
+       struct mc_saved_data *mcs_p;
        struct ucode_cpu_info uci;
-       unsigned long *mc_saved_in_initrd_p;
-       unsigned long initrd_start_addr;
        enum ucode_state ret;
 #ifdef CONFIG_X86_32
-       unsigned long *initrd_start_p;
 
-       mc_saved_in_initrd_p =
-               (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
-       mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-       initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
-       initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+       mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
+       mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
 #else
-       mc_saved_data_p = &mc_saved_data;
-       mc_saved_in_initrd_p = mc_saved_in_initrd;
-       initrd_start_addr = initrd_start;
+       mcs_tmp_p = mc_tmp_ptrs;
+       mcs_p = &mc_saved_data;
 #endif
 
        /*
         * If there is no valid ucode previously saved in memory, no need to
         * update ucode on this AP.
         */
-       if (mc_saved_data_p->mc_saved_count == 0)
+       if (!mcs_p->num_saved)
                return;
 
        collect_cpu_info_early(&uci);
-       ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-                            initrd_start_addr, &uci);
-
+       ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
        if (ret != UCODE_OK)
                return;
 
@@ -786,13 +779,13 @@ void reload_ucode_intel(void)
        struct ucode_cpu_info uci;
        enum ucode_state ret;
 
-       if (!mc_saved_data.mc_saved_count)
+       if (!mc_saved_data.num_saved)
                return;
 
        collect_cpu_info_early(&uci);
 
        ret = load_microcode_early(mc_saved_data.mc_saved,
-                                  mc_saved_data.mc_saved_count, &uci);
+                                  mc_saved_data.num_saved, &uci);
        if (ret != UCODE_OK)
                return;
 
@@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
  * return 0 - no update found
  * return 1 - found update
  */
-static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
+static int get_matching_mc(struct microcode_intel *mc, int cpu)
 {
        struct cpu_signature cpu_sig;
        unsigned int csig, cpf, crev;
@@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
        cpf = cpu_sig.pf;
        crev = cpu_sig.rev;
 
-       return has_newer_microcode(mc_intel, csig, cpf, crev);
+       return has_newer_microcode(mc, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
 {
-       struct microcode_intel *mc_intel;
+       struct microcode_intel *mc;
        struct ucode_cpu_info *uci;
+       struct cpuinfo_x86 *c;
        unsigned int val[2];
-       int cpu_num = raw_smp_processor_id();
-       struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-
-       uci = ucode_cpu_info + cpu;
-       mc_intel = uci->mc;
 
        /* We should bind the task to the CPU */
-       BUG_ON(cpu_num != cpu);
+       if (WARN_ON(raw_smp_processor_id() != cpu))
+               return -1;
 
-       if (mc_intel == NULL)
+       uci = ucode_cpu_info + cpu;
+       mc = uci->mc;
+       if (!mc)
                return 0;
 
        /*
         * Microcode on this CPU could be updated earlier. Only apply the
-        * microcode patch in mc_intel when it is newer than the one on this
+        * microcode patch in mc when it is newer than the one on this
         * CPU.
         */
-       if (get_matching_mc(mc_intel, cpu) == 0)
+       if (!get_matching_mc(mc, cpu))
                return 0;
 
        /* write microcode via MSR 0x79 */
-       wrmsr(MSR_IA32_UCODE_WRITE,
-             (unsigned long) mc_intel->bits,
-             (unsigned long) mc_intel->bits >> 16 >> 16);
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+       wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+       wrmsrl(MSR_IA32_UCODE_REV, 0);
 
        /* As documented in the SDM: Do a CPUID 1 here */
        sync_core();
@@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu)
        /* get the current revision from MSR 0x8B */
        rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-       if (val[1] != mc_intel->hdr.rev) {
+       if (val[1] != mc->hdr.rev) {
                pr_err("CPU%d update to revision 0x%x failed\n",
-                      cpu_num, mc_intel->hdr.rev);
+                      cpu, mc->hdr.rev);
                return -1;
        }
+
        pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
-               cpu_num, val[1],
-               mc_intel->hdr.date & 0xffff,
-               mc_intel->hdr.date >> 24,
-               (mc_intel->hdr.date >> 16) & 0xff);
+               cpu, val[1],
+               mc->hdr.date & 0xffff,
+               mc->hdr.date >> 24,
+               (mc->hdr.date >> 16) & 0xff);
+
+       c = &cpu_data(cpu);
 
        uci->cpu_sig.rev = val[1];
        c->microcode = val[1];
index b96896b..2ce1a7d 100644 (file)
@@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err)
        unsigned long total_size, data_size, ext_table_size;
        struct microcode_header_intel *mc_header = mc;
        struct extended_sigtable *ext_header = NULL;
-       int sum, orig_sum, ext_sigcount = 0, i;
+       u32 sum, orig_sum, ext_sigcount = 0, i;
        struct extended_signature *ext_sig;
 
        total_size = get_totalsize(mc_header);
@@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err)
 
        if (data_size + MC_HEADER_SIZE > total_size) {
                if (print_err)
-                       pr_err("error! Bad data size in microcode data file\n");
+                       pr_err("Error: bad microcode data file size.\n");
                return -EINVAL;
        }
 
        if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
                if (print_err)
-                       pr_err("error! Unknown microcode update format\n");
+                       pr_err("Error: invalid/unknown microcode update format.\n");
                return -EINVAL;
        }
+
        ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
        if (ext_table_size) {
+               u32 ext_table_sum = 0;
+               u32 *ext_tablep;
+
                if ((ext_table_size < EXT_HEADER_SIZE)
                 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
                        if (print_err)
-                               pr_err("error! Small exttable size in microcode data file\n");
+                               pr_err("Error: truncated extended signature table.\n");
                        return -EINVAL;
                }
+
                ext_header = mc + MC_HEADER_SIZE + data_size;
                if (ext_table_size != exttable_size(ext_header)) {
                        if (print_err)
-                               pr_err("error! Bad exttable size in microcode data file\n");
+                               pr_err("Error: extended signature table size mismatch.\n");
                        return -EFAULT;
                }
+
                ext_sigcount = ext_header->count;
-       }
 
-       /* check extended table checksum */
-       if (ext_table_size) {
-               int ext_table_sum = 0;
-               int *ext_tablep = (int *)ext_header;
+               /*
+                * Check extended table checksum: the sum of all dwords that
+                * comprise a valid table must be 0.
+                */
+               ext_tablep = (u32 *)ext_header;
 
-               i = ext_table_size / DWSIZE;
+               i = ext_table_size / sizeof(u32);
                while (i--)
                        ext_table_sum += ext_tablep[i];
+
                if (ext_table_sum) {
                        if (print_err)
-                               pr_warn("aborting, bad extended signature table checksum\n");
+                               pr_warn("Bad extended signature table checksum, aborting.\n");
                        return -EINVAL;
                }
        }
 
-       /* calculate the checksum */
+       /*
+        * Calculate the checksum of update data and header. The checksum of
+        * valid update data and header including the extended signature table
+        * must be 0.
+        */
        orig_sum = 0;
-       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+       i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
        while (i--)
-               orig_sum += ((int *)mc)[i];
+               orig_sum += ((u32 *)mc)[i];
+
        if (orig_sum) {
                if (print_err)
-                       pr_err("aborting, bad checksum\n");
+                       pr_err("Bad microcode data checksum, aborting.\n");
                return -EINVAL;
        }
+
        if (!ext_table_size)
                return 0;
-       /* check extended signature checksum */
+
+       /*
+        * Check extended signature checksum: 0 => valid.
+        */
        for (i = 0; i < ext_sigcount; i++) {
                ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
                          EXT_SIGNATURE_SIZE * i;
-               sum = orig_sum
-                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
-                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+
+               sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+                     (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
                if (sum) {
                        if (print_err)
-                               pr_err("aborting, bad checksum\n");
+                               pr_err("Bad extended signature checksum, aborting.\n");
                        return -EINVAL;
                }
        }
index 3f20710..6988c74 100644 (file)
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
 #
 
 IN=$1
@@ -49,8 +49,8 @@ dump_array()
 trap 'rm "$OUT"' EXIT
 
 (
-       echo "#ifndef _ASM_X86_CPUFEATURE_H"
-       echo "#include <asm/cpufeature.h>"
+       echo "#ifndef _ASM_X86_CPUFEATURES_H"
+       echo "#include <asm/cpufeatures.h>"
        echo "#endif"
        echo ""
 
index 20e242e..4e7c693 100644 (file)
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
        ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
        ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-       printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
-              ms_hyperv.features, ms_hyperv.hints);
+       pr_info("HyperV: features 0x%x, hints 0x%x\n",
+               ms_hyperv.features, ms_hyperv.hints);
 
 #ifdef CONFIG_X86_LOCAL_APIC
        if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
                rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
                hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
                lapic_timer_frequency = hv_lapic_frequency;
-               printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
-                               lapic_timer_frequency);
+               pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+                       lapic_timer_frequency);
        }
 #endif
 
index 316fe3e..3d68993 100644 (file)
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
         */
        if (type != MTRR_TYPE_WRCOMB &&
            (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
-               pr_warning("mtrr: only write-combining%s supported\n",
+               pr_warn("mtrr: only write-combining%s supported\n",
                           centaur_mcr_type ? " and uncacheable are" : " is");
                return -EINVAL;
        }
index 0d98503..31e951c 100644 (file)
@@ -57,9 +57,9 @@ static int __initdata                         nr_range;
 static struct var_mtrr_range_state __initdata  range_state[RANGE_NUM];
 
 static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
 
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
        "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
 
 static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                                base, base + size);
        }
        if (debug_print) {
-               printk(KERN_DEBUG "After WB checking\n");
+               pr_debug("After WB checking\n");
                for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                 range[i].start, range[i].end);
        }
 
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
                    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
                        /* Var MTRR contains UC entry below 1M? Skip it: */
-                       printk(BIOS_BUG_MSG, i);
+                       pr_warn(BIOS_BUG_MSG, i);
                        if (base + size <= (1<<(20-PAGE_SHIFT)))
                                continue;
                        size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
                                 extra_remove_base + extra_remove_size);
 
        if  (debug_print) {
-               printk(KERN_DEBUG "After UC checking\n");
+               pr_debug("After UC checking\n");
                for (i = 0; i < RANGE_NUM; i++) {
                        if (!range[i].end)
                                continue;
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                 range[i].start, range[i].end);
                }
        }
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
        /* sort the ranges */
        nr_range = clean_sort_range(range, RANGE_NUM);
        if  (debug_print) {
-               printk(KERN_DEBUG "After sorting\n");
+               pr_debug("After sorting\n");
                for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+                       pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
                                 range[i].start, range[i].end);
        }
 
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
                start_base = to_size_factor(start_base, &start_factor),
                type = range_state[i].type;
 
-               printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+               pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
                        i, start_base, start_factor,
                        size_base, size_factor,
                        (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                return 0;
 
        /* Print original var MTRRs at first, for debugging: */
-       printk(KERN_DEBUG "original variable MTRRs\n");
+       pr_debug("original variable MTRRs\n");
        print_out_mtrr_range_state();
 
        memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                          x_remove_base, x_remove_size);
 
        range_sums = sum_ranges(range, nr_range);
-       printk(KERN_INFO "total RAM covered: %ldM\n",
+       pr_info("total RAM covered: %ldM\n",
               range_sums >> (20 - PAGE_SHIFT));
 
        if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
 
                if (!result[i].bad) {
                        set_var_mtrr_all(address_bits);
-                       printk(KERN_DEBUG "New variable MTRRs\n");
+                       pr_debug("New variable MTRRs\n");
                        print_out_mtrr_range_state();
                        return 1;
                }
-               printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-                      "will find optimal one\n");
+               pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
        }
 
        i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                                      x_remove_base, x_remove_size, i);
                        if (debug_print) {
                                mtrr_print_out_one_result(i);
-                               printk(KERN_INFO "\n");
+                               pr_info("\n");
                        }
 
                        i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
        index_good = mtrr_search_optimal_index();
 
        if (index_good != -1) {
-               printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+               pr_info("Found optimal setting for mtrr clean up\n");
                i = index_good;
                mtrr_print_out_one_result(i);
 
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
                gran_size <<= 10;
                x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
                set_var_mtrr_all(address_bits);
-               printk(KERN_DEBUG "New variable MTRRs\n");
+               pr_debug("New variable MTRRs\n");
                print_out_mtrr_range_state();
                return 1;
        } else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
                        mtrr_print_out_one_result(i);
        }
 
-       printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-       printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+       pr_info("mtrr_cleanup: can not find optimal value\n");
+       pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
 
        return 0;
 }
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
        /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
        if (!highest_pfn) {
-               printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+               pr_info("CPU MTRRs all blank - virtualized system.\n");
                return 0;
        }
 
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
                                                         end_pfn);
 
        if (total_trim_size) {
-               pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+               pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+                       total_trim_size >> 20);
 
                if (!changed_by_mtrr_cleanup)
                        WARN_ON(1);
index c870af1..fcbcb2f 100644 (file)
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 
        rdmsr(MSR_K8_SYSCFG, lo, hi);
        if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
-               printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+               pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
                       " not cleared by BIOS, clearing this bit\n",
                       smp_processor_id());
                lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
        if (!mask)
                return;
        if (mask & MTRR_CHANGE_MASK_FIXED)
-               pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
        if (mask & MTRR_CHANGE_MASK_VARIABLE)
-               pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
        if (mask & MTRR_CHANGE_MASK_DEFTYPE)
-               pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+               pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
 
-       printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
-       printk(KERN_INFO "mtrr: corrected configuration.\n");
+       pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+       pr_info("mtrr: corrected configuration.\n");
 }
 
 /*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
 void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 {
        if (wrmsr_safe(msr, a, b) < 0) {
-               printk(KERN_ERR
-                       "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+               pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
                        smp_processor_id(), msr, a, b);
        }
 }
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
                tmp |= ~((1ULL<<(hi - 1)) - 1);
 
                if (tmp != mask) {
-                       printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+                       pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
                        add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
                        mask = tmp;
                }
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
            boot_cpu_data.x86_model == 1 &&
            boot_cpu_data.x86_mask <= 7) {
                if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
-                       pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+                       pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
                        return -EINVAL;
                }
                if (!(base + size < 0x70000 || base > 0x7003F) &&
                    (type == MTRR_TYPE_WRCOMB
                     || type == MTRR_TYPE_WRBACK)) {
-                       pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+                       pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
                        return -EINVAL;
                }
        }
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
             lbase = lbase >> 1, last = last >> 1)
                ;
        if (lbase != last) {
-               pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+               pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
                return -EINVAL;
        }
        return 0;
index 5c3d149..10f8d47 100644 (file)
@@ -47,7 +47,7 @@
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                return error;
 
        if (type >= MTRR_NUM_TYPES) {
-               pr_warning("mtrr: type: %u invalid\n", type);
+               pr_warn("mtrr: type: %u invalid\n", type);
                return -EINVAL;
        }
 
        /* If the type is WC, check that this processor supports it */
        if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-               pr_warning("mtrr: your processor doesn't support write-combining\n");
+               pr_warn("mtrr: your processor doesn't support write-combining\n");
                return -ENOSYS;
        }
 
        if (!size) {
-               pr_warning("mtrr: zero sized request\n");
+               pr_warn("mtrr: zero sized request\n");
                return -EINVAL;
        }
 
        if ((base | (base + size - 1)) >>
            (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-               pr_warning("mtrr: base or size exceeds the MTRR width\n");
+               pr_warn("mtrr: base or size exceeds the MTRR width\n");
                return -EINVAL;
        }
 
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                                } else if (types_compatible(type, ltype))
                                        continue;
                        }
-                       pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+                       pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
                                " 0x%lx000,0x%lx000\n", base, size, lbase,
                                lsize);
                        goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                if (ltype != type) {
                        if (types_compatible(type, ltype))
                                continue;
-                       pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+                       pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
                                base, size, mtrr_attrib_to_str(ltype),
                                mtrr_attrib_to_str(type));
                        goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 static int mtrr_check(unsigned long base, unsigned long size)
 {
        if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-               pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+               pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
                pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
                dump_stack();
                return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
                }
        }
        if (reg >= max) {
-               pr_warning("mtrr: register: %d too big\n", reg);
+               pr_warn("mtrr: register: %d too big\n", reg);
                goto out;
        }
        mtrr_if->get(reg, &lbase, &lsize, &ltype);
        if (lsize < 1) {
-               pr_warning("mtrr: MTRR %d not used\n", reg);
+               pr_warn("mtrr: MTRR %d not used\n", reg);
                goto out;
        }
        if (mtrr_usage_table[reg] < 1) {
-               pr_warning("mtrr: reg: %d has count=0\n", reg);
+               pr_warn("mtrr: reg: %d has count=0\n", reg);
                goto out;
        }
        if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
deleted file mode 100644 (file)
index 1b443db..0000000
+++ /dev/null
@@ -1,2428 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
-       .enabled = 1,
-};
-
-struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
-
-u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - x86_pmu.cntval_bits;
-       u64 prev_raw_count, new_raw_count;
-       int idx = hwc->idx;
-       s64 delta;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               goto again;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       struct extra_reg *er;
-
-       reg = &event->hw.extra_reg;
-
-       if (!x86_pmu.extra_regs)
-               return 0;
-
-       for (er = x86_pmu.extra_regs; er->msr; er++) {
-               if (er->event != (config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-               /* Check if the extra msrs can be safely accessed*/
-               if (!er->extra_msr_access)
-                       return -ENXIO;
-
-               reg->idx = er->idx;
-               reg->config = event->attr.config1;
-               reg->reg = er->msr;
-               break;
-       }
-       return 0;
-}
-
-static atomic_t active_events;
-static atomic_t pmc_refcount;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
-                       goto perfctr_fail;
-       }
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
-                       goto eventsel_fail;
-       }
-
-       return true;
-
-eventsel_fail:
-       for (i--; i >= 0; i--)
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-
-       i = x86_pmu.num_counters;
-
-perfctr_fail:
-       for (i--; i >= 0; i--)
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-
-       return false;
-}
-
-static void release_pmc_hardware(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               release_perfctr_nmi(x86_pmu_event_addr(i));
-               release_evntsel_nmi(x86_pmu_config_addr(i));
-       }
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
-       u64 val, val_fail, val_new= ~0;
-       int i, reg, reg_fail, ret = 0;
-       int bios_fail = 0;
-       int reg_safe = -1;
-
-       /*
-        * Check to see if the BIOS enabled any of the counters, if so
-        * complain and bail.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
-                       bios_fail = 1;
-                       val_fail = val;
-                       reg_fail = reg;
-               } else {
-                       reg_safe = i;
-               }
-       }
-
-       if (x86_pmu.num_counters_fixed) {
-               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               ret = rdmsrl_safe(reg, &val);
-               if (ret)
-                       goto msr_fail;
-               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-                       if (val & (0x03 << i*4)) {
-                               bios_fail = 1;
-                               val_fail = val;
-                               reg_fail = reg;
-                       }
-               }
-       }
-
-       /*
-        * If all the counters are enabled, the below test will always
-        * fail.  The tools will also become useless in this scenario.
-        * Just fail and disable the hardware counters.
-        */
-
-       if (reg_safe == -1) {
-               reg = reg_safe;
-               goto msr_fail;
-       }
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       reg = x86_pmu_event_addr(reg_safe);
-       if (rdmsrl_safe(reg, &val))
-               goto msr_fail;
-       val ^= 0xffffUL;
-       ret = wrmsrl_safe(reg, val);
-       ret |= rdmsrl_safe(reg, &val_new);
-       if (ret || val != val_new)
-               goto msr_fail;
-
-       /*
-        * We still allow the PMU driver to operate:
-        */
-       if (bios_fail) {
-               printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
-               printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
-       }
-
-       return true;
-
-msr_fail:
-       printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
-       printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
-               boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
-               reg, val_new);
-
-       return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       atomic_dec(&active_events);
-}
-
-void hw_perf_lbr_event_destroy(struct perf_event *event)
-{
-       hw_perf_event_destroy(event);
-
-       /* undo the lbr/bts event accounting */
-       x86_del_exclusive(x86_lbr_exclusive_lbr);
-}
-
-static inline int x86_pmu_initialized(void)
-{
-       return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       unsigned int cache_type, cache_op, cache_result;
-       u64 config, val;
-
-       config = attr->config;
-
-       cache_type = (config >>  0) & 0xff;
-       if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-               return -EINVAL;
-
-       cache_op = (config >>  8) & 0xff;
-       if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-               return -EINVAL;
-
-       cache_result = (config >> 16) & 0xff;
-       if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-               return -EINVAL;
-
-       val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-       if (val == 0)
-               return -ENOENT;
-
-       if (val == -1)
-               return -EINVAL;
-
-       hwc->config |= val;
-       attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-       return x86_pmu_extra_regs(val, event);
-}
-
-int x86_reserve_hardware(void)
-{
-       int err = 0;
-
-       if (!atomic_inc_not_zero(&pmc_refcount)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&pmc_refcount) == 0) {
-                       if (!reserve_pmc_hardware())
-                               err = -EBUSY;
-                       else
-                               reserve_ds_buffers();
-               }
-               if (!err)
-                       atomic_inc(&pmc_refcount);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       return err;
-}
-
-void x86_release_hardware(void)
-{
-       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
-               release_pmc_hardware();
-               release_ds_buffers();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-}
-
-/*
- * Check if we can create event of a certain type (that no conflicting events
- * are present).
- */
-int x86_add_exclusive(unsigned int what)
-{
-       int i;
-
-       if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
-               mutex_lock(&pmc_reserve_mutex);
-               for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-                       if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-                               goto fail_unlock;
-               }
-               atomic_inc(&x86_pmu.lbr_exclusive[what]);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
-
-       atomic_inc(&active_events);
-       return 0;
-
-fail_unlock:
-       mutex_unlock(&pmc_reserve_mutex);
-       return -EBUSY;
-}
-
-void x86_del_exclusive(unsigned int what)
-{
-       atomic_dec(&x86_pmu.lbr_exclusive[what]);
-       atomic_dec(&active_events);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
-       struct perf_event_attr *attr = &event->attr;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       if (!is_sampling_event(event)) {
-               hwc->sample_period = x86_pmu.max_period;
-               hwc->last_period = hwc->sample_period;
-               local64_set(&hwc->period_left, hwc->sample_period);
-       }
-
-       if (attr->type == PERF_TYPE_RAW)
-               return x86_pmu_extra_regs(event->attr.config, event);
-
-       if (attr->type == PERF_TYPE_HW_CACHE)
-               return set_ext_hw_attr(hwc, event);
-
-       if (attr->config >= x86_pmu.max_events)
-               return -EINVAL;
-
-       /*
-        * The generic map:
-        */
-       config = x86_pmu.event_map(attr->config);
-
-       if (config == 0)
-               return -ENOENT;
-
-       if (config == -1LL)
-               return -EINVAL;
-
-       /*
-        * Branch tracing:
-        */
-       if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !attr->freq && hwc->sample_period == 1) {
-               /* BTS is not supported by this architecture. */
-               if (!x86_pmu.bts_active)
-                       return -EOPNOTSUPP;
-
-               /* BTS is currently only allowed for user-mode. */
-               if (!attr->exclude_kernel)
-                       return -EOPNOTSUPP;
-
-               /* disallow bts if conflicting events are present */
-               if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                       return -EBUSY;
-
-               event->destroy = hw_perf_lbr_event_destroy;
-       }
-
-       hwc->config |= config;
-
-       return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
-       u64 m = event->attr.branch_sample_type;
-       u64 b = 0;
-
-       /* must capture all branches */
-       if (!(m & PERF_SAMPLE_BRANCH_ANY))
-               return 0;
-
-       m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_user)
-               b |= PERF_SAMPLE_BRANCH_USER;
-
-       if (!event->attr.exclude_kernel)
-               b |= PERF_SAMPLE_BRANCH_KERNEL;
-
-       /*
-        * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
-        */
-
-       return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
-       if (event->attr.precise_ip) {
-               int precise = 0;
-
-               /* Support for constant skid */
-               if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
-                       precise++;
-
-                       /* Support for IP fixup */
-                       if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-                               precise++;
-
-                       if (x86_pmu.pebs_prec_dist)
-                               precise++;
-               }
-
-               if (event->attr.precise_ip > precise)
-                       return -EOPNOTSUPP;
-       }
-       /*
-        * check that PEBS LBR correction does not conflict with
-        * whatever the user is asking with attr->branch_sample_type
-        */
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
-               u64 *br_type = &event->attr.branch_sample_type;
-
-               if (has_branch_stack(event)) {
-                       if (!precise_br_compat(event))
-                               return -EOPNOTSUPP;
-
-                       /* branch_sample_type is compatible */
-
-               } else {
-                       /*
-                        * user did not specify  branch_sample_type
-                        *
-                        * For PEBS fixups, we capture all
-                        * the branches at the priv level of the
-                        * event.
-                        */
-                       *br_type = PERF_SAMPLE_BRANCH_ANY;
-
-                       if (!event->attr.exclude_user)
-                               *br_type |= PERF_SAMPLE_BRANCH_USER;
-
-                       if (!event->attr.exclude_kernel)
-                               *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-               }
-       }
-
-       if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
-               event->attach_state |= PERF_ATTACH_TASK_DATA;
-
-       /*
-        * Generate PMC IRQs:
-        * (keep 'enabled' bit clear for now)
-        */
-       event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
-       /*
-        * Count user and OS events unless requested not to
-        */
-       if (!event->attr.exclude_user)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
-       if (!event->attr.exclude_kernel)
-               event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
-       if (event->attr.sample_period && x86_pmu.limit_period) {
-               if (x86_pmu.limit_period(event, event->attr.sample_period) >
-                               event->attr.sample_period)
-                       return -EINVAL;
-       }
-
-       return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
-       int err;
-
-       if (!x86_pmu_initialized())
-               return -ENODEV;
-
-       err = x86_reserve_hardware();
-       if (err)
-               return err;
-
-       atomic_inc(&active_events);
-       event->destroy = hw_perf_event_destroy;
-
-       event->hw.idx = -1;
-       event->hw.last_cpu = -1;
-       event->hw.last_tag = ~0ULL;
-
-       /* mark unused */
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               u64 val;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               rdmsrl(x86_pmu_config_addr(idx), val);
-               if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
-                       continue;
-               val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               wrmsrl(x86_pmu_config_addr(idx), val);
-       }
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (!cpuc->enabled)
-               return;
-
-       cpuc->n_added = 0;
-       cpuc->enabled = 0;
-       barrier();
-
-       x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
-       return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
-       int     weight;
-       int     event;          /* event index */
-       int     counter;        /* counter index */
-       int     unassigned;     /* number of events to be assigned left */
-       int     nr_gp;          /* number of GP counters used */
-       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define        SCHED_STATES_MAX        2
-
-struct perf_sched {
-       int                     max_weight;
-       int                     max_events;
-       int                     max_gp;
-       int                     saved_states;
-       struct event_constraint **constraints;
-       struct sched_state      state;
-       struct sched_state      saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-                           int num, int wmin, int wmax, int gpmax)
-{
-       int idx;
-
-       memset(sched, 0, sizeof(*sched));
-       sched->max_events       = num;
-       sched->max_weight       = wmax;
-       sched->max_gp           = gpmax;
-       sched->constraints      = constraints;
-
-       for (idx = 0; idx < num; idx++) {
-               if (constraints[idx]->weight == wmin)
-                       break;
-       }
-
-       sched->state.event      = idx;          /* start with min weight */
-       sched->state.weight     = wmin;
-       sched->state.unassigned = num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
-       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
-               return;
-
-       sched->saved[sched->saved_states] = sched->state;
-       sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
-       if (!sched->saved_states)
-               return false;
-
-       sched->saved_states--;
-       sched->state = sched->saved[sched->saved_states];
-
-       /* continue with next counter: */
-       clear_bit(sched->state.counter++, sched->state.used);
-
-       return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-       int idx;
-
-       if (!sched->state.unassigned)
-               return false;
-
-       if (sched->state.event >= sched->max_events)
-               return false;
-
-       c = sched->constraints[sched->state.event];
-       /* Prefer fixed purpose counters */
-       if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
-               idx = INTEL_PMC_IDX_FIXED;
-               for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
-                       if (!__test_and_set_bit(idx, sched->state.used))
-                               goto done;
-               }
-       }
-
-       /* Grab the first unused counter starting with idx */
-       idx = sched->state.counter;
-       for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-               if (!__test_and_set_bit(idx, sched->state.used)) {
-                       if (sched->state.nr_gp++ >= sched->max_gp)
-                               return false;
-
-                       goto done;
-               }
-       }
-
-       return false;
-
-done:
-       sched->state.counter = idx;
-
-       if (c->overlap)
-               perf_sched_save_state(sched);
-
-       return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
-       while (!__perf_sched_find_counter(sched)) {
-               if (!perf_sched_restore_state(sched))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
-       struct event_constraint *c;
-
-       if (!sched->state.unassigned || !--sched->state.unassigned)
-               return false;
-
-       do {
-               /* next event */
-               sched->state.event++;
-               if (sched->state.event >= sched->max_events) {
-                       /* next weight */
-                       sched->state.event = 0;
-                       sched->state.weight++;
-                       if (sched->state.weight > sched->max_weight)
-                               return false;
-               }
-               c = sched->constraints[sched->state.event];
-       } while (c->weight != sched->state.weight);
-
-       sched->state.counter = 0;       /* start with first counter */
-
-       return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign)
-{
-       struct perf_sched sched;
-
-       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
-
-       do {
-               if (!perf_sched_find_counter(&sched))
-                       break;  /* failed */
-               if (assign)
-                       assign[sched.state.event] = sched.state.counter;
-       } while (perf_sched_next_event(&sched));
-
-       return sched.state.unassigned;
-}
-EXPORT_SYMBOL_GPL(perf_assign_events);
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       struct event_constraint *c;
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       struct perf_event *e;
-       int i, wmin, wmax, unsched = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
-       if (x86_pmu.start_scheduling)
-               x86_pmu.start_scheduling(cpuc);
-
-       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               cpuc->event_constraint[i] = NULL;
-               c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-               cpuc->event_constraint[i] = c;
-
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /*
-        * fastpath, try to reuse previous register
-        */
-       for (i = 0; i < n; i++) {
-               hwc = &cpuc->event_list[i]->hw;
-               c = cpuc->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-
-       /* slow path */
-       if (i != n) {
-               int gpmax = x86_pmu.num_counters;
-
-               /*
-                * Do not allow scheduling of more than half the available
-                * generic counters.
-                *
-                * This helps avoid counter starvation of sibling thread by
-                * ensuring at most half the counters cannot be in exclusive
-                * mode. There is no designated counters for the limits. Any
-                * N/2 counters can be used. This helps with events with
-                * specific counter constraints.
-                */
-               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-                       gpmax /= 2;
-
-               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-                                            wmax, gpmax, assign);
-       }
-
-       /*
-        * In case of success (unsched = 0), mark events as committed,
-        * so we do not put_constraint() in case new events are added
-        * and fail to be scheduled
-        *
-        * We invoke the lower level commit callback to lock the resource
-        *
-        * We do not need to do all of this in case we are called to
-        * validate an event group (assign == NULL)
-        */
-       if (!unsched && assign) {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       e->hw.flags |= PERF_X86_EVENT_COMMITTED;
-                       if (x86_pmu.commit_scheduling)
-                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
-               }
-       } else {
-               for (i = 0; i < n; i++) {
-                       e = cpuc->event_list[i];
-                       /*
-                        * do not put_constraint() on comitted events,
-                        * because they are good to go
-                        */
-                       if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-                               continue;
-
-                       /*
-                        * release events that failed scheduling
-                        */
-                       if (x86_pmu.put_event_constraints)
-                               x86_pmu.put_event_constraints(cpuc, e);
-               }
-       }
-
-       if (x86_pmu.stop_scheduling)
-               x86_pmu.stop_scheduling(cpuc);
-
-       return unsched ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
-       /* current number of events already accepted */
-       n = cpuc->n_events;
-
-       if (is_x86_event(leader)) {
-               if (n >= max_count)
-                       return -EINVAL;
-               cpuc->event_list[n] = leader;
-               n++;
-       }
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_x86_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               cpuc->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
-                               struct cpu_hw_events *cpuc, int i)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = cpuc->assign[i];
-       hwc->last_cpu = smp_processor_id();
-       hwc->last_tag = ++cpuc->tags[i];
-
-       if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
-               hwc->config_base = 0;
-               hwc->event_base = 0;
-       } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
-               hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-               hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-               hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-       } else {
-               hwc->config_base = x86_pmu_config_addr(hwc->idx);
-               hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-               hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
-       }
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
-                                       struct cpu_hw_events *cpuc,
-                                       int i)
-{
-       return hwc->idx == cpuc->assign[i] &&
-               hwc->last_cpu == smp_processor_id() &&
-               hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int i, added = cpuc->n_added;
-
-       if (!x86_pmu_initialized())
-               return;
-
-       if (cpuc->enabled)
-               return;
-
-       if (cpuc->n_added) {
-               int n_running = cpuc->n_events - cpuc->n_added;
-               /*
-                * apply assignment obtained either from
-                * hw_perf_group_sched_in() or x86_pmu_enable()
-                *
-                * step1: save events moving to new counters
-                */
-               for (i = 0; i < n_running; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       /*
-                        * we can avoid reprogramming counter if:
-                        * - assigned same counter as last time
-                        * - running on same CPU as last time
-                        * - no other event has used the counter since
-                        */
-                       if (hwc->idx == -1 ||
-                           match_prev_assignment(hwc, cpuc, i))
-                               continue;
-
-                       /*
-                        * Ensure we don't accidentally enable a stopped
-                        * counter simply because we rescheduled.
-                        */
-                       if (hwc->state & PERF_HES_STOPPED)
-                               hwc->state |= PERF_HES_ARCH;
-
-                       x86_pmu_stop(event, PERF_EF_UPDATE);
-               }
-
-               /*
-                * step2: reprogram moved events into new counters
-                */
-               for (i = 0; i < cpuc->n_events; i++) {
-                       event = cpuc->event_list[i];
-                       hwc = &event->hw;
-
-                       if (!match_prev_assignment(hwc, cpuc, i))
-                               x86_assign_hw_event(event, cpuc, i);
-                       else if (i < n_running)
-                               continue;
-
-                       if (hwc->state & PERF_HES_ARCH)
-                               continue;
-
-                       x86_pmu_start(event, PERF_EF_RELOAD);
-               }
-               cpuc->n_added = 0;
-               perf_events_lapic_init();
-       }
-
-       cpuc->enabled = 1;
-       barrier();
-
-       x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int ret = 0, idx = hwc->idx;
-
-       if (idx == INTEL_PMC_IDX_FIXED_BTS)
-               return 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-
-       if (unlikely(left <= 0)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               ret = 1;
-       }
-       /*
-        * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-        */
-       if (unlikely(left < 2))
-               left = 2;
-
-       if (left > x86_pmu.max_period)
-               left = x86_pmu.max_period;
-
-       if (x86_pmu.limit_period)
-               left = x86_pmu.limit_period(event, left);
-
-       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
-           local64_read(&hwc->prev_count) != (u64)-left) {
-               /*
-                * The hw event starts counting from this event offset,
-                * mark it to be able to extra future deltas:
-                */
-               local64_set(&hwc->prev_count, (u64)-left);
-
-               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       /*
-        * Due to erratum on certan cpu we need
-        * a second write to be sure the register
-        * is updated properly
-        */
-       if (x86_pmu.perfctr_second_write) {
-               wrmsrl(hwc->event_base,
-                       (u64)(-left) & x86_pmu.cntval_mask);
-       }
-
-       perf_event_update_userpage(event);
-
-       return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
-       if (__this_cpu_read(cpu_hw_events.enabled))
-               __x86_pmu_enable_event(&event->hw,
-                                      ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc;
-       int assign[X86_PMC_IDX_MAX];
-       int n, n0, ret;
-
-       hwc = &event->hw;
-
-       n0 = cpuc->n_events;
-       ret = n = collect_events(cpuc, event, false);
-       if (ret < 0)
-               goto out;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       /*
-        * If group events scheduling transaction was started,
-        * skip the schedulability test here, it will be performed
-        * at commit time (->commit_txn) as a whole.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               goto done_collect;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               goto out;
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
-       /*
-        * Commit the collect_events() state. See x86_pmu_del() and
-        * x86_pmu_*_txn().
-        */
-       cpuc->n_events = n;
-       cpuc->n_added += n - n0;
-       cpuc->n_txn += n - n0;
-
-       ret = 0;
-out:
-       return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1))
-               return;
-
-       if (flags & PERF_EF_RELOAD) {
-               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-               x86_perf_event_set_period(event);
-       }
-
-       event->hw.state = 0;
-
-       cpuc->events[idx] = event;
-       __set_bit(idx, cpuc->active_mask);
-       __set_bit(idx, cpuc->running);
-       x86_pmu.enable(event);
-       perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
-       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-       u64 pebs, debugctl;
-       struct cpu_hw_events *cpuc;
-       unsigned long flags;
-       int cpu, idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.version >= 2) {
-               rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-               rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-               pr_info("\n");
-               pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-               pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-               pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-               pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-               if (x86_pmu.pebs_constraints) {
-                       rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-                       pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
-               }
-               if (x86_pmu.lbr_nr) {
-                       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-                       pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-               }
-       }
-       pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-               rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
-               prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-               pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-                       cpu, idx, pmc_ctrl);
-               pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-               pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-                       cpu, idx, prev_left);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-               rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-               pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-                       cpu, idx, pmc_count);
-       }
-       local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
-               x86_pmu.disable(event);
-               cpuc->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               x86_perf_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int i;
-
-       /*
-        * event is descheduled
-        */
-       event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
-       /*
-        * If we're called during a txn, we don't need to do anything.
-        * The events never got scheduled and ->cancel_txn will truncate
-        * the event_list.
-        *
-        * XXX assumes any ->del() called during a TXN will only be on
-        * an event added during that same TXN.
-        */
-       if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Not a TXN, therefore cleanup properly.
-        */
-       x86_pmu_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < cpuc->n_events; i++) {
-               if (event == cpuc->event_list[i])
-                       break;
-       }
-
-       if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
-               return;
-
-       /* If we have a newly added event; make sure to decrease n_added. */
-       if (i >= cpuc->n_events - cpuc->n_added)
-               --cpuc->n_added;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(cpuc, event);
-
-       /* Delete the array entry. */
-       while (++i < cpuc->n_events) {
-               cpuc->event_list[i-1] = cpuc->event_list[i];
-               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
-       }
-       --cpuc->n_events;
-
-       perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * Some chipsets need to unmask the LVTPC in a particular spot
-        * inside the nmi handler.  As a result, the unmasking was pushed
-        * into all the nmi handlers.
-        *
-        * This generic handler doesn't seem to have any issues where the
-        * unmasking occurs so it was left at the top.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /*
-                        * Though we deactivated the counter some cpus
-                        * might still deliver spurious interrupts still
-                        * in flight. Catch them:
-                        */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-
-               val = x86_perf_event_update(event);
-               if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-                       continue;
-
-               /*
-                * event overflow
-                */
-               handled++;
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-
-void perf_events_lapic_init(void)
-{
-       if (!x86_pmu.apic || !x86_pmu_initialized())
-               return;
-
-       /*
-        * Always use NMI for PMU
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       u64 start_clock;
-       u64 finish_clock;
-       int ret;
-
-       /*
-        * All PMUs/events that share this PMI handler should make sure to
-        * increment active_events for their events.
-        */
-       if (!atomic_read(&active_events))
-               return NMI_DONE;
-
-       start_clock = sched_clock();
-       ret = x86_pmu.handle_irq(regs);
-       finish_clock = sched_clock();
-
-       perf_sample_event_took(finish_clock - start_clock);
-
-       return ret;
-}
-NOKPROBE_SYMBOL(perf_event_nmi_handler);
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int i, ret = NOTIFY_OK;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
-                       cpuc->kfree_on_online[i] = NULL;
-               if (x86_pmu.cpu_prepare)
-                       ret = x86_pmu.cpu_prepare(cpu);
-               break;
-
-       case CPU_STARTING:
-               if (x86_pmu.cpu_starting)
-                       x86_pmu.cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
-                       kfree(cpuc->kfree_on_online[i]);
-                       cpuc->kfree_on_online[i] = NULL;
-               }
-               break;
-
-       case CPU_DYING:
-               if (x86_pmu.cpu_dying)
-                       x86_pmu.cpu_dying(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               if (x86_pmu.cpu_dead)
-                       x86_pmu.cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
-       if (cpu_has_apic)
-               return;
-
-       x86_pmu.apic = 0;
-       pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-       pr_info("no hardware sampling interrupt available.\n");
-
-       /*
-        * If we have a PMU initialized but no APIC
-        * interrupts, we cannot sample hardware
-        * events (user-space has to fall back and
-        * sample via a hrtimer based software event):
-        */
-       pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-}
-
-static struct attribute_group x86_pmu_format_group = {
-       .name = "format",
-       .attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
-       struct device_attribute *d;
-       struct perf_pmu_events_attr *pmu_attr;
-       int offset = 0;
-       int i, j;
-
-       for (i = 0; attrs[i]; i++) {
-               d = (struct device_attribute *)attrs[i];
-               pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
-               /* str trumps id */
-               if (pmu_attr->event_str)
-                       continue;
-               if (x86_pmu.event_map(i + offset))
-                       continue;
-
-               for (j = i; attrs[j]; j++)
-                       attrs[j] = attrs[j + 1];
-
-               /* Check the shifted attr. */
-               i--;
-
-               /*
-                * event_map() is index based, the attrs array is organized
-                * by increasing event index. If we shift the events, then
-                * we need to compensate for the event_map(), otherwise
-                * we are looking up the wrong event in the map
-                */
-               offset++;
-       }
-}
-
-/* Merge two pointer arrays */
-__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
-       struct attribute **new;
-       int j, i;
-
-       for (j = 0; a[j]; j++)
-               ;
-       for (i = 0; b[i]; i++)
-               j++;
-       j++;
-
-       new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
-       if (!new)
-               return NULL;
-
-       j = 0;
-       for (i = 0; a[i]; i++)
-               new[j++] = a[i];
-       for (i = 0; b[i]; i++)
-               new[j++] = b[i];
-       new[j] = NULL;
-
-       return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page)
-{
-       struct perf_pmu_events_attr *pmu_attr = \
-               container_of(attr, struct perf_pmu_events_attr, attr);
-       u64 config = x86_pmu.event_map(pmu_attr->id);
-
-       /* string trumps id */
-       if (pmu_attr->event_str)
-               return sprintf(page, "%s", pmu_attr->event_str);
-
-       return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles,                 CPU_CYCLES              );
-EVENT_ATTR(instructions,               INSTRUCTIONS            );
-EVENT_ATTR(cache-references,           CACHE_REFERENCES        );
-EVENT_ATTR(cache-misses,               CACHE_MISSES            );
-EVENT_ATTR(branch-instructions,                BRANCH_INSTRUCTIONS     );
-EVENT_ATTR(branch-misses,              BRANCH_MISSES           );
-EVENT_ATTR(bus-cycles,                 BUS_CYCLES              );
-EVENT_ATTR(stalled-cycles-frontend,    STALLED_CYCLES_FRONTEND );
-EVENT_ATTR(stalled-cycles-backend,     STALLED_CYCLES_BACKEND  );
-EVENT_ATTR(ref-cycles,                 REF_CPU_CYCLES          );
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
-       EVENT_PTR(CPU_CYCLES),
-       EVENT_PTR(INSTRUCTIONS),
-       EVENT_PTR(CACHE_REFERENCES),
-       EVENT_PTR(CACHE_MISSES),
-       EVENT_PTR(BRANCH_INSTRUCTIONS),
-       EVENT_PTR(BRANCH_MISSES),
-       EVENT_PTR(BUS_CYCLES),
-       EVENT_PTR(STALLED_CYCLES_FRONTEND),
-       EVENT_PTR(STALLED_CYCLES_BACKEND),
-       EVENT_PTR(REF_CPU_CYCLES),
-       NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
-       .name = "events",
-       .attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
-       u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-       u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
-       bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
-       bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
-       bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
-       bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
-       ssize_t ret;
-
-       /*
-       * We have whole page size to spend and just little data
-       * to write, so we can safely use sprintf.
-       */
-       ret = sprintf(page, "event=0x%02llx", event);
-
-       if (umask)
-               ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
-       if (edge)
-               ret += sprintf(page + ret, ",edge");
-
-       if (pc)
-               ret += sprintf(page + ret, ",pc");
-
-       if (any)
-               ret += sprintf(page + ret, ",any");
-
-       if (inv)
-               ret += sprintf(page + ret, ",inv");
-
-       if (cmask)
-               ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
-       ret += sprintf(page + ret, "\n");
-
-       return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
-       struct x86_pmu_quirk *quirk;
-       int err;
-
-       pr_info("Performance Events: ");
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_INTEL:
-               err = intel_pmu_init();
-               break;
-       case X86_VENDOR_AMD:
-               err = amd_pmu_init();
-               break;
-       default:
-               err = -ENOTSUPP;
-       }
-       if (err != 0) {
-               pr_cont("no PMU driver, software events only.\n");
-               return 0;
-       }
-
-       pmu_check_apic();
-
-       /* sanity check that the hardware exists or is emulated */
-       if (!check_hw_exists())
-               return 0;
-
-       pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-       x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
-
-       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
-               quirk->func();
-
-       if (!x86_pmu.intel_ctrl)
-               x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       perf_events_lapic_init();
-       register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
-       unconstrained = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-                                  0, x86_pmu.num_counters, 0, 0);
-
-       x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
-       if (x86_pmu.event_attrs)
-               x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
-       if (!x86_pmu.events_sysfs_show)
-               x86_pmu_events_group.attrs = &empty_attrs;
-       else
-               filter_events(x86_pmu_events_group.attrs);
-
-       if (x86_pmu.cpu_events) {
-               struct attribute **tmp;
-
-               tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
-               if (!WARN_ON(!tmp))
-                       x86_pmu_events_group.attrs = tmp;
-       }
-
-       pr_info("... version:                %d\n",     x86_pmu.version);
-       pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-       pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-       pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-       pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-       pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-       pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
-
-       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
-       perf_cpu_notifier(x86_pmu_notifier);
-
-       return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
- */
-static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
-
-       cpuc->txn_flags = txn_flags;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       perf_pmu_disable(pmu);
-       __this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
-       unsigned int txn_flags;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       txn_flags = cpuc->txn_flags;
-       cpuc->txn_flags = 0;
-       if (txn_flags & ~PERF_PMU_TXN_ADD)
-               return;
-
-       /*
-        * Truncate collected array by the number of events added in this
-        * transaction. See x86_pmu_add() and x86_pmu_*_txn().
-        */
-       __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-       __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
-       perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- *
- * Does not cancel the transaction on failure; expects the caller to do this.
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int assign[X86_PMC_IDX_MAX];
-       int n, ret;
-
-       WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
-
-       if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
-               cpuc->txn_flags = 0;
-               return 0;
-       }
-
-       n = cpuc->n_events;
-
-       if (!x86_pmu_initialized())
-               return -EAGAIN;
-
-       ret = x86_pmu.schedule_events(cpuc, n, assign);
-       if (ret)
-               return ret;
-
-       /*
-        * copy new assignment, now we know it is possible
-        * will be used by hw_perf_enable()
-        */
-       memcpy(cpuc->assign, assign, n*sizeof(int));
-
-       cpuc->txn_flags = 0;
-       perf_pmu_enable(pmu);
-       return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
-       kfree(cpuc->shared_regs);
-       kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
-       struct cpu_hw_events *cpuc;
-       int cpu = raw_smp_processor_id();
-
-       cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
-       if (!cpuc)
-               return ERR_PTR(-ENOMEM);
-
-       /* only needed, if we have extra_regs */
-       if (x86_pmu.extra_regs) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto error;
-       }
-       cpuc->is_fake = 1;
-       return cpuc;
-error:
-       free_fake_cpuc(cpuc);
-       return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
-       struct cpu_hw_events *fake_cpuc;
-       struct event_constraint *c;
-       int ret = 0;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-
-       c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
-
-       if (!c || !c->weight)
-               ret = -EINVAL;
-
-       if (x86_pmu.put_event_constraints)
-               x86_pmu.put_event_constraints(fake_cpuc, event);
-
-       free_fake_cpuc(fake_cpuc);
-
-       return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- *     - check events are compatible which each other
- *     - events do not compete for the same counter
- *     - number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct cpu_hw_events *fake_cpuc;
-       int ret = -EINVAL, n;
-
-       fake_cpuc = allocate_fake_cpuc();
-       if (IS_ERR(fake_cpuc))
-               return PTR_ERR(fake_cpuc);
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = collect_events(fake_cpuc, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-       n = collect_events(fake_cpuc, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_cpuc->n_events = n;
-
-       ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
-       free_fake_cpuc(fake_cpuc);
-       return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
-       struct pmu *tmp;
-       int err;
-
-       switch (event->attr.type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               break;
-
-       default:
-               return -ENOENT;
-       }
-
-       err = __x86_pmu_event_init(event);
-       if (!err) {
-               /*
-                * we temporarily connect event to its pmu
-                * such that validate_group() can classify
-                * it as an x86 event using is_x86_event()
-                */
-               tmp = event->pmu;
-               event->pmu = &pmu;
-
-               if (event->group_leader != event)
-                       err = validate_group(event);
-               else
-                       err = validate_event(event);
-
-               event->pmu = tmp;
-       }
-       if (err) {
-               if (event->destroy)
-                       event->destroy(event);
-       }
-
-       if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
-               event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
-
-       return err;
-}
-
-static void refresh_pce(void *ignored)
-{
-       if (current->mm)
-               load_mm_cr4(current->mm);
-}
-
-static void x86_pmu_event_mapped(struct perf_event *event)
-{
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static void x86_pmu_event_unmapped(struct perf_event *event)
-{
-       if (!current->mm)
-               return;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return;
-
-       if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-               on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
-       int idx = event->hw.idx;
-
-       if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-               return 0;
-
-       if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-               idx -= INTEL_PMC_IDX_FIXED;
-               idx |= 1 << 30;
-       }
-
-       return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t count)
-{
-       unsigned long val;
-       ssize_t ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-
-       if (val > 2)
-               return -EINVAL;
-
-       if (x86_pmu.attr_rdpmc_broken)
-               return -ENOTSUPP;
-
-       if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
-               /*
-                * Changing into or out of always available, aka
-                * perf-event-bypassing mode.  This path is extremely slow,
-                * but only root can trigger it, so it's okay.
-                */
-               if (val == 2)
-                       static_key_slow_inc(&rdpmc_always_available);
-               else
-                       static_key_slow_dec(&rdpmc_always_available);
-               on_each_cpu(refresh_pce, NULL, 1);
-       }
-
-       x86_pmu.attr_rdpmc = val;
-
-       return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
-       &dev_attr_rdpmc.attr,
-       NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
-       .attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
-       &x86_pmu_attr_group,
-       &x86_pmu_format_group,
-       &x86_pmu_events_group,
-       NULL,
-};
-
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (x86_pmu.sched_task)
-               x86_pmu.sched_task(ctx, sched_in);
-}
-
-void perf_check_microcode(void)
-{
-       if (x86_pmu.check_microcode)
-               x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
-       .pmu_enable             = x86_pmu_enable,
-       .pmu_disable            = x86_pmu_disable,
-
-       .attr_groups            = x86_pmu_attr_groups,
-
-       .event_init             = x86_pmu_event_init,
-
-       .event_mapped           = x86_pmu_event_mapped,
-       .event_unmapped         = x86_pmu_event_unmapped,
-
-       .add                    = x86_pmu_add,
-       .del                    = x86_pmu_del,
-       .start                  = x86_pmu_start,
-       .stop                   = x86_pmu_stop,
-       .read                   = x86_pmu_read,
-
-       .start_txn              = x86_pmu_start_txn,
-       .cancel_txn             = x86_pmu_cancel_txn,
-       .commit_txn             = x86_pmu_commit_txn,
-
-       .event_idx              = x86_pmu_event_idx,
-       .sched_task             = x86_pmu_sched_task,
-       .task_ctx_size          = sizeof(struct x86_perf_task_context),
-};
-
-void arch_perf_update_userpage(struct perf_event *event,
-                              struct perf_event_mmap_page *userpg, u64 now)
-{
-       struct cyc2ns_data *data;
-
-       userpg->cap_user_time = 0;
-       userpg->cap_user_time_zero = 0;
-       userpg->cap_user_rdpmc =
-               !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
-       userpg->pmc_width = x86_pmu.cntval_bits;
-
-       if (!sched_clock_stable())
-               return;
-
-       data = cyc2ns_read_begin();
-
-       /*
-        * Internal timekeeping for enabled/running/stopped times
-        * is always in the local_clock domain.
-        */
-       userpg->cap_user_time = 1;
-       userpg->time_mult = data->cyc2ns_mul;
-       userpg->time_shift = data->cyc2ns_shift;
-       userpg->time_offset = data->cyc2ns_offset - now;
-
-       /*
-        * cap_user_time_zero doesn't make sense when we're using a different
-        * time base for the records.
-        */
-       if (event->clock == &local_clock) {
-               userpg->cap_user_time_zero = 1;
-               userpg->time_zero = data->cyc2ns_offset;
-       }
-
-       cyc2ns_read_end(data);
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-       return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       struct perf_callchain_entry *entry = data;
-
-       perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-       .stack                  = backtrace_stack,
-       .address                = backtrace_address,
-       .walk_stack             = print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       perf_callchain_store(entry, regs->ip);
-
-       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
-       return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
-       struct desc_struct *desc;
-       int idx = segment >> 3;
-
-       if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-               struct ldt_struct *ldt;
-
-               if (idx > LDT_ENTRIES)
-                       return 0;
-
-               /* IRQs are off, so this synchronizes with smp_store_release */
-               ldt = lockless_dereference(current->active_mm->context.ldt);
-               if (!ldt || idx > ldt->size)
-                       return 0;
-
-               desc = &ldt->entries[idx];
-#else
-               return 0;
-#endif
-       } else {
-               if (idx > GDT_ENTRIES)
-                       return 0;
-
-               desc = raw_cpu_ptr(gdt_page.gdt) + idx;
-       }
-
-       return get_desc_base(desc);
-}
-
-#ifdef CONFIG_IA32_EMULATION
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       /* 32-bit process in 64-bit kernel. */
-       unsigned long ss_base, cs_base;
-       struct stack_frame_ia32 frame;
-       const void __user *fp;
-
-       if (!test_thread_flag(TIF_IA32))
-               return 0;
-
-       cs_base = get_segment_base(regs->cs);
-       ss_base = get_segment_base(regs->ss);
-
-       fp = compat_ptr(ss_base + regs->bp);
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame     = 0;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 8))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, cs_base + frame.return_address);
-               fp = compat_ptr(ss_base + frame.next_frame);
-       }
-       pagefault_enable();
-       return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-    return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-       struct stack_frame frame;
-       const void __user *fp;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return;
-       }
-
-       /*
-        * We don't know what to do with VM86 stacks.. ignore them for now.
-        */
-       if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
-               return;
-
-       fp = (void __user *)regs->bp;
-
-       perf_callchain_store(entry, regs->ip);
-
-       if (!current->mm)
-               return;
-
-       if (perf_callchain_user32(regs, entry))
-               return;
-
-       pagefault_disable();
-       while (entry->nr < PERF_MAX_STACK_DEPTH) {
-               unsigned long bytes;
-               frame.next_frame             = NULL;
-               frame.return_address = 0;
-
-               if (!access_ok(VERIFY_READ, fp, 16))
-                       break;
-
-               bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
-               if (bytes != 0)
-                       break;
-               bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
-               if (bytes != 0)
-                       break;
-
-               if (!valid_user_frame(fp, sizeof(frame)))
-                       break;
-
-               perf_callchain_store(entry, frame.return_address);
-               fp = (void __user *)frame.next_frame;
-       }
-       pagefault_enable();
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- *   VM86 - the good olde 16 bit days, where the linear address is
- *          20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
- *          to figure out what the 32bit base address is.
- *
- *    X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
-       /*
-        * For IA32 we look at the GDT/LDT segment base to convert the
-        * effective IP to a linear address.
-        */
-
-#ifdef CONFIG_X86_32
-       /*
-        * If we are in VM86 mode, add the segment offset to convert to a
-        * linear address.
-        */
-       if (regs->flags & X86_VM_MASK)
-               return 0x10 * regs->cs;
-
-       if (user_mode(regs) && regs->cs != __USER_CS)
-               return get_segment_base(regs->cs);
-#else
-       if (user_mode(regs) && !user_64bit_mode(regs) &&
-           regs->cs != __USER32_CS)
-               return get_segment_base(regs->cs);
-#endif
-       return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-               return perf_guest_cbs->get_guest_ip();
-
-       return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-       int misc = 0;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               if (perf_guest_cbs->is_user_mode())
-                       misc |= PERF_RECORD_MISC_GUEST_USER;
-               else
-                       misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-       } else {
-               if (user_mode(regs))
-                       misc |= PERF_RECORD_MISC_USER;
-               else
-                       misc |= PERF_RECORD_MISC_KERNEL;
-       }
-
-       if (regs->flags & PERF_EFLAGS_EXACT)
-               misc |= PERF_RECORD_MISC_EXACT_IP;
-
-       return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
-       cap->version            = x86_pmu.version;
-       cap->num_counters_gp    = x86_pmu.num_counters;
-       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
-       cap->bit_width_gp       = x86_pmu.cntval_bits;
-       cap->bit_width_fixed    = x86_pmu.cntval_bits;
-       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
-       cap->events_mask_len    = x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
deleted file mode 100644 (file)
index 7bb61e3..0000000
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-/* To enable MSR tracing please use the generic trace points. */
-
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-       EXTRA_REG_NONE  = -1,   /* not used */
-
-       EXTRA_REG_RSP_0 = 0,    /* offcore_response_0 */
-       EXTRA_REG_RSP_1 = 1,    /* offcore_response_1 */
-       EXTRA_REG_LBR   = 2,    /* lbr_select */
-       EXTRA_REG_LDLAT = 3,    /* ld_lat_threshold */
-       EXTRA_REG_FE    = 4,    /* fe_* */
-
-       EXTRA_REG_MAX           /* number of entries needed */
-};
-
-struct event_constraint {
-       union {
-               unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-               u64             idxmsk64;
-       };
-       u64     code;
-       u64     cmask;
-       int     weight;
-       int     overlap;
-       int     flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT      0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST         0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW     0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED       0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW     0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW     0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
-
-
-struct amd_nb {
-       int nb_id;  /* NorthBridge id */
-       int refcnt; /* reference count */
-       struct perf_event *owners[X86_PMC_IDX_MAX];
-       struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS                8
-
-/*
- * Flags PEBS can handle without an PMI.
- *
- * TID can only be handled by flushing at context switch.
- *
- */
-#define PEBS_FREERUNNING_FLAGS \
-       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
-       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
-       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-       PERF_SAMPLE_TRANSACTION)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-       u64     bts_buffer_base;
-       u64     bts_index;
-       u64     bts_absolute_maximum;
-       u64     bts_interrupt_threshold;
-       u64     pebs_buffer_base;
-       u64     pebs_index;
-       u64     pebs_absolute_maximum;
-       u64     pebs_interrupt_threshold;
-       u64     pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
-       raw_spinlock_t          lock;   /* per-core: protect structure */
-       u64                 config;     /* extra MSR config */
-       u64                 reg;        /* extra MSR number */
-       atomic_t            ref;        /* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
-       struct er_account       regs[EXTRA_REG_MAX];
-       int                     refcnt;         /* per-core: #HT threads */
-       unsigned                core_id;        /* per-core: core id */
-};
-
-enum intel_excl_state_type {
-       INTEL_EXCL_UNUSED    = 0, /* counter is unused */
-       INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
-       INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
-};
-
-struct intel_excl_states {
-       enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-       bool sched_started; /* true if scheduling has started */
-};
-
-struct intel_excl_cntrs {
-       raw_spinlock_t  lock;
-
-       struct intel_excl_states states[2];
-
-       union {
-               u16     has_exclusive[2];
-               u32     exclusive_present;
-       };
-
-       int             refcnt;         /* per-core: #HT threads */
-       unsigned        core_id;        /* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES                32
-
-enum {
-       X86_PERF_KFREE_SHARED = 0,
-       X86_PERF_KFREE_EXCL   = 1,
-       X86_PERF_KFREE_MAX
-};
-
-struct cpu_hw_events {
-       /*
-        * Generic x86 PMC bits
-        */
-       struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
-       unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       int                     enabled;
-
-       int                     n_events; /* the # of events in the below arrays */
-       int                     n_added;  /* the # last events in the below arrays;
-                                            they've never been enabled yet */
-       int                     n_txn;    /* the # last events in the below arrays;
-                                            added in the current transaction */
-       int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-       u64                     tags[X86_PMC_IDX_MAX];
-
-       struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
-
-       int                     n_excl; /* the number of exclusive events */
-
-       unsigned int            txn_flags;
-       int                     is_fake;
-
-       /*
-        * Intel DebugStore bits
-        */
-       struct debug_store      *ds;
-       u64                     pebs_enabled;
-
-       /*
-        * Intel LBR bits
-        */
-       int                             lbr_users;
-       void                            *lbr_context;
-       struct perf_branch_stack        lbr_stack;
-       struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
-       struct er_account               *lbr_sel;
-       u64                             br_sel;
-
-       /*
-        * Intel host/guest exclude bits
-        */
-       u64                             intel_ctrl_guest_mask;
-       u64                             intel_ctrl_host_mask;
-       struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
-
-       /*
-        * Intel checkpoint mask
-        */
-       u64                             intel_cp_status;
-
-       /*
-        * manage shared (per-core, per-cpu) registers
-        * used on Intel NHM/WSM/SNB
-        */
-       struct intel_shared_regs        *shared_regs;
-       /*
-        * manage exclusive counter access between hyperthread
-        */
-       struct event_constraint *constraint_list; /* in enable order */
-       struct intel_excl_cntrs         *excl_cntrs;
-       int excl_thread_id; /* 0 or 1 */
-
-       /*
-        * AMD specific bits
-        */
-       struct amd_nb                   *amd_nb;
-       /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
-       u64                             perf_ctr_virt_mask;
-
-       void                            *kfree_on_online[X86_PERF_KFREE_MAX];
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
-       { .idxmsk64 = (n) },            \
-       .code = (c),                    \
-       .cmask = (m),                   \
-       .weight = (w),                  \
-       .overlap = (o),                 \
-       .flags = f,                     \
-}
-
-#define EVENT_CONSTRAINT(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
-       __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
-                          0, PERF_X86_EVENT_EXCL)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  - in_tx
- *  - in_tx_checkpointed
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n)   \
-       EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)  \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-/* Constraint on specific umask bit only + event */
-#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)      \
-       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
-
-/* Like UEVENT_CONSTRAINT, but match flags too */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)    \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-#define INTEL_EXCLUEVT_CONSTRAINT(c, n)        \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
-
-#define INTEL_PLD_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                          HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n)     \
-       __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* Event constraint, but match on all event flags too. */
-#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-/* Check only flags, but allow all event/umask */
-#define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
-       EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
-
-/* Check flags and event code, and set the HSW store flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-/* Check flags and event code, and set the HSW load flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW store flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW load flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, \
-                         PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW N/A flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
-       __EVENT_CONSTRAINT(code, n,                     \
-                         INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-                         HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
-
-
-/*
- * We define the end marker as having a weight of -1
- * to enable blacklisting of events using a counter bitmask
- * of zero and thus a weight of zero.
- * The end marker has a weight that cannot possibly be
- * obtained from counting the bits in the bitmask.
- */
-#define EVENT_CONSTRAINT_END { .weight = -1 }
-
-/*
- * Check for end marker with weight == -1
- */
-#define for_each_event_constraint(e, c)        \
-       for ((e) = (c); (e)->weight != -1; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-       unsigned int            event;
-       unsigned int            msr;
-       u64                     config_mask;
-       u64                     valid_mask;
-       int                     idx;  /* per_xxx->regs[] reg index */
-       bool                    extra_msr_access;
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {     \
-       .event = (e),                   \
-       .msr = (ms),                    \
-       .config_mask = (m),             \
-       .valid_mask = (vm),             \
-       .idx = EXTRA_REG_##i,           \
-       .extra_msr_access = true,       \
-       }
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)     \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
-       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
-                       ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
-       INTEL_UEVENT_EXTRA_REG(c, \
-                              MSR_PEBS_LD_LAT_THRESHOLD, \
-                              0xffff, \
-                              LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-       struct {
-               u64     lbr_format:6;
-               u64     pebs_trap:1;
-               u64     pebs_arch_reg:1;
-               u64     pebs_format:4;
-               u64     smm_freeze:1;
-               /*
-                * PMU supports separate counter range for writing
-                * values > 32bit.
-                */
-               u64     full_width_write:1;
-       };
-       u64     capabilities;
-};
-
-struct x86_pmu_quirk {
-       struct x86_pmu_quirk *next;
-       void (*func)(void);
-};
-
-union x86_pmu_config {
-       struct {
-               u64 event:8,
-                   umask:8,
-                   usr:1,
-                   os:1,
-                   edge:1,
-                   pc:1,
-                   interrupt:1,
-                   __reserved1:1,
-                   en:1,
-                   inv:1,
-                   cmask:8,
-                   event2:4,
-                   __reserved2:4,
-                   go:1,
-                   ho:1;
-       } bits;
-       u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-enum {
-       x86_lbr_exclusive_lbr,
-       x86_lbr_exclusive_bts,
-       x86_lbr_exclusive_pt,
-       x86_lbr_exclusive_max,
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-       /*
-        * Generic x86 PMC bits
-        */
-       const char      *name;
-       int             version;
-       int             (*handle_irq)(struct pt_regs *);
-       void            (*disable_all)(void);
-       void            (*enable_all)(int added);
-       void            (*enable)(struct perf_event *);
-       void            (*disable)(struct perf_event *);
-       int             (*hw_config)(struct perf_event *event);
-       int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-       unsigned        eventsel;
-       unsigned        perfctr;
-       int             (*addr_offset)(int index, bool eventsel);
-       int             (*rdpmc_index)(int index);
-       u64             (*event_map)(int);
-       int             max_events;
-       int             num_counters;
-       int             num_counters_fixed;
-       int             cntval_bits;
-       u64             cntval_mask;
-       union {
-                       unsigned long events_maskl;
-                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
-       };
-       int             events_mask_len;
-       int             apic;
-       u64             max_period;
-       struct event_constraint *
-                       (*get_event_constraints)(struct cpu_hw_events *cpuc,
-                                                int idx,
-                                                struct perf_event *event);
-
-       void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
-                                                struct perf_event *event);
-
-       void            (*start_scheduling)(struct cpu_hw_events *cpuc);
-
-       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
-       void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
-
-       struct event_constraint *event_constraints;
-       struct x86_pmu_quirk *quirks;
-       int             perfctr_second_write;
-       bool            late_ack;
-       unsigned        (*limit_period)(struct perf_event *event, unsigned l);
-
-       /*
-        * sysfs attrs
-        */
-       int             attr_rdpmc_broken;
-       int             attr_rdpmc;
-       struct attribute **format_attrs;
-       struct attribute **event_attrs;
-
-       ssize_t         (*events_sysfs_show)(char *page, u64 config);
-       struct attribute **cpu_events;
-
-       /*
-        * CPU Hotplug hooks
-        */
-       int             (*cpu_prepare)(int cpu);
-       void            (*cpu_starting)(int cpu);
-       void            (*cpu_dying)(int cpu);
-       void            (*cpu_dead)(int cpu);
-
-       void            (*check_microcode)(void);
-       void            (*sched_task)(struct perf_event_context *ctx,
-                                     bool sched_in);
-
-       /*
-        * Intel Arch Perfmon v2+
-        */
-       u64                     intel_ctrl;
-       union perf_capabilities intel_cap;
-
-       /*
-        * Intel DebugStore bits
-        */
-       unsigned int    bts             :1,
-                       bts_active      :1,
-                       pebs            :1,
-                       pebs_active     :1,
-                       pebs_broken     :1,
-                       pebs_prec_dist  :1;
-       int             pebs_record_size;
-       void            (*drain_pebs)(struct pt_regs *regs);
-       struct event_constraint *pebs_constraints;
-       void            (*pebs_aliases)(struct perf_event *event);
-       int             max_pebs_events;
-       unsigned long   free_running_flags;
-
-       /*
-        * Intel LBR
-        */
-       unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-       int             lbr_nr;                    /* hardware stack size */
-       u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
-       const int       *lbr_sel_map;              /* lbr_select mappings */
-       bool            lbr_double_abort;          /* duplicated lbr aborts */
-
-       /*
-        * Intel PT/LBR/BTS are exclusive
-        */
-       atomic_t        lbr_exclusive[x86_lbr_exclusive_max];
-
-       /*
-        * Extra registers for events
-        */
-       struct extra_reg *extra_regs;
-       unsigned int flags;
-
-       /*
-        * Intel host/guest support (KVM)
-        */
-       struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-struct x86_perf_task_context {
-       u64 lbr_from[MAX_LBR_ENTRIES];
-       u64 lbr_to[MAX_LBR_ENTRIES];
-       u64 lbr_info[MAX_LBR_ENTRIES];
-       int tos;
-       int lbr_callstack_users;
-       int lbr_stack_state;
-};
-
-#define x86_add_quirk(func_)                                           \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = x86_pmu.quirks;                                  \
-       x86_pmu.quirks = &__quirk;                                      \
-} while (0)
-
-/*
- * x86_pmu flags
- */
-#define PMU_FL_NO_HT_SHARING   0x1 /* no hyper-threading resource sharing */
-#define PMU_FL_HAS_RSP_1       0x2 /* has 2 equivalent offcore_rsp regs   */
-#define PMU_FL_EXCL_CNTRS      0x4 /* has exclusive counter requirements  */
-#define PMU_FL_EXCL_ENABLED    0x8 /* exclusive counter active */
-
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id)                                         \
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = PERF_COUNT_HW_##_id,                          \
-       .event_str      = NULL,                                         \
-};
-
-#define EVENT_ATTR_STR(_name, v, str)                                  \
-static struct perf_pmu_events_attr event_attr_##v = {                  \
-       .attr           = __ATTR(_name, 0444, events_sysfs_show, NULL), \
-       .id             = 0,                                            \
-       .event_str      = str,                                          \
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-static inline bool x86_pmu_has_lbr_callstack(void)
-{
-       return  x86_pmu.lbr_sel_map &&
-               x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
-}
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-       return x86_pmu.eventsel + (x86_pmu.addr_offset ?
-                                  x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-       return x86_pmu.perfctr + (x86_pmu.addr_offset ?
-                                 x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
-       return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_add_exclusive(unsigned int what);
-
-void x86_del_exclusive(unsigned int what);
-
-int x86_reserve_hardware(void);
-
-void x86_release_hardware(void);
-
-void hw_perf_lbr_event_destroy(struct perf_event *event);
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-                                         u64 enable_mask)
-{
-       u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
-       if (hwc->extra_reg.reg)
-               wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-       wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct event_constraint **constraints, int n,
-                       int wmin, int wmax, int gpmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-       return ip > PAGE_OFFSET;
-#else
-       return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
-       regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
-       if (regs->flags & X86_VM_MASK)
-               regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
-       regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-struct attribute **merge_attr(struct attribute **a, struct attribute **b);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
-       return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-static inline bool intel_pmu_has_bts(struct perf_event *event)
-{
-       if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-           !event->attr.freq && event->hw.sample_period == 1)
-               return true;
-
-       return false;
-}
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_slm_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-extern struct event_constraint intel_skl_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(bool pmi);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-void intel_pmu_lbr_init_hsw(void);
-
-void intel_pmu_lbr_init_skl(void);
-
-void intel_pmu_lbr_init_knl(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-void intel_pt_interrupt(void);
-
-int intel_bts_interrupt(void);
-
-void intel_bts_enable_local(void);
-
-void intel_bts_disable_local(void);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-                         char *page);
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
-       return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       return NULL;
-}
-
-static inline int is_ht_workaround_enabled(void)
-{
-       return 0;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
deleted file mode 100644 (file)
index 5861053..0000000
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-               [ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-               [ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-               [ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
-               [ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]                   = 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]                 = 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]             = 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]                 = 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]          = 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]                        = 0x00c3,
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]      = 0x00d0, /* "Decoder empty" event */
-  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]       = 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-       return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- *   4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- *   6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
-       int offset;
-
-       if (!index)
-               return index;
-
-       if (eventsel)
-               offset = event_offsets[index];
-       else
-               offset = count_offsets[index];
-
-       if (offset)
-               return offset;
-
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               offset = index;
-       else
-               offset = index << 1;
-
-       if (eventsel)
-               event_offsets[index] = offset;
-       else
-               count_offsets[index] = offset;
-
-       return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
-       if (event->attr.exclude_host && event->attr.exclude_guest)
-               /*
-                * When HO == GO == 1 the hardware treats that as GO == HO == 0
-                * and will count in both modes. We don't want to count in that
-                * case so we emulate no-counting by setting US = OS = 0.
-                */
-               event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
-                                     ARCH_PERFMON_EVENTSEL_OS);
-       else if (event->attr.exclude_host)
-               event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
-       else if (event->attr.exclude_guest)
-               event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
-       return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
-       return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
-       return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-
-       return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
-       int ret;
-
-       /* pass precise event sampling to ibs: */
-       if (event->attr.precise_ip && get_ibs_caps())
-               return -ENOENT;
-
-       if (has_branch_stack(event))
-               return -EOPNOTSUPP;
-
-       ret = x86_pmu_hw_config(event);
-       if (ret)
-               return ret;
-
-       if (event->attr.type == PERF_TYPE_RAW)
-               event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
-       return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
-                                          struct perf_event *event)
-{
-       struct amd_nb *nb = cpuc->amd_nb;
-       int i;
-
-       /*
-        * need to scan whole list because event may not have
-        * been assigned during scheduling
-        *
-        * no race condition possible because event can only
-        * be removed on one CPU at a time AND PMU is disabled
-        * when we come here
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               if (cmpxchg(nb->owners + i, event, NULL) == event)
-                       break;
-       }
-}
-
- /*
-  * AMD64 NorthBridge events need special treatment because
-  * counter access needs to be synchronized across all cores
-  * of a package. Refer to BKDG section 3.12
-  *
-  * NB events are events measuring L3 cache, Hypertransport
-  * traffic. They are identified by an event code >= 0xe00.
-  * They measure events on the NorthBride which is shared
-  * by all cores on a package. NB events are counted on a
-  * shared set of counters. When a NB event is programmed
-  * in a counter, the data actually comes from a shared
-  * counter. Thus, access to those counters needs to be
-  * synchronized.
-  *
-  * We implement the synchronization such that no two cores
-  * can be measuring NB events using the same counters. Thus,
-  * we maintain a per-NB allocation table. The available slot
-  * is propagated using the event_constraint structure.
-  *
-  * We provide only one choice for each NB event based on
-  * the fact that only NB events have restrictions. Consequently,
-  * if a counter is available, there is a guarantee the NB event
-  * will be assigned to it. If no slot is available, an empty
-  * constraint is returned and scheduling will eventually fail
-  * for this event.
-  *
-  * Note that all cores attached the same NB compete for the same
-  * counters to host NB events, this is why we use atomic ops. Some
-  * multi-chip CPUs may have more than one NB.
-  *
-  * Given that resources are allocated (cmpxchg), they must be
-  * eventually freed for others to use. This is accomplished by
-  * calling __amd_put_nb_event_constraints()
-  *
-  * Non NB events are not impacted by this restriction.
-  */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                              struct event_constraint *c)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct amd_nb *nb = cpuc->amd_nb;
-       struct perf_event *old;
-       int idx, new = -1;
-
-       if (!c)
-               c = &unconstrained;
-
-       if (cpuc->is_fake)
-               return c;
-
-       /*
-        * detect if already present, if so reuse
-        *
-        * cannot merge with actual allocation
-        * because of possible holes
-        *
-        * event can already be present yet not assigned (in hwc->idx)
-        * because of successive calls to x86_schedule_events() from
-        * hw_perf_group_sched_in() without hw_perf_enable()
-        */
-       for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
-               if (new == -1 || hwc->idx == idx)
-                       /* assign free slot, prefer hwc->idx */
-                       old = cmpxchg(nb->owners + idx, NULL, event);
-               else if (nb->owners[idx] == event)
-                       /* event already present */
-                       old = event;
-               else
-                       continue;
-
-               if (old && old != event)
-                       continue;
-
-               /* reassign to this slot */
-               if (new != -1)
-                       cmpxchg(nb->owners + new, event, NULL);
-               new = idx;
-
-               /* already present, reuse */
-               if (old == event)
-                       break;
-       }
-
-       if (new == -1)
-               return &emptyconstraint;
-
-       return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
-       struct amd_nb *nb;
-       int i;
-
-       nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
-       if (!nb)
-               return NULL;
-
-       nb->nb_id = -1;
-
-       /*
-        * initialize all possible NB constraints
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               __set_bit(i, nb->event_constraints[i].idxmsk);
-               nb->event_constraints[i].weight = 1;
-       }
-       return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       WARN_ON_ONCE(cpuc->amd_nb);
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return NOTIFY_OK;
-
-       cpuc->amd_nb = amd_alloc_nb(cpu);
-       if (!cpuc->amd_nb)
-               return NOTIFY_BAD;
-
-       return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-       struct amd_nb *nb;
-       int i, nb_id;
-
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       nb_id = amd_get_nb_id(cpu);
-       WARN_ON_ONCE(nb_id == BAD_APICID);
-
-       for_each_online_cpu(i) {
-               nb = per_cpu(cpu_hw_events, i).amd_nb;
-               if (WARN_ON_ONCE(!nb))
-                       continue;
-
-               if (nb->nb_id == nb_id) {
-                       *onln = cpuc->amd_nb;
-                       cpuc->amd_nb = nb;
-                       break;
-               }
-       }
-
-       cpuc->amd_nb->nb_id = nb_id;
-       cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
-       struct cpu_hw_events *cpuhw;
-
-       if (boot_cpu_data.x86_max_cores < 2)
-               return;
-
-       cpuhw = &per_cpu(cpu_hw_events, cpu);
-
-       if (cpuhw->amd_nb) {
-               struct amd_nb *nb = cpuhw->amd_nb;
-
-               if (nb->nb_id == -1 || --nb->refcnt == 0)
-                       kfree(nb);
-
-               cpuhw->amd_nb = NULL;
-       }
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       /*
-        * if not NB event or no NB, then no constraints
-        */
-       if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
-               return &unconstrained;
-
-       return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-                                     struct perf_event *event)
-{
-       if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
-               __amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *amd_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK    0x000000F0ULL
-
-#define AMD_EVENT_FP           0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS           0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC           0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU           0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE                0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS                0x000000C0ULL
-#define AMD_EVENT_DE           0x000000D0ULL
-#define AMD_EVENT_NB           0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000       FP      PERF_CTL[5:3]
- * 0x010       FP      PERF_CTL[5:3]
- * 0x020       LS      PERF_CTL[5:0]
- * 0x030       LS      PERF_CTL[5:0]
- * 0x040       DC      PERF_CTL[5:0]
- * 0x050       DC      PERF_CTL[5:0]
- * 0x060       CU      PERF_CTL[2:0]
- * 0x070       CU      PERF_CTL[2:0]
- * 0x080       IC/DE   PERF_CTL[2:0]
- * 0x090       IC/DE   PERF_CTL[2:0]
- * 0x0A0       ---
- * 0x0B0       ---
- * 0x0C0       EX/LS   PERF_CTL[5:0]
- * 0x0D0       DE      PERF_CTL[2:0]
- * 0x0E0       NB      NB_PERF_CTL[3:0]
- * 0x0F0       NB      NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003       FP      PERF_CTL[3]
- * 0x004       FP      PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B       FP      PERF_CTL[3]
- * 0x00D       FP      PERF_CTL[3]
- * 0x023       DE      PERF_CTL[2:0]
- * 0x02D       LS      PERF_CTL[3]
- * 0x02E       LS      PERF_CTL[3,0]
- * 0x031       LS      PERF_CTL[2:0] (**)
- * 0x043       CU      PERF_CTL[2:0]
- * 0x045       CU      PERF_CTL[2:0]
- * 0x046       CU      PERF_CTL[2:0]
- * 0x054       CU      PERF_CTL[2:0]
- * 0x055       CU      PERF_CTL[2:0]
- * 0x08F       IC      PERF_CTL[0]
- * 0x187       DE      PERF_CTL[0]
- * 0x188       DE      PERF_CTL[0]
- * 0x0DB       EX      PERF_CTL[5:0]
- * 0x0DC       LS      PERF_CTL[5:0]
- * 0x0DD       LS      PERF_CTL[5:0]
- * 0x0DE       LS      PERF_CTL[5:0]
- * 0x0DF       LS      PERF_CTL[5:0]
- * 0x1C0       EX      PERF_CTL[5:3]
- * 0x1D6       EX      PERF_CTL[5:0]
- * 0x1D8       EX      PERF_CTL[5:0]
- *
- * (*)  depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
-                              struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int event_code = amd_get_event_code(hwc);
-
-       switch (event_code & AMD_EVENT_TYPE_MASK) {
-       case AMD_EVENT_FP:
-               switch (event_code) {
-               case 0x000:
-                       if (!(hwc->config & 0x0000F000ULL))
-                               break;
-                       if (!(hwc->config & 0x00000F00ULL))
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x004:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               break;
-                       return &amd_f15_PMC3;
-               case 0x003:
-               case 0x00B:
-               case 0x00D:
-                       return &amd_f15_PMC3;
-               }
-               return &amd_f15_PMC53;
-       case AMD_EVENT_LS:
-       case AMD_EVENT_DC:
-       case AMD_EVENT_EX_LS:
-               switch (event_code) {
-               case 0x023:
-               case 0x043:
-               case 0x045:
-               case 0x046:
-               case 0x054:
-               case 0x055:
-                       return &amd_f15_PMC20;
-               case 0x02D:
-                       return &amd_f15_PMC3;
-               case 0x02E:
-                       return &amd_f15_PMC30;
-               case 0x031:
-                       if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-                               return &amd_f15_PMC20;
-                       return &emptyconstraint;
-               case 0x1C0:
-                       return &amd_f15_PMC53;
-               default:
-                       return &amd_f15_PMC50;
-               }
-       case AMD_EVENT_CU:
-       case AMD_EVENT_IC_DE:
-       case AMD_EVENT_DE:
-               switch (event_code) {
-               case 0x08F:
-               case 0x187:
-               case 0x188:
-                       return &amd_f15_PMC0;
-               case 0x0DB ... 0x0DF:
-               case 0x1D6:
-               case 0x1D8:
-                       return &amd_f15_PMC50;
-               default:
-                       return &amd_f15_PMC20;
-               }
-       case AMD_EVENT_NB:
-               /* moved to perf_event_amd_uncore.c */
-               return &emptyconstraint;
-       default:
-               return &emptyconstraint;
-       }
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
-                   (config & AMD64_EVENTSEL_EVENT) >> 24;
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
-       .name                   = "AMD",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = x86_pmu_enable_all,
-       .enable                 = x86_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = amd_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_K7_EVNTSEL0,
-       .perfctr                = MSR_K7_PERFCTR0,
-       .addr_offset            = amd_pmu_addr_offset,
-       .event_map              = amd_pmu_event_map,
-       .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
-       .num_counters           = AMD64_NUM_COUNTERS,
-       .cntval_bits            = 48,
-       .cntval_mask            = (1ULL << 48) - 1,
-       .apic                   = 1,
-       /* use highest bit to detect overflow */
-       .max_period             = (1ULL << 47) - 1,
-       .get_event_constraints  = amd_get_event_constraints,
-       .put_event_constraints  = amd_put_event_constraints,
-
-       .format_attrs           = amd_format_attr,
-       .events_sysfs_show      = amd_event_sysfs_show,
-
-       .cpu_prepare            = amd_pmu_cpu_prepare,
-       .cpu_starting           = amd_pmu_cpu_starting,
-       .cpu_dead               = amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-               return 0;
-
-       switch (boot_cpu_data.x86) {
-       case 0x15:
-               pr_cont("Fam15h ");
-               x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
-               break;
-
-       default:
-               pr_err("core perfctr but no constraints; unknown hardware!\n");
-               return -ENODEV;
-       }
-
-       /*
-        * If core performance counter extensions exists, we must use
-        * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
-        * amd_pmu_addr_offset().
-        */
-       x86_pmu.eventsel        = MSR_F15H_PERF_CTL;
-       x86_pmu.perfctr         = MSR_F15H_PERF_CTR;
-       x86_pmu.num_counters    = AMD64_NUM_COUNTERS_CORE;
-
-       pr_cont("core perfctr, ");
-       return 0;
-}
-
-__init int amd_pmu_init(void)
-{
-       int ret;
-
-       /* Performance-monitoring supported from K7 and later: */
-       if (boot_cpu_data.x86 < 6)
-               return -ENODEV;
-
-       x86_pmu = amd_pmu;
-
-       ret = amd_core_pmu_init();
-       if (ret)
-               return ret;
-
-       /* Events are common for all AMDs */
-       memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-              sizeof(hw_cache_event_ids));
-
-       return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       cpuc->perf_ctr_virt_mask = 0;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * We only mask out the Host-only bit so that host-only counting works
-        * when SVM is disabled. If someone sets up a guest-only counter when
-        * SVM is disabled the Guest-only bits still gets set and the counter
-        * will not count anything.
-        */
-       cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-       /* Reload all events */
-       x86_pmu_disable_all();
-       x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
deleted file mode 100644 (file)
index 989d3c2..0000000
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK  (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK     IBS_OP_MAX_CNT
-
-enum ibs_states {
-       IBS_ENABLED     = 0,
-       IBS_STARTED     = 1,
-       IBS_STOPPING    = 2,
-
-       IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
-       struct perf_event       *event;
-       unsigned long           state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
-       struct pmu                      pmu;
-       unsigned int                    msr;
-       u64                             config_mask;
-       u64                             cnt_mask;
-       u64                             enable_mask;
-       u64                             valid_mask;
-       u64                             max_period;
-       unsigned long                   offset_mask[1];
-       int                             offset_max;
-       struct cpu_perf_ibs __percpu    *pcpu;
-
-       struct attribute                **format_attrs;
-       struct attribute_group          format_group;
-       const struct attribute_group    *attr_groups[2];
-
-       u64                             (*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
-       u32             size;
-       union {
-               u32     data[0];        /* data buffer starts here */
-               u32     caps;
-       };
-       u64             regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
-       s64 left = local64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
-       int overflow = 0;
-
-       /*
-        * If we are way outside a reasonable range then just skip forward:
-        */
-       if (unlikely(left <= -period)) {
-               left = period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       if (unlikely(left < (s64)min)) {
-               left += period;
-               local64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-               overflow = 1;
-       }
-
-       /*
-        * If the hw period that triggers the sw overflow is too short
-        * we might hit the irq handler. This biases the results.
-        * Thus we shorten the next-to-last period and set the last
-        * period to the max period.
-        */
-       if (left > max) {
-               left -= max;
-               if (left > max)
-                       left = max;
-               else if (left < min)
-                       left = min;
-       }
-
-       *hw_period = (u64)left;
-
-       return overflow;
-}
-
-static  int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int shift = 64 - width;
-       u64 prev_raw_count;
-       u64 delta;
-
-       /*
-        * Careful: an NMI might modify the previous event value.
-        *
-        * Our tactic to handle this is to first atomically read and
-        * exchange a new raw count - then add that new-prev delta
-        * count to the generic event atomically:
-        */
-       prev_raw_count = local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       new_raw_count) != prev_raw_count)
-               return 0;
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-       local64_sub(delta, &hwc->period_left);
-
-       return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
-       if (perf_ibs_fetch.pmu.type == type)
-               return &perf_ibs_fetch;
-       if (perf_ibs_op.pmu.type == type)
-               return &perf_ibs_op;
-       return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
- *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
- *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
-       switch (event->attr.precise_ip) {
-       case 0:
-               return -ENOENT;
-       case 1:
-       case 2:
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-
-       switch (event->attr.type) {
-       case PERF_TYPE_HARDWARE:
-               switch (event->attr.config) {
-               case PERF_COUNT_HW_CPU_CYCLES:
-                       *config = 0;
-                       return 0;
-               }
-               break;
-       case PERF_TYPE_RAW:
-               switch (event->attr.config) {
-               case 0x0076:
-                       *config = 0;
-                       return 0;
-               case 0x00C1:
-                       *config = IBS_OP_CNT_CTL;
-                       return 0;
-               }
-               break;
-       default:
-               return -ENOENT;
-       }
-
-       return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
-       .exclude_user   = 1,
-       .exclude_kernel = 1,
-       .exclude_hv     = 1,
-       .exclude_idle   = 1,
-       .exclude_host   = 1,
-       .exclude_guest  = 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs;
-       u64 max_cnt, config;
-       int ret;
-
-       perf_ibs = get_ibs_pmu(event->attr.type);
-       if (perf_ibs) {
-               config = event->attr.config;
-       } else {
-               perf_ibs = &perf_ibs_op;
-               ret = perf_ibs_precise_event(event, &config);
-               if (ret)
-                       return ret;
-       }
-
-       if (event->pmu != &perf_ibs->pmu)
-               return -ENOENT;
-
-       if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
-               return -EINVAL;
-
-       if (config & ~perf_ibs->config_mask)
-               return -EINVAL;
-
-       if (hwc->sample_period) {
-               if (config & perf_ibs->cnt_mask)
-                       /* raw max_cnt may not be set */
-                       return -EINVAL;
-               if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
-                       /*
-                        * lower 4 bits can not be set in ibs max cnt,
-                        * but allowing it in case we adjust the
-                        * sample period to set a frequency.
-                        */
-                       return -EINVAL;
-               hwc->sample_period &= ~0x0FULL;
-               if (!hwc->sample_period)
-                       hwc->sample_period = 0x10;
-       } else {
-               max_cnt = config & perf_ibs->cnt_mask;
-               config &= ~perf_ibs->cnt_mask;
-               event->attr.sample_period = max_cnt << 4;
-               hwc->sample_period = event->attr.sample_period;
-       }
-
-       if (!hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * If we modify hwc->sample_period, we also need to update
-        * hwc->last_period and hwc->period_left.
-        */
-       hwc->last_period = hwc->sample_period;
-       local64_set(&hwc->period_left, hwc->sample_period);
-
-       hwc->config_base = perf_ibs->msr;
-       hwc->config = config;
-
-       return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
-                              struct hw_perf_event *hwc, u64 *period)
-{
-       int overflow;
-
-       /* ignore lower 4 bits in min count: */
-       overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
-       local64_set(&hwc->prev_count, 0);
-
-       return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
-       return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
-       u64 count = 0;
-
-       if (config & IBS_OP_VAL)
-               count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
-       if (ibs_caps & IBS_CAPS_RDWROPCNT)
-               count += (config & IBS_OP_CUR_CNT) >> 32;
-
-       return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-                     u64 *config)
-{
-       u64 count = perf_ibs->get_count(*config);
-
-       /*
-        * Set width to 64 since we do not overflow on max width but
-        * instead on max count. In perf_ibs_set_period() we clear
-        * prev count manually on overflow.
-        */
-       while (!perf_event_try_update(event, count, 64)) {
-               rdmsrl(event->hw.config_base, *config);
-               count = perf_ibs->get_count(*config);
-       }
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
-                                        struct hw_perf_event *hwc, u64 config)
-{
-       wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
-                                         struct hw_perf_event *hwc, u64 config)
-{
-       config &= ~perf_ibs->cnt_mask;
-       wrmsrl(hwc->config_base, config);
-       config &= ~perf_ibs->enable_mask;
-       wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 period;
-
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       perf_ibs_set_period(perf_ibs, hwc, &period);
-       set_bit(IBS_STARTED, pcpu->state);
-       perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       u64 config;
-       int stopping;
-
-       stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
-       if (!stopping && (hwc->state & PERF_HES_UPTODATE))
-               return;
-
-       rdmsrl(hwc->config_base, config);
-
-       if (stopping) {
-               set_bit(IBS_STOPPING, pcpu->state);
-               perf_ibs_disable_event(perf_ibs, hwc, config);
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       /*
-        * Clear valid bit to not count rollovers on update, rollovers
-        * are only updated in the irq handler.
-        */
-       config &= ~perf_ibs->valid_mask;
-
-       perf_ibs_event_update(perf_ibs, event, &config);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (test_and_set_bit(IBS_ENABLED, pcpu->state))
-               return -ENOSPC;
-
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       pcpu->event = event;
-
-       if (flags & PERF_EF_START)
-               perf_ibs_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
-       struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-       if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
-               return;
-
-       perf_ibs_stop(event, PERF_EF_UPDATE);
-
-       pcpu->event = NULL;
-
-       perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en,       "config:57");
-PMU_FORMAT_ATTR(cnt_ctl,       "config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
-       &format_attr_rand_en.attr,
-       NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
-       NULL,   /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
-       NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSFETCHCTL,
-       .config_mask            = IBS_FETCH_CONFIG_MASK,
-       .cnt_mask               = IBS_FETCH_MAX_CNT,
-       .enable_mask            = IBS_FETCH_ENABLE,
-       .valid_mask             = IBS_FETCH_VAL,
-       .max_period             = IBS_FETCH_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSFETCH_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSFETCH_REG_COUNT,
-       .format_attrs           = ibs_fetch_format_attrs,
-
-       .get_count              = get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-
-               .event_init     = perf_ibs_init,
-               .add            = perf_ibs_add,
-               .del            = perf_ibs_del,
-               .start          = perf_ibs_start,
-               .stop           = perf_ibs_stop,
-               .read           = perf_ibs_read,
-       },
-       .msr                    = MSR_AMD64_IBSOPCTL,
-       .config_mask            = IBS_OP_CONFIG_MASK,
-       .cnt_mask               = IBS_OP_MAX_CNT,
-       .enable_mask            = IBS_OP_ENABLE,
-       .valid_mask             = IBS_OP_VAL,
-       .max_period             = IBS_OP_MAX_CNT << 4,
-       .offset_mask            = { MSR_AMD64_IBSOP_REG_MASK },
-       .offset_max             = MSR_AMD64_IBSOP_REG_COUNT,
-       .format_attrs           = ibs_op_format_attrs,
-
-       .get_count              = get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
-       struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-       struct perf_event *event = pcpu->event;
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_sample_data data;
-       struct perf_raw_record raw;
-       struct pt_regs regs;
-       struct perf_ibs_data ibs_data;
-       int offset, size, check_rip, offset_max, throttle = 0;
-       unsigned int msr;
-       u64 *buf, *config, period;
-
-       if (!test_bit(IBS_STARTED, pcpu->state)) {
-               /*
-                * Catch spurious interrupts after stopping IBS: After
-                * disabling IBS there could be still incoming NMIs
-                * with samples that even have the valid bit cleared.
-                * Mark all this NMIs as handled.
-                */
-               return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
-       }
-
-       msr = hwc->config_base;
-       buf = ibs_data.regs;
-       rdmsrl(msr, *buf);
-       if (!(*buf++ & perf_ibs->valid_mask))
-               return 0;
-
-       config = &ibs_data.regs[0];
-       perf_ibs_event_update(perf_ibs, event, config);
-       perf_sample_data_init(&data, 0, hwc->last_period);
-       if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-               goto out;       /* no sw counter overflow */
-
-       ibs_data.caps = ibs_caps;
-       size = 1;
-       offset = 1;
-       check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
-       if (event->attr.sample_type & PERF_SAMPLE_RAW)
-               offset_max = perf_ibs->offset_max;
-       else if (check_rip)
-               offset_max = 2;
-       else
-               offset_max = 1;
-       do {
-               rdmsrl(msr + offset, *buf++);
-               size++;
-               offset = find_next_bit(perf_ibs->offset_mask,
-                                      perf_ibs->offset_max,
-                                      offset + 1);
-       } while (offset < offset_max);
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               /*
-                * Read IbsBrTarget and IbsOpData4 separately
-                * depending on their availability.
-                * Can't add to offset_max as they are staggered
-                */
-               if (ibs_caps & IBS_CAPS_BRNTRGT) {
-                       rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
-                       size++;
-               }
-               if (ibs_caps & IBS_CAPS_OPDATA4) {
-                       rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
-                       size++;
-               }
-       }
-       ibs_data.size = sizeof(u64) * size;
-
-       regs = *iregs;
-       if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
-               regs.flags &= ~PERF_EFLAGS_EXACT;
-       } else {
-               set_linear_ip(&regs, ibs_data.regs[1]);
-               regs.flags |= PERF_EFLAGS_EXACT;
-       }
-
-       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               raw.size = sizeof(u32) + ibs_data.size;
-               raw.data = ibs_data.data;
-               data.raw = &raw;
-       }
-
-       throttle = perf_event_overflow(event, &data, &regs);
-out:
-       if (throttle)
-               perf_ibs_disable_event(perf_ibs, hwc, *config);
-       else
-               perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-       perf_event_update_userpage(event);
-
-       return 1;
-}
-
-static int
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-       int handled = 0;
-
-       handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
-       handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       return handled;
-}
-NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
-       struct cpu_perf_ibs __percpu *pcpu;
-       int ret;
-
-       pcpu = alloc_percpu(struct cpu_perf_ibs);
-       if (!pcpu)
-               return -ENOMEM;
-
-       perf_ibs->pcpu = pcpu;
-
-       /* register attributes */
-       if (perf_ibs->format_attrs[0]) {
-               memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
-               perf_ibs->format_group.name     = "format";
-               perf_ibs->format_group.attrs    = perf_ibs->format_attrs;
-
-               memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
-               perf_ibs->attr_groups[0]        = &perf_ibs->format_group;
-               perf_ibs->pmu.attr_groups       = perf_ibs->attr_groups;
-       }
-
-       ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
-       if (ret) {
-               perf_ibs->pcpu = NULL;
-               free_percpu(pcpu);
-       }
-
-       return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
-       struct attribute **attr = ibs_op_format_attrs;
-
-       if (!ibs_caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
-       if (ibs_caps & IBS_CAPS_OPCNT) {
-               perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
-               *attr++ = &format_attr_cnt_ctl.attr;
-       }
-       perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
-       register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-       printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
-       return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
-       u32 caps;
-       unsigned int max_level;
-
-       if (!boot_cpu_has(X86_FEATURE_IBS))
-               return 0;
-
-       /* check IBS cpuid feature flags */
-       max_level = cpuid_eax(0x80000000);
-       if (max_level < IBS_CPUID_FEATURES)
-               return IBS_CAPS_DEFAULT;
-
-       caps = cpuid_eax(IBS_CPUID_FEATURES);
-       if (!(caps & IBS_CAPS_AVAIL))
-               /* cpuid flags not valid */
-               return IBS_CAPS_DEFAULT;
-
-       return caps;
-}
-
-u32 get_ibs_caps(void)
-{
-       return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-       return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
-       int offset;
-       u64 val;
-       int valid = 0;
-
-       preempt_disable();
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-               pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       if (!get_eilvt(offset)) {
-               pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-                      smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-               goto out;
-       }
-
-       valid = 1;
-out:
-       preempt_enable();
-
-       return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-       struct pci_dev *cpu_cfg;
-       int nodes;
-       u32 value = 0;
-
-       nodes = 0;
-       cpu_cfg = NULL;
-       do {
-               cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-                                        PCI_DEVICE_ID_AMD_10H_NB_MISC,
-                                        cpu_cfg);
-               if (!cpu_cfg)
-                       break;
-               ++nodes;
-               pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-                                      | IBSCTL_LVT_OFFSET_VALID);
-               pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-                       pci_dev_put(cpu_cfg);
-                       printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-                              "IBSCTL = 0x%08x\n", value);
-                       return -EINVAL;
-               }
-       } while (1);
-
-       if (!nodes) {
-               printk(KERN_DEBUG "No CPU node configured for IBS\n");
-               return -ENODEV;
-       }
-
-       return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static void force_ibs_eilvt_setup(void)
-{
-       int offset;
-       int ret;
-
-       preempt_disable();
-       /* find the next free available EILVT entry, skip offset 0 */
-       for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-               if (get_eilvt(offset))
-                       break;
-       }
-       preempt_enable();
-
-       if (offset == APIC_EILVT_NR_MAX) {
-               printk(KERN_DEBUG "No EILVT entry available\n");
-               return;
-       }
-
-       ret = setup_ibs_ctl(offset);
-       if (ret)
-               goto out;
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       pr_info("IBS: LVT offset %d assigned\n", offset);
-
-       return;
-out:
-       preempt_disable();
-       put_eilvt(offset);
-       preempt_enable();
-       return;
-}
-
-static void ibs_eilvt_setup(void)
-{
-       /*
-        * Force LVT offset assignment for family 10h: The offsets are
-        * not assigned by the BIOS for this family, so the OS is
-        * responsible for doing it. If the OS assignment fails, fall
-        * back to BIOS settings and try to setup this.
-        */
-       if (boot_cpu_data.x86 == 0x10)
-               force_ibs_eilvt_setup();
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_AMD64_IBSCTL, val);
-       if (!(val & IBSCTL_LVT_OFFSET_VALID))
-               return -EINVAL;
-
-       return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset < 0)
-               goto failed;
-
-       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-               return;
-failed:
-       pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
-               smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
-       int offset;
-
-       offset = get_ibs_lvt_offset();
-       if (offset >= 0)
-               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-#ifdef CONFIG_PM
-
-static int perf_ibs_suspend(void)
-{
-       clear_APIC_ibs(NULL);
-       return 0;
-}
-
-static void perf_ibs_resume(void)
-{
-       ibs_eilvt_setup();
-       setup_APIC_ibs(NULL);
-}
-
-static struct syscore_ops perf_ibs_syscore_ops = {
-       .resume         = perf_ibs_resume,
-       .suspend        = perf_ibs_suspend,
-};
-
-static void perf_ibs_pm_init(void)
-{
-       register_syscore_ops(&perf_ibs_syscore_ops);
-}
-
-#else
-
-static inline void perf_ibs_pm_init(void) { }
-
-#endif
-
-static int
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_STARTING:
-               setup_APIC_ibs(NULL);
-               break;
-       case CPU_DYING:
-               clear_APIC_ibs(NULL);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
-       u32 caps;
-       int ret = -EINVAL;
-
-       caps = __get_ibs_caps();
-       if (!caps)
-               return -ENODEV; /* ibs not supported by the cpu */
-
-       ibs_eilvt_setup();
-
-       if (!ibs_eilvt_valid())
-               goto out;
-
-       perf_ibs_pm_init();
-       cpu_notifier_register_begin();
-       ibs_caps = caps;
-       /* make ibs_caps visible to other cpus: */
-       smp_mb();
-       smp_call_function(setup_APIC_ibs, NULL, 1);
-       __perf_cpu_notifier(perf_ibs_cpu_notifier);
-       cpu_notifier_register_done();
-
-       ret = perf_event_ibs_init();
-out:
-       if (ret)
-               pr_err("Failed to setup IBS, %d\n", ret);
-       return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
deleted file mode 100644 (file)
index 97242a9..0000000
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT          16
-
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
-       struct pmu pmu;
-       u8 max_banks;
-       u8 max_counters;
-       u64 cntr_assign_mask;
-       raw_spinlock_t lock;
-       const struct attribute_group *attr_groups[4];
-};
-
-#define format_group   attr_groups[0]
-#define cpumask_group  attr_groups[1]
-#define events_group   attr_groups[2]
-#define null_group     attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource,    "config:0-7");
-PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
-       &format_attr_csource.attr,
-       &format_attr_devid.attr,
-       &format_attr_pasid.attr,
-       &format_attr_domid.attr,
-       &format_attr_devid_mask.attr,
-       &format_attr_pasid_mask.attr,
-       &format_attr_domid_mask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
-       .name = "format",
-       .attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
-       struct kobj_attribute attr;
-       const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
-                               struct kobj_attribute *attr, char *buf)
-{
-       struct amd_iommu_event_desc *event =
-               container_of(attr, struct amd_iommu_event_desc, attr);
-       return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event)                    \
-{                                                              \
-       .attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),  \
-       .event = _event,                                        \
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
-       AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
-       AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
-       AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
-       AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
-       AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
-       AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
-       AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
-       AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
-       AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
-       AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
-       AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
-       { /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
-                                  struct device_attribute *attr,
-                                  char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
-       .attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
-       unsigned long flags;
-       int shift, bank, cntr, retval;
-       int max_banks = perf_iommu->max_banks;
-       int max_cntrs = perf_iommu->max_counters;
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
-       for (bank = 0, shift = 0; bank < max_banks; bank++) {
-               for (cntr = 0; cntr < max_cntrs; cntr++) {
-                       shift = bank + (bank*3) + cntr;
-                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
-                               continue;
-                       } else {
-                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
-                               goto out;
-                       }
-               }
-       }
-       retval = -ENOSPC;
-out:
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-       return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
-                                       u8 bank, u8 cntr)
-{
-       unsigned long flags;
-       int max_banks, max_cntrs;
-       int shift = 0;
-
-       max_banks = perf_iommu->max_banks;
-       max_cntrs = perf_iommu->max_counters;
-
-       if ((bank > max_banks) || (cntr > max_cntrs))
-               return -EINVAL;
-
-       shift = bank + cntr + (bank*3);
-
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-       perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
-       return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct perf_amd_iommu *perf_iommu;
-       u64 config, config1;
-
-       /* test the event attr type check for PMU enumeration */
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * IOMMU counters are shared across all cores.
-        * Therefore, it does not support per-process mode.
-        * Also, it does not support event sampling mode.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* IOMMU counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       perf_iommu = &__perf_iommu;
-
-       if (event->pmu != &perf_iommu->pmu)
-               return -ENOENT;
-
-       if (perf_iommu) {
-               config = event->attr.config;
-               config1 = event->attr.config1;
-       } else {
-               return -EINVAL;
-       }
-
-       /* integrate with iommu base devid (0000), assume one iommu */
-       perf_iommu->max_banks =
-               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-       perf_iommu->max_counters =
-               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-               return -EINVAL;
-
-       /* update the hw_perf_event struct with the iommu config data */
-       hwc->config = config;
-       hwc->extra_reg.config = config1;
-
-       return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
-       u8 csource = _GET_CSOURCE(ev);
-       u16 devid = _GET_DEVID(ev);
-       u64 reg = 0ULL;
-
-       reg = csource;
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
-       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
-       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
-       if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
-       u64 reg = 0ULL;
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                       _GET_BANK(event), _GET_CNTR(event),
-                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       pr_debug("perf: amd_iommu:perf_iommu_start\n");
-       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-               return;
-
-       WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-       hwc->state = 0;
-
-       if (flags & PERF_EF_RELOAD) {
-               u64 prev_raw_count =  local64_read(&hwc->prev_count);
-               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
-       }
-
-       perf_iommu_enable_event(event);
-       perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
-       u64 count = 0ULL;
-       u64 prev_raw_count = 0ULL;
-       u64 delta = 0ULL;
-       struct hw_perf_event *hwc = &event->hw;
-       pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &count, false);
-
-       /* IOMMU pc counter register is only 48 bits */
-       count &= 0xFFFFFFFFFFFFULL;
-
-       prev_raw_count =  local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       count) != prev_raw_count)
-               return;
-
-       /* Handling 48-bit counter overflowing */
-       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       perf_iommu_disable_event(event);
-       WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if (hwc->state & PERF_HES_UPTODATE)
-               return;
-
-       config = hwc->config;
-       perf_iommu_read(event);
-       hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
-       int retval;
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_add\n");
-       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       /* request an iommu bank/counter */
-       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-       if (retval != -ENOSPC)
-               event->hw.extra_reg.reg = (u16)retval;
-       else
-               return retval;
-
-       if (flags & PERF_EF_START)
-               perf_iommu_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-       pr_debug("perf: amd_iommu:perf_iommu_del\n");
-       perf_iommu_stop(event, PERF_EF_UPDATE);
-
-       /* clear the assigned iommu bank/counter */
-       clear_avail_iommu_bnk_cntr(perf_iommu,
-                                    _GET_BANK(event),
-                                    _GET_CNTR(event));
-
-       perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
-       struct attribute **attrs;
-       struct attribute_group *attr_group;
-       int i = 0, j;
-
-       while (amd_iommu_v2_event_descs[i].attr.attr.name)
-               i++;
-
-       attr_group = kzalloc(sizeof(struct attribute *)
-               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-       if (!attr_group)
-               return -ENOMEM;
-
-       attrs = (struct attribute **)(attr_group + 1);
-       for (j = 0; j < i; j++)
-               attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
-       attr_group->name = "events";
-       attr_group->attrs = attrs;
-       perf_iommu->events_group = attr_group;
-
-       return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
-       if (__perf_iommu.events_group != NULL) {
-               kfree(__perf_iommu.events_group);
-               __perf_iommu.events_group = NULL;
-       }
-}
-
-static __init int _init_perf_amd_iommu(
-       struct perf_amd_iommu *perf_iommu, char *name)
-{
-       int ret;
-
-       raw_spin_lock_init(&perf_iommu->lock);
-
-       /* Init format attributes */
-       perf_iommu->format_group = &amd_iommu_format_group;
-
-       /* Init cpumask attributes to only core 0 */
-       cpumask_set_cpu(0, &iommu_cpumask);
-       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-       /* Init events attributes */
-       if (_init_events_attrs(perf_iommu) != 0)
-               pr_err("perf: amd_iommu: Only support raw events.\n");
-
-       /* Init null attributes */
-       perf_iommu->null_group = NULL;
-       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
-       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-       if (ret) {
-               pr_err("perf: amd_iommu: Failed to initialized.\n");
-               amd_iommu_pc_exit();
-       } else {
-               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
-       }
-
-       return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
-       .pmu = {
-               .event_init     = perf_iommu_event_init,
-               .add            = perf_iommu_add,
-               .del            = perf_iommu_del,
-               .start          = perf_iommu_start,
-               .stop           = perf_iommu_stop,
-               .read           = perf_iommu_read,
-       },
-       .max_banks              = 0x00,
-       .max_counters           = 0x00,
-       .cntr_assign_mask       = 0ULL,
-       .format_group           = NULL,
-       .cpumask_group          = NULL,
-       .events_group           = NULL,
-       .null_group             = NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
-       /* Make sure the IOMMU PC resource is available */
-       if (!amd_iommu_pc_supported())
-               return -ENODEV;
-
-       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
-       return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
deleted file mode 100644 (file)
index 845d173..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG                   0x00
-#define IOMMU_PC_COUNTER_SRC_REG               0x08
-#define IOMMU_PC_PASID_MATCH_REG               0x10
-#define IOMMU_PC_DOMID_MATCH_REG               0x18
-#define IOMMU_PC_DEVID_MATCH_REG               0x20
-#define IOMMU_PC_COUNTER_REPORT_REG            0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS                       64
-#define PC_MAX_SPEC_CNTRS                      16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID                       0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-                       u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
deleted file mode 100644 (file)
index 4974274..0000000
+++ /dev/null
@@ -1,601 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB                4
-#define NUM_COUNTERS_L2                4
-#define MAX_COUNTERS           NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB          6
-#define RDPMC_BASE_L2          10
-
-#define COUNTER_SHIFT          16
-
-struct amd_uncore {
-       int id;
-       int refcnt;
-       int cpu;
-       int num_counters;
-       int rdpmc_base;
-       u32 msr_base;
-       cpumask_t *active_mask;
-       struct pmu *pmu;
-       struct perf_event *events[MAX_COUNTERS];
-       struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
-       return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
-       if (is_nb_event(event) && amd_uncore_nb)
-               return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-       else if (is_l2_event(event) && amd_uncore_l2)
-               return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
-       return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev, new;
-       s64 delta;
-
-       /*
-        * since we do not enable counter overflow interrupts,
-        * we do not have to worry about prev_count changing on us
-        */
-
-       prev = local64_read(&hwc->prev_count);
-       rdpmcl(hwc->event_base_rdpmc, new);
-       local64_set(&hwc->prev_count, new);
-       delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-       delta >>= COUNTER_SHIFT;
-       local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (flags & PERF_EF_RELOAD)
-               wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
-       hwc->state = 0;
-       wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
-       perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-       hwc->state |= PERF_HES_STOPPED;
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               amd_uncore_read(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       /* are we already assigned? */
-       if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
-               goto out;
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (uncore->events[i] == event) {
-                       hwc->idx = i;
-                       goto out;
-               }
-       }
-
-       /* if not, take the first available counter */
-       hwc->idx = -1;
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
-                       hwc->idx = i;
-                       break;
-               }
-       }
-
-out:
-       if (hwc->idx == -1)
-               return -EBUSY;
-
-       hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-       hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-       hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (flags & PERF_EF_START)
-               amd_uncore_start(event, PERF_EF_RELOAD);
-
-       return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
-       int i;
-       struct amd_uncore *uncore = event_to_amd_uncore(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       amd_uncore_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < uncore->num_counters; i++) {
-               if (cmpxchg(&uncore->events[i], event, NULL) == event)
-                       break;
-       }
-
-       hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
-       struct amd_uncore *uncore;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /*
-        * NB and L2 counters (MSRs) are shared across all cores that share the
-        * same NB / L2 cache. Interrupts can be directed to a single target
-        * core, however, event counts generated by processes running on other
-        * cores cannot be masked out. So we do not support sampling and
-        * per-thread events.
-        */
-       if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-               return -EINVAL;
-
-       /* NB and L2 counters do not have usr/os/guest/host bits */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-           event->attr.exclude_host || event->attr.exclude_guest)
-               return -EINVAL;
-
-       /* and we do not enable counter overflow interrupts */
-       hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
-       hwc->idx = -1;
-
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       uncore = event_to_amd_uncore(event);
-       if (!uncore)
-               return -ENODEV;
-
-       /*
-        * since request can come in to any of the shared cores, we will remap
-        * to a single common cpu.
-        */
-       event->cpu = uncore->cpu;
-
-       return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-       cpumask_t *active_mask;
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu->type == amd_nb_pmu.type)
-               active_mask = &amd_nb_active_mask;
-       else if (pmu->type == amd_l2_pmu.type)
-               active_mask = &amd_l2_active_mask;
-       else
-               return 0;
-
-       return cpumap_print_to_pagebuf(true, buf, active_mask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
-       .attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-       .name = "format",
-       .attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-       &amd_uncore_attr_group,
-       &amd_uncore_format_group,
-       NULL,
-};
-
-static struct pmu amd_nb_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_nb",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
-       .attr_groups    = amd_uncore_attr_groups,
-       .name           = "amd_l2",
-       .event_init     = amd_uncore_event_init,
-       .add            = amd_uncore_add,
-       .del            = amd_uncore_del,
-       .start          = amd_uncore_start,
-       .stop           = amd_uncore_stop,
-       .read           = amd_uncore_read,
-};
-
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
-{
-       return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-                       cpu_to_node(cpu));
-}
-
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
-       struct amd_uncore *uncore_nb = NULL, *uncore_l2;
-
-       if (amd_uncore_nb) {
-               uncore_nb = amd_uncore_alloc(cpu);
-               if (!uncore_nb)
-                       goto fail;
-               uncore_nb->cpu = cpu;
-               uncore_nb->num_counters = NUM_COUNTERS_NB;
-               uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-               uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-               uncore_nb->active_mask = &amd_nb_active_mask;
-               uncore_nb->pmu = &amd_nb_pmu;
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
-       }
-
-       if (amd_uncore_l2) {
-               uncore_l2 = amd_uncore_alloc(cpu);
-               if (!uncore_l2)
-                       goto fail;
-               uncore_l2->cpu = cpu;
-               uncore_l2->num_counters = NUM_COUNTERS_L2;
-               uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-               uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-               uncore_l2->active_mask = &amd_l2_active_mask;
-               uncore_l2->pmu = &amd_l2_pmu;
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
-       }
-
-       return 0;
-
-fail:
-       kfree(uncore_nb);
-       return -ENOMEM;
-}
-
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-                              struct amd_uncore * __percpu *uncores)
-{
-       unsigned int cpu;
-       struct amd_uncore *that;
-
-       for_each_online_cpu(cpu) {
-               that = *per_cpu_ptr(uncores, cpu);
-
-               if (!that)
-                       continue;
-
-               if (this == that)
-                       continue;
-
-               if (this->id == that->id) {
-                       that->free_when_cpu_online = this;
-                       this = that;
-                       break;
-               }
-       }
-
-       this->refcnt++;
-       return this;
-}
-
-static void amd_uncore_cpu_starting(unsigned int cpu)
-{
-       unsigned int eax, ebx, ecx, edx;
-       struct amd_uncore *uncore;
-
-       if (amd_uncore_nb) {
-               uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-               uncore->id = ecx & 0xff;
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-               *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
-       }
-
-       if (amd_uncore_l2) {
-               unsigned int apicid = cpu_data(cpu).apicid;
-               unsigned int nshared;
-
-               uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
-               cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-               nshared = ((eax >> 14) & 0xfff) + 1;
-               uncore->id = apicid - (apicid % nshared);
-
-               uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-               *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
-       }
-}
-
-static void uncore_online(unsigned int cpu,
-                         struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       kfree(uncore->free_when_cpu_online);
-       uncore->free_when_cpu_online = NULL;
-
-       if (cpu == uncore->cpu)
-               cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void amd_uncore_cpu_online(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_online(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_online(cpu, amd_uncore_l2);
-}
-
-static void uncore_down_prepare(unsigned int cpu,
-                               struct amd_uncore * __percpu *uncores)
-{
-       unsigned int i;
-       struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
-       if (this->cpu != cpu)
-               return;
-
-       /* this cpu is going down, migrate to a shared sibling if possible */
-       for_each_online_cpu(i) {
-               struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
-               if (cpu == i)
-                       continue;
-
-               if (this == that) {
-                       perf_pmu_migrate_context(this->pmu, cpu, i);
-                       cpumask_clear_cpu(cpu, that->active_mask);
-                       cpumask_set_cpu(i, that->active_mask);
-                       that->cpu = i;
-                       break;
-               }
-       }
-}
-
-static void amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_down_prepare(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
-{
-       struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-       if (cpu == uncore->cpu)
-               cpumask_clear_cpu(cpu, uncore->active_mask);
-
-       if (!--uncore->refcnt)
-               kfree(uncore);
-       *per_cpu_ptr(uncores, cpu) = NULL;
-}
-
-static void amd_uncore_cpu_dead(unsigned int cpu)
-{
-       if (amd_uncore_nb)
-               uncore_dead(cpu, amd_uncore_nb);
-
-       if (amd_uncore_l2)
-               uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
-                       void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               if (amd_uncore_cpu_up_prepare(cpu))
-                       return notifier_from_errno(-ENOMEM);
-               break;
-
-       case CPU_STARTING:
-               amd_uncore_cpu_starting(cpu);
-               break;
-
-       case CPU_ONLINE:
-               amd_uncore_cpu_online(cpu);
-               break;
-
-       case CPU_DOWN_PREPARE:
-               amd_uncore_cpu_down_prepare(cpu);
-               break;
-
-       case CPU_UP_CANCELED:
-       case CPU_DEAD:
-               amd_uncore_cpu_dead(cpu);
-               break;
-
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block = {
-       .notifier_call  = amd_uncore_cpu_notifier,
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_starting(cpu);
-       amd_uncore_cpu_online(cpu);
-}
-
-static void cleanup_cpu_online(void *dummy)
-{
-       unsigned int cpu = smp_processor_id();
-
-       amd_uncore_cpu_dead(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
-       unsigned int cpu, cpu2;
-       int ret = -ENODEV;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               goto fail_nodev;
-
-       if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-               goto fail_nodev;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-               amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_nb) {
-                       ret = -ENOMEM;
-                       goto fail_nb;
-               }
-               ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-               if (ret)
-                       goto fail_nb;
-
-               printk(KERN_INFO "perf: AMD NB counters detected\n");
-               ret = 0;
-       }
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-               amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-               if (!amd_uncore_l2) {
-                       ret = -ENOMEM;
-                       goto fail_l2;
-               }
-               ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-               if (ret)
-                       goto fail_l2;
-
-               printk(KERN_INFO "perf: AMD L2I counters detected\n");
-               ret = 0;
-       }
-
-       if (ret)
-               goto fail_nodev;
-
-       cpu_notifier_register_begin();
-
-       /* init cpus already online before registering for hotplug notifier */
-       for_each_online_cpu(cpu) {
-               ret = amd_uncore_cpu_up_prepare(cpu);
-               if (ret)
-                       goto fail_online;
-               smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
-       }
-
-       __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
-       cpu_notifier_register_done();
-
-       return 0;
-
-
-fail_online:
-       for_each_online_cpu(cpu2) {
-               if (cpu2 == cpu)
-                       break;
-               smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
-       }
-       cpu_notifier_register_done();
-
-       /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
-       amd_uncore_nb = amd_uncore_l2 = NULL;
-
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
-               perf_pmu_unregister(&amd_l2_pmu);
-fail_l2:
-       if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-               perf_pmu_unregister(&amd_nb_pmu);
-       if (amd_uncore_l2)
-               free_percpu(amd_uncore_l2);
-fail_nb:
-       if (amd_uncore_nb)
-               free_percpu(amd_uncore_nb);
-
-fail_nodev:
-       return ret;
-}
-device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
deleted file mode 100644 (file)
index a667078..0000000
+++ /dev/null
@@ -1,3774 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/nmi.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-       [PERF_COUNT_HW_CPU_CYCLES]              = 0x003c,
-       [PERF_COUNT_HW_INSTRUCTIONS]            = 0x00c0,
-       [PERF_COUNT_HW_CACHE_REFERENCES]        = 0x4f2e,
-       [PERF_COUNT_HW_CACHE_MISSES]            = 0x412e,
-       [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]     = 0x00c4,
-       [PERF_COUNT_HW_BRANCH_MISSES]           = 0x00c5,
-       [PERF_COUNT_HW_BUS_CYCLES]              = 0x013c,
-       [PERF_COUNT_HW_REF_CPU_CYCLES]          = 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-       INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
-       INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-       INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
-       INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
-       INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
-       INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
-       INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-       INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
-       INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-       INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
-       INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
-       INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-       EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_slm_event_constraints[] __read_mostly =
-{
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),    /* INST_RETIRED.PREC_DIST */
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7,
-                              MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7,
-                              MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-       INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-       /*
-        * Note the low 8 bits eventsel code is not a continuous field, containing
-        * some #GPing bits. These are masked out.
-        */
-       INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
-       EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
-       EVENT_PTR(mem_ld_nhm),
-       NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
-       EVENT_PTR(mem_ld_snb),
-       EVENT_PTR(mem_st_snb),
-       NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-       INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-       /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
-       /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
-       /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
-
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_bdw_event_constraints[] = {
-       FIXED_EVENT_CONSTRAINT(0x00c0, 0),      /* INST_RETIRED.ANY */
-       FIXED_EVENT_CONSTRAINT(0x003c, 1),      /* CPU_CLK_UNHALTED.CORE */
-       FIXED_EVENT_CONSTRAINT(0x0300, 2),      /* CPU_CLK_UNHALTED.REF */
-       INTEL_UEVENT_CONSTRAINT(0x148, 0x4),    /* L1D_PEND_MISS.PENDING */
-       INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),        /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
-       EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-       return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts.
- * - icache miss does not include decoded icache
- */
-
-#define SKL_DEMAND_DATA_RD             BIT_ULL(0)
-#define SKL_DEMAND_RFO                 BIT_ULL(1)
-#define SKL_ANY_RESPONSE               BIT_ULL(16)
-#define SKL_SUPPLIER_NONE              BIT_ULL(17)
-#define SKL_L3_MISS_LOCAL_DRAM         BIT_ULL(26)
-#define SKL_L3_MISS_REMOTE_HOP0_DRAM   BIT_ULL(27)
-#define SKL_L3_MISS_REMOTE_HOP1_DRAM   BIT_ULL(28)
-#define SKL_L3_MISS_REMOTE_HOP2P_DRAM  BIT_ULL(29)
-#define SKL_L3_MISS                    (SKL_L3_MISS_LOCAL_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-#define SKL_SPL_HIT                    BIT_ULL(30)
-#define SKL_SNOOP_NONE                 BIT_ULL(31)
-#define SKL_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define SKL_SNOOP_MISS                 BIT_ULL(33)
-#define SKL_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define SKL_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define SKL_SNOOP_HITM                 BIT_ULL(36)
-#define SKL_SNOOP_NON_DRAM             BIT_ULL(37)
-#define SKL_ANY_SNOOP                  (SKL_SPL_HIT|SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
-#define SKL_DEMAND_READ                        SKL_DEMAND_DATA_RD
-#define SKL_SNOOP_DRAM                 (SKL_SNOOP_NONE| \
-                                        SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-                                        SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-                                        SKL_SNOOP_HITM|SKL_SPL_HIT)
-#define SKL_DEMAND_WRITE               SKL_DEMAND_RFO
-#define SKL_LLC_ACCESS                 SKL_ANY_RESPONSE
-#define SKL_L3_MISS_REMOTE             (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-                                        SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-
-static __initconst const u64 skl_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x283,   /* ICACHE_64B.MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_INST_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x608,   /* DTLB_LOAD_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_INST_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x649,   /* DTLB_STORE_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0xe85,   /* ITLB_MISSES.WALK_COMPLETED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 skl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS|SKL_ANY_SNOOP|
-                                      SKL_SUPPLIER_NONE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-                                      SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-#define SNB_DMND_DATA_RD       (1ULL << 0)
-#define SNB_DMND_RFO           (1ULL << 1)
-#define SNB_DMND_IFETCH                (1ULL << 2)
-#define SNB_DMND_WB            (1ULL << 3)
-#define SNB_PF_DATA_RD         (1ULL << 4)
-#define SNB_PF_RFO             (1ULL << 5)
-#define SNB_PF_IFETCH          (1ULL << 6)
-#define SNB_LLC_DATA_RD                (1ULL << 7)
-#define SNB_LLC_RFO            (1ULL << 8)
-#define SNB_LLC_IFETCH         (1ULL << 9)
-#define SNB_BUS_LOCKS          (1ULL << 10)
-#define SNB_STRM_ST            (1ULL << 11)
-#define SNB_OTHER              (1ULL << 15)
-#define SNB_RESP_ANY           (1ULL << 16)
-#define SNB_NO_SUPP            (1ULL << 17)
-#define SNB_LLC_HITM           (1ULL << 18)
-#define SNB_LLC_HITE           (1ULL << 19)
-#define SNB_LLC_HITS           (1ULL << 20)
-#define SNB_LLC_HITF           (1ULL << 21)
-#define SNB_LOCAL              (1ULL << 22)
-#define SNB_REMOTE             (0xffULL << 23)
-#define SNB_SNP_NONE           (1ULL << 31)
-#define SNB_SNP_NOT_NEEDED     (1ULL << 32)
-#define SNB_SNP_MISS           (1ULL << 33)
-#define SNB_NO_FWD             (1ULL << 34)
-#define SNB_SNP_FWD            (1ULL << 35)
-#define SNB_HITM               (1ULL << 36)
-#define SNB_NON_DRAM           (1ULL << 37)
-
-#define SNB_DMND_READ          (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE         (SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-                                SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-                                SNB_HITM)
-
-#define SNB_DRAM_ANY           (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE                (SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS          SNB_RESP_ANY
-#define SNB_L3_MISS            (SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
-               [ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
-               [ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-
-};
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts because they are not
- *   reliably counted.
- */
-
-#define HSW_DEMAND_DATA_RD             BIT_ULL(0)
-#define HSW_DEMAND_RFO                 BIT_ULL(1)
-#define HSW_ANY_RESPONSE               BIT_ULL(16)
-#define HSW_SUPPLIER_NONE              BIT_ULL(17)
-#define HSW_L3_MISS_LOCAL_DRAM         BIT_ULL(22)
-#define HSW_L3_MISS_REMOTE_HOP0                BIT_ULL(27)
-#define HSW_L3_MISS_REMOTE_HOP1                BIT_ULL(28)
-#define HSW_L3_MISS_REMOTE_HOP2P       BIT_ULL(29)
-#define HSW_L3_MISS                    (HSW_L3_MISS_LOCAL_DRAM| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_SNOOP_NONE                 BIT_ULL(31)
-#define HSW_SNOOP_NOT_NEEDED           BIT_ULL(32)
-#define HSW_SNOOP_MISS                 BIT_ULL(33)
-#define HSW_SNOOP_HIT_NO_FWD           BIT_ULL(34)
-#define HSW_SNOOP_HIT_WITH_FWD         BIT_ULL(35)
-#define HSW_SNOOP_HITM                 BIT_ULL(36)
-#define HSW_SNOOP_NON_DRAM             BIT_ULL(37)
-#define HSW_ANY_SNOOP                  (HSW_SNOOP_NONE| \
-                                        HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
-                                        HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
-                                        HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
-#define HSW_SNOOP_DRAM                 (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
-#define HSW_DEMAND_READ                        HSW_DEMAND_DATA_RD
-#define HSW_DEMAND_WRITE               HSW_DEMAND_RFO
-#define HSW_L3_MISS_REMOTE             (HSW_L3_MISS_REMOTE_HOP0|\
-                                        HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_LLC_ACCESS                 HSW_ANY_RESPONSE
-
-#define BDW_L3_MISS_LOCAL              BIT(26)
-#define BDW_L3_MISS                    (BDW_L3_MISS_LOCAL| \
-                                        HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-                                        HSW_L3_MISS_REMOTE_HOP2P)
-
-
-static __initconst const u64 hsw_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x151,   /* L1D.REPLACEMENT */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x280,   /* ICACHE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x81d0,  /* MEM_UOPS_RETIRED.ALL_LOADS */
-               [ C(RESULT_MISS)   ] = 0x108,   /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x82d0,  /* MEM_UOPS_RETIRED.ALL_STORES */
-               [ C(RESULT_MISS)   ] = 0x149,   /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x6085,  /* ITLB_MISSES.STLB_HIT */
-               [ C(RESULT_MISS)   ] = 0x185,   /* ITLB_MISSES.MISS_CAUSES_A_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0xc4,    /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0xc5,    /* BR_MISP_RETIRED.ALL_BRANCHES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x1b7,   /* OFFCORE_RESPONSE */
-               [ C(RESULT_MISS)   ] = 0x1b7,   /* OFFCORE_RESPONSE */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 hsw_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS|HSW_ANY_SNOOP,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_LOCAL_DRAM|
-                                      HSW_SNOOP_DRAM,
-               [ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-                                      HSW_L3_MISS_REMOTE|
-                                      HSW_SNOOP_DRAM,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD       (1 << 0)
-#define NHM_DMND_RFO           (1 << 1)
-#define NHM_DMND_IFETCH                (1 << 2)
-#define NHM_DMND_WB            (1 << 3)
-#define NHM_PF_DATA_RD         (1 << 4)
-#define NHM_PF_DATA_RFO                (1 << 5)
-#define NHM_PF_IFETCH          (1 << 6)
-#define NHM_OFFCORE_OTHER      (1 << 7)
-#define NHM_UNCORE_HIT         (1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
-#define NHM_OTHER_CORE_HITM    (1 << 10)
-                               /* reserved */
-#define NHM_REMOTE_CACHE_FWD   (1 << 12)
-#define NHM_REMOTE_DRAM                (1 << 13)
-#define NHM_LOCAL_DRAM         (1 << 14)
-#define NHM_NON_DRAM           (1 << 15)
-
-#define NHM_LOCAL              (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE             (NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ          (NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE         (NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH      (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT     (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS    (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS  (NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-               [ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
-       },
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-               [ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-               [ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-               [ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       /*
-        * Use RFO, not WRITEBACK, because a write miss would typically occur
-        * on RFO.
-        */
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-               [ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-               [ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-               [ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-               [ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-               [ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-               [ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-               [ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static struct extra_reg intel_slm_extra_regs[] __read_mostly =
-{
-       /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-       INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-       INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
-       EVENT_EXTRA_END
-};
-
-#define SLM_DMND_READ          SNB_DMND_DATA_RD
-#define SLM_DMND_WRITE         SNB_DMND_RFO
-#define SLM_DMND_PREFETCH      (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SLM_SNP_ANY            (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
-#define SLM_LLC_ACCESS         SNB_RESP_ANY
-#define SLM_LLC_MISS           (SLM_SNP_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 slm_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
-               [ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
-       },
- },
-};
-
-static __initconst const u64 slm_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
-               [ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
-       [ C(OP_PREFETCH) ] = {
-               /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-               [ C(RESULT_ACCESS) ] = 0x01b7,
-               /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-               [ C(RESULT_MISS)   ] = 0x01b7,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
-               [ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
-               [ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-#define KNL_OT_L2_HITE         BIT_ULL(19) /* Other Tile L2 Hit */
-#define KNL_OT_L2_HITF         BIT_ULL(20) /* Other Tile L2 Hit */
-#define KNL_MCDRAM_LOCAL       BIT_ULL(21)
-#define KNL_MCDRAM_FAR         BIT_ULL(22)
-#define KNL_DDR_LOCAL          BIT_ULL(23)
-#define KNL_DDR_FAR            BIT_ULL(24)
-#define KNL_DRAM_ANY           (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
-                                   KNL_DDR_LOCAL | KNL_DDR_FAR)
-#define KNL_L2_READ            SLM_DMND_READ
-#define KNL_L2_WRITE           SLM_DMND_WRITE
-#define KNL_L2_PREFETCH                SLM_DMND_PREFETCH
-#define KNL_L2_ACCESS          SLM_LLC_ACCESS
-#define KNL_L2_MISS            (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
-                                  KNL_DRAM_ANY | SNB_SNP_ANY | \
-                                                 SNB_NON_DRAM)
-
-static __initconst const u64 knl_hw_cache_extra_regs
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-       [C(LL)] = {
-               [C(OP_READ)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = 0,
-               },
-               [C(OP_WRITE)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
-               },
-               [C(OP_PREFETCH)] = {
-                       [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
-                       [C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
-               },
-       },
-};
-
-/*
- * Use from PMIs where the LBRs are already disabled.
- */
-static void __intel_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               intel_pmu_disable_bts();
-       else
-               intel_bts_disable_local();
-
-       intel_pmu_pebs_disable_all();
-}
-
-static void intel_pmu_disable_all(void)
-{
-       __intel_pmu_disable_all();
-       intel_pmu_lbr_disable_all();
-}
-
-static void __intel_pmu_enable_all(int added, bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       intel_pmu_pebs_enable_all();
-       intel_pmu_lbr_enable_all(pmi);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-                       x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-               struct perf_event *event =
-                       cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
-               if (WARN_ON_ONCE(!event))
-                       return;
-
-               intel_pmu_enable_bts(event->hw.config);
-       } else
-               intel_bts_enable_local();
-}
-
-static void intel_pmu_enable_all(int added)
-{
-       __intel_pmu_enable_all(added, false);
-}
-
-/*
- * Workaround for:
- *   Intel Errata AAK100 (model 26)
- *   Intel Errata AAP53  (model 30)
- *   Intel Errata BD53   (model 44)
- *
- * The official story:
- *   These chips need to be 'reset' when adding counters by programming the
- *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- *   in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       static const unsigned long nhm_magic[4] = {
-               0x4300B5,
-               0x4300D2,
-               0x4300B1,
-               0x4300B1
-       };
-       struct perf_event *event;
-       int i;
-
-       /*
-        * The Errata requires below steps:
-        * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
-        * 2) Configure 4 PERFEVTSELx with the magic events and clear
-        *    the corresponding PMCx;
-        * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
-        * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
-        * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
-        */
-
-       /*
-        * The real steps we choose are a little different from above.
-        * A) To reduce MSR operations, we don't run step 1) as they
-        *    are already cleared before this function is called;
-        * B) Call x86_perf_event_update to save PMCx before configuring
-        *    PERFEVTSELx with magic number;
-        * C) With step 5), we do clear only when the PERFEVTSELx is
-        *    not used currently.
-        * D) Call x86_perf_event_set_period to restore PMCx;
-        */
-
-       /* We always operate 4 pairs of PERF Counters */
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-               if (event)
-                       x86_perf_event_update(event);
-       }
-
-       for (i = 0; i < 4; i++) {
-               wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
-               wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
-       }
-
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
-       for (i = 0; i < 4; i++) {
-               event = cpuc->events[i];
-
-               if (event) {
-                       x86_perf_event_set_period(event);
-                       __x86_pmu_enable_event(&event->hw,
-                                       ARCH_PERFMON_EVENTSEL_ENABLE);
-               } else
-                       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
-       }
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
-       if (added)
-               intel_pmu_nhm_workaround();
-       intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, mask;
-
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               intel_pmu_disable_bts();
-               intel_pmu_drain_bts_buffer();
-               return;
-       }
-
-       cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-       /*
-        * must disable before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_disable(event);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_disable_fixed(hwc);
-               return;
-       }
-
-       x86_pmu_disable_event(event);
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
-       int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-       u64 ctrl_val, bits, mask;
-
-       /*
-        * Enable IRQ generation (0x8),
-        * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-        * if requested:
-        */
-       bits = 0x8ULL;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-               bits |= 0x2;
-       if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-               bits |= 0x1;
-
-       /*
-        * ANY bit is supported in v3 and up
-        */
-       if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
-               bits |= 0x4;
-
-       bits <<= (idx * 4);
-       mask = 0xfULL << (idx * 4);
-
-       rdmsrl(hwc->config_base, ctrl_val);
-       ctrl_val &= ~mask;
-       ctrl_val |= bits;
-       wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-               if (!__this_cpu_read(cpu_hw_events.enabled))
-                       return;
-
-               intel_pmu_enable_bts(hwc->config);
-               return;
-       }
-       /*
-        * must enabled before any actual event
-        * because any event may be combined with LBR
-        */
-       if (needs_branch_stack(event))
-               intel_pmu_lbr_enable(event);
-
-       if (event->attr.exclude_host)
-               cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-       if (event->attr.exclude_guest)
-               cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-       if (unlikely(event_is_checkpointed(event)))
-               cpuc->intel_cp_status |= (1ull << hwc->idx);
-
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-               intel_pmu_enable_fixed(hwc);
-               return;
-       }
-
-       if (unlikely(event->attr.precise_ip))
-               intel_pmu_pebs_enable(event);
-
-       __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
-       x86_perf_event_update(event);
-       /*
-        * For a checkpointed counter always reset back to 0.  This
-        * avoids a situation where the counter overflows, aborts the
-        * transaction and is then set back to shortly before the
-        * overflow, and overflows and aborts again.
-        */
-       if (unlikely(event_is_checkpointed(event))) {
-               /* No race with NMIs because the counter should not be armed */
-               wrmsrl(event->hw.event_base, 0);
-               local64_set(&event->hw.prev_count, 0);
-       }
-       return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-       unsigned long flags;
-       int idx;
-
-       if (!x86_pmu.num_counters)
-               return;
-
-       local_irq_save(flags);
-
-       pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
-               wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
-       }
-       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-               wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
-       if (ds)
-               ds->bts_index = ds->bts_buffer_base;
-
-       /* Ack all overflows and disable fixed counters */
-       if (x86_pmu.version >= 2) {
-               intel_pmu_ack_status(intel_pmu_get_status());
-               wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-       }
-
-       /* Reset LBRs and LBR freezing */
-       if (x86_pmu.lbr_nr) {
-               update_debugctlmsr(get_debugctlmsr() &
-                       ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
-       }
-
-       local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int bit, loops;
-       u64 status;
-       int handled;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       /*
-        * No known reason to not always do late ACK,
-        * but just in case do it opt-in.
-        */
-       if (!x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       __intel_pmu_disable_all();
-       handled = intel_pmu_drain_bts_buffer();
-       handled += intel_bts_interrupt();
-       status = intel_pmu_get_status();
-       if (!status)
-               goto done;
-
-       loops = 0;
-again:
-       intel_pmu_lbr_read();
-       intel_pmu_ack_status(status);
-       if (++loops > 100) {
-               static bool warned = false;
-               if (!warned) {
-                       WARN(1, "perfevents: irq loop stuck!\n");
-                       perf_event_print_debug();
-                       warned = true;
-               }
-               intel_pmu_reset();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-
-       /*
-        * Ignore a range of extra bits in status that do not indicate
-        * overflow by themselves.
-        */
-       status &= ~(GLOBAL_STATUS_COND_CHG |
-                   GLOBAL_STATUS_ASIF |
-                   GLOBAL_STATUS_LBRS_FROZEN);
-       if (!status)
-               goto done;
-
-       /*
-        * PEBS overflow sets bit 62 in the global status register
-        */
-       if (__test_and_clear_bit(62, (unsigned long *)&status)) {
-               handled++;
-               x86_pmu.drain_pebs(regs);
-       }
-
-       /*
-        * Intel PT
-        */
-       if (__test_and_clear_bit(55, (unsigned long *)&status)) {
-               handled++;
-               intel_pt_interrupt();
-       }
-
-       /*
-        * Checkpointed counters can lead to 'spurious' PMIs because the
-        * rollback caused by the PMI will have cleared the overflow status
-        * bit. Therefore always force probe these counters.
-        */
-       status |= cpuc->intel_cp_status;
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (has_branch_stack(event))
-                       data.br_stack = &cpuc->lbr_stack;
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = intel_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       __intel_pmu_enable_all(0, true);
-       /*
-        * Only unmask the NMI after the overflow counters
-        * have been reset. This avoids spurious NMIs on
-        * Haswell CPUs.
-        */
-       if (x86_pmu.late_ack)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned int hw_event, bts_event;
-
-       if (event->attr.freq)
-               return NULL;
-
-       hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-       bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
-       if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
-               return &bts_constraint;
-
-       return NULL;
-}
-
-static int intel_alt_er(int idx, u64 config)
-{
-       int alt_idx;
-       if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
-               return idx;
-
-       if (idx == EXTRA_REG_RSP_0)
-               alt_idx = EXTRA_REG_RSP_1;
-
-       if (idx == EXTRA_REG_RSP_1)
-               alt_idx = EXTRA_REG_RSP_0;
-
-       if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
-               return idx;
-
-       return alt_idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
-       event->hw.extra_reg.idx = idx;
-
-       if (idx == EXTRA_REG_RSP_0) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
-       } else if (idx == EXTRA_REG_RSP_1) {
-               event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-               event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
-               event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-       }
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-                                  struct perf_event *event,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct event_constraint *c = &emptyconstraint;
-       struct er_account *era;
-       unsigned long flags;
-       int idx = reg->idx;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake cpuc we
-        * need to ignore this, otherwise we might fail to allocate proper fake
-        * state for this extra reg constraint. Also see the comment below.
-        */
-       if (reg->alloc && !cpuc->is_fake)
-               return NULL; /* call x86_get_event_constraint() */
-
-again:
-       era = &cpuc->shared_regs->regs[idx];
-       /*
-        * we use spin_lock_irqsave() to avoid lockdep issues when
-        * passing a fake cpuc
-        */
-       raw_spin_lock_irqsave(&era->lock, flags);
-
-       if (!atomic_read(&era->ref) || era->config == reg->config) {
-
-               /*
-                * If its a fake cpuc -- as per validate_{group,event}() we
-                * shouldn't touch event state and we can avoid doing so
-                * since both will only call get_event_constraints() once
-                * on each event, this avoids the need for reg->alloc.
-                *
-                * Not doing the ER fixup will only result in era->reg being
-                * wrong, but since we won't actually try and program hardware
-                * this isn't a problem either.
-                */
-               if (!cpuc->is_fake) {
-                       if (idx != reg->idx)
-                               intel_fixup_er(event, idx);
-
-                       /*
-                        * x86_schedule_events() can call get_event_constraints()
-                        * multiple times on events in the case of incremental
-                        * scheduling(). reg->alloc ensures we only do the ER
-                        * allocation once.
-                        */
-                       reg->alloc = 1;
-               }
-
-               /* lock in msr value */
-               era->config = reg->config;
-               era->reg = reg->reg;
-
-               /* one more user */
-               atomic_inc(&era->ref);
-
-               /*
-                * need to call x86_get_event_constraint()
-                * to check if associated event has constraints
-                */
-               c = NULL;
-       } else {
-               idx = intel_alt_er(idx, reg->config);
-               if (idx != reg->idx) {
-                       raw_spin_unlock_irqrestore(&era->lock, flags);
-                       goto again;
-               }
-       }
-       raw_spin_unlock_irqrestore(&era->lock, flags);
-
-       return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
-                                  struct hw_perf_event_extra *reg)
-{
-       struct er_account *era;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also takes
-        * care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake cpuc we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent cpuc state
-        * either since it'll be thrown out.
-        */
-       if (!reg->alloc || cpuc->is_fake)
-               return;
-
-       era = &cpuc->shared_regs->regs[reg->idx];
-
-       /* one fewer user */
-       atomic_dec(&era->ref);
-
-       /* allocate again next time */
-       reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
-                             struct perf_event *event)
-{
-       struct event_constraint *c = NULL, *d;
-       struct hw_perf_event_extra *xreg, *breg;
-
-       xreg = &event->hw.extra_reg;
-       if (xreg->idx != EXTRA_REG_NONE) {
-               c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
-               if (c == &emptyconstraint)
-                       return c;
-       }
-       breg = &event->hw.branch_reg;
-       if (breg->idx != EXTRA_REG_NONE) {
-               d = __intel_shared_reg_get_constraints(cpuc, event, breg);
-               if (d == &emptyconstraint) {
-                       __intel_shared_reg_put_constraints(cpuc, xreg);
-                       c = d;
-               }
-       }
-       return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (x86_pmu.event_constraints) {
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &unconstrained;
-}
-
-static struct event_constraint *
-__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_bts_constraints(event);
-       if (c)
-               return c;
-
-       c = intel_shared_regs_constraints(cpuc, event);
-       if (c)
-               return c;
-
-       c = intel_pebs_constraints(event);
-       if (c)
-               return c;
-
-       return x86_get_event_constraints(cpuc, idx, event);
-}
-
-static void
-intel_start_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = true;
-       /*
-        * lock shared state until we are done scheduling
-        * in stop_event_scheduling()
-        * makes scheduling appear as a transaction
-        */
-       raw_spin_lock(&excl_cntrs->lock);
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct event_constraint *c = cpuc->event_constraint[idx];
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       lockdep_assert_held(&excl_cntrs->lock);
-
-       if (c->flags & PERF_X86_EVENT_EXCL)
-               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
-       else
-               xl->state[cntr] = INTEL_EXCL_SHARED;
-}
-
-static void
-intel_stop_scheduling(struct cpu_hw_events *cpuc)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl;
-       int tid = cpuc->excl_thread_id;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return;
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       xl = &excl_cntrs->states[tid];
-
-       xl->sched_started = false;
-       /*
-        * release shared state lock (acquired in intel_start_scheduling())
-        */
-       raw_spin_unlock(&excl_cntrs->lock);
-}
-
-static struct event_constraint *
-intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-                          int idx, struct event_constraint *c)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xlo;
-       int tid = cpuc->excl_thread_id;
-       int is_excl, i;
-
-       /*
-        * validating a group does not require
-        * enforcing cross-thread  exclusion
-        */
-       if (cpuc->is_fake || !is_ht_workaround_enabled())
-               return c;
-
-       /*
-        * no exclusion needed
-        */
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return c;
-
-       /*
-        * because we modify the constraint, we need
-        * to make a copy. Static constraints come
-        * from static const tables.
-        *
-        * only needed when constraint has not yet
-        * been cloned (marked dynamic)
-        */
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-               struct event_constraint *cx;
-
-               /*
-                * grab pre-allocated constraint entry
-                */
-               cx = &cpuc->constraint_list[idx];
-
-               /*
-                * initialize dynamic constraint
-                * with static constraint
-                */
-               *cx = *c;
-
-               /*
-                * mark constraint as dynamic, so we
-                * can free it later on
-                */
-               cx->flags |= PERF_X86_EVENT_DYNAMIC;
-               c = cx;
-       }
-
-       /*
-        * From here on, the constraint is dynamic.
-        * Either it was just allocated above, or it
-        * was allocated during a earlier invocation
-        * of this function
-        */
-
-       /*
-        * state of sibling HT
-        */
-       xlo = &excl_cntrs->states[tid ^ 1];
-
-       /*
-        * event requires exclusive counter access
-        * across HT threads
-        */
-       is_excl = c->flags & PERF_X86_EVENT_EXCL;
-       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-               if (!cpuc->n_excl++)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-       }
-
-       /*
-        * Modify static constraint with current dynamic
-        * state of thread
-        *
-        * EXCLUSIVE: sibling counter measuring exclusive event
-        * SHARED   : sibling counter measuring non-exclusive event
-        * UNUSED   : sibling counter unused
-        */
-       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
-               /*
-                * exclusive event in sibling counter
-                * our corresponding counter cannot be used
-                * regardless of our event
-                */
-               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
-                       __clear_bit(i, c->idxmsk);
-               /*
-                * if measuring an exclusive event, sibling
-                * measuring non-exclusive, then counter cannot
-                * be used
-                */
-               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
-                       __clear_bit(i, c->idxmsk);
-       }
-
-       /*
-        * recompute actual bit weight for scheduling algorithm
-        */
-       c->weight = hweight64(c->idxmsk64);
-
-       /*
-        * if we return an empty mask, then switch
-        * back to static empty constraint to avoid
-        * the cost of freeing later on
-        */
-       if (c->weight == 0)
-               c = &emptyconstraint;
-
-       return c;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                           struct perf_event *event)
-{
-       struct event_constraint *c1 = NULL;
-       struct event_constraint *c2;
-
-       if (idx >= 0) /* fake does < 0 */
-               c1 = cpuc->event_constraint[idx];
-
-       /*
-        * first time only
-        * - static constraint: no change across incremental scheduling calls
-        * - dynamic constraint: handled by intel_get_excl_constraints()
-        */
-       c2 = __intel_get_event_constraints(cpuc, idx, event);
-       if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
-               bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
-               c1->weight = c2->weight;
-               c2 = c1;
-       }
-
-       if (cpuc->excl_cntrs)
-               return intel_get_excl_constraints(cpuc, event, idx, c2);
-
-       return c2;
-}
-
-static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-               struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       int tid = cpuc->excl_thread_id;
-       struct intel_excl_states *xl;
-
-       /*
-        * nothing needed if in group validation mode
-        */
-       if (cpuc->is_fake)
-               return;
-
-       if (WARN_ON_ONCE(!excl_cntrs))
-               return;
-
-       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
-               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
-               if (!--cpuc->n_excl)
-                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
-       }
-
-       /*
-        * If event was actually assigned, then mark the counter state as
-        * unused now.
-        */
-       if (hwc->idx >= 0) {
-               xl = &excl_cntrs->states[tid];
-
-               /*
-                * put_constraint may be called from x86_schedule_events()
-                * which already has the lock held so here make locking
-                * conditional.
-                */
-               if (!xl->sched_started)
-                       raw_spin_lock(&excl_cntrs->lock);
-
-               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
-
-               if (!xl->sched_started)
-                       raw_spin_unlock(&excl_cntrs->lock);
-       }
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-
-       reg = &event->hw.extra_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-
-       reg = &event->hw.branch_reg;
-       if (reg->idx != EXTRA_REG_NONE)
-               __intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-                                       struct perf_event *event)
-{
-       intel_put_shared_regs_event_constraints(cpuc, event);
-
-       /*
-        * is PMU has exclusive counter restrictions, then
-        * all events are subject to and must call the
-        * put_excl_constraints() routine
-        */
-       if (cpuc->excl_cntrs)
-               intel_put_excl_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.ANY_P
-                * (0x00c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * INST_RETIRED.ANY_P counts the number of cycles that retires
-                * CNTMASK instructions. By setting CNTMASK to a value (16)
-                * larger than the maximum number of instructions that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less instructions, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use UOPS_RETIRED.ALL
-                * (0x01c2), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * UOPS_RETIRED.ALL counts the number of cycles that retires
-                * CNTMASK micro-ops. By setting CNTMASK to a value (16)
-                * larger than the maximum number of micro-ops that can be
-                * retired per cycle (4) and then inverting the condition, we
-                * count all cycles that retire 16 or less micro-ops, which
-                * is every cycle.
-                *
-                * Thereby we gain a PEBS capable cycle counter.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_precdist(struct perf_event *event)
-{
-       if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-               /*
-                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-                * (0x003c) so that we can use it with PEBS.
-                *
-                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-                * PEBS capable. However we can use INST_RETIRED.PREC_DIST
-                * (0x01c0), which is a PEBS capable event, to get the same
-                * count.
-                *
-                * The PREC_DIST event has special support to minimize sample
-                * shadowing effects. One drawback is that it can be
-                * only programmed on counter 1, but that seems like an
-                * acceptable trade off.
-                */
-               u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
-
-               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-               event->hw.config = alt_config;
-       }
-}
-
-static void intel_pebs_aliases_ivb(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_snb(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static void intel_pebs_aliases_skl(struct perf_event *event)
-{
-       if (event->attr.precise_ip < 3)
-               return intel_pebs_aliases_core2(event);
-       return intel_pebs_aliases_precdist(event);
-}
-
-static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
-{
-       unsigned long flags = x86_pmu.free_running_flags;
-
-       if (event->attr.use_clockid)
-               flags &= ~PERF_SAMPLE_TIME;
-       return flags;
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
-       int ret = x86_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-
-       if (event->attr.precise_ip) {
-               if (!event->attr.freq) {
-                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-                       if (!(event->attr.sample_type &
-                             ~intel_pmu_free_running_flags(event)))
-                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
-               }
-               if (x86_pmu.pebs_aliases)
-                       x86_pmu.pebs_aliases(event);
-       }
-
-       if (needs_branch_stack(event)) {
-               ret = intel_pmu_setup_lbr_filter(event);
-               if (ret)
-                       return ret;
-
-               /*
-                * BTS is set up earlier in this path, so don't account twice
-                */
-               if (!intel_pmu_has_bts(event)) {
-                       /* disallow lbr if conflicting events are present */
-                       if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-                               return -EBUSY;
-
-                       event->destroy = hw_perf_lbr_event_destroy;
-               }
-       }
-
-       if (event->attr.type != PERF_TYPE_RAW)
-               return 0;
-
-       if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
-               return 0;
-
-       if (x86_pmu.version < 3)
-               return -EINVAL;
-
-       if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
-       return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-       if (x86_pmu.guest_get_msrs)
-               return x86_pmu.guest_get_msrs(nr);
-       *nr = 0;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
-       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-       arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-       arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-       /*
-        * If PMU counter has PEBS enabled it is not enough to disable counter
-        * on a guest entry since PEBS memory write can overshoot guest entry
-        * and corrupt guest memory. Disabling PEBS solves the problem.
-        */
-       arr[1].msr = MSR_IA32_PEBS_ENABLE;
-       arr[1].host = cpuc->pebs_enabled;
-       arr[1].guest = 0;
-
-       *nr = 2;
-       return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
-               struct perf_event *event = cpuc->events[idx];
-
-               arr[idx].msr = x86_pmu_config_addr(idx);
-               arr[idx].host = arr[idx].guest = 0;
-
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-
-               arr[idx].host = arr[idx].guest =
-                       event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
-               if (event->attr.exclude_host)
-                       arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-               else if (event->attr.exclude_guest)
-                       arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       }
-
-       *nr = x86_pmu.num_counters;
-       return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
-       if (!event->attr.exclude_host)
-               x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-               if (!test_bit(idx, cpuc->active_mask) ||
-                               cpuc->events[idx]->attr.exclude_host)
-                       continue;
-
-               __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-       }
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
-       int ret = intel_pmu_hw_config(event);
-
-       if (ret)
-               return ret;
-       if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
-               return 0;
-       event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
-       /*
-        * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
-        * PEBS or in ANY thread mode. Since the results are non-sensical forbid
-        * this combination.
-        */
-       if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
-            ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
-             event->attr.precise_ip > 0))
-               return -EOPNOTSUPP;
-
-       if (event_is_checkpointed(event)) {
-               /*
-                * Sampling of checkpointed events can cause situations where
-                * the CPU constantly aborts because of a overflow, which is
-                * then checkpointed back and ignored. Forbid checkpointing
-                * for sampling.
-                *
-                * But still allow a long sampling period, so that perf stat
-                * from KVM works.
-                */
-               if (event->attr.sample_period > 0 &&
-                   event->attr.sample_period < 0x7fffffff)
-                       return -EOPNOTSUPP;
-       }
-       return 0;
-}
-
-static struct event_constraint counter2_constraint =
-                       EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-                         struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       c = intel_get_event_constraints(cpuc, idx, event);
-
-       /* Handle special quirk on in_tx_checkpointed only in counter 2 */
-       if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-               if (c->idxmsk64 & (1U << 2))
-                       return &counter2_constraint;
-               return &emptyconstraint;
-       }
-
-       return c;
-}
-
-/*
- * Broadwell:
- *
- * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
- * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
- * the two to enforce a minimum period of 128 (the smallest value that has bits
- * 0-5 cleared and >= 100).
- *
- * Because of how the code in x86_perf_event_set_period() works, the truncation
- * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
- * to make up for the 'lost' events due to carrying the 'error' in period_left.
- *
- * Therefore the effective (average) period matches the requested period,
- * despite coarser hardware granularity.
- */
-static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
-{
-       if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
-                       X86_CONFIG(.event=0xc0, .umask=0x01)) {
-               if (left < 128)
-                       left = 128;
-               left &= ~0x3fu;
-       }
-       return left;
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(any,   "config:21"     ); /* v3 + */
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-PMU_FORMAT_ATTR(in_tx,  "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
-       u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
-       return x86_event_sysfs_show(page, config, event);
-}
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-       struct intel_shared_regs *regs;
-       int i;
-
-       regs = kzalloc_node(sizeof(struct intel_shared_regs),
-                           GFP_KERNEL, cpu_to_node(cpu));
-       if (regs) {
-               /*
-                * initialize the locks to keep lockdep happy
-                */
-               for (i = 0; i < EXTRA_REG_MAX; i++)
-                       raw_spin_lock_init(&regs->regs[i].lock);
-
-               regs->core_id = -1;
-       }
-       return regs;
-}
-
-static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
-{
-       struct intel_excl_cntrs *c;
-
-       c = kzalloc_node(sizeof(struct intel_excl_cntrs),
-                        GFP_KERNEL, cpu_to_node(cpu));
-       if (c) {
-               raw_spin_lock_init(&c->lock);
-               c->core_id = -1;
-       }
-       return c;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
-               cpuc->shared_regs = allocate_shared_regs(cpu);
-               if (!cpuc->shared_regs)
-                       goto err;
-       }
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
-
-               cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
-               if (!cpuc->constraint_list)
-                       goto err_shared_regs;
-
-               cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-               if (!cpuc->excl_cntrs)
-                       goto err_constraint_list;
-
-               cpuc->excl_thread_id = 0;
-       }
-
-       return NOTIFY_OK;
-
-err_constraint_list:
-       kfree(cpuc->constraint_list);
-       cpuc->constraint_list = NULL;
-
-err_shared_regs:
-       kfree(cpuc->shared_regs);
-       cpuc->shared_regs = NULL;
-
-err:
-       return NOTIFY_BAD;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       int core_id = topology_core_id(cpu);
-       int i;
-
-       init_debug_store_on_cpu(cpu);
-       /*
-        * Deal with CPUs that don't clear their LBRs on power-up.
-        */
-       intel_pmu_lbr_reset();
-
-       cpuc->lbr_sel = NULL;
-
-       if (!cpuc->shared_regs)
-               return;
-
-       if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
-               void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_shared_regs *pc;
-
-                       pc = per_cpu(cpu_hw_events, i).shared_regs;
-                       if (pc && pc->core_id == core_id) {
-                               *onln = cpuc->shared_regs;
-                               cpuc->shared_regs = pc;
-                               break;
-                       }
-               }
-               cpuc->shared_regs->core_id = core_id;
-               cpuc->shared_regs->refcnt++;
-       }
-
-       if (x86_pmu.lbr_sel_map)
-               cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-
-       if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-                       struct intel_excl_cntrs *c;
-
-                       c = per_cpu(cpu_hw_events, i).excl_cntrs;
-                       if (c && c->core_id == core_id) {
-                               cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
-                               cpuc->excl_cntrs = c;
-                               cpuc->excl_thread_id = 1;
-                               break;
-                       }
-               }
-               cpuc->excl_cntrs->core_id = core_id;
-               cpuc->excl_cntrs->refcnt++;
-       }
-}
-
-static void free_excl_cntrs(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_excl_cntrs *c;
-
-       c = cpuc->excl_cntrs;
-       if (c) {
-               if (c->core_id == -1 || --c->refcnt == 0)
-                       kfree(c);
-               cpuc->excl_cntrs = NULL;
-               kfree(cpuc->constraint_list);
-               cpuc->constraint_list = NULL;
-       }
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
-       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-       struct intel_shared_regs *pc;
-
-       pc = cpuc->shared_regs;
-       if (pc) {
-               if (pc->core_id == -1 || --pc->refcnt == 0)
-                       kfree(pc);
-               cpuc->shared_regs = NULL;
-       }
-
-       free_excl_cntrs(cpu);
-
-       fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
-                                bool sched_in)
-{
-       if (x86_pmu.pebs_active)
-               intel_pmu_pebs_sched_task(ctx, sched_in);
-       if (x86_pmu.lbr_nr)
-               intel_pmu_lbr_sched_task(ctx, sched_in);
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-PMU_FORMAT_ATTR(frontend, "config1:0-23");
-
-static struct attribute *intel_arch3_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_any.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       &format_attr_in_tx.attr,
-       &format_attr_in_tx_cp.attr,
-
-       &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-       &format_attr_ldlat.attr, /* PEBS load latency */
-       NULL,
-};
-
-static struct attribute *skl_format_attr[] = {
-       &format_attr_frontend.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu core_pmu = {
-       .name                   = "core",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = x86_pmu_disable_all,
-       .enable_all             = core_pmu_enable_all,
-       .enable                 = core_pmu_enable_event,
-       .disable                = x86_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-
-       /*
-        * Intel PMCs cannot be accessed sanely above 32-bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL<<31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .event_constraints      = intel_core_event_constraints,
-       .guest_get_msrs         = core_guest_get_msrs,
-       .format_attrs           = intel_arch_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       /*
-        * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
-        * together with PMU version 1 and thus be using core_pmu with
-        * shared_regs. We need following callbacks here to allocate
-        * it properly.
-        */
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
-       .name                   = "Intel",
-       .handle_irq             = intel_pmu_handle_irq,
-       .disable_all            = intel_pmu_disable_all,
-       .enable_all             = intel_pmu_enable_all,
-       .enable                 = intel_pmu_enable_event,
-       .disable                = intel_pmu_disable_event,
-       .hw_config              = intel_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
-       .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
-       .event_map              = intel_pmu_event_map,
-       .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
-       .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
-       /*
-        * Intel PMCs cannot be accessed sanely above 32 bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic event period:
-        */
-       .max_period             = (1ULL << 31) - 1,
-       .get_event_constraints  = intel_get_event_constraints,
-       .put_event_constraints  = intel_put_event_constraints,
-       .pebs_aliases           = intel_pebs_aliases_core2,
-
-       .format_attrs           = intel_arch3_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-       .cpu_prepare            = intel_pmu_cpu_prepare,
-       .cpu_starting           = intel_pmu_cpu_starting,
-       .cpu_dying              = intel_pmu_cpu_dying,
-       .guest_get_msrs         = intel_guest_get_msrs,
-       .sched_task             = intel_pmu_sched_task,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
-       /*
-        * PEBS is unreliable due to:
-        *
-        *   AJ67  - PEBS may experience CPL leaks
-        *   AJ68  - PEBS PMI may be delayed by one event
-        *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
-        *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
-        *
-        * AJ67 could be worked around by restricting the OS/USR flags.
-        * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
-        *
-        * AJ106 could possibly be worked around by not allowing LBR
-        *       usage from PEBS, including the fixup.
-        * AJ68  could possibly be worked around by always programming
-        *       a pebs_event_reset[0] value and coping with the lost events.
-        *
-        * But taken together it might just make sense to not enable PEBS on
-        * these chips.
-        */
-       pr_warn("PEBS disabled due to CPU errata\n");
-       x86_pmu.pebs = 0;
-       x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
-       u32 rev = UINT_MAX; /* default to broken for unknown models */
-
-       switch (cpu_data(cpu).x86_model) {
-       case 42: /* SNB */
-               rev = 0x28;
-               break;
-
-       case 45: /* SNB-EP */
-               switch (cpu_data(cpu).x86_mask) {
-               case 6: rev = 0x618; break;
-               case 7: rev = 0x70c; break;
-               }
-       }
-
-       return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
-       int pebs_broken = 0;
-       int cpu;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               if ((pebs_broken = intel_snb_pebs_broken(cpu)))
-                       break;
-       }
-       put_online_cpus();
-
-       if (pebs_broken == x86_pmu.pebs_broken)
-               return;
-
-       /*
-        * Serialized by the microcode lock..
-        */
-       if (x86_pmu.pebs_broken) {
-               pr_info("PEBS enabled due to microcode update\n");
-               x86_pmu.pebs_broken = 0;
-       } else {
-               pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
-               x86_pmu.pebs_broken = 1;
-       }
-}
-
-/*
- * Under certain circumstances, access certain MSR may cause #GP.
- * The function tests if the input MSR can be safely accessed.
- */
-static bool check_msr(unsigned long msr, u64 mask)
-{
-       u64 val_old, val_new, val_tmp;
-
-       /*
-        * Read the current value, change it and read it back to see if it
-        * matches, this is needed to detect certain hardware emulators
-        * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-        */
-       if (rdmsrl_safe(msr, &val_old))
-               return false;
-
-       /*
-        * Only change the bits which can be updated by wrmsrl.
-        */
-       val_tmp = val_old ^ mask;
-       if (wrmsrl_safe(msr, val_tmp) ||
-           rdmsrl_safe(msr, &val_new))
-               return false;
-
-       if (val_new != val_tmp)
-               return false;
-
-       /* Here it's sure that the MSR can be safely accessed.
-        * Restore the old value and return.
-        */
-       wrmsrl(msr, val_old);
-
-       return true;
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
-       x86_pmu.check_microcode = intel_snb_check_microcode;
-       intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
-       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
-       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
-       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
-       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
-       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
-       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
-       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
-       int bit;
-
-       /* disable event that reported as not presend by cpuid */
-       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-               pr_warn("CPUID marked event: \'%s\' unavailable\n",
-                       intel_arch_events_map[bit].name);
-       }
-}
-
-static __init void intel_nehalem_quirk(void)
-{
-       union cpuid10_ebx ebx;
-
-       ebx.full = x86_pmu.events_maskl;
-       if (ebx.split.no_branch_misses_retired) {
-               /*
-                * Erratum AAJ80 detected, we work it around by using
-                * the BR_MISP_EXEC.ANY event. This will over-count
-                * branch-misses, but it's still much better than the
-                * architectural event which is often completely bogus:
-                */
-               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
-               ebx.split.no_branch_misses_retired = 0;
-               x86_pmu.events_maskl = ebx.full;
-               pr_info("CPU erratum AAJ80 worked around\n");
-       }
-}
-
-/*
- * enable software workaround for errata:
- * SNB: BJ122
- * IVB: BV98
- * HSW: HSD29
- *
- * Only needed when HT is enabled. However detecting
- * if HT is enabled is difficult (model specific). So instead,
- * we enable the workaround in the early boot, and verify if
- * it is needed in a later initcall phase once we have valid
- * topology information to check if HT is actually enabled
- */
-static __init void intel_ht_bug(void)
-{
-       x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
-
-       x86_pmu.start_scheduling = intel_start_scheduling;
-       x86_pmu.commit_scheduling = intel_commit_scheduling;
-       x86_pmu.stop_scheduling = intel_stop_scheduling;
-}
-
-EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
-
-/* Haswell special events */
-EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
-EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
-EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
-EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
-EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
-EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
-EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
-EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
-EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
-EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
-
-static struct attribute *hsw_events_attrs[] = {
-       EVENT_PTR(tx_start),
-       EVENT_PTR(tx_commit),
-       EVENT_PTR(tx_abort),
-       EVENT_PTR(tx_capacity),
-       EVENT_PTR(tx_conflict),
-       EVENT_PTR(el_start),
-       EVENT_PTR(el_commit),
-       EVENT_PTR(el_abort),
-       EVENT_PTR(el_capacity),
-       EVENT_PTR(el_conflict),
-       EVENT_PTR(cycles_t),
-       EVENT_PTR(cycles_ct),
-       EVENT_PTR(mem_ld_hsw),
-       EVENT_PTR(mem_st_hsw),
-       NULL
-};
-
-__init int intel_pmu_init(void)
-{
-       union cpuid10_edx edx;
-       union cpuid10_eax eax;
-       union cpuid10_ebx ebx;
-       struct event_constraint *c;
-       unsigned int unused;
-       struct extra_reg *er;
-       int version, i;
-
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-               switch (boot_cpu_data.x86) {
-               case 0x6:
-                       return p6_pmu_init();
-               case 0xb:
-                       return knc_pmu_init();
-               case 0xf:
-                       return p4_pmu_init();
-               }
-               return -ENODEV;
-       }
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Branch Misses Retired hw_event or not.
-        */
-       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
-       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
-               return -ENODEV;
-
-       version = eax.split.version_id;
-       if (version < 2)
-               x86_pmu = core_pmu;
-       else
-               x86_pmu = intel_pmu;
-
-       x86_pmu.version                 = version;
-       x86_pmu.num_counters            = eax.split.num_counters;
-       x86_pmu.cntval_bits             = eax.split.bit_width;
-       x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
-
-       x86_pmu.events_maskl            = ebx.full;
-       x86_pmu.events_mask_len         = eax.split.mask_length;
-
-       x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
-       /*
-        * Quirk: v2 perfmon does not report fixed-purpose events, so
-        * assume at least 3 events:
-        */
-       if (version > 1)
-               x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-       if (boot_cpu_has(X86_FEATURE_PDCM)) {
-               u64 capabilities;
-
-               rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-               x86_pmu.intel_cap.capabilities = capabilities;
-       }
-
-       intel_ds_init();
-
-       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
-       /*
-        * Install the hw-cache-events table:
-        */
-       switch (boot_cpu_data.x86_model) {
-       case 14: /* 65nm Core "Yonah" */
-               pr_cont("Core events, ");
-               break;
-
-       case 15: /* 65nm Core2 "Merom"          */
-               x86_add_quirk(intel_clovertown_quirk);
-       case 22: /* 65nm Core2 "Merom-L"        */
-       case 23: /* 45nm Core2 "Penryn"         */
-       case 29: /* 45nm Core2 "Dunnington (MP) */
-               memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_core();
-
-               x86_pmu.event_constraints = intel_core2_event_constraints;
-               x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
-               pr_cont("Core2 events, ");
-               break;
-
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-               memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_nehalem_event_constraints;
-               x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               x86_add_quirk(intel_nehalem_quirk);
-
-               pr_cont("Nehalem events, ");
-               break;
-
-       case 28: /* 45nm Atom "Pineview"   */
-       case 38: /* 45nm Atom "Lincroft"   */
-       case 39: /* 32nm Atom "Penwell"    */
-       case 53: /* 32nm Atom "Cloverview" */
-       case 54: /* 32nm Atom "Cedarview"  */
-               memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_gen_event_constraints;
-               x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
-               pr_cont("Atom events, ");
-               break;
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 76: /* 14nm Atom "Airmont"                   */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-               memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
-                       sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_atom();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_slm_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               pr_cont("Silvermont events, ");
-               break;
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_nhm();
-
-               x86_pmu.event_constraints = intel_westmere_event_constraints;
-               x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-               x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_westmere_extra_regs;
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-
-               x86_pmu.cpu_events = nhm_events_attrs;
-
-               /* UOPS_ISSUED.STALLED_CYCLES */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-               pr_cont("Westmere events, ");
-               break;
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-               x86_add_quirk(intel_sandybridge_quirk);
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_snb_event_constraints;
-               x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-               if (boot_cpu_data.x86_model == 45)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-               /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-                       X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("SandyBridge events, ");
-               break;
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-               x86_add_quirk(intel_ht_bug);
-               memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-                      sizeof(hw_cache_event_ids));
-               /* dTLB-load-misses on IVB is different than SNB */
-               hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
-
-               memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-                      sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_snb();
-
-               x86_pmu.event_constraints = intel_ivb_event_constraints;
-               x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               if (boot_cpu_data.x86_model == 62)
-                       x86_pmu.extra_regs = intel_snbep_extra_regs;
-               else
-                       x86_pmu.extra_regs = intel_snb_extra_regs;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.cpu_events = snb_events_attrs;
-
-               /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-               intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-                       X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
-               pr_cont("IvyBridge events, ");
-               break;
-
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-               x86_add_quirk(intel_ht_bug);
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_hsw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.lbr_double_abort = true;
-               pr_cont("Haswell events, ");
-               break;
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-               /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
-               hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
-                                                                        BDW_L3_MISS|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
-                                                                         HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
-                                                                            BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-               hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
-                                                                             BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-
-               intel_pmu_lbr_init_hsw();
-
-               x86_pmu.event_constraints = intel_bdw_event_constraints;
-               x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_snbep_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.cpu_events = hsw_events_attrs;
-               x86_pmu.limit_period = bdw_limit_period;
-               pr_cont("Broadwell events, ");
-               break;
-
-       case 87: /* Knights Landing Xeon Phi */
-               memcpy(hw_cache_event_ids,
-                      slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs,
-                      knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_knl();
-
-               x86_pmu.event_constraints = intel_slm_event_constraints;
-               x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_knl_extra_regs;
-
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               pr_cont("Knights Landing events, ");
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               x86_pmu.late_ack = true;
-               memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-               memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-               intel_pmu_lbr_init_skl();
-
-               x86_pmu.event_constraints = intel_skl_event_constraints;
-               x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
-               x86_pmu.extra_regs = intel_skl_extra_regs;
-               x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
-               x86_pmu.pebs_prec_dist = true;
-               /* all extra regs are per-cpu when HT is on */
-               x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-               x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-               x86_pmu.hw_config = hsw_hw_config;
-               x86_pmu.get_event_constraints = hsw_get_event_constraints;
-               x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-                                                 skl_format_attr);
-               WARN_ON(!x86_pmu.format_attrs);
-               x86_pmu.cpu_events = hsw_events_attrs;
-               pr_cont("Skylake events, ");
-               break;
-
-       default:
-               switch (x86_pmu.version) {
-               case 1:
-                       x86_pmu.event_constraints = intel_v1_event_constraints;
-                       pr_cont("generic architected perfmon v1, ");
-                       break;
-               default:
-                       /*
-                        * default constraints for v2 and up
-                        */
-                       x86_pmu.event_constraints = intel_gen_event_constraints;
-                       pr_cont("generic architected perfmon, ");
-                       break;
-               }
-       }
-
-       if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-               WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-                    x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-               x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-       }
-       x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-       if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-               WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-                    x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-               x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-       }
-
-       x86_pmu.intel_ctrl |=
-               ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
-       if (x86_pmu.event_constraints) {
-               /*
-                * event on fixed counter2 (REF_CYCLES) only works on this
-                * counter, so do not extend mask to generic counters
-                */
-               for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if (c->cmask == FIXED_EVENT_FLAGS
-                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-                       }
-                       c->idxmsk64 &=
-                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
-                       c->weight = hweight64(c->idxmsk64);
-               }
-       }
-
-       /*
-        * Access LBR MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support LBR MSR
-        * Check all LBT MSR here.
-        * Disable LBR access if any LBR MSRs can not be accessed.
-        */
-       if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
-               x86_pmu.lbr_nr = 0;
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-                     check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
-                       x86_pmu.lbr_nr = 0;
-       }
-
-       /*
-        * Access extra MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support offcore event
-        * Check all extra_regs here.
-        */
-       if (x86_pmu.extra_regs) {
-               for (er = x86_pmu.extra_regs; er->msr; er++) {
-                       er->extra_msr_access = check_msr(er->msr, 0x11UL);
-                       /* Disable LBR select mapping */
-                       if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-                               x86_pmu.lbr_sel_map = NULL;
-               }
-       }
-
-       /* Support full width counters using alternative MSR range */
-       if (x86_pmu.intel_cap.full_width_write) {
-               x86_pmu.max_period = x86_pmu.cntval_mask;
-               x86_pmu.perfctr = MSR_IA32_PMC0;
-               pr_cont("full-width counters, ");
-       }
-
-       return 0;
-}
-
-/*
- * HT bug: phase 2 init
- * Called once we have valid topology information to check
- * whether or not HT is enabled
- * If HT is off, then we disable the workaround
- */
-static __init int fixup_ht_bug(void)
-{
-       int cpu = smp_processor_id();
-       int w, c;
-       /*
-        * problem not present on this CPU model, nothing to do
-        */
-       if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
-               return 0;
-
-       w = cpumask_weight(topology_sibling_cpumask(cpu));
-       if (w > 1) {
-               pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
-               return 0;
-       }
-
-       if (lockup_detector_suspend() != 0) {
-               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-               return 0;
-       }
-
-       x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
-
-       x86_pmu.start_scheduling = NULL;
-       x86_pmu.commit_scheduling = NULL;
-       x86_pmu.stop_scheduling = NULL;
-
-       lockup_detector_resume();
-
-       get_online_cpus();
-
-       for_each_online_cpu(c) {
-               free_excl_cntrs(c);
-       }
-
-       put_online_cpus();
-       pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
-       return 0;
-}
-subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
deleted file mode 100644 (file)
index 2cad71d..0000000
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * BTS PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/coredump.h>
-
-#include <asm-generic/sizes.h>
-#include <asm/perf_event.h>
-
-#include "perf_event.h"
-
-struct bts_ctx {
-       struct perf_output_handle       handle;
-       struct debug_store              ds_back;
-       int                             started;
-};
-
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
-
-#define BTS_RECORD_SIZE                24
-#define BTS_SAFETY_MARGIN      4080
-
-struct bts_phys {
-       struct page     *page;
-       unsigned long   size;
-       unsigned long   offset;
-       unsigned long   displacement;
-};
-
-struct bts_buffer {
-       size_t          real_size;      /* multiple of BTS_RECORD_SIZE */
-       unsigned int    nr_pages;
-       unsigned int    nr_bufs;
-       unsigned int    cur_buf;
-       bool            snapshot;
-       local_t         data_size;
-       local_t         lost;
-       local_t         head;
-       unsigned long   end;
-       void            **data_pages;
-       struct bts_phys buf[0];
-};
-
-struct pmu bts_pmu;
-
-static size_t buf_size(struct page *page)
-{
-       return 1 << (PAGE_SHIFT + page_private(page));
-}
-
-static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
-{
-       struct bts_buffer *buf;
-       struct page *page;
-       int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-       unsigned long offset;
-       size_t size = nr_pages << PAGE_SHIFT;
-       int pg, nbuf, pad;
-
-       /* count all the high order buffers */
-       for (pg = 0, nbuf = 0; pg < nr_pages;) {
-               page = virt_to_page(pages[pg]);
-               if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
-                       return NULL;
-               pg += 1 << page_private(page);
-               nbuf++;
-       }
-
-       /*
-        * to avoid interrupts in overwrite mode, only allow one physical
-        */
-       if (overwrite && nbuf > 1)
-               return NULL;
-
-       buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->nr_pages = nr_pages;
-       buf->nr_bufs = nbuf;
-       buf->snapshot = overwrite;
-       buf->data_pages = pages;
-       buf->real_size = size - size % BTS_RECORD_SIZE;
-
-       for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-               unsigned int __nr_pages;
-
-               page = virt_to_page(pages[pg]);
-               __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
-               buf->buf[nbuf].page = page;
-               buf->buf[nbuf].offset = offset;
-               buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-               buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
-               pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
-               buf->buf[nbuf].size -= pad;
-
-               pg += __nr_pages;
-               offset += __nr_pages << PAGE_SHIFT;
-       }
-
-       return buf;
-}
-
-static void bts_buffer_free_aux(void *data)
-{
-       kfree(data);
-}
-
-static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
-{
-       return buf->buf[idx].offset + buf->buf[idx].displacement;
-}
-
-static void
-bts_config_buffer(struct bts_buffer *buf)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_phys *phys = &buf->buf[buf->cur_buf];
-       unsigned long index, thresh = 0, end = phys->size;
-       struct page *page = phys->page;
-
-       index = local_read(&buf->head);
-
-       if (!buf->snapshot) {
-               if (buf->end < phys->offset + buf_size(page))
-                       end = buf->end - phys->offset - phys->displacement;
-
-               index -= phys->offset + phys->displacement;
-
-               if (end - index > BTS_SAFETY_MARGIN)
-                       thresh = end - BTS_SAFETY_MARGIN;
-               else if (end - index > BTS_RECORD_SIZE)
-                       thresh = end - BTS_RECORD_SIZE;
-               else
-                       thresh = end;
-       }
-
-       ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
-       ds->bts_index = ds->bts_buffer_base + index;
-       ds->bts_absolute_maximum = ds->bts_buffer_base + end;
-       ds->bts_interrupt_threshold = !buf->snapshot
-               ? ds->bts_buffer_base + thresh
-               : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
-}
-
-static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
-{
-       unsigned long index = head - phys->offset;
-
-       memset(page_address(phys->page) + index, 0, phys->size - index);
-}
-
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= bts->handle.size ||
-           bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-               return true;
-
-       return false;
-}
-
-static void bts_update(struct bts_ctx *bts)
-{
-       int cpu = raw_smp_processor_id();
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
-
-       if (!buf)
-               return;
-
-       head = index + bts_buffer_offset(buf, buf->cur_buf);
-       old = local_xchg(&buf->head, head);
-
-       if (!buf->snapshot) {
-               if (old == head)
-                       return;
-
-               if (ds->bts_index >= ds->bts_absolute_maximum)
-                       local_inc(&buf->lost);
-
-               /*
-                * old and head are always in the same physical buffer, so we
-                * can subtract them to get the data size.
-                */
-               local_add(head - old, &buf->data_size);
-       } else {
-               local_set(&buf->data_size, head);
-       }
-}
-
-static void __bts_event_start(struct perf_event *event)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-       u64 config = 0;
-
-       if (!buf || bts_buffer_is_full(buf, bts))
-               return;
-
-       event->hw.itrace_started = 1;
-       event->hw.state = 0;
-
-       if (!buf->snapshot)
-               config |= ARCH_PERFMON_EVENTSEL_INT;
-       if (!event->attr.exclude_kernel)
-               config |= ARCH_PERFMON_EVENTSEL_OS;
-       if (!event->attr.exclude_user)
-               config |= ARCH_PERFMON_EVENTSEL_USR;
-
-       bts_config_buffer(buf);
-
-       /*
-        * local barrier to make sure that ds configuration made it
-        * before we enable BTS
-        */
-       wmb();
-
-       intel_pmu_enable_bts(config);
-}
-
-static void bts_event_start(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       __bts_event_start(event);
-
-       /* PMI handler: this counter is running and likely generating PMIs */
-       ACCESS_ONCE(bts->started) = 1;
-}
-
-static void __bts_event_stop(struct perf_event *event)
-{
-       /*
-        * No extra synchronization is mandated by the documentation to have
-        * BTS data stores globally visible.
-        */
-       intel_pmu_disable_bts();
-
-       if (event->hw.state & PERF_HES_STOPPED)
-               return;
-
-       ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
-}
-
-static void bts_event_stop(struct perf_event *event, int flags)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       /* PMI handler: don't restart this counter */
-       ACCESS_ONCE(bts->started) = 0;
-
-       __bts_event_stop(event);
-
-       if (flags & PERF_EF_UPDATE)
-               bts_update(bts);
-}
-
-void intel_bts_enable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event && bts->started)
-               __bts_event_start(bts->handle.event);
-}
-
-void intel_bts_disable_local(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-       if (bts->handle.event)
-               __bts_event_stop(bts->handle.event);
-}
-
-static int
-bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
-{
-       unsigned long head, space, next_space, pad, gap, skip, wakeup;
-       unsigned int next_buf;
-       struct bts_phys *phys, *next_phys;
-       int ret;
-
-       if (buf->snapshot)
-               return 0;
-
-       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-       if (WARN_ON_ONCE(head != local_read(&buf->head)))
-               return -EINVAL;
-
-       phys = &buf->buf[buf->cur_buf];
-       space = phys->offset + phys->displacement + phys->size - head;
-       pad = space;
-       if (space > handle->size) {
-               space = handle->size;
-               space -= space % BTS_RECORD_SIZE;
-       }
-       if (space <= BTS_SAFETY_MARGIN) {
-               /* See if next phys buffer has more space */
-               next_buf = buf->cur_buf + 1;
-               if (next_buf >= buf->nr_bufs)
-                       next_buf = 0;
-               next_phys = &buf->buf[next_buf];
-               gap = buf_size(phys->page) - phys->displacement - phys->size +
-                     next_phys->displacement;
-               skip = pad + gap;
-               if (handle->size >= skip) {
-                       next_space = next_phys->size;
-                       if (next_space + skip > handle->size) {
-                               next_space = handle->size - skip;
-                               next_space -= next_space % BTS_RECORD_SIZE;
-                       }
-                       if (next_space > space || !space) {
-                               if (pad)
-                                       bts_buffer_pad_out(phys, head);
-                               ret = perf_aux_output_skip(handle, skip);
-                               if (ret)
-                                       return ret;
-                               /* Advance to next phys buffer */
-                               phys = next_phys;
-                               space = next_space;
-                               head = phys->offset + phys->displacement;
-                               /*
-                                * After this, cur_buf and head won't match ds
-                                * anymore, so we must not be racing with
-                                * bts_update().
-                                */
-                               buf->cur_buf = next_buf;
-                               local_set(&buf->head, head);
-                       }
-               }
-       }
-
-       /* Don't go far beyond wakeup watermark */
-       wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
-                handle->head;
-       if (space > wakeup) {
-               space = wakeup;
-               space -= space % BTS_RECORD_SIZE;
-       }
-
-       buf->end = head + space;
-
-       /*
-        * If we have no space, the lost notification would have been sent when
-        * we hit absolute_maximum - see bts_update()
-        */
-       if (!space)
-               return -ENOSPC;
-
-       return 0;
-}
-
-int intel_bts_interrupt(void)
-{
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct perf_event *event = bts->handle.event;
-       struct bts_buffer *buf;
-       s64 old_head;
-       int err;
-
-       if (!event || !bts->started)
-               return 0;
-
-       buf = perf_get_aux(&bts->handle);
-       /*
-        * Skip snapshot counters: they don't use the interrupt, but
-        * there's no other way of telling, because the pointer will
-        * keep moving
-        */
-       if (!buf || buf->snapshot)
-               return 0;
-
-       old_head = local_read(&buf->head);
-       bts_update(bts);
-
-       /* no new data */
-       if (old_head == local_read(&buf->head))
-               return 0;
-
-       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                           !!local_xchg(&buf->lost, 0));
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return 1;
-
-       err = bts_buffer_reset(buf, &bts->handle);
-       if (err)
-               perf_aux_output_end(&bts->handle, 0, false);
-
-       return 1;
-}
-
-static void bts_event_del(struct perf_event *event, int mode)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
-       bts_event_stop(event, PERF_EF_UPDATE);
-
-       if (buf) {
-               if (buf->snapshot)
-                       bts->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                                   !!local_xchg(&buf->lost, 0));
-       }
-
-       cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-       cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-       cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
-}
-
-static int bts_event_add(struct perf_event *event, int mode)
-{
-       struct bts_buffer *buf;
-       struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-               return -EBUSY;
-
-       if (bts->handle.event)
-               return -EBUSY;
-
-       buf = perf_aux_output_begin(&bts->handle, event);
-       if (!buf)
-               return -EINVAL;
-
-       ret = bts_buffer_reset(buf, &bts->handle);
-       if (ret) {
-               perf_aux_output_end(&bts->handle, 0, false);
-               return ret;
-       }
-
-       bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-       bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-       bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
-       if (mode & PERF_EF_START) {
-               bts_event_start(event, 0);
-               if (hwc->state & PERF_HES_STOPPED) {
-                       bts_event_del(event, 0);
-                       return -EBUSY;
-               }
-       }
-
-       return 0;
-}
-
-static void bts_event_destroy(struct perf_event *event)
-{
-       x86_release_hardware();
-       x86_del_exclusive(x86_lbr_exclusive_bts);
-}
-
-static int bts_event_init(struct perf_event *event)
-{
-       int ret;
-
-       if (event->attr.type != bts_pmu.type)
-               return -ENOENT;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_bts))
-               return -EBUSY;
-
-       /*
-        * BTS leaks kernel addresses even when CPL0 tracing is
-        * disabled, so disallow intel_bts driver for unprivileged
-        * users on paranoid systems since it provides trace data
-        * to the user in a zero-copy fashion.
-        *
-        * Note that the default paranoia setting permits unprivileged
-        * users to profile the kernel.
-        */
-       if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-           !capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       ret = x86_reserve_hardware();
-       if (ret) {
-               x86_del_exclusive(x86_lbr_exclusive_bts);
-               return ret;
-       }
-
-       event->destroy = bts_event_destroy;
-
-       return 0;
-}
-
-static void bts_event_read(struct perf_event *event)
-{
-}
-
-static __init int bts_init(void)
-{
-       if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
-               return -ENODEV;
-
-       bts_pmu.capabilities    = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
-       bts_pmu.task_ctx_nr     = perf_sw_context;
-       bts_pmu.event_init      = bts_event_init;
-       bts_pmu.add             = bts_event_add;
-       bts_pmu.del             = bts_event_del;
-       bts_pmu.start           = bts_event_start;
-       bts_pmu.stop            = bts_event_stop;
-       bts_pmu.read            = bts_event_read;
-       bts_pmu.setup_aux       = bts_buffer_setup_aux;
-       bts_pmu.free_aux        = bts_buffer_free_aux;
-
-       return perf_pmu_register(&bts_pmu, "intel_bts", -1);
-}
-arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
deleted file mode 100644 (file)
index a316ca9..0000000
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define MSR_IA32_PQR_ASSOC     0x0c8f
-#define MSR_IA32_QM_CTR                0x0c8e
-#define MSR_IA32_QM_EVTSEL     0x0c8d
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:              The cached Resource Monitoring ID
- * @closid:            The cached Class Of Service ID
- * @rmid_usecnt:       The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-       u32                     rmid;
-       u32                     closid;
-       int                     rmid_usecnt;
-};
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR         (1ULL << 63)
-#define RMID_VAL_UNAVAIL       (1ULL << 62)
-
-#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
-
-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID           (-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-       if (!rmid || rmid == INVALID_RMID)
-               return false;
-
-       return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-       u64 val;
-
-       /*
-        * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-        * it just says that to increase confusion.
-        */
-       wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-       rdmsrl(MSR_IA32_QM_CTR, val);
-
-       /*
-        * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-        * the number of cachelines tagged with @rmid.
-        */
-       return val;
-}
-
-enum rmid_recycle_state {
-       RMID_YOUNG = 0,
-       RMID_AVAILABLE,
-       RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-       u32 rmid;
-       enum rmid_recycle_state state;
-       struct list_head list;
-       unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       entry = cqm_rmid_ptrs[rmid];
-       WARN_ON(entry->rmid != rmid);
-
-       return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       if (list_empty(&cqm_rmid_free_lru))
-               return INVALID_RMID;
-
-       entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-       list_del(&entry->list);
-
-       return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-       struct cqm_rmid_entry *entry;
-
-       lockdep_assert_held(&cache_mutex);
-
-       WARN_ON(!__rmid_valid(rmid));
-       entry = __rmid_entry(rmid);
-
-       entry->queue_time = jiffies;
-       entry->state = RMID_YOUNG;
-
-       list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-       struct cqm_rmid_entry *entry;
-       unsigned int nr_rmids;
-       int r = 0;
-
-       nr_rmids = cqm_max_rmid + 1;
-       cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-                               nr_rmids, GFP_KERNEL);
-       if (!cqm_rmid_ptrs)
-               return -ENOMEM;
-
-       for (; r <= cqm_max_rmid; r++) {
-               struct cqm_rmid_entry *entry;
-
-               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-               if (!entry)
-                       goto fail;
-
-               INIT_LIST_HEAD(&entry->list);
-               entry->rmid = r;
-               cqm_rmid_ptrs[r] = entry;
-
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-       /*
-        * RMID 0 is special and is always allocated. It's used for all
-        * tasks that are not monitored.
-        */
-       entry = __rmid_entry(0);
-       list_del(&entry->list);
-
-       mutex_lock(&cache_mutex);
-       intel_cqm_rotation_rmid = __get_rmid();
-       mutex_unlock(&cache_mutex);
-
-       return 0;
-fail:
-       while (r--)
-               kfree(cqm_rmid_ptrs[r]);
-
-       kfree(cqm_rmid_ptrs);
-       return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-       /* Per-cpu and task events don't mix */
-       if ((a->attach_state & PERF_ATTACH_TASK) !=
-           (b->attach_state & PERF_ATTACH_TASK))
-               return false;
-
-#ifdef CONFIG_CGROUP_PERF
-       if (a->cgrp != b->cgrp)
-               return false;
-#endif
-
-       /* If not task event, we're machine wide */
-       if (!(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Events that target same task are placed into the same cache group.
-        */
-       if (a->hw.target == b->hw.target)
-               return true;
-
-       /*
-        * Are we an inherited event?
-        */
-       if (b->parent == a)
-               return true;
-
-       return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-       if (event->attach_state & PERF_ATTACH_TASK)
-               return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-       return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *                PROHIBITS
- *     system-wide    ->       cgroup and task
- *     cgroup        ->        system-wide
- *                           ->        task in cgroup
- *     task          ->        system-wide
- *                           ->        task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-       /*
-        * We can have any number of cgroups but only one system-wide
-        * event at a time.
-        */
-       if (a->cgrp && b->cgrp) {
-               struct perf_cgroup *ac = a->cgrp;
-               struct perf_cgroup *bc = b->cgrp;
-
-               /*
-                * This condition should have been caught in
-                * __match_event() and we should be sharing an RMID.
-                */
-               WARN_ON_ONCE(ac == bc);
-
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-
-       if (a->cgrp || b->cgrp) {
-               struct perf_cgroup *ac, *bc;
-
-               /*
-                * cgroup and system-wide events are mutually exclusive
-                */
-               if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-                   (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-                       return true;
-
-               /*
-                * Ensure neither event is part of the other's cgroup
-                */
-               ac = event_to_cgroup(a);
-               bc = event_to_cgroup(b);
-               if (ac == bc)
-                       return true;
-
-               /*
-                * Must have cgroup and non-intersecting task events.
-                */
-               if (!ac || !bc)
-                       return false;
-
-               /*
-                * We have cgroup and task events, and the task belongs
-                * to a cgroup. Check for for overlap.
-                */
-               if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-                   cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-                       return true;
-
-               return false;
-       }
-#endif
-       /*
-        * If one of them is not a task, same story as above with cgroups.
-        */
-       if (!(a->attach_state & PERF_ATTACH_TASK) ||
-           !(b->attach_state & PERF_ATTACH_TASK))
-               return true;
-
-       /*
-        * Must be non-overlapping.
-        */
-       return false;
-}
-
-struct rmid_read {
-       u32 rmid;
-       atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-       struct perf_event *event;
-       struct list_head *head = &group->hw.cqm_group_entry;
-       u32 old_rmid = group->hw.cqm_rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       /*
-        * If our RMID is being deallocated, perform a read now.
-        */
-       if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-               struct rmid_read rr = {
-                       .value = ATOMIC64_INIT(0),
-                       .rmid = old_rmid,
-               };
-
-               on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-                                &rr, 1);
-               local64_set(&group->count, atomic64_read(&rr.value));
-       }
-
-       raw_spin_lock_irq(&cache_lock);
-
-       group->hw.cqm_rmid = rmid;
-       list_for_each_entry(event, head, hw.cqm_group_entry)
-               event->hw.cqm_rmid = rmid;
-
-       raw_spin_unlock_irq(&cache_lock);
-
-       return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-       struct cqm_rmid_entry *entry;
-
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               if (entry->state != RMID_AVAILABLE)
-                       break;
-
-               if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-                       entry->state = RMID_DIRTY;
-       }
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-       struct perf_event *leader, *event;
-
-       lockdep_assert_held(&cache_mutex);
-
-       leader = list_first_entry(&cache_groups, struct perf_event,
-                                 hw.cqm_groups_entry);
-       event = leader;
-
-       list_for_each_entry_continue(event, &cache_groups,
-                                    hw.cqm_groups_entry) {
-               if (__rmid_valid(event->hw.cqm_rmid))
-                       continue;
-
-               if (__conflict_event(event, leader))
-                       continue;
-
-               intel_cqm_xchg_rmid(event, rmid);
-               return true;
-       }
-
-       return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250    /* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-       struct cqm_rmid_entry *entry, *tmp;
-
-       lockdep_assert_held(&cache_mutex);
-
-       *available = 0;
-       list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-               unsigned long min_queue_time;
-               unsigned long now = jiffies;
-
-               /*
-                * We hold RMIDs placed into limbo for a minimum queue
-                * time. Before the minimum queue time has elapsed we do
-                * not recycle RMIDs.
-                *
-                * The reasoning is that until a sufficient time has
-                * passed since we stopped using an RMID, any RMID
-                * placed onto the limbo list will likely still have
-                * data tagged in the cache, which means we'll probably
-                * fail to recycle it anyway.
-                *
-                * We can save ourselves an expensive IPI by skipping
-                * any RMIDs that have not been queued for the minimum
-                * time.
-                */
-               min_queue_time = entry->queue_time +
-                       msecs_to_jiffies(__rmid_queue_time_ms);
-
-               if (time_after(min_queue_time, now))
-                       break;
-
-               entry->state = RMID_AVAILABLE;
-               (*available)++;
-       }
-
-       /*
-        * Fast return if none of the RMIDs on the limbo list have been
-        * sitting on the queue for the minimum queue time.
-        */
-       if (!*available)
-               return false;
-
-       /*
-        * Test whether an RMID is free for each package.
-        */
-       on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-       list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-               /*
-                * Exhausted all RMIDs that have waited min queue time.
-                */
-               if (entry->state == RMID_YOUNG)
-                       break;
-
-               if (entry->state == RMID_DIRTY)
-                       continue;
-
-               list_del(&entry->list); /* remove from limbo */
-
-               /*
-                * The rotation RMID gets priority if it's
-                * currently invalid. In which case, skip adding
-                * the RMID to the the free lru.
-                */
-               if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-                       intel_cqm_rotation_rmid = entry->rmid;
-                       continue;
-               }
-
-               /*
-                * If we have groups waiting for RMIDs, hand
-                * them one now provided they don't conflict.
-                */
-               if (intel_cqm_sched_in_event(entry->rmid))
-                       continue;
-
-               /*
-                * Otherwise place it onto the free list.
-                */
-               list_add_tail(&entry->list, &cqm_rmid_free_lru);
-       }
-
-
-       return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-       struct perf_event *rotor;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       rotor = list_first_entry(&cache_groups, struct perf_event,
-                                hw.cqm_groups_entry);
-
-       /*
-        * The group at the front of the list should always have a valid
-        * RMID. If it doesn't then no groups have RMIDs assigned and we
-        * don't need to rotate the list.
-        */
-       if (next == rotor)
-               return;
-
-       rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-       __put_rmid(rmid);
-
-       list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-       struct perf_event *group, *g;
-       u32 rmid;
-
-       lockdep_assert_held(&cache_mutex);
-
-       list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-               if (group == event)
-                       continue;
-
-               rmid = group->hw.cqm_rmid;
-
-               /*
-                * Skip events that don't have a valid RMID.
-                */
-               if (!__rmid_valid(rmid))
-                       continue;
-
-               /*
-                * No conflict? No problem! Leave the event alone.
-                */
-               if (!__conflict_event(group, event))
-                       continue;
-
-               intel_cqm_xchg_rmid(group, INVALID_RMID);
-               __put_rmid(rmid);
-       }
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-       struct perf_event *group, *start = NULL;
-       unsigned int threshold_limit;
-       unsigned int nr_needed = 0;
-       unsigned int nr_available;
-       bool rotated = false;
-
-       mutex_lock(&cache_mutex);
-
-again:
-       /*
-        * Fast path through this function if there are no groups and no
-        * RMIDs that need cleaning.
-        */
-       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-               if (!__rmid_valid(group->hw.cqm_rmid)) {
-                       if (!start)
-                               start = group;
-                       nr_needed++;
-               }
-       }
-
-       /*
-        * We have some event groups, but they all have RMIDs assigned
-        * and no RMIDs need cleaning.
-        */
-       if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-               goto out;
-
-       if (!nr_needed)
-               goto stabilize;
-
-       /*
-        * We have more event groups without RMIDs than available RMIDs,
-        * or we have event groups that conflict with the ones currently
-        * scheduled.
-        *
-        * We force deallocate the rmid of the group at the head of
-        * cache_groups. The first event group without an RMID then gets
-        * assigned intel_cqm_rotation_rmid. This ensures we always make
-        * forward progress.
-        *
-        * Rotate the cache_groups list so the previous head is now the
-        * tail.
-        */
-       __intel_cqm_pick_and_rotate(start);
-
-       /*
-        * If the rotation is going to succeed, reduce the threshold so
-        * that we don't needlessly reuse dirty RMIDs.
-        */
-       if (__rmid_valid(intel_cqm_rotation_rmid)) {
-               intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-               intel_cqm_rotation_rmid = __get_rmid();
-
-               intel_cqm_sched_out_conflicting_events(start);
-
-               if (__intel_cqm_threshold)
-                       __intel_cqm_threshold--;
-       }
-
-       rotated = true;
-
-stabilize:
-       /*
-        * We now need to stablize the RMID we freed above (if any) to
-        * ensure that the next time we rotate we have an RMID with zero
-        * occupancy value.
-        *
-        * Alternatively, if we didn't need to perform any rotation,
-        * we'll have a bunch of RMIDs in limbo that need stabilizing.
-        */
-       threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-       while (intel_cqm_rmid_stabilize(&nr_available) &&
-              __intel_cqm_threshold < threshold_limit) {
-               unsigned int steal_limit;
-
-               /*
-                * Don't spin if nobody is actively waiting for an RMID,
-                * the rotation worker will be kicked as soon as an
-                * event needs an RMID anyway.
-                */
-               if (!nr_needed)
-                       break;
-
-               /* Allow max 25% of RMIDs to be in limbo. */
-               steal_limit = (cqm_max_rmid + 1) / 4;
-
-               /*
-                * We failed to stabilize any RMIDs so our rotation
-                * logic is now stuck. In order to make forward progress
-                * we have a few options:
-                *
-                *   1. rotate ("steal") another RMID
-                *   2. increase the threshold
-                *   3. do nothing
-                *
-                * We do both of 1. and 2. until we hit the steal limit.
-                *
-                * The steal limit prevents all RMIDs ending up on the
-                * limbo list. This can happen if every RMID has a
-                * non-zero occupancy above threshold_limit, and the
-                * occupancy values aren't dropping fast enough.
-                *
-                * Note that there is prioritisation at work here - we'd
-                * rather increase the number of RMIDs on the limbo list
-                * than increase the threshold, because increasing the
-                * threshold skews the event data (because we reuse
-                * dirty RMIDs) - threshold bumps are a last resort.
-                */
-               if (nr_available < steal_limit)
-                       goto again;
-
-               __intel_cqm_threshold++;
-       }
-
-out:
-       mutex_unlock(&cache_mutex);
-       return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-       unsigned long delay;
-
-       __intel_cqm_rmid_rotate();
-
-       delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-       schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-                                 struct perf_event **group)
-{
-       struct perf_event *iter;
-       bool conflict = false;
-       u32 rmid;
-
-       list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-               rmid = iter->hw.cqm_rmid;
-
-               if (__match_event(iter, event)) {
-                       /* All tasks in a group share an RMID */
-                       event->hw.cqm_rmid = rmid;
-                       *group = iter;
-                       return;
-               }
-
-               /*
-                * We only care about conflicts for events that are
-                * actually scheduled in (and hence have a valid RMID).
-                */
-               if (__conflict_event(iter, event) && __rmid_valid(rmid))
-                       conflict = true;
-       }
-
-       if (conflict)
-               rmid = INVALID_RMID;
-       else
-               rmid = __get_rmid();
-
-       event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-       unsigned long flags;
-       u32 rmid;
-       u64 val;
-
-       /*
-        * Task events are handled by intel_cqm_event_count().
-        */
-       if (event->cpu == -1)
-               return;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       rmid = event->hw.cqm_rmid;
-
-       if (!__rmid_valid(rmid))
-               goto out;
-
-       val = __rmid_read(rmid);
-
-       /*
-        * Ignore this reading on error states and do not update the value.
-        */
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               goto out;
-
-       local64_set(&event->count, val);
-out:
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-       struct rmid_read *rr = info;
-       u64 val;
-
-       val = __rmid_read(rr->rmid);
-
-       if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-               return;
-
-       atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-       return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-       unsigned long flags;
-       struct rmid_read rr = {
-               .value = ATOMIC64_INIT(0),
-       };
-
-       /*
-        * We only need to worry about task events. System-wide events
-        * are handled like usual, i.e. entirely with
-        * intel_cqm_event_read().
-        */
-       if (event->cpu != -1)
-               return __perf_event_count(event);
-
-       /*
-        * Only the group leader gets to report values. This stops us
-        * reporting duplicate values to userspace, and gives us a clear
-        * rule for which task gets to report the values.
-        *
-        * Note that it is impossible to attribute these values to
-        * specific packages - we forfeit that ability when we create
-        * task events.
-        */
-       if (!cqm_group_leader(event))
-               return 0;
-
-       /*
-        * Getting up-to-date values requires an SMP IPI which is not
-        * possible if we're being called in interrupt context. Return
-        * the cached values instead.
-        */
-       if (unlikely(in_interrupt()))
-               goto out;
-
-       /*
-        * Notice that we don't perform the reading of an RMID
-        * atomically, because we can't hold a spin lock across the
-        * IPIs.
-        *
-        * Speculatively perform the read, since @event might be
-        * assigned a different (possibly invalid) RMID while we're
-        * busying performing the IPI calls. It's therefore necessary to
-        * check @event's RMID afterwards, and if it has changed,
-        * discard the result of the read.
-        */
-       rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-       if (!__rmid_valid(rr.rmid))
-               goto out;
-
-       on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-       if (event->hw.cqm_rmid == rr.rmid)
-               local64_set(&event->count, atomic64_read(&rr.value));
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-       return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-       u32 rmid = event->hw.cqm_rmid;
-
-       if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-               return;
-
-       event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-       if (state->rmid_usecnt++) {
-               if (!WARN_ON_ONCE(state->rmid != rmid))
-                       return;
-       } else {
-               WARN_ON_ONCE(state->rmid);
-       }
-
-       state->rmid = rmid;
-       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-       if (event->hw.cqm_state & PERF_HES_STOPPED)
-               return;
-
-       event->hw.cqm_state |= PERF_HES_STOPPED;
-
-       intel_cqm_event_read(event);
-
-       if (!--state->rmid_usecnt) {
-               state->rmid = 0;
-               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-       } else {
-               WARN_ON_ONCE(!state->rmid);
-       }
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-       unsigned long flags;
-       u32 rmid;
-
-       raw_spin_lock_irqsave(&cache_lock, flags);
-
-       event->hw.cqm_state = PERF_HES_STOPPED;
-       rmid = event->hw.cqm_rmid;
-
-       if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-               intel_cqm_event_start(event, mode);
-
-       raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-       return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-       struct perf_event *group_other = NULL;
-
-       mutex_lock(&cache_mutex);
-
-       /*
-        * If there's another event in this group...
-        */
-       if (!list_empty(&event->hw.cqm_group_entry)) {
-               group_other = list_first_entry(&event->hw.cqm_group_entry,
-                                              struct perf_event,
-                                              hw.cqm_group_entry);
-               list_del(&event->hw.cqm_group_entry);
-       }
-
-       /*
-        * And we're the group leader..
-        */
-       if (cqm_group_leader(event)) {
-               /*
-                * If there was a group_other, make that leader, otherwise
-                * destroy the group and return the RMID.
-                */
-               if (group_other) {
-                       list_replace(&event->hw.cqm_groups_entry,
-                                    &group_other->hw.cqm_groups_entry);
-               } else {
-                       u32 rmid = event->hw.cqm_rmid;
-
-                       if (__rmid_valid(rmid))
-                               __put_rmid(rmid);
-                       list_del(&event->hw.cqm_groups_entry);
-               }
-       }
-
-       mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-       struct perf_event *group = NULL;
-       bool rotate = false;
-
-       if (event->attr.type != intel_cqm_pmu.type)
-               return -ENOENT;
-
-       if (event->attr.config & ~QOS_EVENT_MASK)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-       INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-       event->destroy = intel_cqm_event_destroy;
-
-       mutex_lock(&cache_mutex);
-
-       /* Will also set rmid */
-       intel_cqm_setup_event(event, &group);
-
-       if (group) {
-               list_add_tail(&event->hw.cqm_group_entry,
-                             &group->hw.cqm_group_entry);
-       } else {
-               list_add_tail(&event->hw.cqm_groups_entry,
-                             &cache_groups);
-
-               /*
-                * All RMIDs are either in use or have recently been
-                * used. Kick the rotation worker to clean/free some.
-                *
-                * We only do this for the group leader, rather than for
-                * every event in a group to save on needless work.
-                */
-               if (!__rmid_valid(event->hw.cqm_rmid))
-                       rotate = true;
-       }
-
-       mutex_unlock(&cache_mutex);
-
-       if (rotate)
-               schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-       return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-static struct attribute *intel_cqm_events_attr[] = {
-       EVENT_PTR(intel_cqm_llc),
-       EVENT_PTR(intel_cqm_llc_pkg),
-       EVENT_PTR(intel_cqm_llc_unit),
-       EVENT_PTR(intel_cqm_llc_scale),
-       EVENT_PTR(intel_cqm_llc_snapshot),
-       NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-       .name = "events",
-       .attrs = intel_cqm_events_attr,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-       .name = "format",
-       .attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-                          char *page)
-{
-       ssize_t rv;
-
-       mutex_lock(&cache_mutex);
-       rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-       mutex_unlock(&cache_mutex);
-
-       return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-                           struct device_attribute *attr,
-                           const char *buf, size_t count)
-{
-       unsigned int bytes, cachelines;
-       int ret;
-
-       ret = kstrtouint(buf, 0, &bytes);
-       if (ret)
-               return ret;
-
-       mutex_lock(&cache_mutex);
-
-       __intel_cqm_max_threshold = bytes;
-       cachelines = bytes / cqm_l3_scale;
-
-       /*
-        * The new maximum takes effect immediately.
-        */
-       if (__intel_cqm_threshold > cachelines)
-               __intel_cqm_threshold = cachelines;
-
-       mutex_unlock(&cache_mutex);
-
-       return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-       &dev_attr_max_recycle_threshold.attr,
-       NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-       .attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-       &intel_cqm_events_group,
-       &intel_cqm_format_group,
-       &intel_cqm_group,
-       NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-       .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-       .attr_groups         = intel_cqm_attr_groups,
-       .task_ctx_nr         = perf_sw_context,
-       .event_init          = intel_cqm_event_init,
-       .add                 = intel_cqm_event_add,
-       .del                 = intel_cqm_event_stop,
-       .start               = intel_cqm_event_start,
-       .stop                = intel_cqm_event_stop,
-       .read                = intel_cqm_event_read,
-       .count               = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       for_each_cpu(i, &cqm_cpumask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return; /* already got reader for this socket */
-       }
-
-       cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static void intel_cqm_cpu_starting(unsigned int cpu)
-{
-       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-       state->rmid = 0;
-       state->closid = 0;
-       state->rmid_usecnt = 0;
-
-       WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-}
-
-static void intel_cqm_cpu_exit(unsigned int cpu)
-{
-       int phys_id = topology_physical_package_id(cpu);
-       int i;
-
-       /*
-        * Is @cpu a designated cqm reader?
-        */
-       if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-               return;
-
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-
-               if (phys_id == topology_physical_package_id(i)) {
-                       cpumask_set_cpu(i, &cqm_cpumask);
-                       break;
-               }
-       }
-}
-
-static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu  = (unsigned long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               intel_cqm_cpu_exit(cpu);
-               break;
-       case CPU_STARTING:
-               intel_cqm_cpu_starting(cpu);
-               cqm_pick_event_reader(cpu);
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-       {}
-};
-
-static int __init intel_cqm_init(void)
-{
-       char *str, scale[20];
-       int i, cpu, ret;
-
-       if (!x86_match_cpu(intel_cqm_match))
-               return -ENODEV;
-
-       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-       /*
-        * It's possible that not all resources support the same number
-        * of RMIDs. Instead of making scheduling much more complicated
-        * (where we have to match a task's RMID to a cpu that supports
-        * that many RMIDs) just find the minimum RMIDs supported across
-        * all cpus.
-        *
-        * Also, check that the scales match on all cpus.
-        */
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-               if (c->x86_cache_max_rmid < cqm_max_rmid)
-                       cqm_max_rmid = c->x86_cache_max_rmid;
-
-               if (c->x86_cache_occ_scale != cqm_l3_scale) {
-                       pr_err("Multiple LLC scale values, disabling\n");
-                       ret = -EINVAL;
-                       goto out;
-               }
-       }
-
-       /*
-        * A reasonable upper limit on the max threshold is the number
-        * of lines tagged per RMID if all RMIDs have the same number of
-        * lines tagged in the LLC.
-        *
-        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-        */
-       __intel_cqm_max_threshold =
-               boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-       str = kstrdup(scale, GFP_KERNEL);
-       if (!str) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       event_attr_intel_cqm_llc_scale.event_str = str;
-
-       ret = intel_cqm_setup_rmid_cache();
-       if (ret)
-               goto out;
-
-       for_each_online_cpu(i) {
-               intel_cqm_cpu_starting(i);
-               cqm_pick_event_reader(i);
-       }
-
-       __perf_cpu_notifier(intel_cqm_cpu_notifier);
-
-       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-       if (ret)
-               pr_err("Intel CQM perf registration failed: %d\n", ret);
-       else
-               pr_info("Intel CQM monitoring enabled\n");
-
-out:
-       cpu_notifier_register_done();
-
-       return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
deleted file mode 100644 (file)
index 75a38b5..0000000
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * perf_event_intel_cstate.c: support cstate residency counters
- *
- * Copyright (C) 2015, Intel Corp.
- * Author: Kan Liang (kan.liang@intel.com)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- */
-
-/*
- * This file export cstate related free running (read-only) counters
- * for perf. These counters may be use simultaneously by other tools,
- * such as turbostat. However, it still make sense to implement them
- * in perf. Because we can conveniently collect them together with
- * other events, and allow to use them from tools without special MSR
- * access code.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it is not supported by the hardware.
- *
- * According to counters' scope and category, two PMUs are registered
- * with the perf_event core subsystem.
- *  - 'cstate_core': The counter is available for each physical core.
- *    The counters include CORE_C*_RESIDENCY.
- *  - 'cstate_pkg': The counter is available for each physical package.
- *    The counters include PKG_C*_RESIDENCY.
- *
- * All of these counters are specified in the Intel® 64 and IA-32
- * Architectures Software Developer.s Manual Vol3b.
- *
- * Model specific counters:
- *     MSR_CORE_C1_RES: CORE C1 Residency Counter
- *                      perf code: 0x00
- *                      Available model: SLM,AMT
- *                      Scope: Core (each processor core has a MSR)
- *     MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
- *                            perf code: 0x03
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Core
- *     MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
- *                            perf code: 0x00
- *                            Available model: SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
- *                            perf code: 0x01
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
- *                            perf code: 0x02
- *                            Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
- *                            perf code: 0x03
- *                            Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *                            Scope: Package (physical package)
- *     MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
- *                            perf code: 0x04
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
- *                            perf code: 0x05
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *     MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
- *                            perf code: 0x06
- *                            Available model: HSW ULT only
- *                            Scope: Package (physical package)
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)                \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj,    \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf);
-
-struct perf_cstate_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-
-/* cstate_core PMU */
-
-static struct pmu cstate_core_pmu;
-static bool has_cstate_core;
-
-enum perf_cstate_core_id {
-       /*
-        * cstate_core events
-        */
-       PERF_CSTATE_CORE_C1_RES = 0,
-       PERF_CSTATE_CORE_C3_RES,
-       PERF_CSTATE_CORE_C6_RES,
-       PERF_CSTATE_CORE_C7_RES,
-
-       PERF_CSTATE_CORE_EVENT_MAX,
-};
-
-bool test_core(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C1_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
-
-static struct perf_cstate_msr core_msr[] = {
-       [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,          &evattr_cstate_core_c1, test_core, },
-       [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,    &evattr_cstate_core_c3, test_core, },
-       [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,    &evattr_cstate_core_c6, test_core, },
-       [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,    &evattr_cstate_core_c7, test_core, },
-};
-
-static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group core_events_attr_group = {
-       .name = "events",
-       .attrs = core_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
-       &format_attr_core_event.attr,
-       NULL,
-};
-
-static struct attribute_group core_format_attr_group = {
-       .name = "format",
-       .attrs = core_format_attrs,
-};
-
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
-       .attrs = cstate_cpumask_attrs,
-};
-
-static const struct attribute_group *core_attr_groups[] = {
-       &core_events_attr_group,
-       &core_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_core PMU end */
-
-
-/* cstate_pkg PMU */
-
-static struct pmu cstate_pkg_pmu;
-static bool has_cstate_pkg;
-
-enum perf_cstate_pkg_id {
-       /*
-        * cstate_pkg events
-        */
-       PERF_CSTATE_PKG_C2_RES = 0,
-       PERF_CSTATE_PKG_C3_RES,
-       PERF_CSTATE_PKG_C6_RES,
-       PERF_CSTATE_PKG_C7_RES,
-       PERF_CSTATE_PKG_C8_RES,
-       PERF_CSTATE_PKG_C9_RES,
-       PERF_CSTATE_PKG_C10_RES,
-
-       PERF_CSTATE_PKG_EVENT_MAX,
-};
-
-bool test_pkg(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-               if (idx == PERF_CSTATE_CORE_C3_RES ||
-                   idx == PERF_CSTATE_CORE_C6_RES ||
-                   idx == PERF_CSTATE_CORE_C7_RES)
-                       return true;
-               break;
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES)
-                       return true;
-               break;
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_CSTATE_CORE_C6_RES)
-                       return true;
-               break;
-       case 69: /* 22nm Haswell ULT */
-               if (idx == PERF_CSTATE_PKG_C2_RES ||
-                   idx == PERF_CSTATE_PKG_C3_RES ||
-                   idx == PERF_CSTATE_PKG_C6_RES ||
-                   idx == PERF_CSTATE_PKG_C7_RES ||
-                   idx == PERF_CSTATE_PKG_C8_RES ||
-                   idx == PERF_CSTATE_PKG_C9_RES ||
-                   idx == PERF_CSTATE_PKG_C10_RES)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
-PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
-PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
-PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
-
-static struct perf_cstate_msr pkg_msr[] = {
-       [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,      &evattr_cstate_pkg_c2,  test_pkg, },
-       [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,      &evattr_cstate_pkg_c3,  test_pkg, },
-       [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,      &evattr_cstate_pkg_c6,  test_pkg, },
-       [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,      &evattr_cstate_pkg_c7,  test_pkg, },
-       [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,      &evattr_cstate_pkg_c8,  test_pkg, },
-       [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,      &evattr_cstate_pkg_c9,  test_pkg, },
-       [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,    &evattr_cstate_pkg_c10, test_pkg, },
-};
-
-static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group pkg_events_attr_group = {
-       .name = "events",
-       .attrs = pkg_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
-       &format_attr_pkg_event.attr,
-       NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
-       .name = "format",
-       .attrs = pkg_format_attrs,
-};
-
-static cpumask_t cstate_pkg_cpu_mask;
-
-static const struct attribute_group *pkg_attr_groups[] = {
-       &pkg_events_attr_group,
-       &pkg_format_attr_group,
-       &cpumask_attr_group,
-       NULL,
-};
-
-/* cstate_pkg PMU end*/
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-                                      struct device_attribute *attr,
-                                      char *buf)
-{
-       struct pmu *pmu = dev_get_drvdata(dev);
-
-       if (pmu == &cstate_core_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
-       else if (pmu == &cstate_pkg_pmu)
-               return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
-       else
-               return 0;
-}
-
-static int cstate_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-       int ret = 0;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (event->pmu == &cstate_core_pmu) {
-               if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
-                       return -EINVAL;
-               if (!core_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = core_msr[cfg].msr;
-       } else if (event->pmu == &cstate_pkg_pmu) {
-               if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
-                       return -EINVAL;
-               if (!pkg_msr[cfg].attr)
-                       return -EINVAL;
-               event->hw.event_base = pkg_msr[cfg].msr;
-       } else
-               return -ENOENT;
-
-       /* must be done before validate_group */
-       event->hw.config = cfg;
-       event->hw.idx = -1;
-
-       return ret;
-}
-
-static inline u64 cstate_pmu_read_counter(struct perf_event *event)
-{
-       u64 val;
-
-       rdmsrl(event->hw.event_base, val);
-       return val;
-}
-
-static void cstate_pmu_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       new_raw_count = cstate_pmu_read_counter(event);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count)
-               goto again;
-
-       local64_add(new_raw_count - prev_raw_count, &event->count);
-}
-
-static void cstate_pmu_event_start(struct perf_event *event, int mode)
-{
-       local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
-}
-
-static void cstate_pmu_event_stop(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_update(event);
-}
-
-static void cstate_pmu_event_del(struct perf_event *event, int mode)
-{
-       cstate_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int cstate_pmu_event_add(struct perf_event *event, int mode)
-{
-       if (mode & PERF_EF_START)
-               cstate_pmu_event_start(event, mode);
-
-       return 0;
-}
-
-static void cstate_cpu_exit(int cpu)
-{
-       int i, id, target;
-
-       /* cpu exit for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_core_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_core_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
-       }
-
-       /* cpu exit for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               target = -1;
-
-               for_each_online_cpu(i) {
-                       if (i == cpu)
-                               continue;
-                       if (id == topology_physical_package_id(i)) {
-                               target = i;
-                               break;
-                       }
-               }
-               if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
-                       cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-               WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-               if (target >= 0)
-                       perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
-       }
-}
-
-static void cstate_cpu_init(int cpu)
-{
-       int i, id;
-
-       /* cpu init for cstate core */
-       if (has_cstate_core) {
-               id = topology_core_id(cpu);
-               for_each_cpu(i, &cstate_core_cpu_mask) {
-                       if (id == topology_core_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-       }
-
-       /* cpu init for cstate pkg */
-       if (has_cstate_pkg) {
-               id = topology_physical_package_id(cpu);
-               for_each_cpu(i, &cstate_pkg_cpu_mask) {
-                       if (id == topology_physical_package_id(i))
-                               break;
-               }
-               if (i >= nr_cpu_ids)
-                       cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-       }
-}
-
-static int cstate_cpu_notifier(struct notifier_block *self,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               break;
-       case CPU_STARTING:
-               cstate_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               break;
-       case CPU_DOWN_PREPARE:
-               cstate_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-                            struct attribute   **events_attrs,
-                            int max_event_nr)
-{
-       int i, j = 0;
-       u64 val;
-
-       /* Probe the cstate events. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining events in the sysfs attrs. */
-       for (i = 0; i < max_event_nr; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-       /* SLM has different MSR for PKG C6 */
-       switch (boot_cpu_data.x86_model) {
-       case 55:
-       case 76:
-       case 77:
-               pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-       }
-
-       if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-               has_cstate_core = true;
-
-       if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-               has_cstate_pkg = true;
-
-       return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-       int cpu;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu)
-               cstate_cpu_init(cpu);
-
-       __perf_cpu_notifier(cstate_cpu_notifier);
-
-       cpu_notifier_register_done();
-}
-
-static struct pmu cstate_core_pmu = {
-       .attr_groups    = core_attr_groups,
-       .name           = "cstate_core",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static struct pmu cstate_pkg_pmu = {
-       .attr_groups    = pkg_attr_groups,
-       .name           = "cstate_pkg",
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = cstate_pmu_event_init,
-       .add            = cstate_pmu_event_add, /* must have */
-       .del            = cstate_pmu_event_del, /* must have */
-       .start          = cstate_pmu_event_start,
-       .stop           = cstate_pmu_event_stop,
-       .read           = cstate_pmu_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static void __init cstate_pmus_register(void)
-{
-       int err;
-
-       if (has_cstate_core) {
-               err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_core_pmu.name, err);
-       }
-
-       if (has_cstate_pkg) {
-               err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-               if (WARN_ON(err))
-                       pr_info("Failed to register PMU %s error %d\n",
-                               cstate_pkg_pmu.name, err);
-       }
-}
-
-static int __init cstate_pmu_init(void)
-{
-       int err;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       err = cstate_init();
-       if (err)
-               return err;
-
-       cstate_cpumask_init();
-
-       cstate_pmus_register();
-
-       return 0;
-}
-
-device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
deleted file mode 100644 (file)
index 10602f0..0000000
+++ /dev/null
@@ -1,1368 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE                24
-
-#define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
-#define PEBS_FIXUP_SIZE                PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
-       u32 flags, ip;
-       u32 ax, bc, cx, dx;
-       u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
-       u64 val;
-       struct {
-               unsigned int ld_dse:4;
-               unsigned int ld_stlb_miss:1;
-               unsigned int ld_locked:1;
-               unsigned int ld_reserved:26;
-       };
-       struct {
-               unsigned int st_l1d_hit:1;
-               unsigned int st_reserved1:3;
-               unsigned int st_stlb_miss:1;
-               unsigned int st_locked:1;
-               unsigned int st_reserved2:26;
-       };
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
-       P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-       OP_LH | P(LVL, L1)  | P(SNOOP, NONE),   /* 0x01: L1 local */
-       OP_LH | P(LVL, LFB) | P(SNOOP, NONE),   /* 0x02: LFB hit */
-       OP_LH | P(LVL, L2)  | P(SNOOP, NONE),   /* 0x03: L2 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, NONE),   /* 0x04: L3 hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, MISS),   /* 0x05: L3 hit, snoop miss */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HIT),    /* 0x06: L3 hit, snoop hit */
-       OP_LH | P(LVL, L3)  | P(SNOOP, HITM),   /* 0x07: L3 hit, snoop hitm */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-       OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-       OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-       OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-       OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-       OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-       OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-       OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
-       dse.val = status;
-
-       /*
-        * bit 4: TLB access
-        * 1 = stored missed 2nd level TLB
-        *
-        * so it either hit the walker or the OS
-        * otherwise hit 2nd level TLB
-        */
-       if (dse.st_stlb_miss)
-               val |= P(TLB, MISS);
-       else
-               val |= P(TLB, HIT);
-
-       /*
-        * bit 0: hit L1 data cache
-        * if not set, then all we know is that
-        * it missed L1D
-        */
-       if (dse.st_l1d_hit)
-               val |= P(LVL, HIT);
-       else
-               val |= P(LVL, MISS);
-
-       /*
-        * bit 5: Locked prefix
-        */
-       if (dse.st_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-static u64 precise_datala_hsw(struct perf_event *event, u64 status)
-{
-       union perf_mem_data_src dse;
-
-       dse.val = PERF_MEM_NA;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-               dse.mem_op = PERF_MEM_OP_STORE;
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
-               dse.mem_op = PERF_MEM_OP_LOAD;
-
-       /*
-        * L1 info only valid for following events:
-        *
-        * MEM_UOPS_RETIRED.STLB_MISS_STORES
-        * MEM_UOPS_RETIRED.LOCK_STORES
-        * MEM_UOPS_RETIRED.SPLIT_STORES
-        * MEM_UOPS_RETIRED.ALL_STORES
-        */
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
-               if (status & 1)
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-               else
-                       dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-       }
-       return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
-       union intel_x86_pebs_dse dse;
-       u64 val;
-       int model = boot_cpu_data.x86_model;
-       int fam = boot_cpu_data.x86;
-
-       dse.val = status;
-
-       /*
-        * use the mapping table for bit 0-3
-        */
-       val = pebs_data_source[dse.ld_dse];
-
-       /*
-        * Nehalem models do not support TLB, Lock infos
-        */
-       if (fam == 0x6 && (model == 26 || model == 30
-           || model == 31 || model == 46)) {
-               val |= P(TLB, NA) | P(LOCK, NA);
-               return val;
-       }
-       /*
-        * bit 4: TLB access
-        * 0 = did not miss 2nd level TLB
-        * 1 = missed 2nd level TLB
-        */
-       if (dse.ld_stlb_miss)
-               val |= P(TLB, MISS) | P(TLB, L2);
-       else
-               val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
-       /*
-        * bit 5: locked prefix
-        */
-       if (dse.ld_locked)
-               val |= P(LOCK, LOCKED);
-
-       return val;
-}
-
-struct pebs_record_core {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-};
-
-union hsw_tsx_tuning {
-       struct {
-               u32 cycles_last_block     : 32,
-                   hle_abort             : 1,
-                   rtm_abort             : 1,
-                   instruction_abort     : 1,
-                   non_instruction_abort : 1,
-                   retry                 : 1,
-                   data_conflict         : 1,
-                   capacity_writes       : 1,
-                   capacity_reads        : 1;
-       };
-       u64         value;
-};
-
-#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
-
-/* Same as HSW, plus TSC */
-
-struct pebs_record_skl {
-       u64 flags, ip;
-       u64 ax, bx, cx, dx;
-       u64 si, di, bp, sp;
-       u64 r8,  r9,  r10, r11;
-       u64 r12, r13, r14, r15;
-       u64 status, dla, dse, lat;
-       u64 real_ip, tsx_tuning;
-       u64 tsc;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-                    (u32)((u64)(unsigned long)ds),
-                    (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
-       if (!per_cpu(cpu_hw_events, cpu).ds)
-               return;
-
-       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static DEFINE_PER_CPU(void *, insn_buffer);
-
-static int alloc_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max;
-       void *buffer, *ibuffer;
-
-       if (!x86_pmu.pebs)
-               return 0;
-
-       buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
-       if (unlikely(!buffer))
-               return -ENOMEM;
-
-       /*
-        * HSW+ already provides us the eventing ip; no need to allocate this
-        * buffer then.
-        */
-       if (x86_pmu.intel_cap.pebs_format < 2) {
-               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-               if (!ibuffer) {
-                       kfree(buffer);
-                       return -ENOMEM;
-               }
-               per_cpu(insn_buffer, cpu) = ibuffer;
-       }
-
-       max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-       ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-       ds->pebs_index = ds->pebs_buffer_base;
-       ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-               max * x86_pmu.pebs_record_size;
-
-       return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.pebs)
-               return;
-
-       kfree(per_cpu(insn_buffer, cpu));
-       per_cpu(insn_buffer, cpu) = NULL;
-
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
-       ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-       int node = cpu_to_node(cpu);
-       int max, thresh;
-       void *buffer;
-
-       if (!x86_pmu.bts)
-               return 0;
-
-       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
-       if (unlikely(!buffer)) {
-               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
-               return -ENOMEM;
-       }
-
-       max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-       thresh = max / 16;
-
-       ds->bts_buffer_base = (u64)(unsigned long)buffer;
-       ds->bts_index = ds->bts_buffer_base;
-       ds->bts_absolute_maximum = ds->bts_buffer_base +
-               max * BTS_RECORD_SIZE;
-       ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-               thresh * BTS_RECORD_SIZE;
-
-       return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds || !x86_pmu.bts)
-               return;
-
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
-       ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
-
-       per_cpu(cpu_hw_events, cpu).ds = ds;
-
-       return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
-       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-       if (!ds)
-               return;
-
-       per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
-       int cpu;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu)
-               fini_debug_store_on_cpu(cpu);
-
-       for_each_possible_cpu(cpu) {
-               release_pebs_buffer(cpu);
-               release_bts_buffer(cpu);
-               release_ds_buffer(cpu);
-       }
-       put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
-       int bts_err = 0, pebs_err = 0;
-       int cpu;
-
-       x86_pmu.bts_active = 0;
-       x86_pmu.pebs_active = 0;
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       if (!x86_pmu.bts)
-               bts_err = 1;
-
-       if (!x86_pmu.pebs)
-               pebs_err = 1;
-
-       get_online_cpus();
-
-       for_each_possible_cpu(cpu) {
-               if (alloc_ds_buffer(cpu)) {
-                       bts_err = 1;
-                       pebs_err = 1;
-               }
-
-               if (!bts_err && alloc_bts_buffer(cpu))
-                       bts_err = 1;
-
-               if (!pebs_err && alloc_pebs_buffer(cpu))
-                       pebs_err = 1;
-
-               if (bts_err && pebs_err)
-                       break;
-       }
-
-       if (bts_err) {
-               for_each_possible_cpu(cpu)
-                       release_bts_buffer(cpu);
-       }
-
-       if (pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_pebs_buffer(cpu);
-       }
-
-       if (bts_err && pebs_err) {
-               for_each_possible_cpu(cpu)
-                       release_ds_buffer(cpu);
-       } else {
-               if (x86_pmu.bts && !bts_err)
-                       x86_pmu.bts_active = 1;
-
-               if (x86_pmu.pebs && !pebs_err)
-                       x86_pmu.pebs_active = 1;
-
-               for_each_online_cpu(cpu)
-                       init_debug_store_on_cpu(cpu);
-       }
-
-       put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
-       EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
-       unsigned long debugctlmsr;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr |= DEBUGCTLMSR_TR;
-       debugctlmsr |= DEBUGCTLMSR_BTS;
-       if (config & ARCH_PERFMON_EVENTSEL_INT)
-               debugctlmsr |= DEBUGCTLMSR_BTINT;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
-       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-               debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long debugctlmsr;
-
-       if (!cpuc->ds)
-               return;
-
-       debugctlmsr = get_debugctlmsr();
-
-       debugctlmsr &=
-               ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
-                 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
-       update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct bts_record {
-               u64     from;
-               u64     to;
-               u64     flags;
-       };
-       struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-       struct bts_record *at, *base, *top;
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-       struct perf_sample_data data;
-       unsigned long skip = 0;
-       struct pt_regs regs;
-
-       if (!event)
-               return 0;
-
-       if (!x86_pmu.bts_active)
-               return 0;
-
-       base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-       top  = (struct bts_record *)(unsigned long)ds->bts_index;
-
-       if (top <= base)
-               return 0;
-
-       memset(&regs, 0, sizeof(regs));
-
-       ds->bts_index = ds->bts_buffer_base;
-
-       perf_sample_data_init(&data, 0, event->hw.last_period);
-
-       /*
-        * BTS leaks kernel addresses in branches across the cpl boundary,
-        * such as traps or system calls, so unless the user is asking for
-        * kernel tracing (and right now it's not possible), we'd need to
-        * filter them out. But first we need to count how many of those we
-        * have in the current batch. This is an extra O(n) pass, however,
-        * it's much faster than the other one especially considering that
-        * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
-        * alloc_bts_buffer()).
-        */
-       for (at = base; at < top; at++) {
-               /*
-                * Note that right now *this* BTS code only works if
-                * attr::exclude_kernel is set, but let's keep this extra
-                * check here in case that changes.
-                */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       skip++;
-       }
-
-       /*
-        * Prepare a generic sample, i.e. fill in the invariant fields.
-        * We will overwrite the from and to address before we output
-        * the sample.
-        */
-       perf_prepare_sample(&header, &data, event, &regs);
-
-       if (perf_output_begin(&handle, event, header.size *
-                             (top - base - skip)))
-               return 1;
-
-       for (at = base; at < top; at++) {
-               /* Filter out any records that contain kernel addresses. */
-               if (event->attr.exclude_kernel &&
-                   (kernel_ip(at->from) || kernel_ip(at->to)))
-                       continue;
-
-               data.ip         = at->from;
-               data.addr       = at->to;
-
-               perf_output_sample(&handle, &header, &data, event);
-       }
-
-       perf_output_end(&handle);
-
-       /* There's new data available. */
-       event->hw.interrupts++;
-       event->pending_kill = POLL_IN;
-       return 1;
-}
-
-static inline void intel_pmu_drain_pebs_buffer(void)
-{
-       struct pt_regs regs;
-
-       x86_pmu.drain_pebs(&regs);
-}
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       if (!sched_in)
-               intel_pmu_drain_pebs_buffer();
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_slm_pebs_event_constraints[] = {
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
-       INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-       INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-       /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
-        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-       INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-       INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-        EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-       INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
-       /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_pebs_event_constraints[] = {
-       INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),      /* INST_RETIRED.PREC_DIST */
-       /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-       /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
-       INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-       INTEL_PLD_CONSTRAINT(0x1cd, 0xf),                     /* MEM_TRANS_RETIRED.* */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
-       INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
-       INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
-       /* Allow all events as PEBS with no flags */
-       INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-       EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
-       struct event_constraint *c;
-
-       if (!event->attr.precise_ip)
-               return NULL;
-
-       if (x86_pmu.pebs_constraints) {
-               for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-                       if ((event->hw.config & c->cmask) == c->code) {
-                               event->hw.flags |= c->flags;
-                               return c;
-                       }
-               }
-       }
-
-       return &emptyconstraint;
-}
-
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
-{
-       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool first_pebs;
-       u64 threshold;
-
-       hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
-       first_pebs = !pebs_is_enabled(cpuc);
-       cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled |= 1ULL << 63;
-
-       /*
-        * When the event is constrained enough we can use a larger
-        * threshold and run the event with less frequent PMI.
-        */
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-               threshold = ds->pebs_absolute_maximum -
-                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-               if (first_pebs)
-                       perf_sched_cb_inc(event->ctx->pmu);
-       } else {
-               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-               /*
-                * If not all events can use larger buffer,
-                * roll back to threshold = 1
-                */
-               if (!first_pebs &&
-                   (ds->pebs_interrupt_threshold > threshold))
-                       perf_sched_cb_dec(event->ctx->pmu);
-       }
-
-       /* Use auto-reload if possible to save a MSR write in the PMI */
-       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-               ds->pebs_event_reset[hwc->idx] =
-                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
-       }
-
-       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-               ds->pebs_interrupt_threshold = threshold;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
-       struct debug_store *ds = cpuc->ds;
-       bool large_pebs = ds->pebs_interrupt_threshold >
-               ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-       if (large_pebs)
-               intel_pmu_drain_pebs_buffer();
-
-       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
-       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-               cpuc->pebs_enabled &= ~(1ULL << 63);
-
-       if (large_pebs && !pebs_is_enabled(cpuc))
-               perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
-       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->pebs_enabled)
-               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       unsigned long from = cpuc->lbr_entries[0].from;
-       unsigned long old_to, to = cpuc->lbr_entries[0].to;
-       unsigned long ip = regs->ip;
-       int is_64bit = 0;
-       void *kaddr;
-       int size;
-
-       /*
-        * We don't need to fixup if the PEBS assist is fault like
-        */
-       if (!x86_pmu.intel_cap.pebs_trap)
-               return 1;
-
-       /*
-        * No LBR entry, no basic block, no rewinding
-        */
-       if (!cpuc->lbr_stack.nr || !from || !to)
-               return 0;
-
-       /*
-        * Basic blocks should never cross user/kernel boundaries
-        */
-       if (kernel_ip(ip) != kernel_ip(to))
-               return 0;
-
-       /*
-        * unsigned math, either ip is before the start (impossible) or
-        * the basic block is larger than 1 page (sanity)
-        */
-       if ((ip - to) > PEBS_FIXUP_SIZE)
-               return 0;
-
-       /*
-        * We sampled a branch insn, rewind using the LBR stack
-        */
-       if (ip == to) {
-               set_linear_ip(regs, from);
-               return 1;
-       }
-
-       size = ip - to;
-       if (!kernel_ip(ip)) {
-               int bytes;
-               u8 *buf = this_cpu_read(insn_buffer);
-
-               /* 'size' must fit our buffer, see above */
-               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-               if (bytes != 0)
-                       return 0;
-
-               kaddr = buf;
-       } else {
-               kaddr = (void *)to;
-       }
-
-       do {
-               struct insn insn;
-
-               old_to = to;
-
-#ifdef CONFIG_X86_64
-               is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
-               insn_init(&insn, kaddr, size, is_64bit);
-               insn_get_length(&insn);
-               /*
-                * Make sure there was not a problem decoding the
-                * instruction and getting the length.  This is
-                * doubly important because we have an infinite
-                * loop if insn.length=0.
-                */
-               if (!insn.length)
-                       break;
-
-               to += insn.length;
-               kaddr += insn.length;
-               size -= insn.length;
-       } while (to < ip);
-
-       if (to == ip) {
-               set_linear_ip(regs, old_to);
-               return 1;
-       }
-
-       /*
-        * Even though we decoded the basic block, the instruction stream
-        * never matched the given IP, either the TO or the IP got corrupted.
-        */
-       return 0;
-}
-
-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
-{
-       if (pebs->tsx_tuning) {
-               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
-               return tsx.cycles_last_block;
-       }
-       return 0;
-}
-
-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
-{
-       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
-
-       /* For RTM XABORTs also log the abort code from AX */
-       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
-       return txn;
-}
-
-static void setup_pebs_sample_data(struct perf_event *event,
-                                  struct pt_regs *iregs, void *__pebs,
-                                  struct perf_sample_data *data,
-                                  struct pt_regs *regs)
-{
-#define PERF_X86_EVENT_PEBS_HSW_PREC \
-               (PERF_X86_EVENT_PEBS_ST_HSW | \
-                PERF_X86_EVENT_PEBS_LD_HSW | \
-                PERF_X86_EVENT_PEBS_NA_HSW)
-       /*
-        * We cast to the biggest pebs_record but are careful not to
-        * unconditionally access the 'extra' entries.
-        */
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct pebs_record_skl *pebs = __pebs;
-       u64 sample_type;
-       int fll, fst, dsrc;
-       int fl = event->hw.flags;
-
-       if (pebs == NULL)
-               return;
-
-       sample_type = event->attr.sample_type;
-       dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-       fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-       fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
-
-       perf_sample_data_init(data, 0, event->hw.last_period);
-
-       data->period = event->hw.last_period;
-
-       /*
-        * Use latency for weight (only avail with PEBS-LL)
-        */
-       if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-               data->weight = pebs->lat;
-
-       /*
-        * data.data_src encodes the data source
-        */
-       if (dsrc) {
-               u64 val = PERF_MEM_NA;
-               if (fll)
-                       val = load_latency_data(pebs->dse);
-               else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-                       val = precise_datala_hsw(event, pebs->dse);
-               else if (fst)
-                       val = precise_store_data(pebs->dse);
-               data->data_src.val = val;
-       }
-
-       /*
-        * We use the interrupt regs as a base because the PEBS record
-        * does not contain a full regs set, specifically it seems to
-        * lack segment descriptors, which get used by things like
-        * user_mode().
-        *
-        * In the simple case fix up only the IP and BP,SP regs, for
-        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-        */
-       *regs = *iregs;
-       regs->flags = pebs->flags;
-       set_linear_ip(regs, pebs->ip);
-       regs->bp = pebs->bp;
-       regs->sp = pebs->sp;
-
-       if (sample_type & PERF_SAMPLE_REGS_INTR) {
-               regs->ax = pebs->ax;
-               regs->bx = pebs->bx;
-               regs->cx = pebs->cx;
-               regs->dx = pebs->dx;
-               regs->si = pebs->si;
-               regs->di = pebs->di;
-               regs->bp = pebs->bp;
-               regs->sp = pebs->sp;
-
-               regs->flags = pebs->flags;
-#ifndef CONFIG_X86_32
-               regs->r8 = pebs->r8;
-               regs->r9 = pebs->r9;
-               regs->r10 = pebs->r10;
-               regs->r11 = pebs->r11;
-               regs->r12 = pebs->r12;
-               regs->r13 = pebs->r13;
-               regs->r14 = pebs->r14;
-               regs->r15 = pebs->r15;
-#endif
-       }
-
-       if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-               regs->ip = pebs->real_ip;
-               regs->flags |= PERF_EFLAGS_EXACT;
-       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
-               regs->flags |= PERF_EFLAGS_EXACT;
-       else
-               regs->flags &= ~PERF_EFLAGS_EXACT;
-
-       if ((sample_type & PERF_SAMPLE_ADDR) &&
-           x86_pmu.intel_cap.pebs_format >= 1)
-               data->addr = pebs->dla;
-
-       if (x86_pmu.intel_cap.pebs_format >= 2) {
-               /* Only set the TSX weight when no memory weight. */
-               if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-                       data->weight = intel_hsw_weight(pebs);
-
-               if (sample_type & PERF_SAMPLE_TRANSACTION)
-                       data->txn = intel_hsw_transaction(pebs);
-       }
-
-       /*
-        * v3 supplies an accurate time stamp, so we use that
-        * for the time stamp.
-        *
-        * We can only do this for the default trace clock.
-        */
-       if (x86_pmu.intel_cap.pebs_format >= 3 &&
-               event->attr.use_clockid == 0)
-               data->time = native_sched_clock_from_tsc(pebs->tsc);
-
-       if (has_branch_stack(event))
-               data->br_stack = &cpuc->lbr_stack;
-}
-
-static inline void *
-get_next_pebs_record_by_bit(void *base, void *top, int bit)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       void *at;
-       u64 pebs_status;
-
-       /*
-        * fmt0 does not have a status bitfield (does not use
-        * perf_record_nhm format)
-        */
-       if (x86_pmu.intel_cap.pebs_format < 1)
-               return base;
-
-       if (base == NULL)
-               return NULL;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-
-               if (test_bit(bit, (unsigned long *)&p->status)) {
-                       /* PEBS v3 has accurate status bits */
-                       if (x86_pmu.intel_cap.pebs_format >= 3)
-                               return at;
-
-                       if (p->status == (1 << bit))
-                               return at;
-
-                       /* clear non-PEBS bit and re-check */
-                       pebs_status = p->status & cpuc->pebs_enabled;
-                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-                       if (pebs_status == (1 << bit))
-                               return at;
-               }
-       }
-       return NULL;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
-                                  struct pt_regs *iregs,
-                                  void *base, void *top,
-                                  int bit, int count)
-{
-       struct perf_sample_data data;
-       struct pt_regs regs;
-       void *at = get_next_pebs_record_by_bit(base, top, bit);
-
-       if (!intel_pmu_save_and_restart(event) &&
-           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
-               return;
-
-       while (count > 1) {
-               setup_pebs_sample_data(event, iregs, at, &data, &regs);
-               perf_event_output(event, &data, &regs);
-               at += x86_pmu.pebs_record_size;
-               at = get_next_pebs_record_by_bit(at, top, bit);
-               count--;
-       }
-
-       setup_pebs_sample_data(event, iregs, at, &data, &regs);
-
-       /*
-        * All but the last records are processed.
-        * The last one is left to be able to call the overflow handler.
-        */
-       if (perf_event_overflow(event, &data, &regs)) {
-               x86_pmu_stop(event, 0);
-               return;
-       }
-
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event = cpuc->events[0]; /* PMC0 only */
-       struct pebs_record_core *at, *top;
-       int n;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
-       /*
-        * Whatever else happens, drain the thing
-        */
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (!test_bit(0, cpuc->active_mask))
-               return;
-
-       WARN_ON_ONCE(!event);
-
-       if (!event->attr.precise_ip)
-               return;
-
-       n = top - at;
-       if (n <= 0)
-               return;
-
-       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct perf_event *event;
-       void *base, *at, *top;
-       short counts[MAX_PEBS_EVENTS] = {};
-       short error[MAX_PEBS_EVENTS] = {};
-       int bit, i;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       if (unlikely(base >= top))
-               return;
-
-       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-               struct pebs_record_nhm *p = at;
-               u64 pebs_status;
-
-               /* PEBS v3 has accurate status bits */
-               if (x86_pmu.intel_cap.pebs_format >= 3) {
-                       for_each_set_bit(bit, (unsigned long *)&p->status,
-                                        MAX_PEBS_EVENTS)
-                               counts[bit]++;
-
-                       continue;
-               }
-
-               pebs_status = p->status & cpuc->pebs_enabled;
-               pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
-               /*
-                * On some CPUs the PEBS status can be zero when PEBS is
-                * racing with clearing of GLOBAL_STATUS.
-                *
-                * Normally we would drop that record, but in the
-                * case when there is only a single active PEBS event
-                * we can assume it's for that event.
-                */
-               if (!pebs_status && cpuc->pebs_enabled &&
-                       !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-                       pebs_status = cpuc->pebs_enabled;
-
-               bit = find_first_bit((unsigned long *)&pebs_status,
-                                       x86_pmu.max_pebs_events);
-               if (bit >= x86_pmu.max_pebs_events)
-                       continue;
-
-               /*
-                * The PEBS hardware does not deal well with the situation
-                * when events happen near to each other and multiple bits
-                * are set. But it should happen rarely.
-                *
-                * If these events include one PEBS and multiple non-PEBS
-                * events, it doesn't impact PEBS record. The record will
-                * be handled normally. (slow path)
-                *
-                * If these events include two or more PEBS events, the
-                * records for the events can be collapsed into a single
-                * one, and it's not possible to reconstruct all events
-                * that caused the PEBS record. It's called collision.
-                * If collision happened, the record will be dropped.
-                */
-               if (p->status != (1ULL << bit)) {
-                       for_each_set_bit(i, (unsigned long *)&pebs_status,
-                                        x86_pmu.max_pebs_events)
-                               error[i]++;
-                       continue;
-               }
-
-               counts[bit]++;
-       }
-
-       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-               if ((counts[bit] == 0) && (error[bit] == 0))
-                       continue;
-
-               event = cpuc->events[bit];
-               WARN_ON_ONCE(!event);
-               WARN_ON_ONCE(!event->attr.precise_ip);
-
-               /* log dropped samples number */
-               if (error[bit])
-                       perf_log_lost_samples(event, error[bit]);
-
-               if (counts[bit]) {
-                       __intel_pmu_pebs_event(event, iregs, base,
-                                              top, bit, counts[bit]);
-               }
-       }
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void __init intel_ds_init(void)
-{
-       /*
-        * No support for 32bit formats
-        */
-       if (!boot_cpu_has(X86_FEATURE_DTES64))
-               return;
-
-       x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
-       x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
-       if (x86_pmu.pebs) {
-               char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
-               int format = x86_pmu.intel_cap.pebs_format;
-
-               switch (format) {
-               case 0:
-                       printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-                       break;
-
-               case 1:
-                       printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 2:
-                       pr_cont("PEBS fmt2%c, ", pebs_type);
-                       x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       break;
-
-               case 3:
-                       pr_cont("PEBS fmt3%c, ", pebs_type);
-                       x86_pmu.pebs_record_size =
-                                               sizeof(struct pebs_record_skl);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
-                       break;
-
-               default:
-                       printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
-                       x86_pmu.pebs = 0;
-               }
-       }
-}
-
-void perf_restore_debug_store(void)
-{
-       struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
-       if (!x86_pmu.bts && !x86_pmu.pebs)
-               return;
-
-       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
deleted file mode 100644 (file)
index 653f88d..0000000
+++ /dev/null
@@ -1,1062 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
-       LBR_FORMAT_32           = 0x00,
-       LBR_FORMAT_LIP          = 0x01,
-       LBR_FORMAT_EIP          = 0x02,
-       LBR_FORMAT_EIP_FLAGS    = 0x03,
-       LBR_FORMAT_EIP_FLAGS2   = 0x04,
-       LBR_FORMAT_INFO         = 0x05,
-       LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
-};
-
-static enum {
-       LBR_EIP_FLAGS           = 1,
-       LBR_TSX                 = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
-       [LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
-       [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT         0 /* do not capture at ring0 */
-#define LBR_USER_BIT           1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT            2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT       3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT       4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT         5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT                6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT                7 /* do not capture relative jumps */
-#define LBR_FAR_BIT            8 /* do not capture far branches */
-#define LBR_CALL_STACK_BIT     9 /* enable call stack */
-
-/*
- * Following bit only exists in Linux; we mask it out before writing it to
- * the actual MSR. But it helps the constraint perf code to understand
- * that this is a separate configuration.
- */
-#define LBR_NO_INFO_BIT               63 /* don't read LBR_INFO. */
-
-#define LBR_KERNEL     (1 << LBR_KERNEL_BIT)
-#define LBR_USER       (1 << LBR_USER_BIT)
-#define LBR_JCC                (1 << LBR_JCC_BIT)
-#define LBR_REL_CALL   (1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL   (1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN     (1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP    (1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP    (1 << LBR_IND_JMP_BIT)
-#define LBR_FAR                (1 << LBR_FAR_BIT)
-#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
-#define LBR_NO_INFO    (1ULL << LBR_NO_INFO_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK   0x1ff   /* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP   -1      /* LBR filter not supported */
-#define LBR_IGN                0       /* ignored */
-
-#define LBR_ANY                 \
-       (LBR_JCC        |\
-        LBR_REL_CALL   |\
-        LBR_IND_CALL   |\
-        LBR_RETURN     |\
-        LBR_REL_JMP    |\
-        LBR_IND_JMP    |\
-        LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
-       X86_BR_NONE             = 0,      /* unknown */
-
-       X86_BR_USER             = 1 << 0, /* branch target is user */
-       X86_BR_KERNEL           = 1 << 1, /* branch target is kernel */
-
-       X86_BR_CALL             = 1 << 2, /* call */
-       X86_BR_RET              = 1 << 3, /* return */
-       X86_BR_SYSCALL          = 1 << 4, /* syscall */
-       X86_BR_SYSRET           = 1 << 5, /* syscall return */
-       X86_BR_INT              = 1 << 6, /* sw interrupt */
-       X86_BR_IRET             = 1 << 7, /* return from interrupt */
-       X86_BR_JCC              = 1 << 8, /* conditional */
-       X86_BR_JMP              = 1 << 9, /* jump */
-       X86_BR_IRQ              = 1 << 10,/* hw interrupt or trap or fault */
-       X86_BR_IND_CALL         = 1 << 11,/* indirect calls */
-       X86_BR_ABORT            = 1 << 12,/* transaction abort */
-       X86_BR_IN_TX            = 1 << 13,/* in transaction */
-       X86_BR_NO_TX            = 1 << 14,/* not in transaction */
-       X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
-       X86_BR_CALL_STACK       = 1 << 16,/* call stack */
-       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY       \
-       (X86_BR_CALL    |\
-        X86_BR_RET     |\
-        X86_BR_SYSCALL |\
-        X86_BR_SYSRET  |\
-        X86_BR_INT     |\
-        X86_BR_IRET    |\
-        X86_BR_JCC     |\
-        X86_BR_JMP      |\
-        X86_BR_IRQ      |\
-        X86_BR_ABORT    |\
-        X86_BR_IND_CALL |\
-        X86_BR_IND_JMP  |\
-        X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL                 \
-       (X86_BR_CALL            |\
-        X86_BR_IND_CALL        |\
-        X86_BR_ZERO_CALL       |\
-        X86_BR_SYSCALL         |\
-        X86_BR_IRQ             |\
-        X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       u64 debugctl, lbr_select = 0, orig_debugctl;
-
-       /*
-        * No need to unfreeze manually, as v4 can do that as part
-        * of the GLOBAL_STATUS ack.
-        */
-       if (pmi && x86_pmu.version >= 4)
-               return;
-
-       /*
-        * No need to reprogram LBR_SELECT in a PMI, as it
-        * did not change.
-        */
-       if (cpuc->lbr_sel)
-               lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-       if (!pmi && cpuc->lbr_sel)
-               wrmsrl(MSR_LBR_SELECT, lbr_select);
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       orig_debugctl = debugctl;
-       debugctl |= DEBUGCTLMSR_LBR;
-       /*
-        * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
-        * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
-        * may cause superfluous increase/decrease of LBR_TOS.
-        */
-       if (!(lbr_select & LBR_CALL_STACK))
-               debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-       if (orig_debugctl != debugctl)
-               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
-       u64 debugctl;
-
-       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-       debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++)
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               wrmsrl(x86_pmu.lbr_from + i, 0);
-               wrmsrl(x86_pmu.lbr_to   + i, 0);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + i, 0);
-       }
-}
-
-void intel_pmu_lbr_reset(void)
-{
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_reset_32();
-       else
-               intel_pmu_lbr_reset_64();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-       u64 tos;
-
-       rdmsrl(x86_pmu.lbr_tos, tos);
-       return tos;
-}
-
-enum {
-       LBR_NONE,
-       LBR_VALID,
-};
-
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0 ||
-           task_ctx->lbr_stack_state == LBR_NONE) {
-               intel_pmu_lbr_reset();
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = task_ctx->tos;
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       wrmsrl(x86_pmu.lbr_tos, tos);
-       task_ctx->lbr_stack_state = LBR_NONE;
-}
-
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
-{
-       int i;
-       unsigned lbr_idx, mask;
-       u64 tos;
-
-       if (task_ctx->lbr_callstack_users == 0) {
-               task_ctx->lbr_stack_state = LBR_NONE;
-               return;
-       }
-
-       mask = x86_pmu.lbr_nr - 1;
-       tos = intel_pmu_lbr_tos();
-       for (i = 0; i < tos; i++) {
-               lbr_idx = (tos - i) & mask;
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-               rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-               if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-       }
-       task_ctx->tos = tos;
-       task_ctx->lbr_stack_state = LBR_VALID;
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       /*
-        * If LBR callstack feature is enabled and the stack was saved when
-        * the task was scheduled out, restore the stack. Otherwise flush
-        * the LBR stack.
-        */
-       task_ctx = ctx ? ctx->task_ctx_data : NULL;
-       if (task_ctx) {
-               if (sched_in) {
-                       __intel_pmu_lbr_restore(task_ctx);
-                       cpuc->lbr_context = ctx;
-               } else {
-                       __intel_pmu_lbr_save(task_ctx);
-               }
-               return;
-       }
-
-       /*
-        * When sampling the branck stack in system-wide, it may be
-        * necessary to flush the stack on context switch. This happens
-        * when the branch stack does not tag its entries with the pid
-        * of the current task. Otherwise it becomes impossible to
-        * associate a branch entry with a task. This ambiguity is more
-        * likely to appear when the branch stack supports priv level
-        * filtering and the user sets it to monitor only at the user
-        * level (which could be a useful measurement in system-wide
-        * mode). In that case, the risk is high of having a branch
-        * stack with branch from multiple tasks.
-        */
-       if (sched_in) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = ctx;
-       }
-}
-
-static inline bool branch_user_callstack(unsigned br_sel)
-{
-       return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       /*
-        * Reset the LBR stack if we changed task context to
-        * avoid data leaks.
-        */
-       if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-               intel_pmu_lbr_reset();
-               cpuc->lbr_context = event->ctx;
-       }
-       cpuc->br_sel = event->hw.branch_reg.reg;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users++;
-       }
-
-       cpuc->lbr_users++;
-       perf_sched_cb_inc(event->ctx->pmu);
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       struct x86_perf_task_context *task_ctx;
-
-       if (!x86_pmu.lbr_nr)
-               return;
-
-       if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-                                       event->ctx->task_ctx_data) {
-               task_ctx = event->ctx->task_ctx_data;
-               task_ctx->lbr_callstack_users--;
-       }
-
-       cpuc->lbr_users--;
-       WARN_ON_ONCE(cpuc->lbr_users < 0);
-       perf_sched_cb_dec(event->ctx->pmu);
-
-       if (cpuc->enabled && !cpuc->lbr_users) {
-               __intel_pmu_lbr_disable();
-               /* avoid stale pointer */
-               cpuc->lbr_context = NULL;
-       }
-}
-
-void intel_pmu_lbr_enable_all(bool pmi)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_enable(pmi);
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (cpuc->lbr_users)
-               __intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-
-       for (i = 0; i < x86_pmu.lbr_nr; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               union {
-                       struct {
-                               u32 from;
-                               u32 to;
-                       };
-                       u64     lbr;
-               } msr_lastbranch;
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
-               cpuc->lbr_entries[i].from       = msr_lastbranch.from;
-               cpuc->lbr_entries[i].to         = msr_lastbranch.to;
-               cpuc->lbr_entries[i].mispred    = 0;
-               cpuc->lbr_entries[i].predicted  = 0;
-               cpuc->lbr_entries[i].reserved   = 0;
-       }
-       cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
-       bool need_info = false;
-       unsigned long mask = x86_pmu.lbr_nr - 1;
-       int lbr_format = x86_pmu.intel_cap.lbr_format;
-       u64 tos = intel_pmu_lbr_tos();
-       int i;
-       int out = 0;
-       int num = x86_pmu.lbr_nr;
-
-       if (cpuc->lbr_sel) {
-               need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
-               if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-                       num = tos;
-       }
-
-       for (i = 0; i < num; i++) {
-               unsigned long lbr_idx = (tos - i) & mask;
-               u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
-               int skip = 0;
-               u16 cycles = 0;
-               int lbr_flags = lbr_desc[lbr_format];
-
-               rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
-               rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
-
-               if (lbr_format == LBR_FORMAT_INFO && need_info) {
-                       u64 info;
-
-                       rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
-                       mis = !!(info & LBR_INFO_MISPRED);
-                       pred = !mis;
-                       in_tx = !!(info & LBR_INFO_IN_TX);
-                       abort = !!(info & LBR_INFO_ABORT);
-                       cycles = (info & LBR_INFO_CYCLES);
-               }
-               if (lbr_flags & LBR_EIP_FLAGS) {
-                       mis = !!(from & LBR_FROM_FLAG_MISPRED);
-                       pred = !mis;
-                       skip = 1;
-               }
-               if (lbr_flags & LBR_TSX) {
-                       in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
-                       abort = !!(from & LBR_FROM_FLAG_ABORT);
-                       skip = 3;
-               }
-               from = (u64)((((s64)from) << skip) >> skip);
-
-               /*
-                * Some CPUs report duplicated abort records,
-                * with the second entry not having an abort bit set.
-                * Skip them here. This loop runs backwards,
-                * so we need to undo the previous record.
-                * If the abort just happened outside the window
-                * the extra entry cannot be removed.
-                */
-               if (abort && x86_pmu.lbr_double_abort && out > 0)
-                       out--;
-
-               cpuc->lbr_entries[out].from      = from;
-               cpuc->lbr_entries[out].to        = to;
-               cpuc->lbr_entries[out].mispred   = mis;
-               cpuc->lbr_entries[out].predicted = pred;
-               cpuc->lbr_entries[out].in_tx     = in_tx;
-               cpuc->lbr_entries[out].abort     = abort;
-               cpuc->lbr_entries[out].cycles    = cycles;
-               cpuc->lbr_entries[out].reserved  = 0;
-               out++;
-       }
-       cpuc->lbr_stack.nr = out;
-}
-
-void intel_pmu_lbr_read(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       if (!cpuc->lbr_users)
-               return;
-
-       if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-               intel_pmu_lbr_read_32(cpuc);
-       else
-               intel_pmu_lbr_read_64(cpuc);
-
-       intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
-       u64 br_type = event->attr.branch_sample_type;
-       int mask = 0;
-
-       if (br_type & PERF_SAMPLE_BRANCH_USER)
-               mask |= X86_BR_USER;
-
-       if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
-               mask |= X86_BR_KERNEL;
-
-       /* we ignore BRANCH_HV here */
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY)
-               mask |= X86_BR_ANY;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
-               mask |= X86_BR_ANY_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
-               mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
-               mask |= X86_BR_IND_CALL;
-
-       if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
-               mask |= X86_BR_ABORT;
-
-       if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
-               mask |= X86_BR_IN_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
-               mask |= X86_BR_NO_TX;
-
-       if (br_type & PERF_SAMPLE_BRANCH_COND)
-               mask |= X86_BR_JCC;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
-               if (!x86_pmu_has_lbr_callstack())
-                       return -EOPNOTSUPP;
-               if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
-                       return -EINVAL;
-               mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
-                       X86_BR_CALL_STACK;
-       }
-
-       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
-               mask |= X86_BR_IND_JMP;
-
-       if (br_type & PERF_SAMPLE_BRANCH_CALL)
-               mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
-       /*
-        * stash actual user request into reg, it may
-        * be used by fixup code for some CPU
-        */
-       event->hw.branch_reg.reg = mask;
-       return 0;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg;
-       u64 br_type = event->attr.branch_sample_type;
-       u64 mask = 0, v;
-       int i;
-
-       for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
-               if (!(br_type & (1ULL << i)))
-                       continue;
-
-               v = x86_pmu.lbr_sel_map[i];
-               if (v == LBR_NOT_SUPP)
-                       return -EOPNOTSUPP;
-
-               if (v != LBR_IGN)
-                       mask |= v;
-       }
-
-       reg = &event->hw.branch_reg;
-       reg->idx = EXTRA_REG_LBR;
-
-       /*
-        * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
-        * in suppress mode. So LBR_SELECT should be set to
-        * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
-        */
-       reg->config = mask ^ x86_pmu.lbr_sel_mask;
-
-       if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
-           (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
-           (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
-               reg->config |= LBR_NO_INFO;
-
-       return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
-       int ret = 0;
-
-       /*
-        * no LBR on this PMU
-        */
-       if (!x86_pmu.lbr_nr)
-               return -EOPNOTSUPP;
-
-       /*
-        * setup SW LBR filter
-        */
-       ret = intel_pmu_setup_sw_lbr_filter(event);
-       if (ret)
-               return ret;
-
-       /*
-        * setup HW LBR filter, if any
-        */
-       if (x86_pmu.lbr_sel_map)
-               ret = intel_pmu_setup_hw_lbr_filter(event);
-
-       return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
-       struct insn insn;
-       void *addr;
-       int bytes_read, bytes_left;
-       int ret = X86_BR_NONE;
-       int ext, to_plm, from_plm;
-       u8 buf[MAX_INSN_SIZE];
-       int is64 = 0;
-
-       to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
-       from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
-       /*
-        * maybe zero if lbr did not fill up after a reset by the time
-        * we get a PMU interrupt
-        */
-       if (from == 0 || to == 0)
-               return X86_BR_NONE;
-
-       if (abort)
-               return X86_BR_ABORT | to_plm;
-
-       if (from_plm == X86_BR_USER) {
-               /*
-                * can happen if measuring at the user level only
-                * and we interrupt in a kernel thread, e.g., idle.
-                */
-               if (!current->mm)
-                       return X86_BR_NONE;
-
-               /* may fail if text not present */
-               bytes_left = copy_from_user_nmi(buf, (void __user *)from,
-                                               MAX_INSN_SIZE);
-               bytes_read = MAX_INSN_SIZE - bytes_left;
-               if (!bytes_read)
-                       return X86_BR_NONE;
-
-               addr = buf;
-       } else {
-               /*
-                * The LBR logs any address in the IP, even if the IP just
-                * faulted. This means userspace can control the from address.
-                * Ensure we don't blindy read any address by validating it is
-                * a known text address.
-                */
-               if (kernel_text_address(from)) {
-                       addr = (void *)from;
-                       /*
-                        * Assume we can get the maximum possible size
-                        * when grabbing kernel data.  This is not
-                        * _strictly_ true since we could possibly be
-                        * executing up next to a memory hole, but
-                        * it is very unlikely to be a problem.
-                        */
-                       bytes_read = MAX_INSN_SIZE;
-               } else {
-                       return X86_BR_NONE;
-               }
-       }
-
-       /*
-        * decoder needs to know the ABI especially
-        * on 64-bit systems running 32-bit apps
-        */
-#ifdef CONFIG_X86_64
-       is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
-       insn_init(&insn, addr, bytes_read, is64);
-       insn_get_opcode(&insn);
-       if (!insn.opcode.got)
-               return X86_BR_ABORT;
-
-       switch (insn.opcode.bytes[0]) {
-       case 0xf:
-               switch (insn.opcode.bytes[1]) {
-               case 0x05: /* syscall */
-               case 0x34: /* sysenter */
-                       ret = X86_BR_SYSCALL;
-                       break;
-               case 0x07: /* sysret */
-               case 0x35: /* sysexit */
-                       ret = X86_BR_SYSRET;
-                       break;
-               case 0x80 ... 0x8f: /* conditional */
-                       ret = X86_BR_JCC;
-                       break;
-               default:
-                       ret = X86_BR_NONE;
-               }
-               break;
-       case 0x70 ... 0x7f: /* conditional */
-               ret = X86_BR_JCC;
-               break;
-       case 0xc2: /* near ret */
-       case 0xc3: /* near ret */
-       case 0xca: /* far ret */
-       case 0xcb: /* far ret */
-               ret = X86_BR_RET;
-               break;
-       case 0xcf: /* iret */
-               ret = X86_BR_IRET;
-               break;
-       case 0xcc ... 0xce: /* int */
-               ret = X86_BR_INT;
-               break;
-       case 0xe8: /* call near rel */
-               insn_get_immediate(&insn);
-               if (insn.immediate1.value == 0) {
-                       /* zero length call */
-                       ret = X86_BR_ZERO_CALL;
-                       break;
-               }
-       case 0x9a: /* call far absolute */
-               ret = X86_BR_CALL;
-               break;
-       case 0xe0 ... 0xe3: /* loop jmp */
-               ret = X86_BR_JCC;
-               break;
-       case 0xe9 ... 0xeb: /* jmp */
-               ret = X86_BR_JMP;
-               break;
-       case 0xff: /* call near absolute, call far absolute ind */
-               insn_get_modrm(&insn);
-               ext = (insn.modrm.bytes[0] >> 3) & 0x7;
-               switch (ext) {
-               case 2: /* near ind call */
-               case 3: /* far ind call */
-                       ret = X86_BR_IND_CALL;
-                       break;
-               case 4:
-               case 5:
-                       ret = X86_BR_IND_JMP;
-                       break;
-               }
-               break;
-       default:
-               ret = X86_BR_NONE;
-       }
-       /*
-        * interrupts, traps, faults (and thus ring transition) may
-        * occur on any instructions. Thus, to classify them correctly,
-        * we need to first look at the from and to priv levels. If they
-        * are different and to is in the kernel, then it indicates
-        * a ring transition. If the from instruction is not a ring
-        * transition instr (syscall, systenter, int), then it means
-        * it was a irq, trap or fault.
-        *
-        * we have no way of detecting kernel to kernel faults.
-        */
-       if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
-           && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
-               ret = X86_BR_IRQ;
-
-       /*
-        * branch priv level determined by target as
-        * is done by HW when LBR_SELECT is implemented
-        */
-       if (ret != X86_BR_NONE)
-               ret |= to_plm;
-
-       return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
-       u64 from, to;
-       int br_sel = cpuc->br_sel;
-       int i, j, type;
-       bool compress = false;
-
-       /* if sampling all branches, then nothing to filter */
-       if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
-               return;
-
-       for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
-               from = cpuc->lbr_entries[i].from;
-               to = cpuc->lbr_entries[i].to;
-
-               type = branch_type(from, to, cpuc->lbr_entries[i].abort);
-               if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-                       if (cpuc->lbr_entries[i].in_tx)
-                               type |= X86_BR_IN_TX;
-                       else
-                               type |= X86_BR_NO_TX;
-               }
-
-               /* if type does not correspond, then discard */
-               if (type == X86_BR_NONE || (br_sel & type) != type) {
-                       cpuc->lbr_entries[i].from = 0;
-                       compress = true;
-               }
-       }
-
-       if (!compress)
-               return;
-
-       /* remove all entries with from=0 */
-       for (i = 0; i < cpuc->lbr_stack.nr; ) {
-               if (!cpuc->lbr_entries[i].from) {
-                       j = i;
-                       while (++j < cpuc->lbr_stack.nr)
-                               cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-                       cpuc->lbr_stack.nr--;
-                       if (!cpuc->lbr_entries[i].from)
-                               continue;
-               }
-               i++;
-       }
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_REL_JMP
-                                               | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
-        */
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
-        LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
-       /*
-        * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
-        */
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-       [PERF_SAMPLE_BRANCH_ANY_SHIFT]          = LBR_ANY,
-       [PERF_SAMPLE_BRANCH_USER_SHIFT]         = LBR_USER,
-       [PERF_SAMPLE_BRANCH_KERNEL_SHIFT]       = LBR_KERNEL,
-       [PERF_SAMPLE_BRANCH_HV_SHIFT]           = LBR_IGN,
-       [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]   = LBR_RETURN | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_FAR,
-       [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
-       [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
-       [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
-                                               | LBR_RETURN | LBR_CALL_STACK,
-       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
-       [PERF_SAMPLE_BRANCH_CALL_SHIFT]         = LBR_REL_CALL,
-};
-
-/* core */
-void __init intel_pmu_lbr_init_core(void)
-{
-       x86_pmu.lbr_nr     = 4;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void __init intel_pmu_lbr_init_nhm(void)
-{
-       x86_pmu.lbr_nr     = 16;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - workaround LBR_SEL errata (see above)
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void __init intel_pmu_lbr_init_snb(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("16-deep LBR, ");
-}
-
-/* haswell */
-void intel_pmu_lbr_init_hsw(void)
-{
-       x86_pmu.lbr_nr   = 16;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       pr_cont("16-deep LBR, ");
-}
-
-/* skylake */
-__init void intel_pmu_lbr_init_skl(void)
-{
-       x86_pmu.lbr_nr   = 32;
-       x86_pmu.lbr_tos  = MSR_LBR_TOS;
-       x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-       /*
-        * SW branch filter usage:
-        * - support syscall, sysret capture.
-        *   That requires LBR_FAR but that means far
-        *   jmp need to be filtered out
-        */
-       pr_cont("32-deep LBR, ");
-}
-
-/* atom */
-void __init intel_pmu_lbr_init_atom(void)
-{
-       /*
-        * only models starting at stepping 10 seems
-        * to have an operational LBR which can freeze
-        * on PMU interrupt
-        */
-       if (boot_cpu_data.x86_model == 28
-           && boot_cpu_data.x86_mask < 10) {
-               pr_cont("LBR disabled due to erratum");
-               return;
-       }
-
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-       /*
-        * SW branch filter usage:
-        * - compensate for lack of HW filter
-        */
-       pr_cont("8-deep LBR, ");
-}
-
-/* Knights Landing */
-void intel_pmu_lbr_init_knl(void)
-{
-       x86_pmu.lbr_nr     = 8;
-       x86_pmu.lbr_tos    = MSR_LBR_TOS;
-       x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-       x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-       x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-       x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-       pr_cont("8-deep LBR, ");
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
deleted file mode 100644 (file)
index c0bbd10..0000000
+++ /dev/null
@@ -1,1188 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/intel_pt.h>
-
-#include "perf_event.h"
-#include "intel_pt.h"
-
-static DEFINE_PER_CPU(struct pt, pt_ctx);
-
-static struct pt_pmu pt_pmu;
-
-enum cpuid_regs {
-       CR_EAX = 0,
-       CR_ECX,
-       CR_EDX,
-       CR_EBX
-};
-
-/*
- * Capabilities of Intel PT hardware, such as number of address bits or
- * supported output schemes, are cached and exported to userspace as "caps"
- * attribute group of pt pmu device
- * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
- * relevant bits together with intel_pt traces.
- *
- * These are necessary for both trace decoding (payloads_lip, contains address
- * width encoded in IP-related packets), and event configuration (bitmasks with
- * permitted values for certain bit fields).
- */
-#define PT_CAP(_n, _l, _r, _m)                                         \
-       [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,        \
-                           .reg = _r, .mask = _m }
-
-static struct pt_cap_desc {
-       const char      *name;
-       u32             leaf;
-       u8              reg;
-       u32             mask;
-} pt_caps[] = {
-       PT_CAP(max_subleaf,             0, CR_EAX, 0xffffffff),
-       PT_CAP(cr3_filtering,           0, CR_EBX, BIT(0)),
-       PT_CAP(psb_cyc,                 0, CR_EBX, BIT(1)),
-       PT_CAP(mtc,                     0, CR_EBX, BIT(3)),
-       PT_CAP(topa_output,             0, CR_ECX, BIT(0)),
-       PT_CAP(topa_multiple_entries,   0, CR_ECX, BIT(1)),
-       PT_CAP(single_range_output,     0, CR_ECX, BIT(2)),
-       PT_CAP(payloads_lip,            0, CR_ECX, BIT(31)),
-       PT_CAP(mtc_periods,             1, CR_EAX, 0xffff0000),
-       PT_CAP(cycle_thresholds,        1, CR_EBX, 0xffff),
-       PT_CAP(psb_periods,             1, CR_EBX, 0xffff0000),
-};
-
-static u32 pt_cap_get(enum pt_capabilities cap)
-{
-       struct pt_cap_desc *cd = &pt_caps[cap];
-       u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
-       unsigned int shift = __ffs(cd->mask);
-
-       return (c & cd->mask) >> shift;
-}
-
-static ssize_t pt_cap_show(struct device *cdev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct dev_ext_attribute *ea =
-               container_of(attr, struct dev_ext_attribute, attr);
-       enum pt_capabilities cap = (long)ea->var;
-
-       return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
-}
-
-static struct attribute_group pt_cap_group = {
-       .name   = "caps",
-};
-
-PMU_FORMAT_ATTR(cyc,           "config:1"      );
-PMU_FORMAT_ATTR(mtc,           "config:9"      );
-PMU_FORMAT_ATTR(tsc,           "config:10"     );
-PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
-PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
-PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
-PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
-
-static struct attribute *pt_formats_attr[] = {
-       &format_attr_cyc.attr,
-       &format_attr_mtc.attr,
-       &format_attr_tsc.attr,
-       &format_attr_noretcomp.attr,
-       &format_attr_mtc_period.attr,
-       &format_attr_cyc_thresh.attr,
-       &format_attr_psb_period.attr,
-       NULL,
-};
-
-static struct attribute_group pt_format_group = {
-       .name   = "format",
-       .attrs  = pt_formats_attr,
-};
-
-static const struct attribute_group *pt_attr_groups[] = {
-       &pt_cap_group,
-       &pt_format_group,
-       NULL,
-};
-
-static int __init pt_pmu_hw_init(void)
-{
-       struct dev_ext_attribute *de_attrs;
-       struct attribute **attrs;
-       size_t size;
-       int ret;
-       long i;
-
-       attrs = NULL;
-
-       for (i = 0; i < PT_CPUID_LEAVES; i++) {
-               cpuid_count(20, i,
-                           &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
-                           &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
-       }
-
-       ret = -ENOMEM;
-       size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
-       attrs = kzalloc(size, GFP_KERNEL);
-       if (!attrs)
-               goto fail;
-
-       size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
-       de_attrs = kzalloc(size, GFP_KERNEL);
-       if (!de_attrs)
-               goto fail;
-
-       for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-               struct dev_ext_attribute *de_attr = de_attrs + i;
-
-               de_attr->attr.attr.name = pt_caps[i].name;
-
-               sysfs_attr_init(&de_attr->attr.attr);
-
-               de_attr->attr.attr.mode         = S_IRUGO;
-               de_attr->attr.show              = pt_cap_show;
-               de_attr->var                    = (void *)i;
-
-               attrs[i] = &de_attr->attr.attr;
-       }
-
-       pt_cap_group.attrs = attrs;
-
-       return 0;
-
-fail:
-       kfree(attrs);
-
-       return ret;
-}
-
-#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC    | \
-                         RTIT_CTL_CYC_THRESH   | \
-                         RTIT_CTL_PSB_FREQ)
-
-#define RTIT_CTL_MTC   (RTIT_CTL_MTC_EN        | \
-                        RTIT_CTL_MTC_RANGE)
-
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
-                       RTIT_CTL_DISRETC        | \
-                       RTIT_CTL_CYC_PSB        | \
-                       RTIT_CTL_MTC)
-
-static bool pt_event_valid(struct perf_event *event)
-{
-       u64 config = event->attr.config;
-       u64 allowed, requested;
-
-       if ((config & PT_CONFIG_MASK) != config)
-               return false;
-
-       if (config & RTIT_CTL_CYC_PSB) {
-               if (!pt_cap_get(PT_CAP_psb_cyc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_psb_periods);
-               requested = (config & RTIT_CTL_PSB_FREQ) >>
-                       RTIT_CTL_PSB_FREQ_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_cycle_thresholds);
-               requested = (config & RTIT_CTL_CYC_THRESH) >>
-                       RTIT_CTL_CYC_THRESH_OFFSET;
-               if (requested && (!(allowed & BIT(requested))))
-                       return false;
-       }
-
-       if (config & RTIT_CTL_MTC) {
-               /*
-                * In the unlikely case that CPUID lists valid mtc periods,
-                * but not the mtc capability, drop out here.
-                *
-                * Spec says that setting mtc period bits while mtc bit in
-                * CPUID is 0 will #GP, so better safe than sorry.
-                */
-               if (!pt_cap_get(PT_CAP_mtc))
-                       return false;
-
-               allowed = pt_cap_get(PT_CAP_mtc_periods);
-               if (!allowed)
-                       return false;
-
-               requested = (config & RTIT_CTL_MTC_RANGE) >>
-                       RTIT_CTL_MTC_RANGE_OFFSET;
-
-               if (!(allowed & BIT(requested)))
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * PT configuration helpers
- * These all are cpu affine and operate on a local PT
- */
-
-static void pt_config(struct perf_event *event)
-{
-       u64 reg;
-
-       if (!event->hw.itrace_started) {
-               event->hw.itrace_started = 1;
-               wrmsrl(MSR_IA32_RTIT_STATUS, 0);
-       }
-
-       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
-
-       if (!event->attr.exclude_kernel)
-               reg |= RTIT_CTL_OS;
-       if (!event->attr.exclude_user)
-               reg |= RTIT_CTL_USR;
-
-       reg |= (event->attr.config & PT_CONFIG_MASK);
-
-       wrmsrl(MSR_IA32_RTIT_CTL, reg);
-}
-
-static void pt_config_start(bool start)
-{
-       u64 ctl;
-
-       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-       if (start)
-               ctl |= RTIT_CTL_TRACEEN;
-       else
-               ctl &= ~RTIT_CTL_TRACEEN;
-       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-       /*
-        * A wrmsr that disables trace generation serializes other PT
-        * registers and causes all data packets to be written to memory,
-        * but a fence is required for the data to become globally visible.
-        *
-        * The below WMB, separating data store and aux_head store matches
-        * the consumer's RMB that separates aux_head load and data load.
-        */
-       if (!start)
-               wmb();
-}
-
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-                            unsigned int output_off)
-{
-       u64 reg;
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-       reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-       wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
-/*
- * Keep ToPA table-related metadata on the same page as the actual table,
- * taking up a few words from the top
- */
-
-#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
-
-/**
- * struct topa - page-sized ToPA table with metadata at the top
- * @table:     actual ToPA table entries, as understood by PT hardware
- * @list:      linkage to struct pt_buffer's list of tables
- * @phys:      physical address of this page
- * @offset:    offset of the first entry in this table in the buffer
- * @size:      total size of all entries in this table
- * @last:      index of the last initialized entry in this table
- */
-struct topa {
-       struct topa_entry       table[TENTS_PER_PAGE];
-       struct list_head        list;
-       u64                     phys;
-       u64                     offset;
-       size_t                  size;
-       int                     last;
-};
-
-/* make -1 stand for the last table entry */
-#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
-
-/**
- * topa_alloc() - allocate page-sized ToPA table
- * @cpu:       CPU on which to allocate.
- * @gfp:       Allocation flags.
- *
- * Return:     On success, return the pointer to ToPA table page.
- */
-static struct topa *topa_alloc(int cpu, gfp_t gfp)
-{
-       int node = cpu_to_node(cpu);
-       struct topa *topa;
-       struct page *p;
-
-       p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-       if (!p)
-               return NULL;
-
-       topa = page_address(p);
-       topa->last = 0;
-       topa->phys = page_to_phys(p);
-
-       /*
-        * In case of singe-entry ToPA, always put the self-referencing END
-        * link as the 2nd entry in the table
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(topa, 1)->end = 1;
-       }
-
-       return topa;
-}
-
-/**
- * topa_free() - free a page-sized ToPA table
- * @topa:      Table to deallocate.
- */
-static void topa_free(struct topa *topa)
-{
-       free_page((unsigned long)topa);
-}
-
-/**
- * topa_insert_table() - insert a ToPA table into a buffer
- * @buf:        PT buffer that's being extended.
- * @topa:       New topa table to be inserted.
- *
- * If it's the first table in this buffer, set up buffer's pointers
- * accordingly; otherwise, add a END=1 link entry to @topa to the current
- * "last" table and adjust the last table pointer to @topa.
- */
-static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
-{
-       struct topa *last = buf->last;
-
-       list_add_tail(&topa->list, &buf->tables);
-
-       if (!buf->first) {
-               buf->first = buf->last = buf->cur = topa;
-               return;
-       }
-
-       topa->offset = last->offset + last->size;
-       buf->last = topa;
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return;
-
-       BUG_ON(last->last != TENTS_PER_PAGE - 1);
-
-       TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
-       TOPA_ENTRY(last, -1)->end = 1;
-}
-
-/**
- * topa_table_full() - check if a ToPA table is filled up
- * @topa:      ToPA table.
- */
-static bool topa_table_full(struct topa *topa)
-{
-       /* single-entry ToPA is a special case */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return !!topa->last;
-
-       return topa->last == TENTS_PER_PAGE - 1;
-}
-
-/**
- * topa_insert_pages() - create a list of ToPA tables
- * @buf:       PT buffer being initialized.
- * @gfp:       Allocation flags.
- *
- * This initializes a list of ToPA tables with entries from
- * the data_pages provided by rb_alloc_aux().
- *
- * Return:     0 on success or error code.
- */
-static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
-{
-       struct topa *topa = buf->last;
-       int order = 0;
-       struct page *p;
-
-       p = virt_to_page(buf->data_pages[buf->nr_pages]);
-       if (PagePrivate(p))
-               order = page_private(p);
-
-       if (topa_table_full(topa)) {
-               topa = topa_alloc(buf->cpu, gfp);
-               if (!topa)
-                       return -ENOMEM;
-
-               topa_insert_table(buf, topa);
-       }
-
-       TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
-       TOPA_ENTRY(topa, -1)->size = order;
-       if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(topa, -1)->intr = 1;
-               TOPA_ENTRY(topa, -1)->stop = 1;
-       }
-
-       topa->last++;
-       topa->size += sizes(order);
-
-       buf->nr_pages += 1ul << order;
-
-       return 0;
-}
-
-/**
- * pt_topa_dump() - print ToPA tables and their entries
- * @buf:       PT buffer.
- */
-static void pt_topa_dump(struct pt_buffer *buf)
-{
-       struct topa *topa;
-
-       list_for_each_entry(topa, &buf->tables, list) {
-               int i;
-
-               pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
-                        topa->phys, topa->offset, topa->size);
-               for (i = 0; i < TENTS_PER_PAGE; i++) {
-                       pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
-                                &topa->table[i],
-                                (unsigned long)topa->table[i].base << TOPA_SHIFT,
-                                sizes(topa->table[i].size),
-                                topa->table[i].end ?  'E' : ' ',
-                                topa->table[i].intr ? 'I' : ' ',
-                                topa->table[i].stop ? 'S' : ' ',
-                                *(u64 *)&topa->table[i]);
-                       if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
-                            topa->table[i].stop) ||
-                           topa->table[i].end)
-                               break;
-               }
-       }
-}
-
-/**
- * pt_buffer_advance() - advance to the next output region
- * @buf:       PT buffer.
- *
- * Advance the current pointers in the buffer to the next ToPA entry.
- */
-static void pt_buffer_advance(struct pt_buffer *buf)
-{
-       buf->output_off = 0;
-       buf->cur_idx++;
-
-       if (buf->cur_idx == buf->cur->last) {
-               if (buf->cur == buf->last)
-                       buf->cur = buf->first;
-               else
-                       buf->cur = list_entry(buf->cur->list.next, struct topa,
-                                             list);
-               buf->cur_idx = 0;
-       }
-}
-
-/**
- * pt_update_head() - calculate current offsets and sizes
- * @pt:                Per-cpu pt context.
- *
- * Update buffer's current write pointer position and data size.
- */
-static void pt_update_head(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       u64 topa_idx, base, old;
-
-       /* offset of the first region in this table from the beginning of buf */
-       base = buf->cur->offset + buf->output_off;
-
-       /* offset of the current output region within this table */
-       for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
-               base += sizes(buf->cur->table[topa_idx].size);
-
-       if (buf->snapshot) {
-               local_set(&buf->data_size, base);
-       } else {
-               old = (local64_xchg(&buf->head, base) &
-                      ((buf->nr_pages << PAGE_SHIFT) - 1));
-               if (base < old)
-                       base += buf->nr_pages << PAGE_SHIFT;
-
-               local_add(base - old, &buf->data_size);
-       }
-}
-
-/**
- * pt_buffer_region() - obtain current output region's address
- * @buf:       PT buffer.
- */
-static void *pt_buffer_region(struct pt_buffer *buf)
-{
-       return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
-}
-
-/**
- * pt_buffer_region_size() - obtain current output region's size
- * @buf:       PT buffer.
- */
-static size_t pt_buffer_region_size(struct pt_buffer *buf)
-{
-       return sizes(buf->cur->table[buf->cur_idx].size);
-}
-
-/**
- * pt_handle_status() - take care of possible status conditions
- * @pt:                Per-cpu pt context.
- */
-static void pt_handle_status(struct pt *pt)
-{
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-       int advance = 0;
-       u64 status;
-
-       rdmsrl(MSR_IA32_RTIT_STATUS, status);
-
-       if (status & RTIT_STATUS_ERROR) {
-               pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
-               pt_topa_dump(buf);
-               status &= ~RTIT_STATUS_ERROR;
-       }
-
-       if (status & RTIT_STATUS_STOPPED) {
-               status &= ~RTIT_STATUS_STOPPED;
-
-               /*
-                * On systems that only do single-entry ToPA, hitting STOP
-                * means we are already losing data; need to let the decoder
-                * know.
-                */
-               if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
-                   buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-                       local_inc(&buf->lost);
-                       advance++;
-               }
-       }
-
-       /*
-        * Also on single-entry ToPA implementations, interrupt will come
-        * before the output reaches its output region's boundary.
-        */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
-           pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
-               void *head = pt_buffer_region(buf);
-
-               /* everything within this margin needs to be zeroed out */
-               memset(head + buf->output_off, 0,
-                      pt_buffer_region_size(buf) -
-                      buf->output_off);
-               advance++;
-       }
-
-       if (advance)
-               pt_buffer_advance(buf);
-
-       wrmsrl(MSR_IA32_RTIT_STATUS, status);
-}
-
-/**
- * pt_read_offset() - translate registers into buffer pointers
- * @buf:       PT buffer.
- *
- * Set buffer's output pointers from MSR values.
- */
-static void pt_read_offset(struct pt_buffer *buf)
-{
-       u64 offset, base_topa;
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-       buf->cur = phys_to_virt(base_topa);
-
-       rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
-       /* offset within current output region */
-       buf->output_off = offset >> 32;
-       /* index of current output region within this table */
-       buf->cur_idx = (offset & 0xffffff80) >> 7;
-}
-
-/**
- * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
- * @buf:       PT buffer.
- * @pg:                Page offset in the buffer.
- *
- * When advancing to the next output region (ToPA entry), given a page offset
- * into the buffer, we need to find the offset of the first page in the next
- * region.
- */
-static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
-{
-       struct topa_entry *te = buf->topa_index[pg];
-
-       /* one region */
-       if (buf->first == buf->last && buf->first->last == 1)
-               return pg;
-
-       do {
-               pg++;
-               pg &= buf->nr_pages - 1;
-       } while (buf->topa_index[pg] == te);
-
-       return pg;
-}
-
-/**
- * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
- * @buf:       PT buffer.
- * @handle:    Current output handle.
- *
- * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected and waking up the consumer after a certain fraction of
- * the buffer has filled up. Only needed and sensible for non-snapshot counters.
- *
- * This obviously relies on buf::head to figure out buffer markers, so it has
- * to be called after pt_buffer_reset_offsets() and before the hardware tracing
- * is enabled.
- */
-static int pt_buffer_reset_markers(struct pt_buffer *buf,
-                                  struct perf_output_handle *handle)
-
-{
-       unsigned long head = local64_read(&buf->head);
-       unsigned long idx, npages, wakeup;
-
-       /* can't stop in the middle of an output region */
-       if (buf->output_off + handle->size + 1 <
-           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
-               return -EINVAL;
-
-
-       /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               return 0;
-
-       /* clear STOP and INT from current entry */
-       buf->topa_index[buf->stop_pos]->stop = 0;
-       buf->topa_index[buf->intr_pos]->intr = 0;
-
-       /* how many pages till the STOP marker */
-       npages = handle->size >> PAGE_SHIFT;
-
-       /* if it's on a page boundary, fill up one more page */
-       if (!offset_in_page(head + handle->size + 1))
-               npages++;
-
-       idx = (head >> PAGE_SHIFT) + npages;
-       idx &= buf->nr_pages - 1;
-       buf->stop_pos = idx;
-
-       wakeup = handle->wakeup >> PAGE_SHIFT;
-
-       /* in the worst case, wake up the consumer one page before hard stop */
-       idx = (head >> PAGE_SHIFT) + npages - 1;
-       if (idx > wakeup)
-               idx = wakeup;
-
-       idx &= buf->nr_pages - 1;
-       buf->intr_pos = idx;
-
-       buf->topa_index[buf->stop_pos]->stop = 1;
-       buf->topa_index[buf->intr_pos]->intr = 1;
-
-       return 0;
-}
-
-/**
- * pt_buffer_setup_topa_index() - build topa_index[] table of regions
- * @buf:       PT buffer.
- *
- * topa_index[] references output regions indexed by offset into the
- * buffer for purposes of quick reverse lookup.
- */
-static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
-{
-       struct topa *cur = buf->first, *prev = buf->last;
-       struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
-               *te_prev = TOPA_ENTRY(prev, prev->last - 1);
-       int pg = 0, idx = 0;
-
-       while (pg < buf->nr_pages) {
-               int tidx;
-
-               /* pages within one topa entry */
-               for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
-                       buf->topa_index[pg] = te_prev;
-
-               te_prev = te_cur;
-
-               if (idx == cur->last - 1) {
-                       /* advance to next topa table */
-                       idx = 0;
-                       cur = list_entry(cur->list.next, struct topa, list);
-               } else {
-                       idx++;
-               }
-               te_cur = TOPA_ENTRY(cur, idx);
-       }
-
-}
-
-/**
- * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
- * @buf:       PT buffer.
- * @head:      Write pointer (aux_head) from AUX buffer.
- *
- * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly. This is done after we have obtained the
- * current aux_head position from a successful call to perf_aux_output_begin()
- * to make sure the hardware is writing to the right place.
- *
- * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
- * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
- * which are used to determine INT and STOP markers' locations by a subsequent
- * call to pt_buffer_reset_markers().
- */
-static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
-{
-       int pg;
-
-       if (buf->snapshot)
-               head &= (buf->nr_pages << PAGE_SHIFT) - 1;
-
-       pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-       pg = pt_topa_next_entry(buf, pg);
-
-       buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
-       buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
-                       (unsigned long)buf->cur) / sizeof(struct topa_entry);
-       buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
-
-       local64_set(&buf->head, head);
-       local_set(&buf->data_size, 0);
-}
-
-/**
- * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
- * @buf:       PT buffer.
- */
-static void pt_buffer_fini_topa(struct pt_buffer *buf)
-{
-       struct topa *topa, *iter;
-
-       list_for_each_entry_safe(topa, iter, &buf->tables, list) {
-               /*
-                * right now, this is in free_aux() path only, so
-                * no need to unlink this table from the list
-                */
-               topa_free(topa);
-       }
-}
-
-/**
- * pt_buffer_init_topa() - initialize ToPA table for pt buffer
- * @buf:       PT buffer.
- * @size:      Total size of all regions within this ToPA.
- * @gfp:       Allocation flags.
- */
-static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
-                              gfp_t gfp)
-{
-       struct topa *topa;
-       int err;
-
-       topa = topa_alloc(buf->cpu, gfp);
-       if (!topa)
-               return -ENOMEM;
-
-       topa_insert_table(buf, topa);
-
-       while (buf->nr_pages < nr_pages) {
-               err = topa_insert_pages(buf, gfp);
-               if (err) {
-                       pt_buffer_fini_topa(buf);
-                       return -ENOMEM;
-               }
-       }
-
-       pt_buffer_setup_topa_index(buf);
-
-       /* link last table to the first one, unless we're double buffering */
-       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
-               TOPA_ENTRY(buf->last, -1)->end = 1;
-       }
-
-       pt_topa_dump(buf);
-       return 0;
-}
-
-/**
- * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:       Cpu on which to allocate, -1 means current.
- * @pages:     Array of pointers to buffer pages passed from perf core.
- * @nr_pages:  Number of pages in the buffer.
- * @snapshot:  If this is a snapshot/overwrite counter.
- *
- * This is a pmu::setup_aux callback that sets up ToPA tables and all the
- * bookkeeping for an AUX buffer.
- *
- * Return:     Our private PT buffer structure.
- */
-static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
-{
-       struct pt_buffer *buf;
-       int node, ret;
-
-       if (!nr_pages)
-               return NULL;
-
-       if (cpu == -1)
-               cpu = raw_smp_processor_id();
-       node = cpu_to_node(cpu);
-
-       buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
-                          GFP_KERNEL, node);
-       if (!buf)
-               return NULL;
-
-       buf->cpu = cpu;
-       buf->snapshot = snapshot;
-       buf->data_pages = pages;
-
-       INIT_LIST_HEAD(&buf->tables);
-
-       ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
-       if (ret) {
-               kfree(buf);
-               return NULL;
-       }
-
-       return buf;
-}
-
-/**
- * pt_buffer_free_aux() - perf AUX deallocation path callback
- * @data:      PT buffer.
- */
-static void pt_buffer_free_aux(void *data)
-{
-       struct pt_buffer *buf = data;
-
-       pt_buffer_fini_topa(buf);
-       kfree(buf);
-}
-
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:       PT buffer.
- * @pt:                Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
-{
-       if (buf->snapshot)
-               return false;
-
-       if (local_read(&buf->data_size) >= pt->handle.size)
-               return true;
-
-       return false;
-}
-
-/**
- * intel_pt_interrupt() - PT PMI handler
- */
-void intel_pt_interrupt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-       struct perf_event *event = pt->handle.event;
-
-       /*
-        * There may be a dangling PT bit in the interrupt status register
-        * after PT has been disabled by pt_event_stop(). Make sure we don't
-        * do anything (particularly, re-enable) for this event here.
-        */
-       if (!ACCESS_ONCE(pt->handle_nmi))
-               return;
-
-       pt_config_start(false);
-
-       if (!event)
-               return;
-
-       buf = perf_get_aux(&pt->handle);
-       if (!buf)
-               return;
-
-       pt_read_offset(buf);
-
-       pt_handle_status(pt);
-
-       pt_update_head(pt);
-
-       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                           local_xchg(&buf->lost, 0));
-
-       if (!event->hw.state) {
-               int ret;
-
-               buf = perf_aux_output_begin(&pt->handle, event);
-               if (!buf) {
-                       event->hw.state = PERF_HES_STOPPED;
-                       return;
-               }
-
-               pt_buffer_reset_offsets(buf, pt->handle.head);
-               /* snapshot counters don't use PMI, so it's safe */
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret) {
-                       perf_aux_output_end(&pt->handle, 0, true);
-                       return;
-               }
-
-               pt_config_buffer(buf->cur->table, buf->cur_idx,
-                                buf->output_off);
-               pt_config(event);
-       }
-}
-
-/*
- * PMU callbacks
- */
-
-static void pt_event_start(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-       if (!buf || pt_buffer_is_full(buf, pt)) {
-               event->hw.state = PERF_HES_STOPPED;
-               return;
-       }
-
-       ACCESS_ONCE(pt->handle_nmi) = 1;
-       event->hw.state = 0;
-
-       pt_config_buffer(buf->cur->table, buf->cur_idx,
-                        buf->output_off);
-       pt_config(event);
-}
-
-static void pt_event_stop(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       /*
-        * Protect against the PMI racing with disabling wrmsr,
-        * see comment in intel_pt_interrupt().
-        */
-       ACCESS_ONCE(pt->handle_nmi) = 0;
-       pt_config_start(false);
-
-       if (event->hw.state == PERF_HES_STOPPED)
-               return;
-
-       event->hw.state = PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_UPDATE) {
-               struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-               if (!buf)
-                       return;
-
-               if (WARN_ON_ONCE(pt->handle.event != event))
-                       return;
-
-               pt_read_offset(buf);
-
-               pt_handle_status(pt);
-
-               pt_update_head(pt);
-       }
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct pt_buffer *buf;
-
-       pt_event_stop(event, PERF_EF_UPDATE);
-
-       buf = perf_get_aux(&pt->handle);
-
-       if (buf) {
-               if (buf->snapshot)
-                       pt->handle.head =
-                               local_xchg(&buf->data_size,
-                                          buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                                   local_xchg(&buf->lost, 0));
-       }
-}
-
-static int pt_event_add(struct perf_event *event, int mode)
-{
-       struct pt_buffer *buf;
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-       struct hw_perf_event *hwc = &event->hw;
-       int ret = -EBUSY;
-
-       if (pt->handle.event)
-               goto fail;
-
-       buf = perf_aux_output_begin(&pt->handle, event);
-       ret = -EINVAL;
-       if (!buf)
-               goto fail_stop;
-
-       pt_buffer_reset_offsets(buf, pt->handle.head);
-       if (!buf->snapshot) {
-               ret = pt_buffer_reset_markers(buf, &pt->handle);
-               if (ret)
-                       goto fail_end_stop;
-       }
-
-       if (mode & PERF_EF_START) {
-               pt_event_start(event, 0);
-               ret = -EBUSY;
-               if (hwc->state == PERF_HES_STOPPED)
-                       goto fail_end_stop;
-       } else {
-               hwc->state = PERF_HES_STOPPED;
-       }
-
-       return 0;
-
-fail_end_stop:
-       perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-       hwc->state = PERF_HES_STOPPED;
-fail:
-       return ret;
-}
-
-static void pt_event_read(struct perf_event *event)
-{
-}
-
-static void pt_event_destroy(struct perf_event *event)
-{
-       x86_del_exclusive(x86_lbr_exclusive_pt);
-}
-
-static int pt_event_init(struct perf_event *event)
-{
-       if (event->attr.type != pt_pmu.pmu.type)
-               return -ENOENT;
-
-       if (!pt_event_valid(event))
-               return -EINVAL;
-
-       if (x86_add_exclusive(x86_lbr_exclusive_pt))
-               return -EBUSY;
-
-       event->destroy = pt_event_destroy;
-
-       return 0;
-}
-
-void cpu_emergency_stop_pt(void)
-{
-       struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-       if (pt->handle.event)
-               pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
-}
-
-static __init int pt_init(void)
-{
-       int ret, cpu, prior_warn = 0;
-
-       BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
-
-       if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-               return -ENODEV;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu) {
-               u64 ctl;
-
-               ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
-               if (!ret && (ctl & RTIT_CTL_TRACEEN))
-                       prior_warn++;
-       }
-       put_online_cpus();
-
-       if (prior_warn) {
-               x86_add_exclusive(x86_lbr_exclusive_pt);
-               pr_warn("PT is enabled at boot time, doing nothing\n");
-
-               return -EBUSY;
-       }
-
-       ret = pt_pmu_hw_init();
-       if (ret)
-               return ret;
-
-       if (!pt_cap_get(PT_CAP_topa_output)) {
-               pr_warn("ToPA output is not supported on this CPU\n");
-               return -ENODEV;
-       }
-
-       if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-               pt_pmu.pmu.capabilities =
-                       PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
-
-       pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-       pt_pmu.pmu.attr_groups  = pt_attr_groups;
-       pt_pmu.pmu.task_ctx_nr  = perf_sw_context;
-       pt_pmu.pmu.event_init   = pt_event_init;
-       pt_pmu.pmu.add          = pt_event_add;
-       pt_pmu.pmu.del          = pt_event_del;
-       pt_pmu.pmu.start        = pt_event_start;
-       pt_pmu.pmu.stop         = pt_event_stop;
-       pt_pmu.pmu.read         = pt_event_read;
-       pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
-       pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
-       ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
-
-       return ret;
-}
-arch_initcall(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
deleted file mode 100644 (file)
index 24a351a..0000000
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
- * Copyright (C) 2013 Google, Inc., Stephane Eranian
- *
- * Intel RAPL interface is specified in the IA-32 Manual Vol3b
- * section 14.7.1 (September 2013)
- *
- * RAPL provides more controls than just reporting energy consumption
- * however here we only expose the 3 energy consumption free running
- * counters (pp0, pkg, dram).
- *
- * Each of those counters increments in a power unit defined by the
- * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
- * but it can vary.
- *
- * Counter to rapl events mappings:
- *
- *  pp0 counter: consumption of all physical cores (power plane 0)
- *       event: rapl_energy_cores
- *    perf code: 0x1
- *
- *  pkg counter: consumption of the whole processor package
- *       event: rapl_energy_pkg
- *    perf code: 0x2
- *
- * dram counter: consumption of the dram domain (servers only)
- *       event: rapl_energy_dram
- *    perf code: 0x3
- *
- * dram counter: consumption of the builtin-gpu domain (client only)
- *       event: rapl_energy_gpu
- *    perf code: 0x4
- *
- * We manage those counters as free running (read-only). They may be
- * use simultaneously by other tools, such as turbostat.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it does not make sense and is not
- * supported by the RAPL hardware.
- *
- * Because we want to avoid floating-point operations in the kernel,
- * the events are all reported in fixed point arithmetic (32.32).
- * Tools must adjust the counts to convert them to Watts using
- * the duration of the measurement. Tools may use a function such as
- * ldexp(raw_count, -32);
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-/*
- * RAPL energy status counters
- */
-#define RAPL_IDX_PP0_NRG_STAT  0       /* all cores */
-#define INTEL_RAPL_PP0         0x1     /* pseudo-encoding */
-#define RAPL_IDX_PKG_NRG_STAT  1       /* entire package */
-#define INTEL_RAPL_PKG         0x2     /* pseudo-encoding */
-#define RAPL_IDX_RAM_NRG_STAT  2       /* DRAM */
-#define INTEL_RAPL_RAM         0x3     /* pseudo-encoding */
-#define RAPL_IDX_PP1_NRG_STAT  3       /* gpu */
-#define INTEL_RAPL_PP1         0x4     /* pseudo-encoding */
-
-#define NR_RAPL_DOMAINS         0x4
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
-       "pp0-core",
-       "package",
-       "dram",
-       "pp1-gpu",
-};
-
-/* Clients have PP0, PKG */
-#define RAPL_IDX_CLN   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM */
-#define RAPL_IDX_SRV   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM, PP1 */
-#define RAPL_IDX_HSW   (1<<RAPL_IDX_PP0_NRG_STAT|\
-                        1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT|\
-                        1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Knights Landing has PKG, RAM */
-#define RAPL_IDX_KNL   (1<<RAPL_IDX_PKG_NRG_STAT|\
-                        1<<RAPL_IDX_RAM_NRG_STAT)
-
-/*
- * event code: LSB 8 bits, passed in attr->config
- * any other bit is reserved
- */
-#define RAPL_EVENT_MASK        0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)          \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj,      \
-                               struct kobj_attribute *attr,    \
-                               char *page)                     \
-{                                                              \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
-       return sprintf(page, _format "\n");                     \
-}                                                              \
-static struct kobj_attribute format_attr_##_var =              \
-       __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-
-#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
-static struct perf_pmu_events_attr event_attr_##v = {                          \
-       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
-       .id             = 0,                                                    \
-       .event_str      = str,                                                  \
-};
-
-struct rapl_pmu {
-       spinlock_t       lock;
-       int              n_active; /* number of active events */
-       struct list_head active_list;
-       struct pmu       *pmu; /* pointer to rapl_pmu_class */
-       ktime_t          timer_interval; /* in ktime_t unit */
-       struct hrtimer   hrtimer;
-};
-
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
-static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
-
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
-
-static struct x86_pmu_quirk *rapl_quirks;
-static inline u64 rapl_read_counter(struct perf_event *event)
-{
-       u64 raw;
-       rdmsrl(event->hw.event_base, raw);
-       return raw;
-}
-
-#define rapl_add_quirk(func_)                                          \
-do {                                                                   \
-       static struct x86_pmu_quirk __quirk __initdata = {              \
-               .func = func_,                                          \
-       };                                                              \
-       __quirk.next = rapl_quirks;                                     \
-       rapl_quirks = &__quirk;                                         \
-} while (0)
-
-static inline u64 rapl_scale(u64 v, int cfg)
-{
-       if (cfg > NR_RAPL_DOMAINS) {
-               pr_warn("invalid domain %d, failed to scale data\n", cfg);
-               return v;
-       }
-       /*
-        * scale delta to smallest unit (1/2^32)
-        * users must then scale back: count * 1/(1e9*2^32) to get Joules
-        * or use ldexp(count, -32).
-        * Watts = Joules/Time delta
-        */
-       return v << (32 - rapl_hw_unit[cfg - 1]);
-}
-
-static u64 rapl_event_update(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 prev_raw_count, new_raw_count;
-       s64 delta, sdelta;
-       int shift = RAPL_CNTR_WIDTH;
-
-again:
-       prev_raw_count = local64_read(&hwc->prev_count);
-       rdmsrl(event->hw.event_base, new_raw_count);
-
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                           new_raw_count) != prev_raw_count) {
-               cpu_relax();
-               goto again;
-       }
-
-       /*
-        * Now we have the new raw value and have updated the prev
-        * timestamp already. We can now calculate the elapsed delta
-        * (event-)time and add that to the generic event.
-        *
-        * Careful, not all hw sign-extends above the physical width
-        * of the count.
-        */
-       delta = (new_raw_count << shift) - (prev_raw_count << shift);
-       delta >>= shift;
-
-       sdelta = rapl_scale(delta, event->hw.config);
-
-       local64_add(sdelta, &event->count);
-
-       return new_raw_count;
-}
-
-static void rapl_start_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
-                    HRTIMER_MODE_REL_PINNED);
-}
-
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_cancel(&pmu->hrtimer);
-}
-
-static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct perf_event *event;
-       unsigned long flags;
-
-       if (!pmu->n_active)
-               return HRTIMER_NORESTART;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       list_for_each_entry(event, &pmu->active_list, active_entry) {
-               rapl_event_update(event);
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       hrtimer_forward_now(hrtimer, pmu->timer_interval);
-
-       return HRTIMER_RESTART;
-}
-
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
-{
-       struct hrtimer *hr = &pmu->hrtimer;
-
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hr->function = rapl_hrtimer_handle;
-}
-
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
-                                  struct perf_event *event)
-{
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-
-       list_add_tail(&event->active_entry, &pmu->active_list);
-
-       local64_set(&event->hw.prev_count, rapl_read_counter(event));
-
-       pmu->n_active++;
-       if (pmu->n_active == 1)
-               rapl_start_hrtimer(pmu);
-}
-
-static void rapl_pmu_event_start(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-       __rapl_pmu_event_start(pmu, event);
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static void rapl_pmu_event_stop(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       /* mark event as deactivated and stopped */
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               WARN_ON_ONCE(pmu->n_active <= 0);
-               pmu->n_active--;
-               if (pmu->n_active == 0)
-                       rapl_stop_hrtimer(pmu);
-
-               list_del(&event->active_entry);
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-       }
-
-       /* check if update of sw counter is necessary */
-       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               rapl_event_update(event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static int rapl_pmu_event_add(struct perf_event *event, int mode)
-{
-       struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-       struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
-
-       spin_lock_irqsave(&pmu->lock, flags);
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-       if (mode & PERF_EF_START)
-               __rapl_pmu_event_start(pmu, event);
-
-       spin_unlock_irqrestore(&pmu->lock, flags);
-
-       return 0;
-}
-
-static void rapl_pmu_event_del(struct perf_event *event, int flags)
-{
-       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int rapl_pmu_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
-       int bit, msr, ret = 0;
-
-       /* only look at RAPL events */
-       if (event->attr.type != rapl_pmu_class.type)
-               return -ENOENT;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~RAPL_EVENT_MASK)
-               return -EINVAL;
-
-       /*
-        * check event is known (determines counter)
-        */
-       switch (cfg) {
-       case INTEL_RAPL_PP0:
-               bit = RAPL_IDX_PP0_NRG_STAT;
-               msr = MSR_PP0_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PKG:
-               bit = RAPL_IDX_PKG_NRG_STAT;
-               msr = MSR_PKG_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_RAM:
-               bit = RAPL_IDX_RAM_NRG_STAT;
-               msr = MSR_DRAM_ENERGY_STATUS;
-               break;
-       case INTEL_RAPL_PP1:
-               bit = RAPL_IDX_PP1_NRG_STAT;
-               msr = MSR_PP1_ENERGY_STATUS;
-               break;
-       default:
-               return -EINVAL;
-       }
-       /* check event supported */
-       if (!(rapl_cntr_mask & (1 << bit)))
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /* must be done before validate_group */
-       event->hw.event_base = msr;
-       event->hw.config = cfg;
-       event->hw.idx = bit;
-
-       return ret;
-}
-
-static void rapl_pmu_event_read(struct perf_event *event)
-{
-       rapl_event_update(event);
-}
-
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-       .attrs = rapl_pmu_attrs,
-};
-
-RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
-RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
-
-RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
-
-/*
- * we compute in 0.23 nJ increments regardless of MSR
- */
-RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
-
-static struct attribute *rapl_events_srv_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_cln_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_hsw_attr[] = {
-       EVENT_PTR(rapl_cores),
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_gpu),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_cores_unit),
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_gpu_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_cores_scale),
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_gpu_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute *rapl_events_knl_attr[] = {
-       EVENT_PTR(rapl_pkg),
-       EVENT_PTR(rapl_ram),
-
-       EVENT_PTR(rapl_pkg_unit),
-       EVENT_PTR(rapl_ram_unit),
-
-       EVENT_PTR(rapl_pkg_scale),
-       EVENT_PTR(rapl_ram_scale),
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_events_group = {
-       .name = "events",
-       .attrs = NULL, /* patched at runtime */
-};
-
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
-static struct attribute *rapl_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group rapl_pmu_format_group = {
-       .name = "format",
-       .attrs = rapl_formats_attr,
-};
-
-const struct attribute_group *rapl_attr_groups[] = {
-       &rapl_pmu_attr_group,
-       &rapl_pmu_format_group,
-       &rapl_pmu_events_group,
-       NULL,
-};
-
-static struct pmu rapl_pmu_class = {
-       .attr_groups    = rapl_attr_groups,
-       .task_ctx_nr    = perf_invalid_context, /* system-wide only */
-       .event_init     = rapl_pmu_event_init,
-       .add            = rapl_pmu_event_add, /* must have */
-       .del            = rapl_pmu_event_del, /* must have */
-       .start          = rapl_pmu_event_start,
-       .stop           = rapl_pmu_event_stop,
-       .read           = rapl_pmu_event_read,
-};
-
-static void rapl_cpu_exit(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int i, phys_id = topology_physical_package_id(cpu);
-       int target = -1;
-
-       /* find a new cpu on same package */
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-       /*
-        * clear cpu from cpumask
-        * if was set in cpumask and still some cpu on package,
-        * then move to new cpu
-        */
-       if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-               cpumask_set_cpu(target, &rapl_cpu_mask);
-
-       WARN_ON(cpumask_empty(&rapl_cpu_mask));
-       /*
-        * migrate events and context to new cpu
-        */
-       if (target >= 0)
-               perf_pmu_migrate_context(pmu->pmu, cpu, target);
-
-       /* cancel overflow polling timer for CPU */
-       rapl_stop_hrtimer(pmu);
-}
-
-static void rapl_cpu_init(int cpu)
-{
-       int i, phys_id = topology_physical_package_id(cpu);
-
-       /* check if phys_is is already covered */
-       for_each_cpu(i, &rapl_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-       /* was not found, so add it */
-       cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
-       /*
-        * DRAM domain on HSW server has fixed energy unit which can be
-        * different than the unit from power unit MSR.
-        * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-        * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-        */
-       rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_cpu_prepare(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-       int phys_id = topology_physical_package_id(cpu);
-       u64 ms;
-
-       if (pmu)
-               return 0;
-
-       if (phys_id < 0)
-               return -1;
-
-       pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-       if (!pmu)
-               return -1;
-       spin_lock_init(&pmu->lock);
-
-       INIT_LIST_HEAD(&pmu->active_list);
-
-       pmu->pmu = &rapl_pmu_class;
-
-       /*
-        * use reference of 200W for scaling the timeout
-        * to avoid missing counter overflows.
-        * 200W = 200 Joules/sec
-        * divide interval by 2 to avoid lockstep (2 * 100)
-        * if hw unit is 32, then we use 2 ms 1/200/2
-        */
-       if (rapl_hw_unit[0] < 32)
-               ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-       else
-               ms = 2;
-
-       pmu->timer_interval = ms_to_ktime(ms);
-
-       rapl_hrtimer_init(pmu);
-
-       /* set RAPL pmu for this cpu for now */
-       per_cpu(rapl_pmu, cpu) = pmu;
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-       return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-       kfree(pmu);
-
-       per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-       struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-       if (!pmu)
-               return 0;
-
-       per_cpu(rapl_pmu, cpu) = NULL;
-
-       per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
-       return 0;
-}
-
-static int rapl_cpu_notifier(struct notifier_block *self,
-                            unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               rapl_cpu_prepare(cpu);
-               break;
-       case CPU_STARTING:
-               rapl_cpu_init(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               rapl_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               rapl_cpu_kfree(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               rapl_cpu_exit(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static int rapl_check_hw_unit(void)
-{
-       u64 msr_rapl_power_unit_bits;
-       int i;
-
-       /* protect rdmsrl() to handle virtualization */
-       if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-               return -1;
-       for (i = 0; i < NR_RAPL_DOMAINS; i++)
-               rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
-
-       return 0;
-}
-
-static const struct x86_cpu_id rapl_cpu_match[] = {
-       [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-       [1] = {},
-};
-
-static int __init rapl_pmu_init(void)
-{
-       struct rapl_pmu *pmu;
-       int cpu, ret;
-       struct x86_pmu_quirk *quirk;
-       int i;
-
-       /*
-        * check for Intel processor family 6
-        */
-       if (!x86_match_cpu(rapl_cpu_match))
-               return 0;
-
-       /* check supported CPU */
-       switch (boot_cpu_data.x86_model) {
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-               rapl_cntr_mask = RAPL_IDX_CLN;
-               rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-               break;
-       case 63: /* Haswell-Server */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell-Celeron */
-       case 61: /* Broadwell */
-               rapl_cntr_mask = RAPL_IDX_HSW;
-               rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-               break;
-       case 45: /* Sandy Bridge-EP */
-       case 62: /* IvyTown */
-               rapl_cntr_mask = RAPL_IDX_SRV;
-               rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-               break;
-       case 87: /* Knights Landing */
-               rapl_add_quirk(rapl_hsw_server_quirk);
-               rapl_cntr_mask = RAPL_IDX_KNL;
-               rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
-       default:
-               /* unsupported */
-               return 0;
-       }
-       ret = rapl_check_hw_unit();
-       if (ret)
-               return ret;
-
-       /* run cpu model quirks */
-       for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-               quirk->func();
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               ret = rapl_cpu_prepare(cpu);
-               if (ret)
-                       goto out;
-               rapl_cpu_init(cpu);
-       }
-
-       __perf_cpu_notifier(rapl_cpu_notifier);
-
-       ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-       if (WARN_ON(ret)) {
-               pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-               cpu_notifier_register_done();
-               return -1;
-       }
-
-       pmu = __this_cpu_read(rapl_pmu);
-
-       pr_info("RAPL PMU detected,"
-               " API unit is 2^-32 Joules,"
-               " %d fixed counters"
-               " %llu ms ovfl timer\n",
-               hweight32(rapl_cntr_mask),
-               ktime_to_ms(pmu->timer_interval));
-       for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-               if (rapl_cntr_mask & (1 << i)) {
-                       pr_info("hw unit of domain %s 2^-%d Joules\n",
-                               rapl_domain_names[i], rapl_hw_unit[i]);
-               }
-       }
-out:
-       cpu_notifier_register_done();
-
-       return 0;
-}
-device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
deleted file mode 100644 (file)
index f97f807..0000000
+++ /dev/null
@@ -1,1398 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
-struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-
-static bool pcidrv_registered;
-struct pci_driver *uncore_pci_driver;
-/* pci bus to socket mapping */
-DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
-struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint uncore_constraint_fixed =
-       EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-struct event_constraint uncore_constraint_empty =
-       EVENT_CONSTRAINT(0, 0, 0);
-
-int uncore_pcibus_to_physid(struct pci_bus *bus)
-{
-       struct pci2phy_map *map;
-       int phys_id = -1;
-
-       raw_spin_lock(&pci2phy_map_lock);
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == pci_domain_nr(bus)) {
-                       phys_id = map->pbus_to_physid[bus->number];
-                       break;
-               }
-       }
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       return phys_id;
-}
-
-struct pci2phy_map *__find_pci2phy_map(int segment)
-{
-       struct pci2phy_map *map, *alloc = NULL;
-       int i;
-
-       lockdep_assert_held(&pci2phy_map_lock);
-
-lookup:
-       list_for_each_entry(map, &pci2phy_map_head, list) {
-               if (map->segment == segment)
-                       goto end;
-       }
-
-       if (!alloc) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
-               raw_spin_lock(&pci2phy_map_lock);
-
-               if (!alloc)
-                       return NULL;
-
-               goto lookup;
-       }
-
-       map = alloc;
-       alloc = NULL;
-       map->segment = segment;
-       for (i = 0; i < 256; i++)
-               map->pbus_to_physid[i] = -1;
-       list_add_tail(&map->list, &pci2phy_map_head);
-
-end:
-       kfree(alloc);
-       return map;
-}
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf)
-{
-       struct uncore_event_desc *event =
-               container_of(attr, struct uncore_event_desc, attr);
-       return sprintf(buf, "%s", event->config);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-       return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
-       struct intel_uncore_box *box;
-
-       box = *per_cpu_ptr(pmu->box, cpu);
-       if (box)
-               return box;
-
-       raw_spin_lock(&uncore_box_lock);
-       /* Recheck in lock to handle races. */
-       if (*per_cpu_ptr(pmu->box, cpu))
-               goto out;
-       list_for_each_entry(box, &pmu->box_list, list) {
-               if (box->phys_id == topology_physical_package_id(cpu)) {
-                       atomic_inc(&box->refcnt);
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-                       break;
-               }
-       }
-out:
-       raw_spin_unlock(&uncore_box_lock);
-
-       return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-       /*
-        * perf core schedules event on the basis of cpu, uncore events are
-        * collected by one of the cpus inside a physical package.
-        */
-       return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 count;
-
-       rdmsrl(event->hw.event_base, count);
-
-       return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       unsigned long flags;
-       bool ok = false;
-
-       /*
-        * reg->alloc can be set due to existing state, so for fake box we
-        * need to ignore this, otherwise we might fail to allocate proper
-        * fake state for this extra reg constraint.
-        */
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-
-       er = &box->shared_regs[reg1->idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!atomic_read(&er->ref) ||
-           (er->config1 == reg1->config && er->config2 == reg2->config)) {
-               atomic_inc(&er->ref);
-               er->config1 = reg1->config;
-               er->config2 = reg2->config;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (ok) {
-               if (!uncore_box_is_fake(box))
-                       reg1->alloc = 1;
-               return NULL;
-       }
-
-       return &uncore_constraint_empty;
-}
-
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
-       /*
-        * Only put constraint if extra reg was actually allocated. Also
-        * takes care of event which do not use an extra shared reg.
-        *
-        * Also, if this is a fake box we shouldn't touch any event state
-        * (reg->alloc) and we don't care about leaving inconsistent box
-        * state either since it will be thrown out.
-        */
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       er = &box->shared_regs[reg1->idx];
-       atomic_dec(&er->ref);
-       reg1->alloc = 0;
-}
-
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       er = &box->shared_regs[idx];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return config;
-}
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hwc->idx = idx;
-       hwc->last_tag = ++box->tags[idx];
-
-       if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
-               hwc->event_base = uncore_fixed_ctr(box);
-               hwc->config_base = uncore_fixed_ctl(box);
-               return;
-       }
-
-       hwc->config_base = uncore_event_ctl(box, hwc->idx);
-       hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
-}
-
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
-       u64 prev_count, new_count, delta;
-       int shift;
-
-       if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
-               shift = 64 - uncore_fixed_ctr_bits(box);
-       else
-               shift = 64 - uncore_perf_ctr_bits(box);
-
-       /* the hrtimer might modify the previous event value */
-again:
-       prev_count = local64_read(&event->hw.prev_count);
-       new_count = uncore_read_counter(box, event);
-       if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
-               goto again;
-
-       delta = (new_count << shift) - (prev_count << shift);
-       delta >>= shift;
-
-       local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
-       struct intel_uncore_box *box;
-       struct perf_event *event;
-       unsigned long flags;
-       int bit;
-
-       box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
-       if (!box->n_active || box->cpu != smp_processor_id())
-               return HRTIMER_NORESTART;
-       /*
-        * disable local interrupt to prevent uncore_pmu_event_start/stop
-        * to interrupt the update process
-        */
-       local_irq_save(flags);
-
-       /*
-        * handle boxes with an active event list as opposed to active
-        * counters
-        */
-       list_for_each_entry(event, &box->active_list, active_entry) {
-               uncore_perf_event_update(box, event);
-       }
-
-       for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
-               uncore_perf_event_update(box, box->events[bit]);
-
-       local_irq_restore(flags);
-
-       hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
-       return HRTIMER_RESTART;
-}
-
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
-                     HRTIMER_MODE_REL_PINNED);
-}
-
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
-       hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
-{
-       struct intel_uncore_box *box;
-       int i, size;
-
-       size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
-       box = kzalloc_node(size, GFP_KERNEL, node);
-       if (!box)
-               return NULL;
-
-       for (i = 0; i < type->num_shared_regs; i++)
-               raw_spin_lock_init(&box->shared_regs[i].lock);
-
-       uncore_pmu_init_hrtimer(box);
-       atomic_set(&box->refcnt, 1);
-       box->cpu = -1;
-       box->phys_id = -1;
-
-       /* set default hrtimer timeout */
-       box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-
-       INIT_LIST_HEAD(&box->active_list);
-
-       return box;
-}
-
-/*
- * Using uncore_pmu_event_init pmu event_init callback
- * as a detection point for uncore events.
- */
-static int uncore_pmu_event_init(struct perf_event *event);
-
-static bool is_uncore_event(struct perf_event *event)
-{
-       return event->pmu->event_init == uncore_pmu_event_init;
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
-       struct perf_event *event;
-       int n, max_count;
-
-       max_count = box->pmu->type->num_counters;
-       if (box->pmu->type->fixed_ctl)
-               max_count++;
-
-       if (box->n_events >= max_count)
-               return -EINVAL;
-
-       n = box->n_events;
-
-       if (is_uncore_event(leader)) {
-               box->event_list[n] = leader;
-               n++;
-       }
-
-       if (!dogrp)
-               return n;
-
-       list_for_each_entry(event, &leader->sibling_list, group_entry) {
-               if (!is_uncore_event(event) ||
-                   event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-
-               if (n >= max_count)
-                       return -EINVAL;
-
-               box->event_list[n] = event;
-               n++;
-       }
-       return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct event_constraint *c;
-
-       if (type->ops->get_constraint) {
-               c = type->ops->get_constraint(box, event);
-               if (c)
-                       return c;
-       }
-
-       if (event->attr.config == UNCORE_FIXED_EVENT)
-               return &uncore_constraint_fixed;
-
-       if (type->constraints) {
-               for_each_event_constraint(c, type->constraints) {
-                       if ((event->hw.config & c->cmask) == c->code)
-                               return c;
-               }
-       }
-
-       return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       if (box->pmu->type->ops->put_constraint)
-               box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
-       unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       struct event_constraint *c;
-       int i, wmin, wmax, ret = 0;
-       struct hw_perf_event *hwc;
-
-       bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
-       for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               c = uncore_get_event_constraint(box, box->event_list[i]);
-               box->event_constraint[i] = c;
-               wmin = min(wmin, c->weight);
-               wmax = max(wmax, c->weight);
-       }
-
-       /* fastpath, try to reuse previous register */
-       for (i = 0; i < n; i++) {
-               hwc = &box->event_list[i]->hw;
-               c = box->event_constraint[i];
-
-               /* never assigned */
-               if (hwc->idx == -1)
-                       break;
-
-               /* constraint still honored */
-               if (!test_bit(hwc->idx, c->idxmsk))
-                       break;
-
-               /* not already used */
-               if (test_bit(hwc->idx, used_mask))
-                       break;
-
-               __set_bit(hwc->idx, used_mask);
-               if (assign)
-                       assign[i] = hwc->idx;
-       }
-       /* slow path */
-       if (i != n)
-               ret = perf_assign_events(box->event_constraint, n,
-                                        wmin, wmax, n, assign);
-
-       if (!assign || ret) {
-               for (i = 0; i < n; i++)
-                       uncore_put_event_constraint(box, box->event_list[i]);
-       }
-       return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int idx = event->hw.idx;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
-               return;
-
-       event->hw.state = 0;
-       box->events[idx] = event;
-       box->n_active++;
-       __set_bit(idx, box->active_mask);
-
-       local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
-       uncore_enable_event(box, event);
-
-       if (box->n_active == 1) {
-               uncore_enable_box(box);
-               uncore_pmu_start_hrtimer(box);
-       }
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
-               uncore_disable_event(box, event);
-               box->n_active--;
-               box->events[hwc->idx] = NULL;
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               if (box->n_active == 0) {
-                       uncore_disable_box(box);
-                       uncore_pmu_cancel_hrtimer(box);
-               }
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-       int assign[UNCORE_PMC_IDX_MAX];
-       int i, n, ret;
-
-       if (!box)
-               return -ENODEV;
-
-       ret = n = uncore_collect_events(box, event, false);
-       if (ret < 0)
-               return ret;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       ret = uncore_assign_events(box, assign, n);
-       if (ret)
-               return ret;
-
-       /* save events moving to new counters */
-       for (i = 0; i < box->n_events; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx == assign[i] &&
-                       hwc->last_tag == box->tags[assign[i]])
-                       continue;
-               /*
-                * Ensure we don't accidentally enable a stopped
-                * counter simply because we rescheduled.
-                */
-               if (hwc->state & PERF_HES_STOPPED)
-                       hwc->state |= PERF_HES_ARCH;
-
-               uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-       }
-
-       /* reprogram moved events into new counters */
-       for (i = 0; i < n; i++) {
-               event = box->event_list[i];
-               hwc = &event->hw;
-
-               if (hwc->idx != assign[i] ||
-                       hwc->last_tag != box->tags[assign[i]])
-                       uncore_assign_hw_event(box, event, assign[i]);
-               else if (i < box->n_events)
-                       continue;
-
-               if (hwc->state & PERF_HES_ARCH)
-                       continue;
-
-               uncore_pmu_event_start(event, 0);
-       }
-       box->n_events = n;
-
-       return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       uncore_put_event_constraint(box, event);
-
-                       while (++i < box->n_events)
-                               box->event_list[i - 1] = box->event_list[i];
-
-                       --box->n_events;
-                       break;
-               }
-       }
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-}
-
-void uncore_pmu_event_read(struct perf_event *event)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
-                               struct perf_event *event)
-{
-       struct perf_event *leader = event->group_leader;
-       struct intel_uncore_box *fake_box;
-       int ret = -EINVAL, n;
-
-       fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
-       if (!fake_box)
-               return -ENOMEM;
-
-       fake_box->pmu = pmu;
-       /*
-        * the event is not yet connected with its
-        * siblings therefore we must first collect
-        * existing siblings, then add the new event
-        * before we can simulate the scheduling
-        */
-       n = uncore_collect_events(fake_box, leader, true);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-       n = uncore_collect_events(fake_box, event, false);
-       if (n < 0)
-               goto out;
-
-       fake_box->n_events = n;
-
-       ret = uncore_assign_events(fake_box, NULL, n);
-out:
-       kfree(fake_box);
-       return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       int ret;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /*
-        * Uncore PMU does measure at all privilege level all the time.
-        * So it doesn't make sense to specify any exclude bits.
-        */
-       if (event->attr.exclude_user || event->attr.exclude_kernel ||
-                       event->attr.exclude_hv || event->attr.exclude_idle)
-               return -EINVAL;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-       if (event->attr.config == UNCORE_FIXED_EVENT) {
-               /* no fixed counter */
-               if (!pmu->type->fixed_ctl)
-                       return -EINVAL;
-               /*
-                * if there is only one fixed counter, only the first pmu
-                * can access the fixed counter
-                */
-               if (pmu->type->single_fixed && pmu->pmu_idx > 0)
-                       return -EINVAL;
-
-               /* fixed counters have event field hardcoded to zero */
-               hwc->config = 0ULL;
-       } else {
-               hwc->config = event->attr.config & pmu->type->event_mask;
-               if (pmu->type->ops->hw_config) {
-                       ret = pmu->type->ops->hw_config(box, event);
-                       if (ret)
-                               return ret;
-               }
-       }
-
-       if (event->group_leader != event)
-               ret = uncore_validate_group(pmu, event);
-       else
-               ret = 0;
-
-       return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
-       &dev_attr_cpumask.attr,
-       NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
-       .attrs = uncore_pmu_attrs,
-};
-
-static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
-       int ret;
-
-       if (!pmu->type->pmu) {
-               pmu->pmu = (struct pmu) {
-                       .attr_groups    = pmu->type->attr_groups,
-                       .task_ctx_nr    = perf_invalid_context,
-                       .event_init     = uncore_pmu_event_init,
-                       .add            = uncore_pmu_event_add,
-                       .del            = uncore_pmu_event_del,
-                       .start          = uncore_pmu_event_start,
-                       .stop           = uncore_pmu_event_stop,
-                       .read           = uncore_pmu_event_read,
-               };
-       } else {
-               pmu->pmu = *pmu->type->pmu;
-               pmu->pmu.attr_groups = pmu->type->attr_groups;
-       }
-
-       if (pmu->type->num_boxes == 1) {
-               if (strlen(pmu->type->name) > 0)
-                       sprintf(pmu->name, "uncore_%s", pmu->type->name);
-               else
-                       sprintf(pmu->name, "uncore");
-       } else {
-               sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-                       pmu->pmu_idx);
-       }
-
-       ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
-       return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
-       int i;
-
-       for (i = 0; i < type->num_boxes; i++)
-               free_percpu(type->pmus[i].box);
-       kfree(type->pmus);
-       type->pmus = NULL;
-       kfree(type->events_group);
-       type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
-       int i;
-       for (i = 0; types[i]; i++)
-               uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
-       struct intel_uncore_pmu *pmus;
-       struct attribute_group *attr_group;
-       struct attribute **attrs;
-       int i, j;
-
-       pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
-       if (!pmus)
-               return -ENOMEM;
-
-       type->pmus = pmus;
-
-       type->unconstrainted = (struct event_constraint)
-               __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-                               0, type->num_counters, 0, 0);
-
-       for (i = 0; i < type->num_boxes; i++) {
-               pmus[i].func_id = -1;
-               pmus[i].pmu_idx = i;
-               pmus[i].type = type;
-               INIT_LIST_HEAD(&pmus[i].box_list);
-               pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-               if (!pmus[i].box)
-                       goto fail;
-       }
-
-       if (type->event_descs) {
-               i = 0;
-               while (type->event_descs[i].attr.attr.name)
-                       i++;
-
-               attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
-                                       sizeof(*attr_group), GFP_KERNEL);
-               if (!attr_group)
-                       goto fail;
-
-               attrs = (struct attribute **)(attr_group + 1);
-               attr_group->name = "events";
-               attr_group->attrs = attrs;
-
-               for (j = 0; j < i; j++)
-                       attrs[j] = &type->event_descs[j].attr.attr;
-
-               type->events_group = attr_group;
-       }
-
-       type->pmu_group = &uncore_pmu_attr_group;
-       return 0;
-fail:
-       uncore_type_exit(type);
-       return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
-       int i, ret;
-
-       for (i = 0; types[i]; i++) {
-               ret = uncore_type_init(types[i]);
-               if (ret)
-                       goto fail;
-       }
-       return 0;
-fail:
-       while (--i >= 0)
-               uncore_type_exit(types[i]);
-       return ret;
-}
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct intel_uncore_type *type;
-       int phys_id;
-       bool first_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       if (phys_id < 0)
-               return -ENODEV;
-
-       if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-               int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-               uncore_extra_pci_dev[phys_id][idx] = pdev;
-               pci_set_drvdata(pdev, NULL);
-               return 0;
-       }
-
-       type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-       box = uncore_alloc_box(type, NUMA_NO_NODE);
-       if (!box)
-               return -ENOMEM;
-
-       /*
-        * for performance monitoring unit with multiple boxes,
-        * each box has a different function id.
-        */
-       pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-       /* Knights Landing uses a common PCI device ID for multiple instances of
-        * an uncore PMU device type. There is only one entry per device type in
-        * the knl_uncore_pci_ids table inspite of multiple devices present for
-        * some device types. Hence PCI device idx would be 0 for all devices.
-        * So increment pmu pointer to point to an unused array element.
-        */
-       if (boot_cpu_data.x86_model == 87)
-               while (pmu->func_id >= 0)
-                       pmu++;
-       if (pmu->func_id < 0)
-               pmu->func_id = pdev->devfn;
-       else
-               WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-       box->phys_id = phys_id;
-       box->pci_dev = pdev;
-       box->pmu = pmu;
-       uncore_box_init(box);
-       pci_set_drvdata(pdev, box);
-
-       raw_spin_lock(&uncore_box_lock);
-       if (list_empty(&pmu->box_list))
-               first_box = true;
-       list_add_tail(&box->list, &pmu->box_list);
-       raw_spin_unlock(&uncore_box_lock);
-
-       if (first_box)
-               uncore_pmu_register(pmu);
-       return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
-       struct intel_uncore_box *box = pci_get_drvdata(pdev);
-       struct intel_uncore_pmu *pmu;
-       int i, cpu, phys_id;
-       bool last_box = false;
-
-       phys_id = uncore_pcibus_to_physid(pdev->bus);
-       box = pci_get_drvdata(pdev);
-       if (!box) {
-               for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-                       if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-                               uncore_extra_pci_dev[phys_id][i] = NULL;
-                               break;
-                       }
-               }
-               WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
-               return;
-       }
-
-       pmu = box->pmu;
-       if (WARN_ON_ONCE(phys_id != box->phys_id))
-               return;
-
-       pci_set_drvdata(pdev, NULL);
-
-       raw_spin_lock(&uncore_box_lock);
-       list_del(&box->list);
-       if (list_empty(&pmu->box_list))
-               last_box = true;
-       raw_spin_unlock(&uncore_box_lock);
-
-       for_each_possible_cpu(cpu) {
-               if (*per_cpu_ptr(pmu->box, cpu) == box) {
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       atomic_dec(&box->refcnt);
-               }
-       }
-
-       WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
-       kfree(box);
-
-       if (last_box)
-               perf_pmu_unregister(&pmu->pmu);
-}
-
-static int __init uncore_pci_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 45: /* Sandy Bridge-EP */
-               ret = snbep_uncore_pci_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ret = ivbep_uncore_pci_init();
-               break;
-       case 63: /* Haswell-EP */
-               ret = hswep_uncore_pci_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               ret = bdx_uncore_pci_init();
-               break;
-       case 42: /* Sandy Bridge */
-               ret = snb_uncore_pci_init();
-               break;
-       case 58: /* Ivy Bridge */
-               ret = ivb_uncore_pci_init();
-               break;
-       case 60: /* Haswell */
-       case 69: /* Haswell Celeron */
-               ret = hsw_uncore_pci_init();
-               break;
-       case 61: /* Broadwell */
-               ret = bdw_uncore_pci_init();
-               break;
-       case 87: /* Knights Landing */
-               ret = knl_uncore_pci_init();
-               break;
-       default:
-               return 0;
-       }
-
-       if (ret)
-               return ret;
-
-       ret = uncore_types_init(uncore_pci_uncores);
-       if (ret)
-               return ret;
-
-       uncore_pci_driver->probe = uncore_pci_probe;
-       uncore_pci_driver->remove = uncore_pci_remove;
-
-       ret = pci_register_driver(uncore_pci_driver);
-       if (ret == 0)
-               pcidrv_registered = true;
-       else
-               uncore_types_exit(uncore_pci_uncores);
-
-       return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
-       if (pcidrv_registered) {
-               pcidrv_registered = false;
-               pci_unregister_driver(uncore_pci_driver);
-               uncore_types_exit(uncore_pci_uncores);
-       }
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-       struct intel_uncore_box *box;
-
-       while (!list_empty(&boxes_to_free)) {
-               box = list_entry(boxes_to_free.next,
-                                struct intel_uncore_box, list);
-               list_del(&box->list);
-               kfree(box);
-       }
-}
-
-static void uncore_cpu_dying(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       *per_cpu_ptr(pmu->box, cpu) = NULL;
-                       if (box && atomic_dec_and_test(&box->refcnt))
-                               list_add(&box->list, &boxes_to_free);
-               }
-       }
-}
-
-static int uncore_cpu_starting(int cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box, *exist;
-       int i, j, k, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       box = *per_cpu_ptr(pmu->box, cpu);
-                       /* called by uncore_cpu_init? */
-                       if (box && box->phys_id >= 0) {
-                               uncore_box_init(box);
-                               continue;
-                       }
-
-                       for_each_online_cpu(k) {
-                               exist = *per_cpu_ptr(pmu->box, k);
-                               if (exist && exist->phys_id == phys_id) {
-                                       atomic_inc(&exist->refcnt);
-                                       *per_cpu_ptr(pmu->box, cpu) = exist;
-                                       if (box) {
-                                               list_add(&box->list,
-                                                        &boxes_to_free);
-                                               box = NULL;
-                                       }
-                                       break;
-                               }
-                       }
-
-                       if (box) {
-                               box->phys_id = phys_id;
-                               uncore_box_init(box);
-                       }
-               }
-       }
-       return 0;
-}
-
-static int uncore_cpu_prepare(int cpu, int phys_id)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (pmu->func_id < 0)
-                               pmu->func_id = j;
-
-                       box = uncore_alloc_box(type, cpu_to_node(cpu));
-                       if (!box)
-                               return -ENOMEM;
-
-                       box->pmu = pmu;
-                       box->phys_id = phys_id;
-                       *per_cpu_ptr(pmu->box, cpu) = box;
-               }
-       }
-       return 0;
-}
-
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
-       struct intel_uncore_type *type;
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       int i, j;
-
-       for (i = 0; uncores[i]; i++) {
-               type = uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       if (old_cpu < 0)
-                               box = uncore_pmu_to_box(pmu, new_cpu);
-                       else
-                               box = uncore_pmu_to_box(pmu, old_cpu);
-                       if (!box)
-                               continue;
-
-                       if (old_cpu < 0) {
-                               WARN_ON_ONCE(box->cpu != -1);
-                               box->cpu = new_cpu;
-                               continue;
-                       }
-
-                       WARN_ON_ONCE(box->cpu != old_cpu);
-                       if (new_cpu >= 0) {
-                               uncore_pmu_cancel_hrtimer(box);
-                               perf_pmu_migrate_context(&pmu->pmu,
-                                               old_cpu, new_cpu);
-                               box->cpu = new_cpu;
-                       } else {
-                               box->cpu = -1;
-                       }
-               }
-       }
-}
-
-static void uncore_event_exit_cpu(int cpu)
-{
-       int i, phys_id, target;
-
-       /* if exiting cpu is used for collecting uncore events */
-       if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-               return;
-
-       /* find a new cpu to collect uncore events */
-       phys_id = topology_physical_package_id(cpu);
-       target = -1;
-       for_each_online_cpu(i) {
-               if (i == cpu)
-                       continue;
-               if (phys_id == topology_physical_package_id(i)) {
-                       target = i;
-                       break;
-               }
-       }
-
-       /* migrate uncore events to the new cpu */
-       if (target >= 0)
-               cpumask_set_cpu(target, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, cpu, target);
-       uncore_change_context(uncore_pci_uncores, cpu, target);
-}
-
-static void uncore_event_init_cpu(int cpu)
-{
-       int i, phys_id;
-
-       phys_id = topology_physical_package_id(cpu);
-       for_each_cpu(i, &uncore_cpu_mask) {
-               if (phys_id == topology_physical_package_id(i))
-                       return;
-       }
-
-       cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
-       uncore_change_context(uncore_msr_uncores, -1, cpu);
-       uncore_change_context(uncore_pci_uncores, -1, cpu);
-}
-
-static int uncore_cpu_notifier(struct notifier_block *self,
-                              unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       /* allocate/free data structure for uncore box */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_UP_PREPARE:
-               uncore_cpu_prepare(cpu, -1);
-               break;
-       case CPU_STARTING:
-               uncore_cpu_starting(cpu);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_DYING:
-               uncore_cpu_dying(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DEAD:
-               uncore_kfree_boxes();
-               break;
-       default:
-               break;
-       }
-
-       /* select the cpu that collects uncore events */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_FAILED:
-       case CPU_STARTING:
-               uncore_event_init_cpu(cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               uncore_event_exit_cpu(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb = {
-       .notifier_call  = uncore_cpu_notifier,
-       /*
-        * to migrate uncore events, our notifier should be executed
-        * before perf core's notifier.
-        */
-       .priority       = CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
-       uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
-       int ret;
-
-       switch (boot_cpu_data.x86_model) {
-       case 26: /* Nehalem */
-       case 30:
-       case 37: /* Westmere */
-       case 44:
-               nhm_uncore_cpu_init();
-               break;
-       case 42: /* Sandy Bridge */
-       case 58: /* Ivy Bridge */
-       case 60: /* Haswell */
-       case 69: /* Haswell */
-       case 70: /* Haswell */
-       case 61: /* Broadwell */
-       case 71: /* Broadwell */
-               snb_uncore_cpu_init();
-               break;
-       case 45: /* Sandy Bridge-EP */
-               snbep_uncore_cpu_init();
-               break;
-       case 46: /* Nehalem-EX */
-       case 47: /* Westmere-EX aka. Xeon E7 */
-               nhmex_uncore_cpu_init();
-               break;
-       case 62: /* Ivy Bridge-EP */
-               ivbep_uncore_cpu_init();
-               break;
-       case 63: /* Haswell-EP */
-               hswep_uncore_cpu_init();
-               break;
-       case 79: /* BDX-EP */
-       case 86: /* BDX-DE */
-               bdx_uncore_cpu_init();
-               break;
-       case 87: /* Knights Landing */
-               knl_uncore_cpu_init();
-               break;
-       default:
-               return 0;
-       }
-
-       ret = uncore_types_init(uncore_msr_uncores);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_type *type;
-       int i, j;
-
-       for (i = 0; uncore_msr_uncores[i]; i++) {
-               type = uncore_msr_uncores[i];
-               for (j = 0; j < type->num_boxes; j++) {
-                       pmu = &type->pmus[j];
-                       uncore_pmu_register(pmu);
-               }
-       }
-
-       return 0;
-}
-
-static void __init uncore_cpumask_init(void)
-{
-       int cpu;
-
-       /*
-        * ony invoke once from msr or pci init code
-        */
-       if (!cpumask_empty(&uncore_cpu_mask))
-               return;
-
-       cpu_notifier_register_begin();
-
-       for_each_online_cpu(cpu) {
-               int i, phys_id = topology_physical_package_id(cpu);
-
-               for_each_cpu(i, &uncore_cpu_mask) {
-                       if (phys_id == topology_physical_package_id(i)) {
-                               phys_id = -1;
-                               break;
-                       }
-               }
-               if (phys_id < 0)
-                       continue;
-
-               uncore_cpu_prepare(cpu, phys_id);
-               uncore_event_init_cpu(cpu);
-       }
-       on_each_cpu(uncore_cpu_setup, NULL, 1);
-
-       __register_cpu_notifier(&uncore_cpu_nb);
-
-       cpu_notifier_register_done();
-}
-
-
-static int __init intel_uncore_init(void)
-{
-       int ret;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return -ENODEV;
-
-       if (cpu_has_hypervisor)
-               return -ENODEV;
-
-       ret = uncore_pci_init();
-       if (ret)
-               goto fail;
-       ret = uncore_cpu_init();
-       if (ret) {
-               uncore_pci_exit();
-               goto fail;
-       }
-       uncore_cpumask_init();
-
-       uncore_pmus_register();
-       return 0;
-fail:
-       return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
deleted file mode 100644 (file)
index 07aa2d6..0000000
+++ /dev/null
@@ -1,356 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN            32
-#define UNCORE_PMU_HRTIMER_INTERVAL    (60LL * NSEC_PER_SEC)
-#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT             0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC     8
-#define UNCORE_PMC_IDX_FIXED           UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX             (UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
-#define UNCORE_PCI_DEV_TYPE(data)      ((data >> 8) & 0xff)
-#define UNCORE_PCI_DEV_IDX(data)       (data & 0xff)
-#define UNCORE_EXTRA_PCI_DEV           0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX       3
-
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX              8
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
-       const char *name;
-       int num_counters;
-       int num_boxes;
-       int perf_ctr_bits;
-       int fixed_ctr_bits;
-       unsigned perf_ctr;
-       unsigned event_ctl;
-       unsigned event_mask;
-       unsigned fixed_ctr;
-       unsigned fixed_ctl;
-       unsigned box_ctl;
-       unsigned msr_offset;
-       unsigned num_shared_regs:8;
-       unsigned single_fixed:1;
-       unsigned pair_ctr_ctl:1;
-       unsigned *msr_offsets;
-       struct event_constraint unconstrainted;
-       struct event_constraint *constraints;
-       struct intel_uncore_pmu *pmus;
-       struct intel_uncore_ops *ops;
-       struct uncore_event_desc *event_descs;
-       const struct attribute_group *attr_groups[4];
-       struct pmu *pmu; /* for custom pmu ops */
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
-       void (*init_box)(struct intel_uncore_box *);
-       void (*disable_box)(struct intel_uncore_box *);
-       void (*enable_box)(struct intel_uncore_box *);
-       void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
-       void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
-       u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
-       int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
-       struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
-                                                  struct perf_event *);
-       void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
-       struct pmu pmu;
-       char name[UNCORE_PMU_NAME_LEN];
-       int pmu_idx;
-       int func_id;
-       struct intel_uncore_type *type;
-       struct intel_uncore_box ** __percpu box;
-       struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
-       raw_spinlock_t lock;
-       u64 config, config1, config2;
-       atomic_t ref;
-};
-
-struct intel_uncore_box {
-       int phys_id;
-       int n_active;   /* number of active events */
-       int n_events;
-       int cpu;        /* cpu to collect events */
-       unsigned long flags;
-       atomic_t refcnt;
-       struct perf_event *events[UNCORE_PMC_IDX_MAX];
-       struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
-       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
-       unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-       u64 tags[UNCORE_PMC_IDX_MAX];
-       struct pci_dev *pci_dev;
-       struct intel_uncore_pmu *pmu;
-       u64 hrtimer_duration; /* hrtimer timeout for this box */
-       struct hrtimer hrtimer;
-       struct list_head list;
-       struct list_head active_list;
-       void *io_addr;
-       struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED      0
-
-struct uncore_event_desc {
-       struct kobj_attribute attr;
-       const char *config;
-};
-
-struct pci2phy_map {
-       struct list_head list;
-       int segment;
-       int pbus_to_physid[256];
-};
-
-int uncore_pcibus_to_physid(struct pci_bus *bus);
-struct pci2phy_map *__find_pci2phy_map(int segment);
-
-ssize_t uncore_event_show(struct kobject *kobj,
-                         struct kobj_attribute *attr, char *buf);
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config)                        \
-{                                                              \
-       .attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
-       .config = _config,                                      \
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)                        \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj,            \
-                               struct kobj_attribute *attr,            \
-                               char *page)                             \
-{                                                                      \
-       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
-       return sprintf(page, _format "\n");                             \
-}                                                                      \
-static struct kobj_attribute format_attr_##_var =                      \
-       __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
-       struct intel_uncore_pmu *pmu = box->pmu;
-       return pmu->type->msr_offsets ?
-               pmu->type->msr_offsets[pmu->pmu_idx] :
-               pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->box_ctl)
-               return 0;
-       return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (!box->pmu->type->fixed_ctl)
-               return 0;
-       return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->event_ctl +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       return box->pmu->type->perf_ctr +
-               (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-               uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctl(box);
-       else
-               return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
-       if (box->pci_dev)
-               return uncore_pci_fixed_ctr(box);
-       else
-               return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_event_ctl(box, idx);
-       else
-               return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-       if (box->pci_dev)
-               return uncore_pci_perf_ctr(box, idx);
-       else
-               return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
-       return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
-       return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->disable_box)
-               box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->type->ops->enable_box)
-               box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
-                               struct perf_event *event)
-{
-       return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-               if (box->pmu->type->ops->init_box)
-                       box->pmu->type->ops->init_box(box);
-       }
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
-       return (box->phys_id < 0);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_event_read(struct perf_event *event);
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
-
-extern struct intel_uncore_type **uncore_msr_uncores;
-extern struct intel_uncore_type **uncore_pci_uncores;
-extern struct pci_driver *uncore_pci_driver;
-extern raw_spinlock_t pci2phy_map_lock;
-extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-extern struct event_constraint uncore_constraint_empty;
-
-/* perf_event_intel_uncore_snb.c */
-int snb_uncore_pci_init(void);
-int ivb_uncore_pci_init(void);
-int hsw_uncore_pci_init(void);
-int bdw_uncore_pci_init(void);
-void snb_uncore_cpu_init(void);
-void nhm_uncore_cpu_init(void);
-int snb_pci2phy_map_init(int devid);
-
-/* perf_event_intel_uncore_snbep.c */
-int snbep_uncore_pci_init(void);
-void snbep_uncore_cpu_init(void);
-int ivbep_uncore_pci_init(void);
-void ivbep_uncore_cpu_init(void);
-int hswep_uncore_pci_init(void);
-void hswep_uncore_cpu_init(void);
-int bdx_uncore_pci_init(void);
-void bdx_uncore_cpu_init(void);
-int knl_uncore_pci_init(void);
-void knl_uncore_cpu_init(void);
-
-/* perf_event_intel_uncore_nhmex.c */
-void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
deleted file mode 100644 (file)
index 2749965..0000000
+++ /dev/null
@@ -1,1221 +0,0 @@
-/* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK      0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0         (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET                (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN          (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22                (1 << 22)
-#define NHMEX_PMON_CTL_INVERT          (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK      0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK      (NHMEX_PMON_CTL_EV_SEL_MASK | \
-                                        NHMEX_PMON_CTL_UMASK_MASK | \
-                                        NHMEX_PMON_CTL_EDGE_DET | \
-                                        NHMEX_PMON_CTL_INVERT | \
-                                        NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define NHMEX_U_MSR_PMON_CTR                   0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL                        0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN                 (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL       0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL             (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL            (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK            \
-               (NHMEX_PMON_CTL_EV_SEL_MASK |   \
-                NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL           0xd00
-#define NHMEX_C0_MSR_PMON_CTR0                 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0              0xd10
-#define NHMEX_C_MSR_OFFSET                     0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL           0xc20
-#define NHMEX_B0_MSR_PMON_CTR0                 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0                 0xc30
-#define NHMEX_B_MSR_OFFSET                     0x40
-#define NHMEX_B0_MSR_MATCH                     0xe45
-#define NHMEX_B0_MSR_MASK                      0xe46
-#define NHMEX_B1_MSR_MATCH                     0xe4d
-#define NHMEX_B1_MSR_MASK                      0xe4e
-
-#define NHMEX_B_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT         6
-#define NHMEX_B_PMON_CTR_MASK          \
-               (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK            \
-               (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-                NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL           0xc40
-#define NHMEX_S0_MSR_PMON_CTR0                 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0                 0xc50
-#define NHMEX_S_MSR_OFFSET                     0x80
-#define NHMEX_S0_MSR_MM_CFG                    0xe48
-#define NHMEX_S0_MSR_MATCH                     0xe49
-#define NHMEX_S0_MSR_MASK                      0xe4a
-#define NHMEX_S1_MSR_MM_CFG                    0xe58
-#define NHMEX_S1_MSR_MATCH                     0xe59
-#define NHMEX_S1_MSR_MASK                      0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN                 (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV             0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL                        0xca0
-#define NHMEX_M0_MSR_PMU_DSP                   0xca5
-#define NHMEX_M0_MSR_PMU_ISS                   0xca6
-#define NHMEX_M0_MSR_PMU_MAP                   0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR               0xca8
-#define NHMEX_M0_MSR_PMU_PGT                   0xca9
-#define NHMEX_M0_MSR_PMU_PLD                   0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC           0xcab
-#define NHMEX_M0_MSR_PMU_CTL0                  0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0                  0xcb1
-#define NHMEX_M_MSR_OFFSET                     0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG                        0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG                        0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN                 (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK           0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK            0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT           34
-
-#define NHMEX_M_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN                        (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT      2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK       \
-       (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT    4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK     \
-       (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE             (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE             (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT         9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK          \
-       (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT    19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK     \
-       (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK                    \
-               (NHMEX_M_PMON_CTL_COUNT_MODE_MASK |     \
-                NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |   \
-                NHMEX_M_PMON_CTL_WRAP_MODE |           \
-                NHMEX_M_PMON_CTL_FLAG_MODE |           \
-                NHMEX_M_PMON_CTL_INC_SEL_MASK |        \
-                NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK          (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-                          NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-                               NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-               EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-                               MBOX_SET_FLAG_SEL_MASK, \
-                               (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL                 0xe00
-#define NHMEX_R_MSR_PMON_CTL0                  0xe10
-#define NHMEX_R_MSR_PMON_CNT0                  0xe11
-#define NHMEX_R_MSR_OFFSET                     0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)           \
-               ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)                (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)                (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)                \
-               (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)   \
-               (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)   \
-               (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)    \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)     \
-               (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN                    (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT          1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK           \
-               (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN                        (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK            NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL                 0xc80
-#define NHMEX_W_MSR_PMON_CNT0                  0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0              0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR             0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL             0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN           (1ULL << 31)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~((1ULL << uncore_num_counters(box)) - 1);
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       u64 config;
-
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= (1ULL << uncore_num_counters(box)) - 1;
-               /* WBox has a fixed counter */
-               if (uncore_msr_fixed_ctl(box))
-                       config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-               wrmsrl(msr, config);
-       }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-       else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-       else
-               wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()                         \
-       .init_box       = nhmex_uncore_msr_init_box,            \
-       .disable_box    = nhmex_uncore_msr_disable_box,         \
-       .enable_box     = nhmex_uncore_msr_enable_box,          \
-       .disable_event  = nhmex_uncore_msr_disable_event,       \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_edge.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 1,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .event_ctl      = NHMEX_U_MSR_PMON_EV_SEL,
-       .perf_ctr       = NHMEX_U_MSR_PMON_CTR,
-       .event_mask     = NHMEX_U_PMON_RAW_EVENT_MASK,
-       .box_ctl        = NHMEX_U_MSR_PMON_GLOBAL_CTL,
-       .ops            = &nhmex_uncore_ops,
-       .format_group   = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-       0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 6,
-       .num_boxes              = 10,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_C0_MSR_PMON_EV_SEL0,
-       .perf_ctr               = NHMEX_C0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-       .msr_offsets            = nhmex_cbox_msr_offsets,
-       .pair_ctr_ctl           = 1,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-       .name                   = "wbox",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_W_MSR_PMON_CNT0,
-       .perf_ctr               = NHMEX_W_MSR_PMON_EVT_SEL0,
-       .fixed_ctr              = NHMEX_W_MSR_PMON_FIXED_CTR,
-       .fixed_ctl              = NHMEX_W_MSR_PMON_FIXED_CTL,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_W_MSR_GLOBAL_CTL,
-       .pair_ctr_ctl           = 1,
-       .event_descs            = nhmex_uncore_wbox_events,
-       .ops                    = &nhmex_uncore_ops,
-       .format_group           = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int ctr, ev_sel;
-
-       ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-               NHMEX_B_PMON_CTR_SHIFT;
-       ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-                 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-       /* events that do not use the match/mask registers */
-       if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-           (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_B0_MSR_MATCH;
-       else
-               reg1->reg = NHMEX_B1_MSR_MATCH;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, reg1->config);
-               wrmsrl(reg1->reg + 1, reg2->config);
-       }
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-       EVENT_CONSTRAINT(0 , 1, 0xc0),
-       EVENT_CONSTRAINT(0x40, 2, 0xc0),
-       EVENT_CONSTRAINT(0x80, 4, 0xc0),
-       EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-       EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_counter.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_bbox_msr_enable_event,
-       .hw_config              = nhmex_bbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-       .name                   = "bbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_B0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_B0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_B_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_B_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .constraints            = nhmex_uncore_bbox_constraints,
-       .ops                    = &nhmex_uncore_bbox_ops,
-       .format_group           = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       /* only TO_R_PROG_EV event uses the match/mask register */
-       if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-           NHMEX_S_EVENT_TO_R_PROG_EV)
-               return 0;
-
-       if (box->pmu->pmu_idx == 0)
-               reg1->reg = NHMEX_S0_MSR_MM_CFG;
-       else
-               reg1->reg = NHMEX_S1_MSR_MM_CFG;
-       reg1->idx = 0;
-       reg1->config = event->attr.config1;
-       reg2->config = event->attr.config2;
-       return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg1->reg, 0);
-               wrmsrl(reg1->reg + 1, reg1->config);
-               wrmsrl(reg1->reg + 2, reg2->config);
-               wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-       }
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match.attr,
-       &format_attr_mask.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-       .name                   = "format",
-       .attrs                  = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_sbox_msr_enable_event,
-       .hw_config              = nhmex_sbox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_S0_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_S0_MSR_PMON_CTR0,
-       .event_mask             = NHMEX_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-       .msr_offset             = NHMEX_S_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 1,
-       .ops                    = &nhmex_uncore_sbox_ops,
-       .format_group           = &nhmex_uncore_sbox_format_group
-};
-
-enum {
-       EXTRA_REG_NHMEX_M_FILTER,
-       EXTRA_REG_NHMEX_M_DSP,
-       EXTRA_REG_NHMEX_M_ISS,
-       EXTRA_REG_NHMEX_M_MAP,
-       EXTRA_REG_NHMEX_M_MSC_THR,
-       EXTRA_REG_NHMEX_M_PGT,
-       EXTRA_REG_NHMEX_M_PLD,
-       EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-       MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-       MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-       MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-       /* event 0xa uses two extra registers */
-       MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-       MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-       MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-       /* events 0xd ~ 0x10 use the same extra register */
-       MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-       MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-       MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-       EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       bool ret = false;
-       u64 mask;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               raw_spin_lock_irqsave(&er->lock, flags);
-               if (!atomic_read(&er->ref) || er->config == config) {
-                       atomic_inc(&er->ref);
-                       er->config = config;
-                       ret = true;
-               }
-               raw_spin_unlock_irqrestore(&er->lock, flags);
-
-               return ret;
-       }
-       /*
-        * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-        * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-        * fields which are shared.
-        */
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (WARN_ON_ONCE(idx >= 4))
-               return false;
-
-       /* mask of the shared fields */
-       if (uncore_nhmex)
-               mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       else
-               mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       /* add mask of the non-shared field if it's in use */
-       if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-               if (uncore_nhmex)
-                       mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       }
-
-       if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               if (uncore_nhmex)
-                       mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               else
-                       mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-                               WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-               er->config &= ~mask;
-               er->config |= (config & mask);
-               ret = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               er = &box->shared_regs[idx];
-               atomic_dec(&er->ref);
-               return;
-       }
-
-       idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-       u64 config = reg1->config;
-
-       /* get the non-shared control bits and shift them */
-       idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-       if (uncore_nhmex)
-               config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       else
-               config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-       if (new_idx > orig_idx) {
-               idx = new_idx - orig_idx;
-               config <<= 3 * idx;
-       } else {
-               idx = orig_idx - new_idx;
-               config >>= 3 * idx;
-       }
-
-       /* add the shared control bits back */
-       if (uncore_nhmex)
-               config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       else
-               config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-       if (modify) {
-               /* adjust the main event selector */
-               if (new_idx > orig_idx)
-                       hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               else
-                       hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-               reg1->config = config;
-               reg1->idx = ~0xff | new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int i, idx[2], alloc = 0;
-       u64 config1 = reg1->config;
-
-       idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-       idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-       for (i = 0; i < 2; i++) {
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       idx[i] = 0xff;
-
-               if (idx[i] == 0xff)
-                       continue;
-
-               if (!nhmex_mbox_get_shared_reg(box, idx[i],
-                               __BITS_VALUE(config1, i, 32)))
-                       goto fail;
-               alloc |= (0x1 << i);
-       }
-
-       /* for the match/mask registers */
-       if (reg2->idx != EXTRA_REG_NONE &&
-           (uncore_box_is_fake(box) || !reg2->alloc) &&
-           !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-               goto fail;
-
-       /*
-        * If it's a fake box -- as per validate_{group,event}() we
-        * shouldn't touch event state and we can avoid doing so
-        * since both will only call get_event_constraints() once
-        * on each event, this avoids the need for reg->alloc.
-        */
-       if (!uncore_box_is_fake(box)) {
-               if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-                       nhmex_mbox_alter_er(event, idx[0], true);
-               reg1->alloc |= alloc;
-               if (reg2->idx != EXTRA_REG_NONE)
-                       reg2->alloc = 1;
-       }
-       return NULL;
-fail:
-       if (idx[0] != 0xff && !(alloc & 0x1) &&
-           idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-               /*
-                * events 0xd ~ 0x10 are functional identical, but are
-                * controlled by different fields in the ZDP_CTL_FVC
-                * register. If we failed to take one field, try the
-                * rest 3 choices.
-                */
-               BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-               idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               idx[0] = (idx[0] + 1) % 4;
-               idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-               if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-                       config1 = nhmex_mbox_alter_er(event, idx[0], false);
-                       goto again;
-               }
-       }
-
-       if (alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, idx[0]);
-       if (alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, idx[1]);
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       if (reg1->alloc & 0x1)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-       if (reg1->alloc & 0x2)
-               nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-       reg1->alloc = 0;
-
-       if (reg2->alloc) {
-               nhmex_mbox_put_shared_reg(box, reg2->idx);
-               reg2->alloc = 0;
-       }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-       if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return er->idx;
-       return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_type *type = box->pmu->type;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       struct extra_reg *er;
-       unsigned msr;
-       int reg_idx = 0;
-       /*
-        * The mbox events may require 2 extra MSRs at the most. But only
-        * the lower 32 bits in these MSRs are significant, so we can use
-        * config1 to pass two MSRs' config.
-        */
-       for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               if (event->attr.config1 & ~er->valid_mask)
-                       return -EINVAL;
-
-               msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-               if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-                       return -EINVAL;
-
-               /* always use the 32~63 bits to pass the PLD config */
-               if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-                       reg_idx = 1;
-               else if (WARN_ON_ONCE(reg_idx > 0))
-                       return -EINVAL;
-
-               reg1->idx &= ~(0xff << (reg_idx * 8));
-               reg1->reg &= ~(0xffff << (reg_idx * 16));
-               reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-               reg1->reg |= msr << (reg_idx * 16);
-               reg1->config = event->attr.config1;
-               reg_idx++;
-       }
-       /*
-        * The mbox only provides ability to perform address matching
-        * for the PLD events.
-        */
-       if (reg_idx == 2) {
-               reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-               if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-                       reg2->config = event->attr.config2;
-               else
-                       reg2->config = ~0ULL;
-               if (box->pmu->pmu_idx == 0)
-                       reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-               else
-                       reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-       }
-       return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       u64 config;
-
-       if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-               return box->shared_regs[idx].config;
-
-       er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       config = er->config;
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx;
-
-       idx = __BITS_VALUE(reg1->idx, 0, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-       idx = __BITS_VALUE(reg1->idx, 1, 8);
-       if (idx != 0xff)
-               wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-                       nhmex_mbox_shared_reg_config(box, idx));
-
-       if (reg2->idx != EXTRA_REG_NONE) {
-               wrmsrl(reg2->reg, 0);
-               if (reg2->config != ~0ULL) {
-                       wrmsrl(reg2->reg + 1,
-                               reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-                       wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-                               (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-                       wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-               }
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,          count_mode,     "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,                storage_mode,   "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,           wrap_mode,      "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,           flag_mode,      "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,             inc_sel,        "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,                set_flag_sel,   "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,       filter_cfg_en,  "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,                filter_match,   "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,         filter_mask,    "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,                 dsp,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,                 thr,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,                 fvc,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,                 pgt,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,                 map,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,                 iss,            "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,                 pld,            "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-       &format_attr_count_mode.attr,
-       &format_attr_storage_mode.attr,
-       &format_attr_wrap_mode.attr,
-       &format_attr_flag_mode.attr,
-       &format_attr_inc_sel.attr,
-       &format_attr_set_flag_sel.attr,
-       &format_attr_filter_cfg_en.attr,
-       &format_attr_filter_match.attr,
-       &format_attr_filter_mask.attr,
-       &format_attr_dsp.attr,
-       &format_attr_thr.attr,
-       &format_attr_fvc.attr,
-       &format_attr_pgt.attr,
-       &format_attr_map.attr,
-       &format_attr_iss.attr,
-       &format_attr_pld.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-       .name           = "format",
-       .attrs          = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-       INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event   = nhmex_mbox_msr_enable_event,
-       .hw_config      = nhmex_mbox_hw_config,
-       .get_constraint = nhmex_mbox_get_constraint,
-       .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-       .name                   = "mbox",
-       .num_counters           = 6,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_M0_MSR_PMU_CTL0,
-       .perf_ctr               = NHMEX_M0_MSR_PMU_CNT0,
-       .event_mask             = NHMEX_M_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_M0_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_M_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 8,
-       .event_descs            = nhmex_uncore_mbox_events,
-       .ops                    = &nhmex_uncore_mbox_ops,
-       .format_group           = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       /* adjust the main event selector and extra register index */
-       if (reg1->idx % 2) {
-               reg1->idx--;
-               hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       } else {
-               reg1->idx++;
-               hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       }
-
-       /* adjust extra register config */
-       switch (reg1->idx % 6) {
-       case 2:
-               /* shift the 8~15 bits to the 0~7 bits */
-               reg1->config >>= 8;
-               break;
-       case 3:
-               /* shift the 0~7 bits to the 8~15 bits */
-               reg1->config <<= 8;
-               break;
-       }
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       struct intel_uncore_extra_reg *er;
-       unsigned long flags;
-       int idx, er_idx;
-       u64 config1;
-       bool ok = false;
-
-       if (!uncore_box_is_fake(box) && reg1->alloc)
-               return NULL;
-
-       idx = reg1->idx % 6;
-       config1 = reg1->config;
-again:
-       er_idx = idx;
-       /* the 3rd and 4th events use the same extra register */
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (idx < 2) {
-               if (!atomic_read(&er->ref) || er->config == reg1->config) {
-                       atomic_inc(&er->ref);
-                       er->config = reg1->config;
-                       ok = true;
-               }
-       } else if (idx == 2 || idx == 3) {
-               /*
-                * these two events use different fields in a extra register,
-                * the 0~7 bits and the 8~15 bits respectively.
-                */
-               u64 mask = 0xff << ((idx - 2) * 8);
-               if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-                               !((er->config ^ config1) & mask)) {
-                       atomic_add(1 << ((idx - 2) * 8), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= config1 & mask;
-                       ok = true;
-               }
-       } else {
-               if (!atomic_read(&er->ref) ||
-                               (er->config == (hwc->config >> 32) &&
-                                er->config1 == reg1->config &&
-                                er->config2 == reg2->config)) {
-                       atomic_inc(&er->ref);
-                       er->config = (hwc->config >> 32);
-                       er->config1 = reg1->config;
-                       er->config2 = reg2->config;
-                       ok = true;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               /*
-                * The Rbox events are always in pairs. The paired
-                * events are functional identical, but use different
-                * extra registers. If we failed to take an extra
-                * register, try the alternative.
-                */
-               idx ^= 1;
-               if (idx != reg1->idx % 6) {
-                       if (idx == 2)
-                               config1 >>= 8;
-                       else if (idx == 3)
-                               config1 <<= 8;
-                       goto again;
-               }
-       } else {
-               if (!uncore_box_is_fake(box)) {
-                       if (idx != reg1->idx % 6)
-                               nhmex_rbox_alter_er(box, event);
-                       reg1->alloc = 1;
-               }
-               return NULL;
-       }
-       return &uncore_constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct intel_uncore_extra_reg *er;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       int idx, er_idx;
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       idx = reg1->idx % 6;
-       er_idx = idx;
-       if (er_idx > 2)
-               er_idx--;
-       er_idx += (reg1->idx / 6) * 5;
-
-       er = &box->shared_regs[er_idx];
-       if (idx == 2 || idx == 3)
-               atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-       else
-               atomic_dec(&er->ref);
-
-       reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-       int idx;
-
-       idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-               NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-       if (idx >= 0x18)
-               return -EINVAL;
-
-       reg1->idx = idx;
-       reg1->config = event->attr.config1;
-
-       switch (idx % 6) {
-       case 4:
-       case 5:
-               hwc->config |= event->attr.config & (~0ULL << 32);
-               reg2->config = event->attr.config2;
-               break;
-       }
-       return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-       int idx, port;
-
-       idx = reg1->idx;
-       port = idx / 6 + box->pmu->pmu_idx * 4;
-
-       switch (idx % 6) {
-       case 0:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-               break;
-       case 1:
-               wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-               break;
-       case 2:
-       case 3:
-               wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-                       uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-               break;
-       case 4:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-               break;
-       case 5:
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-                       hwc->config >> 32);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-               wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-               break;
-       }
-
-       wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-               (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-       &format_attr_event5.attr,
-       &format_attr_xbr_mm_cfg.attr,
-       &format_attr_xbr_match.attr,
-       &format_attr_xbr_mask.attr,
-       &format_attr_qlx_cfg.attr,
-       &format_attr_iperf_cfg.attr,
-       NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-       .name = "format",
-       .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-       INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,         "event=0x0,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,         "event=0x6,iperf_cfg=0x80000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,         "event=0x0,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,         "event=0x6,iperf_cfg=0x40000000"),
-       INTEL_UNCORE_EVENT_DESC(qpi0_date_response,     "event=0x0,iperf_cfg=0xc4"),
-       INTEL_UNCORE_EVENT_DESC(qpi1_date_response,     "event=0x6,iperf_cfg=0xc4"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-       NHMEX_UNCORE_OPS_COMMON_INIT(),
-       .enable_event           = nhmex_rbox_msr_enable_event,
-       .hw_config              = nhmex_rbox_hw_config,
-       .get_constraint         = nhmex_rbox_get_constraint,
-       .put_constraint         = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-       .name                   = "rbox",
-       .num_counters           = 8,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = NHMEX_R_MSR_PMON_CTL0,
-       .perf_ctr               = NHMEX_R_MSR_PMON_CNT0,
-       .event_mask             = NHMEX_R_PMON_RAW_EVENT_MASK,
-       .box_ctl                = NHMEX_R_MSR_GLOBAL_CTL,
-       .msr_offset             = NHMEX_R_MSR_OFFSET,
-       .pair_ctr_ctl           = 1,
-       .num_shared_regs        = 20,
-       .event_descs            = nhmex_uncore_rbox_events,
-       .ops                    = &nhmex_uncore_rbox_ops,
-       .format_group           = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-       &nhmex_uncore_ubox,
-       &nhmex_uncore_cbox,
-       &nhmex_uncore_bbox,
-       &nhmex_uncore_sbox,
-       &nhmex_uncore_mbox,
-       &nhmex_uncore_rbox,
-       &nhmex_uncore_wbox,
-       NULL,
-};
-
-void nhmex_uncore_cpu_init(void)
-{
-       if (boot_cpu_data.x86_model == 46)
-               uncore_nhmex = true;
-       else
-               nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-       if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = nhmex_msr_uncores;
-}
-/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
deleted file mode 100644 (file)
index 0b93482..0000000
+++ /dev/null
@@ -1,697 +0,0 @@
-/* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
-
-/* Uncore IMC PCI IDs */
-#define PCI_DEVICE_ID_INTEL_SNB_IMC    0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC    0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
-#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
-#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK                 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET                   (1 << 18)
-#define SNB_UNC_CTL_EN                         (1 << 22)
-#define SNB_UNC_CTL_INVERT                     (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK                 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK                 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN               (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK                 (SNB_UNC_CTL_EV_SEL_MASK | \
-                                                SNB_UNC_CTL_UMASK_MASK | \
-                                                SNB_UNC_CTL_EDGE_DET | \
-                                                SNB_UNC_CTL_INVERT | \
-                                                NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* SNB ARB register */
-#define SNB_UNC_ARB_PER_CTR0                   0x3b0
-#define SNB_UNC_ARB_PERFEVTSEL0                        0x3b2
-#define SNB_UNC_ARB_MSR_OFFSET                 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       if (box->pmu->pmu_idx == 0) {
-               wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-                       SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-       }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask5.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-       .name           = "format",
-       .attrs          = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-       .init_box       = snb_uncore_msr_init_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = snb_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_arb_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-       .name           = "cbox",
-       .num_counters   = 2,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNB_UNC_CBO_0_PER_CTR0,
-       .event_ctl      = SNB_UNC_CBO_0_PERFEVTSEL0,
-       .fixed_ctr      = SNB_UNC_FIXED_CTR,
-       .fixed_ctl      = SNB_UNC_FIXED_CTR_CTRL,
-       .single_fixed   = 1,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_CBO_MSR_OFFSET,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-       .event_descs    = snb_uncore_events,
-};
-
-static struct intel_uncore_type snb_uncore_arb = {
-       .name           = "arb",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .perf_ctr       = SNB_UNC_ARB_PER_CTR0,
-       .event_ctl      = SNB_UNC_ARB_PERFEVTSEL0,
-       .event_mask     = SNB_UNC_RAW_EVENT_MASK,
-       .msr_offset     = SNB_UNC_ARB_MSR_OFFSET,
-       .constraints    = snb_uncore_arb_constraints,
-       .ops            = &snb_uncore_msr_ops,
-       .format_group   = &snb_uncore_format_group,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-       &snb_uncore_cbox,
-       &snb_uncore_arb,
-       NULL,
-};
-
-void snb_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = snb_msr_uncores;
-       if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-}
-
-enum {
-       SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-       INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-       { /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK          0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET          0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE            0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS          0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE     0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES         0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE    0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE            SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-       .name = "format",
-       .attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-       resource_size_t addr;
-       u32 pci_dword;
-
-       pci_read_config_dword(pdev, where, &pci_dword);
-       addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-       pci_read_config_dword(pdev, where + 4, &pci_dword);
-       addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-       addr &= ~(PAGE_SIZE - 1);
-
-       box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-       box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-       struct intel_uncore_pmu *pmu;
-       struct intel_uncore_box *box;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-       int idx, base;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       pmu = uncore_event_to_pmu(event);
-       /* no device found for this pmu */
-       if (pmu->func_id < 0)
-               return -ENOENT;
-
-       /* Sampling not supported yet */
-       if (hwc->sample_period)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       /*
-        * Place all uncore events for a particular physical package
-        * onto a single cpu
-        */
-       if (event->cpu < 0)
-               return -EINVAL;
-
-       /* check only supported bits are set */
-       if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-               return -EINVAL;
-
-       box = uncore_pmu_to_box(pmu, event->cpu);
-       if (!box || box->cpu < 0)
-               return -EINVAL;
-
-       event->cpu = box->cpu;
-
-       event->hw.idx = -1;
-       event->hw.last_tag = ~0ULL;
-       event->hw.extra_reg.idx = EXTRA_REG_NONE;
-       event->hw.branch_reg.idx = EXTRA_REG_NONE;
-       /*
-        * check event is known (whitelist, determines counter)
-        */
-       switch (cfg) {
-       case SNB_UNCORE_PCI_IMC_DATA_READS:
-               base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-               idx = UNCORE_PMC_IDX_FIXED;
-               break;
-       case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-               base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-               idx = UNCORE_PMC_IDX_FIXED + 1;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       /* must be done before validate_group */
-       event->hw.event_base = base;
-       event->hw.config = cfg;
-       event->hw.idx = idx;
-
-       /* no group validation needed, we have free running counters */
-
-       return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       u64 count;
-
-       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-               return;
-
-       event->hw.state = 0;
-       box->n_active++;
-
-       list_add_tail(&event->active_entry, &box->active_list);
-
-       count = snb_uncore_imc_read_counter(box, event);
-       local64_set(&event->hw.prev_count, count);
-
-       if (box->n_active == 1)
-               uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!(hwc->state & PERF_HES_STOPPED)) {
-               box->n_active--;
-
-               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-               hwc->state |= PERF_HES_STOPPED;
-
-               list_del(&event->active_entry);
-
-               if (box->n_active == 0)
-                       uncore_pmu_cancel_hrtimer(box);
-       }
-
-       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-               /*
-                * Drain the remaining delta count out of a event
-                * that we are disabling:
-                */
-               uncore_perf_event_update(box, event);
-               hwc->state |= PERF_HES_UPTODATE;
-       }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (!box)
-               return -ENODEV;
-
-       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-       if (!(flags & PERF_EF_START))
-               hwc->state |= PERF_HES_ARCH;
-
-       snb_uncore_imc_event_start(event, 0);
-
-       box->n_events++;
-
-       return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-       struct intel_uncore_box *box = uncore_event_to_box(event);
-       int i;
-
-       snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-       for (i = 0; i < box->n_events; i++) {
-               if (event == box->event_list[i]) {
-                       --box->n_events;
-                       break;
-               }
-       }
-}
-
-int snb_pci2phy_map_init(int devid)
-{
-       struct pci_dev *dev = NULL;
-       struct pci2phy_map *map;
-       int bus, segment;
-
-       dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-       if (!dev)
-               return -ENOTTY;
-
-       bus = dev->bus->number;
-       segment = pci_domain_nr(dev->bus);
-
-       raw_spin_lock(&pci2phy_map_lock);
-       map = __find_pci2phy_map(segment);
-       if (!map) {
-               raw_spin_unlock(&pci2phy_map_lock);
-               pci_dev_put(dev);
-               return -ENOMEM;
-       }
-       map->pbus_to_physid[bus] = 0;
-       raw_spin_unlock(&pci2phy_map_lock);
-
-       pci_dev_put(dev);
-
-       return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-       .task_ctx_nr    = perf_invalid_context,
-       .event_init     = snb_uncore_imc_event_init,
-       .add            = snb_uncore_imc_event_add,
-       .del            = snb_uncore_imc_event_del,
-       .start          = snb_uncore_imc_event_start,
-       .stop           = snb_uncore_imc_event_stop,
-       .read           = uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-       .init_box       = snb_uncore_imc_init_box,
-       .enable_box     = snb_uncore_imc_enable_box,
-       .disable_box    = snb_uncore_imc_disable_box,
-       .disable_event  = snb_uncore_imc_disable_event,
-       .enable_event   = snb_uncore_imc_enable_event,
-       .hw_config      = snb_uncore_imc_hw_config,
-       .read_counter   = snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .fixed_ctr_bits = 32,
-       .fixed_ctr      = SNB_UNCORE_PCI_IMC_CTR_BASE,
-       .event_descs    = snb_uncore_imc_events,
-       .format_group   = &snb_uncore_imc_format_group,
-       .perf_ctr       = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-       .event_mask     = SNB_UNCORE_PCI_IMC_EVENT_MASK,
-       .ops            = &snb_uncore_imc_ops,
-       .pmu            = &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-       [SNB_PCI_UNCORE_IMC]    = &snb_uncore_imc,
-       NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id bdw_uncore_pci_ids[] = {
-       { /* IMC */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-       },
-       { /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-       .name           = "snb_uncore",
-       .id_table       = snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-       .name           = "ivb_uncore",
-       .id_table       = ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-       .name           = "hsw_uncore",
-       .id_table       = hsw_uncore_pci_ids,
-};
-
-static struct pci_driver bdw_uncore_pci_driver = {
-       .name           = "bdw_uncore",
-       .id_table       = bdw_uncore_pci_ids,
-};
-
-struct imc_uncore_pci_dev {
-       __u32 pci_id;
-       struct pci_driver *driver;
-};
-#define IMC_DEV(a, d) \
-       { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
-
-static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
-       IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
-       IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
-       IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
-       IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
-       IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
-       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
-       {  /* end marker */ }
-};
-
-
-#define for_each_imc_pci_id(x, t) \
-       for (x = (t); (x)->pci_id; x++)
-
-static struct pci_driver *imc_uncore_find_dev(void)
-{
-       const struct imc_uncore_pci_dev *p;
-       int ret;
-
-       for_each_imc_pci_id(p, desktop_imc_pci_ids) {
-               ret = snb_pci2phy_map_init(p->pci_id);
-               if (ret == 0)
-                       return p->driver;
-       }
-       return NULL;
-}
-
-static int imc_uncore_pci_init(void)
-{
-       struct pci_driver *imc_drv = imc_uncore_find_dev();
-
-       if (!imc_drv)
-               return -ENODEV;
-
-       uncore_pci_uncores = snb_pci_uncores;
-       uncore_pci_driver = imc_drv;
-
-       return 0;
-}
-
-int snb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int ivb_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-int hsw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-int bdw_uncore_pci_init(void)
-{
-       return imc_uncore_pci_init();
-}
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-               wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-       else
-               wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask8.attr,
-       NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-       .name = "format",
-       .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-       INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-       .disable_box    = nhm_uncore_msr_disable_box,
-       .enable_box     = nhm_uncore_msr_enable_box,
-       .disable_event  = snb_uncore_msr_disable_event,
-       .enable_event   = nhm_uncore_msr_enable_event,
-       .read_counter   = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-       .name           = "",
-       .num_counters   = 8,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .event_ctl      = NHM_UNC_PERFEVTSEL0,
-       .perf_ctr       = NHM_UNC_UNCORE_PMC0,
-       .fixed_ctr      = NHM_UNC_FIXED_CTR,
-       .fixed_ctl      = NHM_UNC_FIXED_CTR_CTRL,
-       .event_mask     = NHM_UNC_RAW_EVENT_MASK,
-       .event_descs    = nhm_uncore_events,
-       .ops            = &nhm_uncore_msr_ops,
-       .format_group   = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-       &nhm_uncore,
-       NULL,
-};
-
-void nhm_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = nhm_msr_uncores;
-}
-
-/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
deleted file mode 100644 (file)
index 33acb88..0000000
+++ /dev/null
@@ -1,3126 +0,0 @@
-/* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL    (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS    (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ         (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN      (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS | \
-                                        SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK     0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK      0x0000ff00
-#define SNBEP_PMON_CTL_RST             (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET                (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT      (1 << 21)
-#define SNBEP_PMON_CTL_EN              (1 << 22)
-#define SNBEP_PMON_CTL_INVERT          (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK      0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK      (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_INVERT | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK                0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK                \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN              (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK      (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK    0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK      0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT      (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET    (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL                 0xf4
-#define SNBEP_PCI_PMON_CTL0                    0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0                    0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0       0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1       0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH      0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL                0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR                0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0         0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1         0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0          0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1          0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0                  0xc16
-#define SNBEP_U_MSR_PMON_CTL0                  0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL                0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR                0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0                 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0                 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL              0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER           0xd14
-#define SNBEP_CBO_MSR_OFFSET                   0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID      0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID      0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE    0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC      0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {   \
-       .event = (e),                           \
-       .msr = SNBEP_C0_MSR_PMON_BOX_FILTER,    \
-       .config_mask = (m),                     \
-       .idx = (i)                              \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0                        0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0                        0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL             0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER          0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK     0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR              0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR              0x3fd
-
-/* IVBEP event control */
-#define IVBEP_PMON_BOX_CTL_INT         (SNBEP_PMON_BOX_CTL_RST_CTRL | \
-                                        SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVBEP_PMON_RAW_EVENT_MASK              (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                        SNBEP_PMON_CTL_UMASK_MASK | \
-                                        SNBEP_PMON_CTL_EDGE_DET | \
-                                        SNBEP_PMON_CTL_TRESH_MASK)
-/* IVBEP Ubox */
-#define IVBEP_U_MSR_PMON_GLOBAL_CTL            0xc00
-#define IVBEP_U_PMON_GLOBAL_FRZ_ALL            (1 << 31)
-#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL          (1 << 29)
-
-#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK        \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_UMASK_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVBEP Cbo */
-#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK              (IVBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x1fULL << 0)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 5)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x3fULL << 17)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-/* IVBEP home agent */
-#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST                (1 << 16)
-#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK               \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVBEP PCU */
-#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK      \
-                               (SNBEP_PMON_CTL_EV_SEL_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVBEP QPI */
-#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK      \
-                               (IVBEP_PMON_RAW_EVENT_MASK | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-                               ((1ULL << (n)) - 1)))
-
-/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0                  0x709
-#define HSWEP_U_MSR_PMON_CTL0                  0x705
-#define HSWEP_U_MSR_PMON_FILTER                        0x707
-
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR                0x704
-
-#define HSWEP_U_MSR_PMON_BOX_FILTER_TID                (0x1 << 0)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_CID                (0x1fULL << 1)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
-                                       (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
-                                        HSWEP_U_MSR_PMON_BOX_FILTER_CID)
-
-/* Haswell-EP CBo */
-#define HSWEP_C0_MSR_PMON_CTR0                 0xe08
-#define HSWEP_C0_MSR_PMON_CTL0                 0xe01
-#define HSWEP_C0_MSR_PMON_BOX_CTL                      0xe00
-#define HSWEP_C0_MSR_PMON_BOX_FILTER0          0xe05
-#define HSWEP_CBO_MSR_OFFSET                   0x10
-
-
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID              (0x3fULL << 0)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK     (0xfULL << 6)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE    (0x7fULL << 17)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID              (0xffffULL << 32)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC              (0x1ffULL << 52)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6               (0x1ULL << 61)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC               (0x1ULL << 62)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC     (0x1ULL << 63)
-
-
-/* Haswell-EP Sbox */
-#define HSWEP_S0_MSR_PMON_CTR0                 0x726
-#define HSWEP_S0_MSR_PMON_CTL0                 0x721
-#define HSWEP_S0_MSR_PMON_BOX_CTL                      0x720
-#define HSWEP_SBOX_MSR_OFFSET                  0xa
-#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* Haswell-EP PCU */
-#define HSWEP_PCU_MSR_PMON_CTR0                        0x717
-#define HSWEP_PCU_MSR_PMON_CTL0                        0x711
-#define HSWEP_PCU_MSR_PMON_BOX_CTL             0x710
-#define HSWEP_PCU_MSR_PMON_BOX_FILTER          0x715
-
-/* KNL Ubox */
-#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
-                                               SNBEP_CBO_PMON_CTL_TID_EN)
-/* KNL CHA */
-#define KNL_CHA_MSR_OFFSET                     0xc
-#define KNL_CHA_MSR_PMON_CTL_QOR               (1 << 16)
-#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
-                                       (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
-                                        KNL_CHA_MSR_PMON_CTL_QOR)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_TID                0x1ff
-#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE      (7 << 18)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_OP         (0xfffffe2aULL << 32)
-
-/* KNL EDC/MC UCLK */
-#define KNL_UCLK_MSR_PMON_CTR0_LOW             0x400
-#define KNL_UCLK_MSR_PMON_CTL0                 0x420
-#define KNL_UCLK_MSR_PMON_BOX_CTL              0x430
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW       0x44c
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL       0x454
-#define KNL_PMON_FIXED_CTL_EN                  0x1
-
-/* KNL EDC */
-#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW                0xa00
-#define KNL_EDC0_ECLK_MSR_PMON_CTL0            0xa20
-#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL         0xa30
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW  0xa3c
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL  0xa44
-
-/* KNL MC */
-#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW          0xb00
-#define KNL_MC0_CH0_MSR_PMON_CTL0              0xb20
-#define KNL_MC0_CH0_MSR_PMON_BOX_CTL           0xb30
-#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW         0xb3c
-#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL         0xb44
-
-/* KNL IRP */
-#define KNL_IRP_PCI_PMON_BOX_CTL               0xf0
-#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK                (SNBEP_PMON_RAW_EVENT_MASK | \
-                                                KNL_CHA_MSR_PMON_CTL_QOR)
-/* KNL PCU */
-#define KNL_PCU_PMON_CTL_EV_SEL_MASK           0x0000007f
-#define KNL_PCU_PMON_CTL_USE_OCC_CTR           (1 << 7)
-#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK                0x3f000000
-#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK        \
-                               (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
-                                KNL_PCU_PMON_CTL_USE_OCC_CTR | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-                                SNBEP_PMON_CTL_EDGE_DET | \
-                                SNBEP_CBO_PMON_CTL_TID_EN | \
-                                SNBEP_PMON_CTL_EV_SEL_EXT | \
-                                SNBEP_PMON_CTL_INVERT | \
-                                KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-                                SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
-DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
-DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
-DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-       u32 config = 0;
-
-       if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               pci_write_config_dword(pdev, box_ctl, config);
-       }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-       pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config |= SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-       u64 config;
-       unsigned msr;
-
-       msr = uncore_msr_box_ctl(box);
-       if (msr) {
-               rdmsrl(msr, config);
-               config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-               wrmsrl(msr, config);
-       }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE)
-               wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr)
-               wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_nid.attr,
-       &format_attr_filter_state.attr,
-       &format_attr_filter_opc.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-       INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-       INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-       INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-       { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                   \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),                   \
-       .init_box       = snbep_uncore_msr_init_box             \
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()                     \
-       .init_box       = snbep_uncore_pci_init_box,            \
-       .disable_box    = snbep_uncore_pci_disable_box,         \
-       .enable_box     = snbep_uncore_pci_enable_box,          \
-       .disable_event  = snbep_uncore_pci_disable_event,       \
-       .read_counter   = snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event   = snbep_uncore_pci_enable_event,        \
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-       UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-       EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &snbep_uncore_msr_ops,
-       .format_group   = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-       EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i;
-
-       if (uncore_box_is_fake(box))
-               return;
-
-       for (i = 0; i < 5; i++) {
-               if (reg1->alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-                           u64 (*cbox_filter_mask)(int fields))
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       int i, alloc = 0;
-       unsigned long flags;
-       u64 mask;
-
-       if (reg1->idx == EXTRA_REG_NONE)
-               return NULL;
-
-       raw_spin_lock_irqsave(&er->lock, flags);
-       for (i = 0; i < 5; i++) {
-               if (!(reg1->idx & (0x1 << i)))
-                       continue;
-               if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-                       continue;
-
-               mask = cbox_filter_mask(0x1 << i);
-               if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-                   !((reg1->config ^ er->config) & mask)) {
-                       atomic_add(1 << (i * 6), &er->ref);
-                       er->config &= ~mask;
-                       er->config |= reg1->config & mask;
-                       alloc |= (0x1 << i);
-               } else {
-                       break;
-               }
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-       if (i < 5)
-               goto fail;
-
-       if (!uncore_box_is_fake(box))
-               reg1->alloc |= alloc;
-
-       return NULL;
-fail:
-       for (; i >= 0; i--) {
-               if (alloc & (0x1 << i))
-                       atomic_sub(1 << (i * 6), &er->ref);
-       }
-       return &uncore_constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x4)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-       return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_cbox_hw_config,
-       .get_constraint         = snbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &snbep_uncore_cbox_ops,
-       .format_group           = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       u64 config = reg1->config;
-
-       if (new_idx > reg1->idx)
-               config <<= 8 * (new_idx - reg1->idx);
-       else
-               config >>= 8 * (reg1->idx - new_idx);
-
-       if (modify) {
-               hwc->config += new_idx - reg1->idx;
-               reg1->config = config;
-               reg1->idx = new_idx;
-       }
-       return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-       unsigned long flags;
-       int idx = reg1->idx;
-       u64 mask, config1 = reg1->config;
-       bool ok = false;
-
-       if (reg1->idx == EXTRA_REG_NONE ||
-           (!uncore_box_is_fake(box) && reg1->alloc))
-               return NULL;
-again:
-       mask = 0xffULL << (idx * 8);
-       raw_spin_lock_irqsave(&er->lock, flags);
-       if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-           !((config1 ^ er->config) & mask)) {
-               atomic_add(1 << (idx * 8), &er->ref);
-               er->config &= ~mask;
-               er->config |= config1 & mask;
-               ok = true;
-       }
-       raw_spin_unlock_irqrestore(&er->lock, flags);
-
-       if (!ok) {
-               idx = (idx + 1) % 4;
-               if (idx != reg1->idx) {
-                       config1 = snbep_pcu_alter_er(event, idx, false);
-                       goto again;
-               }
-               return &uncore_constraint_empty;
-       }
-
-       if (!uncore_box_is_fake(box)) {
-               if (idx != reg1->idx)
-                       snbep_pcu_alter_er(event, idx, true);
-               reg1->alloc = 1;
-       }
-       return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-       if (uncore_box_is_fake(box) || !reg1->alloc)
-               return;
-
-       atomic_sub(1 << (reg1->idx * 8), &er->ref);
-       reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-       &snbep_uncore_ubox,
-       &snbep_uncore_cbox,
-       &snbep_uncore_pcu,
-       NULL,
-};
-
-void snbep_uncore_cpu_init(void)
-{
-       if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = snbep_msr_uncores;
-}
-
-enum {
-       SNBEP_PCI_QPI_PORT0_FILTER,
-       SNBEP_PCI_QPI_PORT1_FILTER,
-       HSWEP_PCI_PCU_3,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-               reg1->idx = 0;
-               reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-               reg1->config = event->attr.config1;
-               reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-               reg2->config = event->attr.config2;
-       }
-       return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-               struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
-               if (filter_pdev) {
-                       pci_write_config_dword(filter_pdev, reg1->reg,
-                                               (u32)reg1->config);
-                       pci_write_config_dword(filter_pdev, reg1->reg + 4,
-                                               (u32)(reg1->config >> 32));
-                       pci_write_config_dword(filter_pdev, reg2->reg,
-                                               (u32)reg2->config);
-                       pci_write_config_dword(filter_pdev, reg2->reg + 4,
-                                               (u32)(reg2->config >> 32));
-               }
-       }
-
-       pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-       SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-       .enable_event           = snbep_qpi_enable_event,
-       .hw_config              = snbep_qpi_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = SNBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &snbep_uncore_pci_ops,                \
-       .format_group   = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 4,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .event_descs            = snbep_uncore_qpi_events,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       SNBEP_PCI_UNCORE_HA,
-       SNBEP_PCI_UNCORE_IMC,
-       SNBEP_PCI_UNCORE_QPI,
-       SNBEP_PCI_UNCORE_R2PCIE,
-       SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-       [SNBEP_PCI_UNCORE_HA]           = &snbep_uncore_ha,
-       [SNBEP_PCI_UNCORE_IMC]          = &snbep_uncore_imc,
-       [SNBEP_PCI_UNCORE_QPI]          = &snbep_uncore_qpi,
-       [SNBEP_PCI_UNCORE_R2PCIE]       = &snbep_uncore_r2pcie,
-       [SNBEP_PCI_UNCORE_R3QPI]        = &snbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-       { /* Home Agent */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* MC Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* QPI Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-               .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-       .name           = "snbep_uncore",
-       .id_table       = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-       struct pci_dev *ubox_dev = NULL;
-       int i, bus, nodeid, segment;
-       struct pci2phy_map *map;
-       int err = 0;
-       u32 config = 0;
-
-       while (1) {
-               /* find the UBOX device */
-               ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-               if (!ubox_dev)
-                       break;
-               bus = ubox_dev->bus->number;
-               /* get the Node ID of the local register */
-               err = pci_read_config_dword(ubox_dev, 0x40, &config);
-               if (err)
-                       break;
-               nodeid = config;
-               /* get the Node ID mapping */
-               err = pci_read_config_dword(ubox_dev, 0x54, &config);
-               if (err)
-                       break;
-
-               segment = pci_domain_nr(ubox_dev->bus);
-               raw_spin_lock(&pci2phy_map_lock);
-               map = __find_pci2phy_map(segment);
-               if (!map) {
-                       raw_spin_unlock(&pci2phy_map_lock);
-                       err = -ENOMEM;
-                       break;
-               }
-
-               /*
-                * every three bits in the Node ID mapping register maps
-                * to a particular node.
-                */
-               for (i = 0; i < 8; i++) {
-                       if (nodeid == ((config >> (3 * i)) & 0x7)) {
-                               map->pbus_to_physid[bus] = i;
-                               break;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       if (!err) {
-               /*
-                * For PCI bus with no UBOX device, find the next bus
-                * that has UBOX device and use its mapping.
-                */
-               raw_spin_lock(&pci2phy_map_lock);
-               list_for_each_entry(map, &pci2phy_map_head, list) {
-                       i = -1;
-                       for (bus = 255; bus >= 0; bus--) {
-                               if (map->pbus_to_physid[bus] >= 0)
-                                       i = map->pbus_to_physid[bus];
-                               else
-                                       map->pbus_to_physid[bus] = i;
-                       }
-               }
-               raw_spin_unlock(&pci2phy_map_lock);
-       }
-
-       pci_dev_put(ubox_dev);
-
-       return err ? pcibios_err_to_errno(err) : 0;
-}
-
-int snbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x3ce0);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = snbep_pci_uncores;
-       uncore_pci_driver = &snbep_uncore_pci_driver;
-       return 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-       if (msr)
-               wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
-}
-
-static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-
-       pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
-}
-
-#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()                     \
-       .init_box       = ivbep_uncore_msr_init_box,            \
-       .disable_box    = snbep_uncore_msr_disable_box,         \
-       .enable_box     = snbep_uncore_msr_enable_box,          \
-       .disable_event  = snbep_uncore_msr_disable_event,       \
-       .enable_event   = snbep_uncore_msr_enable_event,        \
-       .read_counter   = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivbep_uncore_msr_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivbep_uncore_pci_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_uncore_pci_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-};
-
-#define IVBEP_UNCORE_PCI_COMMON_INIT()                         \
-       .perf_ctr       = SNBEP_PCI_PMON_CTR0,                  \
-       .event_ctl      = SNBEP_PCI_PMON_CTL0,                  \
-       .event_mask     = IVBEP_PMON_RAW_EVENT_MASK,            \
-       .box_ctl        = SNBEP_PCI_PMON_BOX_CTL,               \
-       .ops            = &ivbep_uncore_pci_ops,                        \
-       .format_group   = &ivbep_uncore_format_group
-
-static struct attribute *ivbep_uncore_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid.attr,
-       &format_attr_filter_link.attr,
-       &format_attr_filter_state2.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge.attr,
-       &format_attr_filter_band0.attr,
-       &format_attr_filter_band1.attr,
-       &format_attr_filter_band2.attr,
-       &format_attr_filter_band3.attr,
-       NULL,
-};
-
-static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
-       &format_attr_event_ext.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_match_rds.attr,
-       &format_attr_match_rnid30.attr,
-       &format_attr_match_rnid4.attr,
-       &format_attr_match_dnid.attr,
-       &format_attr_match_mc.attr,
-       &format_attr_match_opc.attr,
-       &format_attr_match_vnw.attr,
-       &format_attr_match0.attr,
-       &format_attr_match1.attr,
-       &format_attr_mask_rds.attr,
-       &format_attr_mask_rnid30.attr,
-       &format_attr_mask_rnid4.attr,
-       &format_attr_mask_dnid.attr,
-       &format_attr_mask_mc.attr,
-       &format_attr_mask_opc.attr,
-       &format_attr_mask_vnw.attr,
-       &format_attr_mask0.attr,
-       &format_attr_mask1.attr,
-       NULL,
-};
-
-static struct attribute_group ivbep_uncore_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_qpi_format_group = {
-       .name = "format",
-       .attrs = ivbep_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivbep_uncore_ubox = {
-       .name           = "ubox",
-       .num_counters   = 2,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .fixed_ctr_bits = 48,
-       .perf_ctr       = SNBEP_U_MSR_PMON_CTR0,
-       .event_ctl      = SNBEP_U_MSR_PMON_CTL0,
-       .event_mask     = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl      = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops            = &ivbep_uncore_msr_ops,
-       .format_group   = &ivbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 ivbep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-
-       return mask;
-}
-
-static struct event_constraint *
-ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
-}
-
-static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-                       SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 6, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
-       .init_box               = ivbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = ivbep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = ivbep_cbox_hw_config,
-       .get_constraint         = ivbep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 15,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = SNBEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = SNBEP_C0_MSR_PMON_CTR0,
-       .event_mask             = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = SNBEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = snbep_uncore_cbox_constraints,
-       .ops                    = &ivbep_uncore_cbox_ops,
-       .format_group           = &ivbep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
-       IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = snbep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = SNBEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_pcu_ops,
-       .format_group           = &ivbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivbep_msr_uncores[] = {
-       &ivbep_uncore_ubox,
-       &ivbep_uncore_cbox,
-       &ivbep_uncore_pcu,
-       NULL,
-};
-
-void ivbep_uncore_cpu_init(void)
-{
-       if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = ivbep_msr_uncores;
-}
-
-static struct intel_uncore_type ivbep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 4,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = snbep_uncore_imc_events,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
-                              hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops ivbep_uncore_irp_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = ivbep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivbep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = IVBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &ivbep_uncore_irp_ops,
-       .format_group           = &ivbep_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
-       .init_box       = ivbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = snbep_uncore_pci_disable_event,
-       .enable_event   = snbep_qpi_enable_event,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .hw_config      = snbep_qpi_hw_config,
-       .get_constraint = uncore_get_constraint,
-       .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_qpi_ops,
-       .format_group           = &ivbep_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivbep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r2pcie_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 44,
-       .constraints    = snbep_uncore_r3qpi_constraints,
-       IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       IVBEP_PCI_UNCORE_HA,
-       IVBEP_PCI_UNCORE_IMC,
-       IVBEP_PCI_UNCORE_IRP,
-       IVBEP_PCI_UNCORE_QPI,
-       IVBEP_PCI_UNCORE_R2PCIE,
-       IVBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivbep_pci_uncores[] = {
-       [IVBEP_PCI_UNCORE_HA]   = &ivbep_uncore_ha,
-       [IVBEP_PCI_UNCORE_IMC]  = &ivbep_uncore_imc,
-       [IVBEP_PCI_UNCORE_IRP]  = &ivbep_uncore_irp,
-       [IVBEP_PCI_UNCORE_QPI]  = &ivbep_uncore_qpi,
-       [IVBEP_PCI_UNCORE_R2PCIE]       = &ivbep_uncore_r2pcie,
-       [IVBEP_PCI_UNCORE_R3QPI]        = &ivbep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id ivbep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 4 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivbep_uncore_pci_driver = {
-       .name           = "ivbep_uncore",
-       .id_table       = ivbep_uncore_pci_ids,
-};
-
-int ivbep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x0e1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = ivbep_pci_uncores;
-       uncore_pci_driver = &ivbep_uncore_pci_driver;
-       return 0;
-}
-/* end of IvyTown uncore support */
-
-/* KNL uncore support */
-static struct attribute *knl_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = KNL_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_ubox_format_group,
-};
-
-static struct attribute *knl_uncore_cha_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid4.attr,
-       &format_attr_filter_link3.attr,
-       &format_attr_filter_state4.attr,
-       &format_attr_filter_local.attr,
-       &format_attr_filter_all_op.attr,
-       &format_attr_filter_nnm.attr,
-       &format_attr_filter_opc3.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_cha_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_cha_formats_attr,
-};
-
-static struct event_constraint knl_uncore_cha_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg knl_uncore_cha_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
-       EVENT_EXTRA_END
-};
-
-static u64 knl_cha_filter_mask(int fields)
-{
-       u64 mask = 0;
-
-       if (fields & 0x1)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x4)
-               mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
-       return mask;
-}
-
-static struct event_constraint *
-knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
-}
-
-static int knl_cha_hw_config(struct intel_uncore_box *box,
-                            struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                   struct perf_event *event);
-
-static struct intel_uncore_ops knl_uncore_cha_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = knl_cha_hw_config,
-       .get_constraint         = knl_cha_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type knl_uncore_cha = {
-       .name                   = "cha",
-       .num_counters           = 4,
-       .num_boxes              = 38,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = KNL_CHA_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = knl_uncore_cha_constraints,
-       .ops                    = &knl_uncore_cha_ops,
-       .format_group           = &knl_uncore_cha_format_group,
-};
-
-static struct attribute *knl_uncore_pcu_formats_attr[] = {
-       &format_attr_event2.attr,
-       &format_attr_use_occ_ctr.attr,
-       &format_attr_occ_sel.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh6.attr,
-       &format_attr_occ_invert.attr,
-       &format_attr_occ_edge_det.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_pcu_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_pcu_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_msr_ops,
-       .format_group           = &knl_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *knl_msr_uncores[] = {
-       &knl_uncore_ubox,
-       &knl_uncore_cha,
-       &knl_uncore_pcu,
-       NULL,
-};
-
-void knl_uncore_cpu_init(void)
-{
-       uncore_msr_uncores = knl_msr_uncores;
-}
-
-static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       int box_ctl = uncore_pci_box_ctl(box);
-
-       pci_write_config_dword(pdev, box_ctl, 0);
-}
-
-static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
-                                       struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-
-       if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
-                                                       == UNCORE_FIXED_EVENT)
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | KNL_PMON_FIXED_CTL_EN);
-       else
-               pci_write_config_dword(pdev, hwc->config_base,
-                                      hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops knl_uncore_imc_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = knl_uncore_imc_enable_box,
-       .read_counter   = snbep_uncore_pci_read_counter,
-       .enable_event   = knl_uncore_imc_enable_event,
-       .disable_event  = snbep_uncore_pci_disable_event,
-};
-
-static struct intel_uncore_type knl_uncore_imc_uclk = {
-       .name                   = "imc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 2,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_imc_dclk = {
-       .name                   = "imc",
-       .num_counters           = 4,
-       .num_boxes              = 6,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_MC0_CH0_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
-       .fixed_ctl              = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
-       .box_ctl                = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_uclk = {
-       .name                   = "edc_uclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_UCLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_UCLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-       .fixed_ctl              = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-       .box_ctl                = KNL_UCLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_eclk = {
-       .name                   = "edc_eclk",
-       .num_counters           = 4,
-       .num_boxes              = 8,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
-       .event_ctl              = KNL_EDC0_ECLK_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
-       .fixed_ctl              = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
-       .box_ctl                = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
-       .ops                    = &knl_uncore_imc_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct event_constraint knl_uncore_m2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type knl_uncore_m2pcie = {
-       .name           = "m2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = knl_uncore_m2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct attribute *knl_uncore_irp_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_qor.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group knl_uncore_irp_format_group = {
-       .name = "format",
-       .attrs = knl_uncore_irp_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = KNL_IRP_PCI_PMON_BOX_CTL,
-       .ops                    = &snbep_uncore_pci_ops,
-       .format_group           = &knl_uncore_irp_format_group,
-};
-
-enum {
-       KNL_PCI_UNCORE_MC_UCLK,
-       KNL_PCI_UNCORE_MC_DCLK,
-       KNL_PCI_UNCORE_EDC_UCLK,
-       KNL_PCI_UNCORE_EDC_ECLK,
-       KNL_PCI_UNCORE_M2PCIE,
-       KNL_PCI_UNCORE_IRP,
-};
-
-static struct intel_uncore_type *knl_pci_uncores[] = {
-       [KNL_PCI_UNCORE_MC_UCLK]        = &knl_uncore_imc_uclk,
-       [KNL_PCI_UNCORE_MC_DCLK]        = &knl_uncore_imc_dclk,
-       [KNL_PCI_UNCORE_EDC_UCLK]       = &knl_uncore_edc_uclk,
-       [KNL_PCI_UNCORE_EDC_ECLK]       = &knl_uncore_edc_eclk,
-       [KNL_PCI_UNCORE_M2PCIE]         = &knl_uncore_m2pcie,
-       [KNL_PCI_UNCORE_IRP]            = &knl_uncore_irp,
-       NULL,
-};
-
-/*
- * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
- * device type. prior to KNL, each instance of a PMU device type had a unique
- * device ID.
- *
- *     PCI Device ID   Uncore PMU Devices
- *     ----------------------------------
- *     0x7841          MC0 UClk, MC1 UClk
- *     0x7843          MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
- *                     MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
- *     0x7833          EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
- *                     EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
- *     0x7835          EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
- *                     EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
- *     0x7817          M2PCIe
- *     0x7814          IRP
-*/
-
-static const struct pci_device_id knl_uncore_pci_ids[] = {
-       { /* MC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
-       },
-       { /* MC DClk Channel */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
-       },
-       { /* EDC UClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
-       },
-       { /* EDC EClk */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
-       },
-       { /* M2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
-               .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver knl_uncore_pci_driver = {
-       .name           = "knl_uncore",
-       .id_table       = knl_uncore_pci_ids,
-};
-
-int knl_uncore_pci_init(void)
-{
-       int ret;
-
-       /* All KNL PCI based PMON units are on the same PCI bus except IRP */
-       ret = snb_pci2phy_map_init(0x7814); /* IRP */
-       if (ret)
-               return ret;
-       ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
-       if (ret)
-               return ret;
-       uncore_pci_uncores = knl_pci_uncores;
-       uncore_pci_driver = &knl_uncore_pci_driver;
-       return 0;
-}
-
-/* end of KNL uncore support */
-
-/* Haswell-EP uncore support */
-static struct attribute *hswep_uncore_ubox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh5.attr,
-       &format_attr_filter_tid2.attr,
-       &format_attr_filter_cid.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_ubox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_ubox_formats_attr,
-};
-
-static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       reg1->reg = HSWEP_U_MSR_PMON_FILTER;
-       reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
-       reg1->idx = 0;
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_ubox_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_ubox_hw_config,
-       .get_constraint         = uncore_get_constraint,
-       .put_constraint         = uncore_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 44,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_ubox_ops,
-       .format_group           = &hswep_uncore_ubox_format_group,
-};
-
-static struct attribute *hswep_uncore_cbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_thresh8.attr,
-       &format_attr_filter_tid3.attr,
-       &format_attr_filter_link2.attr,
-       &format_attr_filter_state3.attr,
-       &format_attr_filter_nid2.attr,
-       &format_attr_filter_opc2.attr,
-       &format_attr_filter_nc.attr,
-       &format_attr_filter_c6.attr,
-       &format_attr_filter_isoc.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_cbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_cbox_formats_attr,
-};
-
-static struct event_constraint hswep_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
-       SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-                                 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-       SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-       EVENT_EXTRA_END
-};
-
-static u64 hswep_cbox_filter_mask(int fields)
-{
-       u64 mask = 0;
-       if (fields & 0x1)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
-       if (fields & 0x2)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-       if (fields & 0x4)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-       if (fields & 0x8)
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
-       if (fields & 0x10) {
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
-               mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-       }
-       return mask;
-}
-
-static struct event_constraint *
-hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-       return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
-}
-
-static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-       struct extra_reg *er;
-       int idx = 0;
-
-       for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
-               if (er->event != (event->hw.config & er->config_mask))
-                       continue;
-               idx |= er->idx;
-       }
-
-       if (idx) {
-               reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-                           HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-               reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
-               reg1->idx = idx;
-       }
-       return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-                                 struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-       if (reg1->idx != EXTRA_REG_NONE) {
-               u64 filter = uncore_shared_reg_config(box, 0);
-               wrmsrl(reg1->reg, filter & 0xffffffff);
-               wrmsrl(reg1->reg + 1, filter >> 32);
-       }
-
-       wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops hswep_uncore_cbox_ops = {
-       .init_box               = snbep_uncore_msr_init_box,
-       .disable_box            = snbep_uncore_msr_disable_box,
-       .enable_box             = snbep_uncore_msr_enable_box,
-       .disable_event          = snbep_uncore_msr_disable_event,
-       .enable_event           = hswep_cbox_enable_event,
-       .read_counter           = uncore_msr_read_counter,
-       .hw_config              = hswep_cbox_hw_config,
-       .get_constraint         = hswep_cbox_get_constraint,
-       .put_constraint         = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 18,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = hswep_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-/*
- * Write SBOX Initialization register bit by bit to avoid spurious #GPs
- */
-static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
-{
-       unsigned msr = uncore_msr_box_ctl(box);
-
-       if (msr) {
-               u64 init = SNBEP_PMON_BOX_CTL_INT;
-               u64 flags = 0;
-               int i;
-
-               for_each_set_bit(i, (unsigned long *)&init, 64) {
-                       flags |= (1ULL << i);
-                       wrmsrl(msr, flags);
-               }
-       }
-}
-
-static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
-       __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .init_box               = hswep_uncore_sbox_msr_init_box
-};
-
-static struct attribute *hswep_uncore_sbox_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_tid_en.attr,
-       &format_attr_inv.attr,
-       &format_attr_thresh8.attr,
-       NULL,
-};
-
-static struct attribute_group hswep_uncore_sbox_format_group = {
-       .name = "format",
-       .attrs = hswep_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_type hswep_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 44,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-       int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-       if (ev_sel >= 0xb && ev_sel <= 0xe) {
-               reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
-               reg1->idx = ev_sel - 0xb;
-               reg1->config = event->attr.config1 & (0xff << reg1->idx);
-       }
-       return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_pcu_ops = {
-       SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-       .hw_config              = hswep_pcu_hw_config,
-       .get_constraint         = snbep_pcu_get_constraint,
-       .put_constraint         = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_pcu = {
-       .name                   = "pcu",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = HSWEP_PCU_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_PCU_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_PCU_MSR_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &hswep_uncore_pcu_ops,
-       .format_group           = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *hswep_msr_uncores[] = {
-       &hswep_uncore_ubox,
-       &hswep_uncore_cbox,
-       &hswep_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void hswep_uncore_cpu_init(void)
-{
-       if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-
-       /* Detect 6-8 core systems with only two SBOXes */
-       if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
-               u32 capid4;
-
-               pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
-                                     0x94, &capid4);
-               if (((capid4 >> 6) & 0x3) == 0)
-                       hswep_uncore_sbox.num_boxes = 2;
-       }
-
-       uncore_msr_uncores = hswep_msr_uncores;
-}
-
-static struct intel_uncore_type hswep_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 5,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct uncore_event_desc hswep_uncore_imc_events[] = {
-       INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-       INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-       { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type hswep_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
-
-static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-       struct pci_dev *pdev = box->pci_dev;
-       struct hw_perf_event *hwc = &event->hw;
-       u64 count = 0;
-
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-       pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-       return count;
-}
-
-static struct intel_uncore_ops hswep_uncore_irp_ops = {
-       .init_box       = snbep_uncore_pci_init_box,
-       .disable_box    = snbep_uncore_pci_disable_box,
-       .enable_box     = snbep_uncore_pci_enable_box,
-       .disable_event  = ivbep_uncore_irp_disable_event,
-       .enable_event   = ivbep_uncore_irp_enable_event,
-       .read_counter   = hswep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type hswep_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type hswep_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 5,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = hswep_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 4,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 44,
-       .constraints    = hswep_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       HSWEP_PCI_UNCORE_HA,
-       HSWEP_PCI_UNCORE_IMC,
-       HSWEP_PCI_UNCORE_IRP,
-       HSWEP_PCI_UNCORE_QPI,
-       HSWEP_PCI_UNCORE_R2PCIE,
-       HSWEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *hswep_pci_uncores[] = {
-       [HSWEP_PCI_UNCORE_HA]   = &hswep_uncore_ha,
-       [HSWEP_PCI_UNCORE_IMC]  = &hswep_uncore_imc,
-       [HSWEP_PCI_UNCORE_IRP]  = &hswep_uncore_irp,
-       [HSWEP_PCI_UNCORE_QPI]  = &hswep_uncore_qpi,
-       [HSWEP_PCI_UNCORE_R2PCIE]       = &hswep_uncore_r2pcie,
-       [HSWEP_PCI_UNCORE_R3QPI]        = &hswep_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id hswep_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT0_FILTER),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  SNBEP_PCI_QPI_PORT1_FILTER),
-       },
-       { /* PCU.3 (for Capability registers) */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-                                                  HSWEP_PCI_PCU_3),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver hswep_uncore_pci_driver = {
-       .name           = "hswep_uncore",
-       .id_table       = hswep_uncore_pci_ids,
-};
-
-int hswep_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x2f1e);
-       if (ret)
-               return ret;
-       uncore_pci_uncores = hswep_pci_uncores;
-       uncore_pci_driver = &hswep_uncore_pci_driver;
-       return 0;
-}
-/* end of Haswell-EP uncore support */
-
-/* BDX uncore support */
-
-static struct intel_uncore_type bdx_uncore_ubox = {
-       .name                   = "ubox",
-       .num_counters           = 2,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .fixed_ctr_bits         = 48,
-       .perf_ctr               = HSWEP_U_MSR_PMON_CTR0,
-       .event_ctl              = HSWEP_U_MSR_PMON_CTL0,
-       .event_mask             = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-       .fixed_ctr              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-       .fixed_ctl              = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &ivbep_uncore_msr_ops,
-       .format_group           = &ivbep_uncore_ubox_format_group,
-};
-
-static struct event_constraint bdx_uncore_cbox_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_cbox = {
-       .name                   = "cbox",
-       .num_counters           = 4,
-       .num_boxes              = 24,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
-       .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_C0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_CBO_MSR_OFFSET,
-       .num_shared_regs        = 1,
-       .constraints            = bdx_uncore_cbox_constraints,
-       .ops                    = &hswep_uncore_cbox_ops,
-       .format_group           = &hswep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_sbox = {
-       .name                   = "sbox",
-       .num_counters           = 4,
-       .num_boxes              = 4,
-       .perf_ctr_bits          = 48,
-       .event_ctl              = HSWEP_S0_MSR_PMON_CTL0,
-       .perf_ctr               = HSWEP_S0_MSR_PMON_CTR0,
-       .event_mask             = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-       .box_ctl                = HSWEP_S0_MSR_PMON_BOX_CTL,
-       .msr_offset             = HSWEP_SBOX_MSR_OFFSET,
-       .ops                    = &hswep_uncore_sbox_msr_ops,
-       .format_group           = &hswep_uncore_sbox_format_group,
-};
-
-static struct intel_uncore_type *bdx_msr_uncores[] = {
-       &bdx_uncore_ubox,
-       &bdx_uncore_cbox,
-       &bdx_uncore_sbox,
-       &hswep_uncore_pcu,
-       NULL,
-};
-
-void bdx_uncore_cpu_init(void)
-{
-       if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-               bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-       uncore_msr_uncores = bdx_msr_uncores;
-}
-
-static struct intel_uncore_type bdx_uncore_ha = {
-       .name           = "ha",
-       .num_counters   = 4,
-       .num_boxes      = 2,
-       .perf_ctr_bits  = 48,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_imc = {
-       .name           = "imc",
-       .num_counters   = 5,
-       .num_boxes      = 8,
-       .perf_ctr_bits  = 48,
-       .fixed_ctr_bits = 48,
-       .fixed_ctr      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-       .fixed_ctl      = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-       .event_descs    = hswep_uncore_imc_events,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_irp = {
-       .name                   = "irp",
-       .num_counters           = 4,
-       .num_boxes              = 1,
-       .perf_ctr_bits          = 48,
-       .event_mask             = SNBEP_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .ops                    = &hswep_uncore_irp_ops,
-       .format_group           = &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_qpi = {
-       .name                   = "qpi",
-       .num_counters           = 4,
-       .num_boxes              = 3,
-       .perf_ctr_bits          = 48,
-       .perf_ctr               = SNBEP_PCI_PMON_CTR0,
-       .event_ctl              = SNBEP_PCI_PMON_CTL0,
-       .event_mask             = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-       .box_ctl                = SNBEP_PCI_PMON_BOX_CTL,
-       .num_shared_regs        = 1,
-       .ops                    = &snbep_uncore_qpi_ops,
-       .format_group           = &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r2pcie = {
-       .name           = "r2pcie",
-       .num_counters   = 4,
-       .num_boxes      = 1,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r2pcie_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
-       UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-       UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-       UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-       UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-       EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r3qpi = {
-       .name           = "r3qpi",
-       .num_counters   = 3,
-       .num_boxes      = 3,
-       .perf_ctr_bits  = 48,
-       .constraints    = bdx_uncore_r3qpi_constraints,
-       SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-       BDX_PCI_UNCORE_HA,
-       BDX_PCI_UNCORE_IMC,
-       BDX_PCI_UNCORE_IRP,
-       BDX_PCI_UNCORE_QPI,
-       BDX_PCI_UNCORE_R2PCIE,
-       BDX_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *bdx_pci_uncores[] = {
-       [BDX_PCI_UNCORE_HA]     = &bdx_uncore_ha,
-       [BDX_PCI_UNCORE_IMC]    = &bdx_uncore_imc,
-       [BDX_PCI_UNCORE_IRP]    = &bdx_uncore_irp,
-       [BDX_PCI_UNCORE_QPI]    = &bdx_uncore_qpi,
-       [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
-       [BDX_PCI_UNCORE_R3QPI]  = &bdx_uncore_r3qpi,
-       NULL,
-};
-
-static const struct pci_device_id bdx_uncore_pci_ids[] = {
-       { /* Home Agent 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
-       },
-       { /* Home Agent 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
-       },
-       { /* MC0 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
-       },
-       { /* MC0 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
-       },
-       { /* MC0 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
-       },
-       { /* MC0 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
-       },
-       { /* MC1 Channel 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
-       },
-       { /* MC1 Channel 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
-       },
-       { /* MC1 Channel 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
-       },
-       { /* MC1 Channel 3 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
-       },
-       { /* IRP */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
-       },
-       { /* QPI0 Port 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
-       },
-       { /* QPI0 Port 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
-       },
-       { /* QPI1 Port 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
-       },
-       { /* R2PCIe */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
-       },
-       { /* R3QPI0 Link 0 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
-       },
-       { /* R3QPI0 Link 1 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
-       },
-       { /* R3QPI1 Link 2 */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
-               .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
-       },
-       { /* QPI Port 0 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
-       },
-       { /* QPI Port 1 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
-       },
-       { /* QPI Port 2 filter  */
-               PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
-               .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
-       },
-       { /* end: all zeroes */ }
-};
-
-static struct pci_driver bdx_uncore_pci_driver = {
-       .name           = "bdx_uncore",
-       .id_table       = bdx_uncore_pci_ids,
-};
-
-int bdx_uncore_pci_init(void)
-{
-       int ret = snbep_pci2phy_map_init(0x6f1e);
-
-       if (ret)
-               return ret;
-       uncore_pci_uncores = bdx_pci_uncores;
-       uncore_pci_driver = &bdx_uncore_pci_driver;
-       return 0;
-}
-
-/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
deleted file mode 100644 (file)
index 5b0c232..0000000
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x002a,
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x0016,
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0028,
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x0029,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x0012,
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               /* On Xeon Phi event "0" is a valid DATA_READ          */
-               /*   (L1 Data Cache Reads) Instruction.                */
-               /* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
-               /* bit will always be set in x86_pmu_hw_config().      */
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ           */
-               [ C(RESULT_MISS)   ] = 0x0003,  /* DATA_READ_MISS      */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE          */
-               [ C(RESULT_MISS)   ] = 0x0004,  /* DATA_WRITE_MISS     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0011,  /* L1_DATA_PF1         */
-               [ C(RESULT_MISS)   ] = 0x001c,  /* L1_DATA_PF1_MISS    */
-       },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ          */
-               [ C(RESULT_MISS)   ] = 0x000e,  /* CODE_CACHE_MISS    */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x10cb,  /* L2_READ_MISS */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10cc,  /* L2_WRITE_HIT */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x10fc,  /* L2_DATA_PF2      */
-               [ C(RESULT_MISS)   ] = 0x10fe,  /* L2_DATA_PF2_MISS */
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-                                               /* DATA_READ */
-                                               /* see note on L1 OP_READ */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0001,  /* DATA_WRITE */
-               [ C(RESULT_MISS)   ] = 0x0002,  /* DATA_PAGE_WALK */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = 0x0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x000c,  /* CODE_READ */
-               [ C(RESULT_MISS)   ] = 0x000d,  /* CODE_PAGE_WALK */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0012,  /* BRANCHES */
-               [ C(RESULT_MISS)   ] = 0x002b,  /* BRANCHES_MISPREDICTED */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
-       return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc3, 0x1),      /* HWP_L2HIT */
-       INTEL_EVENT_CONSTRAINT(0xc4, 0x1),      /* HWP_L2MISS */
-       INTEL_EVENT_CONSTRAINT(0xc8, 0x1),      /* L2_READ_HIT_E */
-       INTEL_EVENT_CONSTRAINT(0xc9, 0x1),      /* L2_READ_HIT_M */
-       INTEL_EVENT_CONSTRAINT(0xca, 0x1),      /* L2_READ_HIT_S */
-       INTEL_EVENT_CONSTRAINT(0xcb, 0x1),      /* L2_READ_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcc, 0x1),      /* L2_WRITE_HIT */
-       INTEL_EVENT_CONSTRAINT(0xce, 0x1),      /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
-       INTEL_EVENT_CONSTRAINT(0xcf, 0x1),      /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
-       INTEL_EVENT_CONSTRAINT(0xd7, 0x1),      /* L2_VICTIM_REQ_WITH_DATA */
-       INTEL_EVENT_CONSTRAINT(0xe3, 0x1),      /* SNP_HITM_BUNIT */
-       INTEL_EVENT_CONSTRAINT(0xe6, 0x1),      /* SNP_HIT_L2 */
-       INTEL_EVENT_CONSTRAINT(0xe7, 0x1),      /* SNP_HITM_L2 */
-       INTEL_EVENT_CONSTRAINT(0xf1, 0x1),      /* L2_DATA_READ_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf2, 0x1),      /* L2_DATA_WRITE_MISS_CACHE_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf6, 0x1),      /* L2_DATA_READ_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xf7, 0x1),      /* L2_DATA_WRITE_MISS_MEM_FILL */
-       INTEL_EVENT_CONSTRAINT(0xfc, 0x1),      /* L2_DATA_PF2 */
-       INTEL_EVENT_CONSTRAINT(0xfd, 0x1),      /* L2_DATA_PF2_DROP */
-       INTEL_EVENT_CONSTRAINT(0xfe, 0x1),      /* L2_DATA_PF2_MISS */
-       INTEL_EVENT_CONSTRAINT(0xff, 0x1),      /* L2_DATA_HIT_INFLIGHT_PF2 */
-       EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS                0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL   0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL          0x0000002f
-
-#define KNC_ENABLE_COUNTER0                    0x00000001
-#define KNC_ENABLE_COUNTER1                    0x00000002
-
-static void knc_pmu_disable_all(void)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
-       u64 val;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-       val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
-       (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
-       u64 status;
-
-       rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
-       return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
-       wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       int handled = 0;
-       int bit, loops;
-       u64 status;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       knc_pmu_disable_all();
-
-       status = knc_pmu_get_status();
-       if (!status) {
-               knc_pmu_enable_all(0);
-               return handled;
-       }
-
-       loops = 0;
-again:
-       knc_pmu_ack_status(status);
-       if (++loops > 100) {
-               WARN_ONCE(1, "perf: irq loop stuck!\n");
-               perf_event_print_debug();
-               goto done;
-       }
-
-       inc_irq_stat(apic_perf_irqs);
-
-       for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-               struct perf_event *event = cpuc->events[bit];
-
-               handled++;
-
-               if (!test_bit(bit, cpuc->active_mask))
-                       continue;
-
-               if (!intel_pmu_save_and_restart(event))
-                       continue;
-
-               perf_sample_data_init(&data, 0, event->hw.last_period);
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       /*
-        * Repeat if there is more work to be done:
-        */
-       status = knc_pmu_get_status();
-       if (status)
-               goto again;
-
-done:
-       knc_pmu_enable_all(0);
-
-       return handled;
-}
-
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_knc_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
-       .name                   = "knc",
-       .handle_irq             = knc_pmu_handle_irq,
-       .disable_all            = knc_pmu_disable_all,
-       .enable_all             = knc_pmu_enable_all,
-       .enable                 = knc_pmu_enable_event,
-       .disable                = knc_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_KNC_EVNTSEL0,
-       .perfctr                = MSR_KNC_PERFCTR0,
-       .event_map              = knc_pmu_event_map,
-       .max_events             = ARRAY_SIZE(knc_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 39) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       .cntval_bits            = 40,
-       .cntval_mask            = (1ULL << 40) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = knc_event_constraints,
-       .format_attrs           = intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
-       x86_pmu = knc_pmu;
-
-       memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
deleted file mode 100644 (file)
index ec863b9..0000000
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <linux/perf_event.h>
-
-enum perf_msr_id {
-       PERF_MSR_TSC                    = 0,
-       PERF_MSR_APERF                  = 1,
-       PERF_MSR_MPERF                  = 2,
-       PERF_MSR_PPERF                  = 3,
-       PERF_MSR_SMI                    = 4,
-
-       PERF_MSR_EVENT_MAX,
-};
-
-static bool test_aperfmperf(int idx)
-{
-       return boot_cpu_has(X86_FEATURE_APERFMPERF);
-}
-
-static bool test_intel(int idx)
-{
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-           boot_cpu_data.x86 != 6)
-               return false;
-
-       switch (boot_cpu_data.x86_model) {
-       case 30: /* 45nm Nehalem    */
-       case 26: /* 45nm Nehalem-EP */
-       case 46: /* 45nm Nehalem-EX */
-
-       case 37: /* 32nm Westmere    */
-       case 44: /* 32nm Westmere-EP */
-       case 47: /* 32nm Westmere-EX */
-
-       case 42: /* 32nm SandyBridge         */
-       case 45: /* 32nm SandyBridge-E/EN/EP */
-
-       case 58: /* 22nm IvyBridge       */
-       case 62: /* 22nm IvyBridge-EP/EX */
-
-       case 60: /* 22nm Haswell Core */
-       case 63: /* 22nm Haswell Server */
-       case 69: /* 22nm Haswell ULT */
-       case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-       case 61: /* 14nm Broadwell Core-M */
-       case 86: /* 14nm Broadwell Xeon D */
-       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-       case 79: /* 14nm Broadwell Server */
-
-       case 55: /* 22nm Atom "Silvermont"                */
-       case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-       case 76: /* 14nm Atom "Airmont"                   */
-               if (idx == PERF_MSR_SMI)
-                       return true;
-               break;
-
-       case 78: /* 14nm Skylake Mobile */
-       case 94: /* 14nm Skylake Desktop */
-               if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
-                       return true;
-               break;
-       }
-
-       return false;
-}
-
-struct perf_msr {
-       u64     msr;
-       struct  perf_pmu_events_attr *attr;
-       bool    (*test)(int idx);
-};
-
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
-
-static struct perf_msr msr[] = {
-       [PERF_MSR_TSC]   = { 0,                 &evattr_tsc,    NULL,            },
-       [PERF_MSR_APERF] = { MSR_IA32_APERF,    &evattr_aperf,  test_aperfmperf, },
-       [PERF_MSR_MPERF] = { MSR_IA32_MPERF,    &evattr_mperf,  test_aperfmperf, },
-       [PERF_MSR_PPERF] = { MSR_PPERF,         &evattr_pperf,  test_intel,      },
-       [PERF_MSR_SMI]   = { MSR_SMI_COUNT,     &evattr_smi,    test_intel,      },
-};
-
-static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-       NULL,
-};
-
-static struct attribute_group events_attr_group = {
-       .name = "events",
-       .attrs = events_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-static struct attribute *format_attrs[] = {
-       &format_attr_event.attr,
-       NULL,
-};
-static struct attribute_group format_attr_group = {
-       .name = "format",
-       .attrs = format_attrs,
-};
-
-static const struct attribute_group *attr_groups[] = {
-       &events_attr_group,
-       &format_attr_group,
-       NULL,
-};
-
-static int msr_event_init(struct perf_event *event)
-{
-       u64 cfg = event->attr.config;
-
-       if (event->attr.type != event->pmu->type)
-               return -ENOENT;
-
-       if (cfg >= PERF_MSR_EVENT_MAX)
-               return -EINVAL;
-
-       /* unsupported modes and filters */
-       if (event->attr.exclude_user   ||
-           event->attr.exclude_kernel ||
-           event->attr.exclude_hv     ||
-           event->attr.exclude_idle   ||
-           event->attr.exclude_host   ||
-           event->attr.exclude_guest  ||
-           event->attr.sample_period) /* no sampling */
-               return -EINVAL;
-
-       if (!msr[cfg].attr)
-               return -EINVAL;
-
-       event->hw.idx = -1;
-       event->hw.event_base = msr[cfg].msr;
-       event->hw.config = cfg;
-
-       return 0;
-}
-
-static inline u64 msr_read_counter(struct perf_event *event)
-{
-       u64 now;
-
-       if (event->hw.event_base)
-               rdmsrl(event->hw.event_base, now);
-       else
-               rdtscll(now);
-
-       return now;
-}
-static void msr_event_update(struct perf_event *event)
-{
-       u64 prev, now;
-       s64 delta;
-
-       /* Careful, an NMI might modify the previous event value. */
-again:
-       prev = local64_read(&event->hw.prev_count);
-       now = msr_read_counter(event);
-
-       if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
-               goto again;
-
-       delta = now - prev;
-       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
-               delta = sign_extend64(delta, 31);
-
-       local64_add(now - prev, &event->count);
-}
-
-static void msr_event_start(struct perf_event *event, int flags)
-{
-       u64 now;
-
-       now = msr_read_counter(event);
-       local64_set(&event->hw.prev_count, now);
-}
-
-static void msr_event_stop(struct perf_event *event, int flags)
-{
-       msr_event_update(event);
-}
-
-static void msr_event_del(struct perf_event *event, int flags)
-{
-       msr_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int msr_event_add(struct perf_event *event, int flags)
-{
-       if (flags & PERF_EF_START)
-               msr_event_start(event, flags);
-
-       return 0;
-}
-
-static struct pmu pmu_msr = {
-       .task_ctx_nr    = perf_sw_context,
-       .attr_groups    = attr_groups,
-       .event_init     = msr_event_init,
-       .add            = msr_event_add,
-       .del            = msr_event_del,
-       .start          = msr_event_start,
-       .stop           = msr_event_stop,
-       .read           = msr_event_update,
-       .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static int __init msr_init(void)
-{
-       int i, j = 0;
-
-       if (!boot_cpu_has(X86_FEATURE_TSC)) {
-               pr_cont("no MSR PMU driver.\n");
-               return 0;
-       }
-
-       /* Probe the MSRs. */
-       for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
-               u64 val;
-
-               /*
-                * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-                */
-               if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-                       msr[i].attr = NULL;
-       }
-
-       /* List remaining MSRs in the sysfs attrs. */
-       for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
-               if (msr[i].attr)
-                       events_attrs[j++] = &msr[i].attr->attr.attr;
-       }
-       events_attrs[j] = NULL;
-
-       perf_pmu_register(&pmu_msr, "msr", -1);
-
-       return 0;
-}
-device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
deleted file mode 100644 (file)
index f2e5678..0000000
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
-       unsigned int opcode;                    /* Event code and ESCR selector */
-       unsigned int escr_msr[2];               /* ESCR MSR for this event */
-       unsigned int escr_emask;                /* valid ESCR EventMask bits */
-       unsigned int shared;                    /* event is shared across threads */
-       char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
-       unsigned int metric_pebs;
-       unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert)                     \
-       [P4_PEBS_METRIC__##name] = {                            \
-               .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,   \
-               .metric_vert = vert,                            \
-       }
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
-       P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,  0x0000001, 0x0000001),
-       P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,  0x0000002, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_load_miss_retired,        0x0000004, 0x0000001),
-       P4_GEN_PEBS_BIND(dtlb_store_miss_retired,       0x0000004, 0x0000002),
-       P4_GEN_PEBS_BIND(dtlb_all_miss_retired,         0x0000004, 0x0000003),
-       P4_GEN_PEBS_BIND(tagged_mispred_branch,         0x0018000, 0x0000010),
-       P4_GEN_PEBS_BIND(mob_load_replay_retired,       0x0000200, 0x0000001),
-       P4_GEN_PEBS_BIND(split_load_retired,            0x0000400, 0x0000001),
-       P4_GEN_PEBS_BIND(split_store_retired,           0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
-       [P4_EVENT_TC_DELIVER_MODE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
-               .shared         = 1,
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_BPU_FETCH_REQUEST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
-               .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_ITLB_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
-               .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_MEMORY_CANCEL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MEMORY_COMPLETE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_LOAD_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_STORE_PORT_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
-               .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_MOB_LOAD_REPLAY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
-               .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_PAGE_WALK_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
-               .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_CACHE_REFERENCE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ALLOCATION] = {
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_FSB_DATA_ACTIVITY] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
-               .shared         = 1,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
-               .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
-               .cntr           = { {0, -1, -1}, {1, -1, -1} },
-       },
-       [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
-               .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
-               .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
-               .cntr           = { {2, -1, -1}, {3, -1, -1} },
-       },
-       [P4_EVENT_SSE_INPUT_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_PACKED_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_SP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_SCALAR_DP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_64BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_128BIT_MMX_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_X87_FP_UOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
-               .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_TC_MISC] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
-               .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_GLOBAL_POWER_EVENTS] = {
-               .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_TC_MS_XFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_UOP_QUEUE_WRITES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
-               .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RETIRED_BRANCH_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
-               .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
-               .cntr           = { {4, 5, -1}, {6, 7, -1} },
-       },
-       [P4_EVENT_RESOURCE_STALL] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
-               .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_WC_BUFFER] = {
-               .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
-               .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
-               .shared         = 1,
-               .cntr           = { {8, 9, -1}, {10, 11, -1} },
-       },
-       [P4_EVENT_B2B_CYCLES] = {
-               .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_BNR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BNR),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_SNOOP] = {
-               .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_RESPONSE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
-               .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-               .escr_emask     = 0,
-               .cntr           = { {0, -1, -1}, {2, -1, -1} },
-       },
-       [P4_EVENT_FRONT_END_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_EXECUTION_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_REPLAY_EVENT] = {
-               .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOPS_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_UOP_TYPE] = {
-               .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
-               .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_X87_ASSIST] = {
-               .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_MACHINE_CLEAR] = {
-               .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
-               .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-       [P4_EVENT_INSTR_COMPLETED] = {
-               .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
-               .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-               .escr_emask     =
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
-                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
-               .cntr           = { {12, 13, 16}, {14, 15, 17} },
-       },
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric)                           \
-       p4_config_pack_escr(P4_ESCR_EVENT(event)                        | \
-                           P4_ESCR_EMASK_BIT(event, bit))              | \
-       p4_config_pack_cccr(metric                                      | \
-                           P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__1stl_cache_load_miss_retired),
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
-       },
-},
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_load_miss_retired),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0,
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-                                               P4_PEBS_METRIC__dtlb_store_miss_retired),
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-                                               P4_PEBS_METRIC__none),
-               [ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-                                               P4_PEBS_METRIC__none),
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(NODE) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
-       u64 original;
-       u64 alternative;
-} p4_event_aliases[] = {
-       {
-               /*
-                * Non-halted cycles can be substituted with non-sleeping cycles (see
-                * Intel SDM Vol3b for details). We need this alias to be able
-                * to run nmi-watchdog and 'perf top' (or any other user space tool
-                * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
-                * simultaneously.
-                */
-       .original       =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-       .alternative    =
-               p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)             |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
-                                   P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
-               p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT          |
-                                   P4_CCCR_COMPARE),
-       },
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
-       u64 config_match;
-       int i;
-
-       /*
-        * Only event with special mark is allowed,
-        * we're to be sure it didn't come as malformed
-        * RAW event.
-        */
-       if (!(config & P4_CONFIG_ALIASABLE))
-               return 0;
-
-       config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
-       for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
-               if (config_match == p4_event_aliases[i].original) {
-                       config_match = p4_event_aliases[i].alternative;
-                       break;
-               } else if (config_match == p4_event_aliases[i].alternative) {
-                       config_match = p4_event_aliases[i].original;
-                       break;
-               }
-       }
-
-       if (i >= ARRAY_SIZE(p4_event_aliases))
-               return 0;
-
-       return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
-  /* non-halted CPU clocks */
-  [PERF_COUNT_HW_CPU_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))       |
-               P4_CONFIG_ALIASABLE,
-
-  /*
-   * retired instructions
-   * in a sake of simplicity we don't use the FSB tagging
-   */
-  [PERF_COUNT_HW_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)               |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
-  /* cache hits */
-  [PERF_COUNT_HW_CACHE_REFERENCES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
-  /* cache misses */
-  [PERF_COUNT_HW_CACHE_MISSES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
-               P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
-  /* branch instructions retired */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
-  /* mispredicted branches retired */
-  [PERF_COUNT_HW_BRANCH_MISSES]        =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)      |
-               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
-  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-  [PERF_COUNT_HW_BUS_CYCLES] =
-       p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)           |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
-               P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))        |
-       p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
-       unsigned int evnt = p4_config_unpack_event(config);
-       struct p4_event_bind *bind = NULL;
-
-       if (evnt < ARRAY_SIZE(p4_event_bind_map))
-               bind = &p4_event_bind_map[evnt];
-
-       return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
-       struct p4_event_bind *bind;
-       unsigned int esel;
-       u64 config;
-
-       config = p4_general_events[hw_event];
-       bind = p4_config_get_bind(config);
-       esel = P4_OPCODE_ESEL(bind->opcode);
-       config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
-       return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
-       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
-       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
-               if (boot_cpu_data.x86_model != 3 &&
-                       boot_cpu_data.x86_model != 4 &&
-                       boot_cpu_data.x86_model != 6)
-                       return false;
-       }
-
-       /*
-        * For info
-        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
-        */
-
-       return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
-       unsigned int v, emask;
-
-       /* User data may have out-of-bound event index */
-       v = p4_config_unpack_event(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_event_bind_map))
-               return -EINVAL;
-
-       /* It may be unsupported: */
-       if (!p4_event_match_cpu_model(v))
-               return -EINVAL;
-
-       /*
-        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
-        * in Architectural Performance Monitoring, it means not
-        * on _which_ logical cpu to count but rather _when_, ie it
-        * depends on logical cpu state -- count event if one cpu active,
-        * none, both or any, so we just allow user to pass any value
-        * desired.
-        *
-        * In turn we always set Tx_OS/Tx_USR bits bound to logical
-        * cpu without their propagation to another cpu
-        */
-
-       /*
-        * if an event is shared across the logical threads
-        * the user needs special permissions to be able to use it
-        */
-       if (p4_ht_active() && p4_event_bind_map[v].shared) {
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-       }
-
-       /* ESCR EventMask bits may be invalid */
-       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
-       if (emask & ~p4_event_bind_map[v].escr_emask)
-               return -EINVAL;
-
-       /*
-        * it may have some invalid PEBS bits
-        */
-       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-               return -EINVAL;
-
-       v = p4_config_unpack_metric(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-               return -EINVAL;
-
-       return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
-       int cpu = get_cpu();
-       int rc = 0;
-       u32 escr, cccr;
-
-       /*
-        * the reason we use cpu that early is that: if we get scheduled
-        * first time on the same cpu -- we will not need swap thread
-        * specific flags in config (and will save some cpu cycles)
-        */
-
-       cccr = p4_default_cccr_conf(cpu);
-       escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
-                                        event->attr.exclude_user);
-       event->hw.config = p4_config_pack_escr(escr) |
-                          p4_config_pack_cccr(cccr);
-
-       if (p4_ht_active() && p4_ht_thread(cpu))
-               event->hw.config = p4_set_ht_bit(event->hw.config);
-
-       if (event->attr.type == PERF_TYPE_RAW) {
-               struct p4_event_bind *bind;
-               unsigned int esel;
-               /*
-                * Clear bits we reserve to be managed by kernel itself
-                * and never allowed from a user space
-                */
-                event->attr.config &= P4_CONFIG_MASK;
-
-               rc = p4_validate_raw_event(event);
-               if (rc)
-                       goto out;
-
-               /*
-                * Note that for RAW events we allow user to use P4_CCCR_RESERVED
-                * bits since we keep additional info here (for cache events and etc)
-                */
-               event->hw.config |= event->attr.config;
-               bind = p4_config_get_bind(event->attr.config);
-               if (!bind) {
-                       rc = -EINVAL;
-                       goto out;
-               }
-               esel = P4_OPCODE_ESEL(bind->opcode);
-               event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-       }
-
-       rc = x86_setup_perfctr(event);
-out:
-       put_cpu();
-       return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
-       u64 v;
-
-       /* an official way for overflow indication */
-       rdmsrl(hwc->config_base, v);
-       if (v & P4_CCCR_OVF) {
-               wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-               return 1;
-       }
-
-       /*
-        * In some circumstances the overflow might issue an NMI but did
-        * not set P4_CCCR_OVF bit. Because a counter holds a negative value
-        * we simply check for high bit being set, if it's cleared it means
-        * the counter has reached zero value and continued counting before
-        * real NMI signal was received:
-        */
-       rdmsrl(hwc->event_base, v);
-       if (!(v & ARCH_P4_UNFLAGGED_BIT))
-               return 1;
-
-       return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
-       /*
-        * FIXME
-        *
-        * It's still allowed that two threads setup same cache
-        * events so we can't simply clear metrics until we knew
-        * no one is depending on us, so we need kind of counter
-        * for "ReplayEvent" users.
-        *
-        * What is more complex -- RAW events, if user (for some
-        * reason) will pass some cache event metric with improper
-        * event opcode -- it's fine from hardware point of view
-        * but completely nonsense from "meaning" of such action.
-        *
-        * So at moment let leave metrics turned on forever -- it's
-        * ok for now but need to be revisited!
-        *
-        * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
-        * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
-        */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       /*
-        * If event gets disabled while counter is in overflowed
-        * state we need to clear P4_CCCR_OVF, otherwise interrupt get
-        * asserted again and again
-        */
-       (void)wrmsrl_safe(hwc->config_base,
-               p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_disable_event(event);
-       }
-
-       p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
-       struct p4_pebs_bind *bind;
-       unsigned int idx;
-
-       BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
-       idx = p4_config_unpack_metric(config);
-       if (idx == P4_PEBS_METRIC__none)
-               return;
-
-       bind = &p4_pebs_bind_map[idx];
-
-       (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
-       (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,      (u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int thread = p4_ht_config_thread(hwc->config);
-       u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-       unsigned int idx = p4_config_unpack_event(hwc->config);
-       struct p4_event_bind *bind;
-       u64 escr_addr, cccr;
-
-       bind = &p4_event_bind_map[idx];
-       escr_addr = bind->escr_msr[thread];
-
-       /*
-        * - we dont support cascaded counters yet
-        * - and counter 1 is broken (erratum)
-        */
-       WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
-       WARN_ON_ONCE(hwc->idx == 1);
-
-       /* we need a real Event value */
-       escr_conf &= ~P4_ESCR_EVENT_MASK;
-       escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       /*
-        * it could be Cache event so we need to write metrics
-        * into additional MSRs
-        */
-       p4_pmu_enable_pebs(hwc->config);
-
-       (void)wrmsrl_safe(escr_addr, escr_conf);
-       (void)wrmsrl_safe(hwc->config_base,
-                               (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
-       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-       int idx;
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               struct perf_event *event = cpuc->events[idx];
-               if (!test_bit(idx, cpuc->active_mask))
-                       continue;
-               p4_pmu_enable_event(event);
-       }
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
-       struct perf_sample_data data;
-       struct cpu_hw_events *cpuc;
-       struct perf_event *event;
-       struct hw_perf_event *hwc;
-       int idx, handled = 0;
-       u64 val;
-
-       cpuc = this_cpu_ptr(&cpu_hw_events);
-
-       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               int overflow;
-
-               if (!test_bit(idx, cpuc->active_mask)) {
-                       /* catch in-flight IRQs */
-                       if (__test_and_clear_bit(idx, cpuc->running))
-                               handled++;
-                       continue;
-               }
-
-               event = cpuc->events[idx];
-               hwc = &event->hw;
-
-               WARN_ON_ONCE(hwc->idx != idx);
-
-               /* it might be unflagged overflow */
-               overflow = p4_pmu_clear_cccr_ovf(hwc);
-
-               val = x86_perf_event_update(event);
-               if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
-                       continue;
-
-               handled += overflow;
-
-               /* event overflow for sure */
-               perf_sample_data_init(&data, 0, hwc->last_period);
-
-               if (!x86_perf_event_set_period(event))
-                       continue;
-
-
-               if (perf_event_overflow(event, &data, regs))
-                       x86_pmu_stop(event, 0);
-       }
-
-       if (handled)
-               inc_irq_stat(apic_perf_irqs);
-
-       /*
-        * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
-        * been observed that the OVF bit flag has to be cleared first _before_
-        * the LVTPC can be unmasked.
-        *
-        * The reason is the NMI line will continue to be asserted while the OVF
-        * bit is set.  This causes a second NMI to generate if the LVTPC is
-        * unmasked before the OVF bit is cleared, leading to unknown NMI
-        * messages.
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
-       u32 escr, cccr;
-
-       /*
-        * we either lucky and continue on same cpu or no HT support
-        */
-       if (!p4_should_swap_ts(hwc->config, cpu))
-               return;
-
-       /*
-        * the event is migrated from an another logical
-        * cpu, so we need to swap thread specific flags
-        */
-
-       escr = p4_config_unpack_escr(hwc->config);
-       cccr = p4_config_unpack_cccr(hwc->config);
-
-       if (p4_ht_thread(cpu)) {
-               cccr &= ~P4_CCCR_OVF_PMI_T0;
-               cccr |= P4_CCCR_OVF_PMI_T1;
-               if (escr & P4_ESCR_T0_OS) {
-                       escr &= ~P4_ESCR_T0_OS;
-                       escr |= P4_ESCR_T1_OS;
-               }
-               if (escr & P4_ESCR_T0_USR) {
-                       escr &= ~P4_ESCR_T0_USR;
-                       escr |= P4_ESCR_T1_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config |= P4_CONFIG_HT;
-       } else {
-               cccr &= ~P4_CCCR_OVF_PMI_T1;
-               cccr |= P4_CCCR_OVF_PMI_T0;
-               if (escr & P4_ESCR_T1_OS) {
-                       escr &= ~P4_ESCR_T1_OS;
-                       escr |= P4_ESCR_T0_OS;
-               }
-               if (escr & P4_ESCR_T1_USR) {
-                       escr &= ~P4_ESCR_T1_USR;
-                       escr |= P4_ESCR_T0_USR;
-               }
-               hwc->config  = p4_config_pack_escr(escr);
-               hwc->config |= p4_config_pack_cccr(cccr);
-               hwc->config &= ~P4_CONFIG_HT;
-       }
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE               0x000003a0
-#define P4_ESCR_MSR_MAX                        0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE         (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr)           (msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr)   [P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
-       P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
-       unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
-       if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE      ||
-                       !p4_escr_table[idx]             ||
-                       p4_escr_table[idx] != addr)) {
-               WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
-               return -1;
-       }
-
-       return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
-                       struct p4_event_bind *bind)
-{
-       int i, j;
-
-       for (i = 0; i < P4_CNTR_LIMIT; i++) {
-               j = bind->cntr[thread][i];
-               if (j != -1 && !test_bit(j, used_mask))
-                       return j;
-       }
-
-       return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-       unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-       int cpu = smp_processor_id();
-       struct hw_perf_event *hwc;
-       struct p4_event_bind *bind;
-       unsigned int i, thread, num;
-       int cntr_idx, escr_idx;
-       u64 config_alias;
-       int pass;
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-       bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
-       for (i = 0, num = n; i < n; i++, num--) {
-
-               hwc = &cpuc->event_list[i]->hw;
-               thread = p4_ht_thread(cpu);
-               pass = 0;
-
-again:
-               /*
-                * It's possible to hit a circular lock
-                * between original and alternative events
-                * if both are scheduled already.
-                */
-               if (pass > 2)
-                       goto done;
-
-               bind = p4_config_get_bind(hwc->config);
-               escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
-               if (unlikely(escr_idx == -1))
-                       goto done;
-
-               if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
-                       cntr_idx = hwc->idx;
-                       if (assign)
-                               assign[i] = hwc->idx;
-                       goto reserve;
-               }
-
-               cntr_idx = p4_next_cntr(thread, used_mask, bind);
-               if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-                       /*
-                        * Check whether an event alias is still available.
-                        */
-                       config_alias = p4_get_alias_event(hwc->config);
-                       if (!config_alias)
-                               goto done;
-                       hwc->config = config_alias;
-                       pass++;
-                       goto again;
-               }
-               /*
-                * Perf does test runs to see if a whole group can be assigned
-                * together succesfully.  There can be multiple rounds of this.
-                * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
-                * bits, such that the next round of group assignments will
-                * cause the above p4_should_swap_ts to pass instead of fail.
-                * This leads to counters exclusive to thread0 being used by
-                * thread1.
-                *
-                * Solve this with a cheap hack, reset the idx back to -1 to
-                * force a new lookup (p4_next_cntr) to get the right counter
-                * for the right thread.
-                *
-                * This probably doesn't comply with the general spirit of how
-                * perf wants to work, but P4 is special. :-(
-                */
-               if (p4_should_swap_ts(hwc->config, cpu))
-                       hwc->idx = -1;
-               p4_pmu_swap_config_ts(hwc, cpu);
-               if (assign)
-                       assign[i] = cntr_idx;
-reserve:
-               set_bit(cntr_idx, used_mask);
-               set_bit(escr_idx, escr_mask);
-       }
-
-done:
-       return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht,   "config:63"   );
-
-static struct attribute *intel_p4_formats_attr[] = {
-       &format_attr_cccr.attr,
-       &format_attr_escr.attr,
-       &format_attr_ht.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
-       .name                   = "Netburst P4/Xeon",
-       .handle_irq             = p4_pmu_handle_irq,
-       .disable_all            = p4_pmu_disable_all,
-       .enable_all             = p4_pmu_enable_all,
-       .enable                 = p4_pmu_enable_event,
-       .disable                = p4_pmu_disable_event,
-       .eventsel               = MSR_P4_BPU_CCCR0,
-       .perfctr                = MSR_P4_BPU_PERFCTR0,
-       .event_map              = p4_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p4_general_events),
-       .get_event_constraints  = x86_get_event_constraints,
-       /*
-        * IF HT disabled we may need to use all
-        * ARCH_P4_MAX_CCCR counters simulaneously
-        * though leave it restricted at moment assuming
-        * HT is on
-        */
-       .num_counters           = ARCH_P4_MAX_CCCR,
-       .apic                   = 1,
-       .cntval_bits            = ARCH_P4_CNTRVAL_BITS,
-       .cntval_mask            = ARCH_P4_CNTRVAL_MASK,
-       .max_period             = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
-       .hw_config              = p4_hw_config,
-       .schedule_events        = p4_pmu_schedule_events,
-       /*
-        * This handles erratum N15 in intel doc 249199-029,
-        * the counter may not be updated correctly on write
-        * so we need a second write operation to do the trick
-        * (the official workaround didn't work)
-        *
-        * the former idea is taken from OProfile code
-        */
-       .perfctr_second_write   = 1,
-
-       .format_attrs           = intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
-       unsigned int low, high;
-       int i, reg;
-
-       /* If we get stripped -- indexing fails */
-       BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-       if (!(low & (1 << 7))) {
-               pr_cont("unsupported Netburst CPU model %d ",
-                       boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       pr_cont("Netburst events, ");
-
-       x86_pmu = p4_pmu;
-
-       /*
-        * Even though the counters are configured to interrupt a particular
-        * logical processor when an overflow happens, testing has shown that
-        * on kdump kernels (which uses a single cpu), thread1's counter
-        * continues to run and will report an NMI on thread0.  Due to the
-        * overflow bug, this leads to a stream of unknown NMIs.
-        *
-        * Solve this by zero'ing out the registers to mimic a reset.
-        */
-       for (i = 0; i < x86_pmu.num_counters; i++) {
-               reg = x86_pmu_config_addr(i);
-               wrmsrl_safe(reg, 0ULL);
-       }
-
-       return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
deleted file mode 100644 (file)
index 7c1a0c0..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]           = 0x0079,       /* CPU_CLK_UNHALTED */
-  [PERF_COUNT_HW_INSTRUCTIONS]         = 0x00c0,       /* INST_RETIRED     */
-  [PERF_COUNT_HW_CACHE_REFERENCES]     = 0x0f2e,       /* L2_RQSTS:M:E:S:I */
-  [PERF_COUNT_HW_CACHE_MISSES]         = 0x012e,       /* L2_RQSTS:I       */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,       /* BR_INST_RETIRED  */
-  [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,       /* BR_MISS_PRED_RETIRED */
-  [PERF_COUNT_HW_BUS_CYCLES]           = 0x0062,       /* BUS_DRDY_CLOCKS  */
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,    /* RESOURCE_STALLS  */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
-                               [PERF_COUNT_HW_CACHE_MAX]
-                               [PERF_COUNT_HW_CACHE_OP_MAX]
-                               [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS       */
-                [ C(RESULT_MISS)   ] = 0x0045, /* DCU_LINES_IN        */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0f29,  /* L2_LD:M:E:S:I       */
-       },
-        [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-        },
- },
- [ C(L1I ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0f28,  /* L2_IFETCH:M:E:S:I  */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(LL  ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0x0025,  /* L2_M_LINES_INM     */
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(DTLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0043,  /* DATA_MEM_REFS      */
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = 0,
-               [ C(RESULT_MISS)   ] = 0,
-       },
- },
- [ C(ITLB) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x0080,  /* IFU_IFETCH         */
-               [ C(RESULT_MISS)   ] = 0x0085,  /* ITLB_MISS          */
-       },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
- [ C(BPU ) ] = {
-       [ C(OP_READ) ] = {
-               [ C(RESULT_ACCESS) ] = 0x00c4,  /* BR_INST_RETIRED      */
-               [ C(RESULT_MISS)   ] = 0x00c5,  /* BR_MISS_PRED_RETIRED */
-        },
-       [ C(OP_WRITE) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
-       [ C(OP_PREFETCH) ] = {
-               [ C(RESULT_ACCESS) ] = -1,
-               [ C(RESULT_MISS)   ] = -1,
-       },
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-       return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT                   0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
-       INTEL_EVENT_CONSTRAINT(0xc1, 0x1),      /* FLOPS */
-       INTEL_EVENT_CONSTRAINT(0x10, 0x1),      /* FP_COMP_OPS_EXE */
-       INTEL_EVENT_CONSTRAINT(0x11, 0x2),      /* FP_ASSIST */
-       INTEL_EVENT_CONSTRAINT(0x12, 0x2),      /* MUL */
-       INTEL_EVENT_CONSTRAINT(0x13, 0x2),      /* DIV */
-       INTEL_EVENT_CONSTRAINT(0x14, 0x1),      /* CYCLES_DIV_BUSY */
-       EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
-       u64 val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
-       unsigned long val;
-
-       /* p6 only has one enable register */
-       rdmsrl(MSR_P6_EVNTSEL0, val);
-       val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val = P6_NOP_EVENT;
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 val;
-
-       val = hwc->config;
-
-       /*
-        * p6 only has a global event enable, set on PerfEvtSel0
-        * We "disable" events by programming P6_NOP_EVENT
-        * and we rely on p6_pmu_enable_all() being called
-        * to actually enable the events.
-        */
-
-       (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7"    );
-PMU_FORMAT_ATTR(umask, "config:8-15"   );
-PMU_FORMAT_ATTR(edge,  "config:18"     );
-PMU_FORMAT_ATTR(pc,    "config:19"     );
-PMU_FORMAT_ATTR(inv,   "config:23"     );
-PMU_FORMAT_ATTR(cmask, "config:24-31"  );
-
-static struct attribute *intel_p6_formats_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_umask.attr,
-       &format_attr_edge.attr,
-       &format_attr_pc.attr,
-       &format_attr_inv.attr,
-       &format_attr_cmask.attr,
-       NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
-       .name                   = "p6",
-       .handle_irq             = x86_pmu_handle_irq,
-       .disable_all            = p6_pmu_disable_all,
-       .enable_all             = p6_pmu_enable_all,
-       .enable                 = p6_pmu_enable_event,
-       .disable                = p6_pmu_disable_event,
-       .hw_config              = x86_pmu_hw_config,
-       .schedule_events        = x86_schedule_events,
-       .eventsel               = MSR_P6_EVNTSEL0,
-       .perfctr                = MSR_P6_PERFCTR0,
-       .event_map              = p6_pmu_event_map,
-       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
-       .apic                   = 1,
-       .max_period             = (1ULL << 31) - 1,
-       .version                = 0,
-       .num_counters           = 2,
-       /*
-        * Events have 40 bits implemented. However they are designed such
-        * that bits [32-39] are sign extensions of bit 31. As such the
-        * effective width of a event for P6-like PMU is 32 bits only.
-        *
-        * See IA-32 Intel Architecture Software developer manual Vol 3B
-        */
-       .cntval_bits            = 32,
-       .cntval_mask            = (1ULL << 32) - 1,
-       .get_event_constraints  = x86_get_event_constraints,
-       .event_constraints      = p6_event_constraints,
-
-       .format_attrs           = intel_p6_formats_attr,
-       .events_sysfs_show      = intel_event_sysfs_show,
-
-};
-
-static __init void p6_pmu_rdpmc_quirk(void)
-{
-       if (boot_cpu_data.x86_mask < 9) {
-               /*
-                * PPro erratum 26; fixed in stepping 9 and above.
-                */
-               pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
-               x86_pmu.attr_rdpmc_broken = 1;
-               x86_pmu.attr_rdpmc = 0;
-       }
-}
-
-__init int p6_pmu_init(void)
-{
-       x86_pmu = p6_pmu;
-
-       switch (boot_cpu_data.x86_model) {
-       case  1: /* Pentium Pro */
-               x86_add_quirk(p6_pmu_rdpmc_quirk);
-               break;
-
-       case  3: /* Pentium II - Klamath */
-       case  5: /* Pentium II - Deschutes */
-       case  6: /* Pentium II - Mendocino */
-               break;
-
-       case  7: /* Pentium III - Katmai */
-       case  8: /* Pentium III - Coppermine */
-       case 10: /* Pentium III Xeon */
-       case 11: /* Pentium III - Tualatin */
-               break;
-
-       case  9: /* Pentium M - Banias */
-       case 13: /* Pentium M - Dothan */
-               break;
-
-       default:
-               pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
-               return -ENODEV;
-       }
-
-       memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
-               sizeof(hw_cache_event_ids));
-
-       return 0;
-}
index 819d949..f6f50c4 100644 (file)
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
        for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
                if (!rdrand_long(&tmp)) {
                        clear_cpu_cap(c, X86_FEATURE_RDRAND);
-                       printk_once(KERN_WARNING "rdrand: disabled\n");
+                       pr_warn_once("rdrand: disabled\n");
                        return;
                }
        }
index 4c60eaf..cd53135 100644 (file)
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
        c->x86_max_cores = (core_level_siblings / smp_num_siblings);
 
        if (!printed) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+               pr_info("CPU: Physical Processor ID: %d\n",
                       c->phys_proc_id);
                if (c->x86_max_cores > 1)
-                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                       pr_info("CPU: Processor Core ID: %d\n",
                               c->cpu_core_id);
                printed = 1;
        }
index 252da7a..3417856 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include "cpu.h"
 
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
        if (max >= 0x80860001) {
                cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
                if (cpu_rev != 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+                       pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
                                (cpu_rev >> 24) & 0xff,
                                (cpu_rev >> 16) & 0xff,
                                (cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
        if (max >= 0x80860002) {
                cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
                if (cpu_rev == 0x02000000) {
-                       printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+                       pr_info("CPU: Processor revision %08X, %u MHz\n",
                                new_cpu_rev, cpu_freq);
                }
-               printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+               pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
                       (cms_rev1 >> 24) & 0xff,
                       (cms_rev1 >> 16) & 0xff,
                       (cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
                      (void *)&cpu_info[56],
                      (void *)&cpu_info[60]);
                cpu_info[64] = '\0';
-               printk(KERN_INFO "CPU: %s\n", cpu_info);
+               pr_info("CPU: %s\n", cpu_info);
        }
 
        /* Unhide possibly hidden capability flags */
index 628a059..364e583 100644 (file)
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
        tsc_hz = eax | (((uint64_t)ebx) << 32);
        do_div(tsc_hz, 1000);
        BUG_ON(tsc_hz >> 32);
-       printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+       pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
                         (unsigned long) tsc_hz / 1000,
                         (unsigned long) tsc_hz % 1000);
 
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
        if (ebx != UINT_MAX)
                x86_platform.calibrate_tsc = vmware_get_tsc_khz;
        else
-               printk(KERN_WARNING
-                      "Failed to get TSC freq from the hypervisor\n");
+               pr_warn("Failed to get TSC freq from the hypervisor\n");
 }
 
 /*
index 58f3431..9ef978d 100644 (file)
@@ -57,10 +57,9 @@ struct crash_elf_data {
        struct kimage *image;
        /*
         * Total number of ram ranges we have after various adjustments for
-        * GART, crash reserved region etc.
+        * crash reserved region, etc.
         */
        unsigned int max_nr_ranges;
-       unsigned long gart_start, gart_end;
 
        /* Pointer to elf header */
        void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
        return 0;
 }
 
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
-       struct crash_elf_data *ced = arg;
-
-       ced->gart_start = start;
-       ced->gart_end = end;
-
-       /* Not expecting more than 1 gart aperture */
-       return 1;
-}
-
 
 /* Gather all the required information to prepare elf headers for ram regions */
 static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
 
        ced->max_nr_ranges = nr_ranges;
 
-       /*
-        * We don't create ELF headers for GART aperture as an attempt
-        * to dump this memory in second kernel leads to hang/crash.
-        * If gart aperture is present, one needs to exclude that region
-        * and that could lead to need of extra phdr.
-        */
-       walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
-                               ced, get_gart_ranges_callback);
-
-       /*
-        * If we have gart region, excluding that could potentially split
-        * a memory range, resulting in extra header. Account for  that.
-        */
-       if (ced->gart_end)
-               ced->max_nr_ranges++;
-
        /* Exclusion of crash region could split memory ranges */
        ced->max_nr_ranges++;
 
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
                        return ret;
        }
 
-       /* Exclude GART region */
-       if (ced->gart_end) {
-               ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
-               if (ret)
-                       return ret;
-       }
-
        return ret;
 }
 
@@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
        /* Add ACPI tables */
        cmd.type = E820_ACPI;
        flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-       walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
                       memmap_entry_callback);
 
        /* Add ACPI Non-volatile Storage */
        cmd.type = E820_NVS;
-       walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+       walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
                        memmap_entry_callback);
 
        /* Add crashk_low_res region */
index 569c1e4..621b501 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
+#include <asm/cpufeature.h>
 
 /*
  * The e820 map is the map that gets modified e.g. with command line parameters
@@ -925,6 +926,41 @@ static const char *e820_type_to_string(int e820_type)
        }
 }
 
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+       switch (e820_type) {
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+               return IORESOURCE_SYSTEM_RAM;
+       case E820_ACPI:
+       case E820_NVS:
+       case E820_UNUSABLE:
+       case E820_PRAM:
+       case E820_PMEM:
+       default:
+               return IORESOURCE_MEM;
+       }
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+       switch (e820_type) {
+       case E820_ACPI:
+               return IORES_DESC_ACPI_TABLES;
+       case E820_NVS:
+               return IORES_DESC_ACPI_NV_STORAGE;
+       case E820_PMEM:
+               return IORES_DESC_PERSISTENT_MEMORY;
+       case E820_PRAM:
+               return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+       case E820_RESERVED_KERN:
+       case E820_RAM:
+       case E820_UNUSABLE:
+       default:
+               return IORES_DESC_NONE;
+       }
+}
+
 static bool do_mark_busy(u32 type, struct resource *res)
 {
        /* this is the legacy bios/dos rom-shadow + mmio region */
@@ -967,7 +1003,8 @@ void __init e820_reserve_resources(void)
                res->start = e820.map[i].addr;
                res->end = end;
 
-               res->flags = IORESOURCE_MEM;
+               res->flags = e820_type_to_iomem_type(e820.map[i].type);
+               res->desc = e820_type_to_iores_desc(e820.map[i].type);
 
                /*
                 * don't register the region that could be conflicted with
index d25097c..0b1b9ab 100644 (file)
@@ -114,6 +114,10 @@ void __kernel_fpu_begin(void)
        kernel_fpu_disable();
 
        if (fpu->fpregs_active) {
+               /*
+                * Ignore return value -- we don't care if reg state
+                * is clobbered.
+                */
                copy_fpregs_to_fpstate(fpu);
        } else {
                this_cpu_write(fpu_fpregs_owner_ctx, NULL);
@@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu)
 
        preempt_disable();
        if (fpu->fpregs_active) {
-               if (!copy_fpregs_to_fpstate(fpu))
-                       fpregs_deactivate(fpu);
+               if (!copy_fpregs_to_fpstate(fpu)) {
+                       if (use_eager_fpu())
+                               copy_kernel_to_fpregs(&fpu->state);
+                       else
+                               fpregs_deactivate(fpu);
+               }
        }
        preempt_enable();
 }
@@ -223,14 +231,15 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-/*
- * Copy the current task's FPU state to a new task's FPU context.
- *
- * In both the 'eager' and the 'lazy' case we save hardware registers
- * directly to the destination buffer.
- */
-static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
+       dst_fpu->counter = 0;
+       dst_fpu->fpregs_active = 0;
+       dst_fpu->last_cpu = -1;
+
+       if (!src_fpu->fpstate_active || !cpu_has_fpu)
+               return 0;
+
        WARN_ON_FPU(src_fpu != &current->thread.fpu);
 
        /*
@@ -243,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
        /*
         * Save current FPU registers directly into the child
         * FPU context, without any memory-to-memory copying.
-        *
-        * If the FPU context got destroyed in the process (FNSAVE
-        * done on old CPUs) then copy it back into the source
-        * context and mark the current task for lazy restore.
+        * In lazy mode, if the FPU context isn't loaded into
+        * fpregs, CR0.TS will be set and do_device_not_available
+        * will load the FPU context.
         *
         * We have to do all this with preemption disabled,
         * mostly because of the FNSAVE case, because in that
@@ -259,19 +267,13 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
        preempt_disable();
        if (!copy_fpregs_to_fpstate(dst_fpu)) {
                memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
-               fpregs_deactivate(src_fpu);
+
+               if (use_eager_fpu())
+                       copy_kernel_to_fpregs(&src_fpu->state);
+               else
+                       fpregs_deactivate(src_fpu);
        }
        preempt_enable();
-}
-
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
-{
-       dst_fpu->counter = 0;
-       dst_fpu->fpregs_active = 0;
-       dst_fpu->last_cpu = -1;
-
-       if (src_fpu->fpstate_active && cpu_has_fpu)
-               fpu_copy(dst_fpu, src_fpu);
 
        return 0;
 }
@@ -409,8 +411,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
 {
        if (use_xsave())
                copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-       else
+       else if (static_cpu_has(X86_FEATURE_FXSR))
                copy_kernel_to_fxregs(&init_fpstate.fxsave);
+       else
+               copy_kernel_to_fregs(&init_fpstate.fsave);
 }
 
 /*
@@ -423,7 +427,7 @@ void fpu__clear(struct fpu *fpu)
 {
        WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
-       if (!use_eager_fpu()) {
+       if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
                /* FPU state will be reallocated lazily at the first use. */
                fpu__drop(fpu);
        } else {
index 6d9f0a7..54c86ff 100644 (file)
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
        cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
        write_cr0(cr0);
 
-       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-                    : "+m" (fsw), "+m" (fcw));
+       if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+               asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                            : "+m" (fsw), "+m" (fcw));
 
-       if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-               set_cpu_cap(c, X86_FEATURE_FPU);
-       else
-               clear_cpu_cap(c, X86_FEATURE_FPU);
+               if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+                       set_cpu_cap(c, X86_FEATURE_FPU);
+               else
+                       clear_cpu_cap(c, X86_FEATURE_FPU);
+       }
 
 #ifndef CONFIG_MATH_EMULATION
        if (!cpu_has_fpu) {
@@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void)
         * Set up the legacy init FPU context. (xstate init might overwrite this
         * with a more modern format, if the CPU supports it.)
         */
-       fpstate_init_fxstate(&init_fpstate.fxsave);
+       fpstate_init(&init_fpstate);
 
        fpu__init_system_mxcsr();
 }
@@ -260,7 +262,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  * not only saved the restores along the way, but we also have the
  * FPU ready to be used for the original task.
  *
- * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * 'lazy' is deprecated because it's almost never a performance win
+ * and it's much more complicated than 'eager'.
+ *
+ * 'eager' switching is by default on all CPUs, there we switch the FPU
  * state during every context switch, regardless of whether the task
  * has used FPU instructions in that time slice or not. This is done
  * because modern FPU context saving instructions are able to optimize
@@ -271,7 +276,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  *   to use 'eager' restores, if we detect that a task is using the FPU
  *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
  */
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
 
 /*
  * Find supported xfeatures based on cpu features and command-line input.
@@ -300,12 +305,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
 static void __init fpu__clear_eager_fpu_features(void)
 {
        setup_clear_cpu_cap(X86_FEATURE_MPX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX2);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-       setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 }
 
 /*
@@ -348,15 +347,9 @@ static void __init fpu__init_system_ctx_switch(void)
  */
 static void __init fpu__init_parse_early_param(void)
 {
-       /*
-        * No need to check "eagerfpu=auto" again, since it is the
-        * initial default.
-        */
        if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
                eagerfpu = DISABLE;
                fpu__clear_eager_fpu_features();
-       } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
-               eagerfpu = ENABLE;
        }
 
        if (cmdline_find_option_bool(boot_command_line, "no387"))
index d425cda..6e8354f 100644 (file)
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
        setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
        setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
        setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
+       setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
        setup_clear_cpu_cap(X86_FEATURE_MPX);
        setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
 }
index 29408d6..702547c 100644 (file)
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 static unsigned long text_ip_addr(unsigned long ip)
 {
        /*
-        * On x86_64, kernel text mappings are mapped read-only with
-        * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-        * of the kernel text mapping to modify the kernel text.
+        * On x86_64, kernel text mappings are mapped read-only, so we use
+        * the kernel identity mapping instead of the kernel text mapping
+        * to modify the kernel text.
         *
         * For 32bit kernels, these mappings are same and we can use
         * kernel identity mapping to modify code.
@@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { }
 #endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
-extern void ftrace_caller_end(void);
 extern void ftrace_regs_caller_end(void);
-extern void ftrace_return(void);
+extern void ftrace_epilogue(void);
 extern void ftrace_caller_op_ptr(void);
 extern void ftrace_regs_caller_op_ptr(void);
 
@@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
                op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
        } else {
                start_offset = (unsigned long)ftrace_caller;
-               end_offset = (unsigned long)ftrace_caller_end;
+               end_offset = (unsigned long)ftrace_epilogue;
                op_offset = (unsigned long)ftrace_caller_op_ptr;
        }
 
@@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
        /*
         * Allocate enough size to store the ftrace_caller code,
-        * the jmp to ftrace_return, as well as the address of
+        * the jmp to ftrace_epilogue, as well as the address of
         * the ftrace_ops this trampoline is used for.
         */
        trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
@@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
        ip = (unsigned long)trampoline + size;
 
-       /* The trampoline ends with a jmp to ftrace_return */
-       jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return);
+       /* The trampoline ends with a jmp to ftrace_epilogue */
+       jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
        memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
 
        /*
index f129a9a..1f4422d 100644 (file)
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
 {
-       unsigned long i;
-
-       for (i = 0; i < PTRS_PER_PGD-1; i++)
-               early_level4_pgt[i].pgd = 0;
-
+       memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
        next_early_pgt = 0;
-
        write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
 int __init early_make_pgtable(unsigned long address)
 {
        unsigned long physaddr = address - __PAGE_OFFSET;
-       unsigned long i;
        pgdval_t pgd, *pgd_p;
        pudval_t pud, *pud_p;
        pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
                }
 
                pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-               for (i = 0; i < PTRS_PER_PUD; i++)
-                       pud_p[i] = 0;
+               memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
                *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
        }
        pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
                }
 
                pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-               for (i = 0; i < PTRS_PER_PMD; i++)
-                       pmd_p[i] = 0;
+               memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
                *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
        }
        pmd = (physaddr & PMD_MASK) + early_pmd_flags;
@@ -192,5 +184,13 @@ void __init x86_64_start_reservations(char *real_mode_data)
 
        reserve_ebda_region();
 
+       switch (boot_params.hdr.hardware_subarch) {
+       case X86_SUBARCH_INTEL_MID:
+               x86_intel_mid_early_setup();
+               break;
+       default:
+               break;
+       }
+
        start_kernel();
 }
index 6bc9ae2..54cdbd2 100644 (file)
@@ -19,7 +19,7 @@
 #include <asm/setup.h>
 #include <asm/processor-flags.h>
 #include <asm/msr-index.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
 #include <asm/bootparam.h>
@@ -389,6 +389,12 @@ default_entry:
        /* Make changes effective */
        wrmsr
 
+       /*
+        * And make sure that all the mappings we set up have NX set from
+        * the beginning.
+        */
+       orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
+
 enable_paging:
 
 /*
index ffdc0e8..22fbf9d 100644 (file)
@@ -38,7 +38,6 @@
 #define pud_index(x)   (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 
 L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
 L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
@@ -76,9 +75,7 @@ startup_64:
        subq    $_text - __START_KERNEL_map, %rbp
 
        /* Is the address not 2M aligned? */
-       movq    %rbp, %rax
-       andl    $~PMD_PAGE_MASK, %eax
-       testl   %eax, %eax
+       testl   $~PMD_PAGE_MASK, %ebp
        jnz     bad_address
 
        /*
index b8e6ff5..be0ebbb 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/cpufeature.h>
 #include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
index f8062aa..61521dc 100644 (file)
@@ -462,7 +462,7 @@ void fixup_irqs(void)
                 * non intr-remapping case, we can't wait till this interrupt
                 * arrives at this cpu before completing the irq move.
                 */
-               irq_force_complete_move(irq);
+               irq_force_complete_move(desc);
 
                if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        break_affinity = 1;
@@ -470,6 +470,15 @@ void fixup_irqs(void)
                }
 
                chip = irq_data_get_irq_chip(data);
+               /*
+                * The interrupt descriptor might have been cleaned up
+                * already, but it is not yet removed from the radix tree
+                */
+               if (!chip) {
+                       raw_spin_unlock(&desc->lock);
+                       continue;
+               }
+
                if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
                        chip->irq_mask(data);
 
index 44256a6..ed15cd4 100644 (file)
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 {
        int err;
-#ifdef CONFIG_DEBUG_RODATA
        char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
 
        bpt->type = BP_BREAKPOINT;
        err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
                return err;
        err = probe_kernel_write((char *)bpt->bpt_addr,
                                 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
        if (!err)
                return err;
        /*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
        if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
                return -EINVAL;
        bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
        return err;
 }
 
 int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 {
-#ifdef CONFIG_DEBUG_RODATA
        int err;
        char opc[BREAK_INSTR_SIZE];
 
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
        if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
                goto knl_write;
        return err;
+
 knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
        return probe_kernel_write((char *)bpt->bpt_addr,
                                  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
 }
index 1deffe6..0f05dee 100644 (file)
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
                 * In case the user-specified fault handler returned
                 * zero, try to fix up.
                 */
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, trapnr))
                        return 1;
 
                /*
index 87e1762..ed48a9f 100644 (file)
@@ -168,12 +168,14 @@ GLOBAL(ftrace_call)
        restore_mcount_regs
 
        /*
-        * The copied trampoline must call ftrace_return as it
+        * The copied trampoline must call ftrace_epilogue as it
         * still may need to call the function graph tracer.
+        *
+        * The code up to this label is copied into trampolines so
+        * think twice before adding any new code or changing the
+        * layout here.
         */
-GLOBAL(ftrace_caller_end)
-
-GLOBAL(ftrace_return)
+GLOBAL(ftrace_epilogue)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 GLOBAL(ftrace_graph_call)
@@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call)
        popfq
 
        /*
-        * As this jmp to ftrace_return can be a short jump
+        * As this jmp to ftrace_epilogue can be a short jump
         * it must not be copied into the trampoline.
         * The trampoline will add the code to jump
         * to the return.
         */
 GLOBAL(ftrace_regs_caller_end)
 
-       jmp ftrace_return
+       jmp ftrace_epilogue
 
 END(ftrace_regs_caller)
 
index 30ca760..97340f2 100644 (file)
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
        processor.cpuflag = CPU_ENABLED;
        processor.cpufeature = (boot_cpu_data.x86 << 8) |
            (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-       processor.featureflag = boot_cpu_data.x86_capability[0];
+       processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
        processor.reserved[0] = 0;
        processor.reserved[1] = 0;
        for (i = 0; i < 2; i++) {
index 64f9616..7f3550a 100644 (file)
@@ -40,7 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 
 static struct class *msr_class;
index 8a2cdd7..04b132a 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/cache.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
 
 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
 
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
 
 int unknown_nmi_panic;
 /*
index 14415af..92f7014 100644 (file)
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
 
 static __init int register_e820_pmem(void)
 {
-       char *pmem = "Persistent Memory (legacy)";
        struct platform_device *pdev;
        int rc;
 
-       rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+       rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+                                IORESOURCE_MEM, 0, -1, NULL, found);
        if (rc <= 0)
                return 0;
 
index 9f7c21c..2915d54 100644 (file)
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
          */
        .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
+#ifdef CONFIG_X86_32
+       .SYSENTER_stack_canary  = STACK_END_MAGIC,
+#endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
@@ -418,9 +421,9 @@ static void mwait_idle(void)
        if (!current_set_polling_and_test()) {
                trace_cpu_idle_rcuidle(1, smp_processor_id());
                if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                        clflush((void *)&current_thread_info()->flags);
-                       smp_mb(); /* quirk */
+                       mb(); /* quirk */
                }
 
                __monitor((void *)&current_thread_info()->flags, 0, 0);
index d3d80e6..aa52c10 100644 (file)
@@ -152,21 +152,21 @@ static struct resource data_resource = {
        .name   = "Kernel data",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
        .name   = "Kernel code",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
        .name   = "Kernel bss",
        .start  = 0,
        .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 
index cb6282c..548ddf7 100644 (file)
        regs->seg = GET_SEG(seg) | 3;                   \
 } while (0)
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
+ * alone.  Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+       u32 ar;
+       asm volatile ("lar %[old_ss], %[ar]\n\t"
+                     "jz 1f\n\t"               /* If invalid: */
+                     "xorl %[ar], %[ar]\n\t"   /* set ar = 0 */
+                     "1:"
+                     : [ar] "=r" (ar)
+                     : [old_ss] "rm" ((u16)regs->ss));
+
+       /*
+        * For a valid 64-bit user context, we need DPL 3, type
+        * read-write data or read-write exp-down data, and S and P
+        * set.  We can't use VERW because VERW doesn't check the
+        * P bit.
+        */
+       ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+       if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+           ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+               regs->ss = __USER_DS;
+}
+#endif
+
+static int restore_sigcontext(struct pt_regs *regs,
+                             struct sigcontext __user *sc,
+                             unsigned long uc_flags)
 {
        unsigned long buf_val;
        void __user *buf;
@@ -94,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
                COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
                COPY_SEG_CPL3(cs);
                COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-               /* Kernel saves and restores only the CS segment register on signals,
-                * which is the bare minimum needed to allow mixed 32/64-bit code.
-                * App's signal handler can save/restore other segments if needed. */
-               COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+               /*
+                * Fix up SS if needed for the benefit of old DOSEMU and
+                * CRIU.
+                */
+               if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
+                            user_64bit_mode(regs)))
+                       force_valid_ss(regs);
+#endif
 
                get_user_ex(tmpflags, &sc->flags);
                regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -165,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                put_user_ex(regs->cs, &sc->cs);
                put_user_ex(0, &sc->gs);
                put_user_ex(0, &sc->fs);
+               put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
                put_user_ex(fpstate, &sc->fpstate);
@@ -403,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
        return 0;
 }
 #else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+       unsigned long flags;
+
+       if (cpu_has_xsave)
+               flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+       else
+               flags = UC_SIGCONTEXT_SS;
+
+       if (likely(user_64bit_mode(regs)))
+               flags |= UC_STRICT_RESTORE_SS;
+
+       return flags;
+}
+
 static int __setup_rt_frame(int sig, struct ksignal *ksig,
                            sigset_t *set, struct pt_regs *regs)
 {
@@ -422,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
        put_user_try {
                /* Create the ucontext.  */
-               if (cpu_has_xsave)
-                       put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-               else
-                       put_user_ex(0, &frame->uc.uc_flags);
+               put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
                put_user_ex(0, &frame->uc.uc_link);
                save_altstack_ex(&frame->uc.uc_stack, regs->sp);
 
@@ -459,10 +506,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
        regs->sp = (unsigned long)frame;
 
-       /* Set up the CS register to run signal handlers in 64-bit mode,
-          even if the handler happens to be interrupting 32-bit code. */
+       /*
+        * Set up the CS and SS registers to run signal handlers in
+        * 64-bit mode, even if the handler happens to be interrupting
+        * 32-bit or 16-bit code.
+        *
+        * SS is subtle.  In 64-bit mode, we don't need any particular
+        * SS descriptor, but we do need SS to be valid.  It's possible
+        * that the old SS is entirely bogus -- this can happen if the
+        * signal we're trying to deliver is #GP or #SS caused by a bad
+        * SS value.  We also have a compatbility issue here: DOSEMU
+        * relies on the contents of the SS register indicating the
+        * SS value at the time of the signal, even though that code in
+        * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
+        * avoids relying on sigreturn to restore SS; instead it uses
+        * a trampoline.)  So we do our best: if the old SS was valid,
+        * we keep it.  Otherwise we replace it.
+        */
        regs->cs = __USER_CS;
 
+       if (unlikely(regs->ss != __USER_DS))
+               force_valid_ss(regs);
+
        return 0;
 }
 #endif /* CONFIG_X86_32 */
@@ -489,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
        put_user_try {
                /* Create the ucontext.  */
-               if (cpu_has_xsave)
-                       put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-               else
-                       put_user_ex(0, &frame->uc.uc_flags);
+               put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
                put_user_ex(0, &frame->uc.uc_link);
                compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
                put_user_ex(0, &frame->uc.uc__pad0);
@@ -554,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->sc))
+       /*
+        * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+        * Save a few cycles by skipping the __get_user.
+        */
+       if (restore_sigcontext(regs, &frame->sc, 0))
                goto badframe;
        return regs->ax;
 
@@ -570,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void)
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe __user *frame;
        sigset_t set;
+       unsigned long uc_flags;
 
        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                goto badframe;
        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
                goto badframe;
+       if (__get_user(uc_flags, &frame->uc.uc_flags))
+               goto badframe;
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;
 
        if (restore_altstack(&frame->uc.uc_stack))
@@ -692,12 +761,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
-#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_64
+       if (is_ia32_task())
+               return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+       return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
        return __NR_restart_syscall;
-#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
-       return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
-               __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
-#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+#endif
 }
 
 /*
@@ -763,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_x32 __user *frame;
        sigset_t set;
+       unsigned long uc_flags;
 
        frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
@@ -770,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
                goto badframe;
        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
                goto badframe;
+       if (__get_user(uc_flags, &frame->uc.uc_flags))
+               goto badframe;
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;
 
        if (compat_restore_altstack(&frame->uc.uc_stack))
index 24d57f7..643dbdc 100644 (file)
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map  __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
        unsigned long flags;
@@ -248,7 +256,98 @@ static void notrace start_secondary(void *unused)
        x86_cpuinit.setup_percpu_clockev();
 
        wmb();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+       unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+       /* Called from early boot ? */
+       if (!physical_package_map)
+               return 0;
+
+       if (pkg >= max_physical_pkg_id)
+               return -EINVAL;
+
+       /* Set the logical package id */
+       if (test_and_set_bit(pkg, physical_package_map))
+               goto found;
+
+       if (pkg < __max_logical_packages) {
+               set_bit(pkg, logical_package_map);
+               physical_to_logical_pkg[pkg] = pkg;
+               goto found;
+       }
+       new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+       if (new >= __max_logical_packages) {
+               physical_to_logical_pkg[pkg] = -1;
+               pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+                       apicid, pkg);
+               return -ENOSPC;
+       }
+       set_bit(new, logical_package_map);
+       pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+               apicid, pkg, new);
+       physical_to_logical_pkg[pkg] = new;
+
+found:
+       cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+       return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+       if (phys_pkg >= max_physical_pkg_id)
+               return -1;
+       return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+       unsigned int ncpus, cpu;
+       size_t size;
+
+       /*
+        * Today neither Intel nor AMD support heterogenous systems. That
+        * might change in the future....
+        */
+       ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+       __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+       /*
+        * Possibly larger than what we need as the number of apic ids per
+        * package can be smaller than the actual used apic ids.
+        */
+       max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+       size = max_physical_pkg_id * sizeof(unsigned int);
+       physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+       memset(physical_to_logical_pkg, 0xff, size);
+       size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+       physical_package_map = kzalloc(size, GFP_KERNEL);
+       size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+       logical_package_map = kzalloc(size, GFP_KERNEL);
+
+       pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+       for_each_present_cpu(cpu) {
+               unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+               if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+                       continue;
+               if (!topology_update_package_map(apicid, cpu))
+                       continue;
+               pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+               per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+               set_cpu_possible(cpu, false);
+               set_cpu_present(cpu, false);
+       }
 }
 
 void __init smp_store_boot_cpu_info(void)
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
 
        *c = boot_cpu_data;
        c->cpu_index = id;
+       smp_init_package_map();
 }
 
 /*
index 3f92ce0..27538f1 100644 (file)
@@ -142,7 +142,6 @@ static int test_NX(void)
         * by the error message
         */
 
-#ifdef CONFIG_DEBUG_RODATA
        /* Test 3: Check if the .rodata section is executable */
        if (rodata_test_data != 0xC3) {
                printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
                printk(KERN_ERR "test_nx: .rodata section is executable\n");
                ret = -ENODEV;
        }
-#endif
 
 #if 0
        /* Test 4: Check if the .data section of a module is executable */
index 5ecbfe5..cb4a01b 100644 (file)
@@ -76,5 +76,5 @@ int rodata_test(void)
 }
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
index ade185a..06cbe25 100644 (file)
@@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
-static inline void conditional_sti(struct pt_regs *regs)
+static inline void cond_local_irq_enable(struct pt_regs *regs)
 {
        if (regs->flags & X86_EFLAGS_IF)
                local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-       preempt_count_inc();
-       if (regs->flags & X86_EFLAGS_IF)
-               local_irq_enable();
-}
-
-static inline void conditional_cli(struct pt_regs *regs)
-{
-       if (regs->flags & X86_EFLAGS_IF)
-               local_irq_disable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void cond_local_irq_disable(struct pt_regs *regs)
 {
        if (regs->flags & X86_EFLAGS_IF)
                local_irq_disable();
-       preempt_count_dec();
 }
 
 void ist_enter(struct pt_regs *regs)
@@ -199,7 +185,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
        }
 
        if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                        tsk->thread.error_code = error_code;
                        tsk->thread.trap_nr = trapnr;
                        die(str, regs, error_code);
@@ -262,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
        tsk->thread.error_code = error_code;
        tsk->thread.trap_nr = trapnr;
 
-#ifdef CONFIG_X86_64
        if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
            printk_ratelimit()) {
                pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -271,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
                print_vma_addr(" in ", regs->ip);
                pr_cont("\n");
        }
-#endif
 
        force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 }
@@ -286,7 +270,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 
        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
                        NOTIFY_STOP) {
-               conditional_sti(regs);
+               cond_local_irq_enable(regs);
                do_trap(trapnr, signr, str, regs, error_code,
                        fill_trap_info(regs, signr, trapnr, &info));
        }
@@ -368,7 +352,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
        if (notify_die(DIE_TRAP, "bounds", regs, error_code,
                        X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
                return;
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
 
        if (!user_mode(regs))
                die("bounds", regs, error_code);
@@ -443,7 +427,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
        struct task_struct *tsk;
 
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
 
        if (v8086_mode(regs)) {
                local_irq_enable();
@@ -453,7 +437,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
        tsk = current;
        if (!user_mode(regs)) {
-               if (fixup_exception(regs))
+               if (fixup_exception(regs, X86_TRAP_GP))
                        return;
 
                tsk->thread.error_code = error_code;
@@ -517,9 +501,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
         * as we may switch to the interrupt stack.
         */
        debug_stack_usage_inc();
-       preempt_conditional_sti(regs);
+       preempt_disable();
+       cond_local_irq_enable(regs);
        do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-       preempt_conditional_cli(regs);
+       cond_local_irq_disable(regs);
+       preempt_enable_no_resched();
        debug_stack_usage_dec();
 exit:
        ist_exit(regs);
@@ -571,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 NOKPROBE_SYMBOL(fixup_bad_iret);
 #endif
 
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+       /*
+        * We don't try for precision here.  If we're anywhere in the region of
+        * code that can be single-stepped in the SYSENTER entry path, then
+        * assume that this is a useless single-step trap due to SYSENTER
+        * being invoked with TF set.  (We don't know in advance exactly
+        * which instructions will be hit because BTF could plausibly
+        * be set.)
+        */
+#ifdef CONFIG_X86_32
+       return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+               (unsigned long)__end_SYSENTER_singlestep_region -
+               (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+       return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+               (unsigned long)__end_entry_SYSENTER_compat -
+               (unsigned long)entry_SYSENTER_compat;
+#else
+       return false;
+#endif
+}
+
 /*
  * Our handling of the processor debug registers is non-trivial.
  * We do not clear them on entry and exit from the kernel. Therefore
@@ -605,10 +614,41 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        ist_enter(regs);
 
        get_debugreg(dr6, 6);
+       /*
+        * The Intel SDM says:
+        *
+        *   Certain debug exceptions may clear bits 0-3. The remaining
+        *   contents of the DR6 register are never cleared by the
+        *   processor. To avoid confusion in identifying debug
+        *   exceptions, debug handlers should clear the register before
+        *   returning to the interrupted task.
+        *
+        * Keep it simple: clear DR6 immediately.
+        */
+       set_debugreg(0, 6);
 
        /* Filter out all the reserved bits which are preset to 1 */
        dr6 &= ~DR6_RESERVED;
 
+       /*
+        * The SDM says "The processor clears the BTF flag when it
+        * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
+        * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+        */
+       clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+       if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+                    is_sysenter_singlestep(regs))) {
+               dr6 &= ~DR_STEP;
+               if (!dr6)
+                       goto exit;
+               /*
+                * else we might have gotten a single-step trap and hit a
+                * watchpoint at the same time, in which case we should fall
+                * through and handle the watchpoint.
+                */
+       }
+
        /*
         * If dr6 has no reason to give us about the origin of this trap,
         * then it's very likely the result of an icebp/int01 trap.
@@ -617,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        if (!dr6 && user_mode(regs))
                user_icebp = 1;
 
-       /* Catch kmemcheck conditions first of all! */
+       /* Catch kmemcheck conditions! */
        if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
                goto exit;
 
-       /* DR6 may or may not be cleared by the CPU */
-       set_debugreg(0, 6);
-
-       /*
-        * The processor cleared BTF, so don't mark that we need it set.
-        */
-       clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-
        /* Store the virtualized DR6 value */
        tsk->thread.debugreg6 = dr6;
 
@@ -648,24 +680,25 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        debug_stack_usage_inc();
 
        /* It's safe to allow irq's after DR6 has been saved */
-       preempt_conditional_sti(regs);
+       preempt_disable();
+       cond_local_irq_enable(regs);
 
        if (v8086_mode(regs)) {
                handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
                                        X86_TRAP_DB);
-               preempt_conditional_cli(regs);
+               cond_local_irq_disable(regs);
+               preempt_enable_no_resched();
                debug_stack_usage_dec();
                goto exit;
        }
 
-       /*
-        * Single-stepping through system calls: ignore any exceptions in
-        * kernel space, but re-enable TF when returning to user mode.
-        *
-        * We already checked v86 mode above, so we can check for kernel mode
-        * by just checking the CPL of CS.
-        */
-       if ((dr6 & DR_STEP) && !user_mode(regs)) {
+       if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+               /*
+                * Historical junk that used to handle SYSENTER single-stepping.
+                * This should be unreachable now.  If we survive for a while
+                * without anyone hitting this warning, we'll turn this into
+                * an oops.
+                */
                tsk->thread.debugreg6 &= ~DR_STEP;
                set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
                regs->flags &= ~X86_EFLAGS_TF;
@@ -673,10 +706,19 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        si_code = get_si_code(tsk->thread.debugreg6);
        if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
                send_sigtrap(tsk, regs, error_code, si_code);
-       preempt_conditional_cli(regs);
+       cond_local_irq_disable(regs);
+       preempt_enable_no_resched();
        debug_stack_usage_dec();
 
 exit:
+#if defined(CONFIG_X86_32)
+       /*
+        * This is the most likely code path that involves non-trivial use
+        * of the SYSENTER stack.  Check that we haven't overrun it.
+        */
+       WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+            "Overran or corrupted SYSENTER stack\n");
+#endif
        ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
@@ -696,10 +738,10 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
                return;
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
 
        if (!user_mode(regs)) {
-               if (!fixup_exception(regs)) {
+               if (!fixup_exception(regs, trapnr)) {
                        task->thread.error_code = error_code;
                        task->thread.trap_nr = trapnr;
                        die(str, regs, error_code);
@@ -743,20 +785,19 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
 }
 
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-       BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
-       if (read_cr0() & X86_CR0_EM) {
+       if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
                struct math_emu_info info = { };
 
-               conditional_sti(regs);
+               cond_local_irq_enable(regs);
 
                info.regs = regs;
                math_emulate(&info);
@@ -765,7 +806,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #endif
        fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
-       conditional_sti(regs);
+       cond_local_irq_enable(regs);
 #endif
 }
 NOKPROBE_SYMBOL(do_device_not_available);
@@ -868,7 +909,7 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-       set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+       set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
index 3d743da..5638044 100644 (file)
@@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
 /*
  * Use a ring-buffer like data structure, where a writer advances the head by
  * writing a new data entry and a reader advances the tail when it observes a
@@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc);
 
 #endif /* CONFIG_CPU_FREQ */
 
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void detect_art(void)
+{
+       unsigned int unused[2];
+
+       if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+               return;
+
+       cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+             &art_to_tsc_numerator, unused, unused+1);
+
+       /* Don't enable ART in a VM, non-stop TSC required */
+       if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+           !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+           art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+               return;
+
+       if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+               return;
+
+       /* Make this sticky over multiple CPU init calls */
+       setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
 /* clocksource code */
 
 static struct clocksource clocksource_tsc;
@@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void)
        return 0;
 }
 
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(cycle_t art)
+{
+       u64 tmp, res, rem;
+
+       rem = do_div(art, art_to_tsc_denominator);
+
+       res = art * art_to_tsc_numerator;
+       tmp = rem * art_to_tsc_numerator;
+
+       do_div(tmp, art_to_tsc_denominator);
+       res += tmp + art_to_tsc_offset;
+
+       return (struct system_counterval_t) {.cs = art_related_clocksource,
+                       .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
 
 static void tsc_refine_calibration_work(struct work_struct *work);
 static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
@@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
                (unsigned long)tsc_khz % 1000);
 
 out:
+       if (boot_cpu_has(X86_FEATURE_ART))
+               art_related_clocksource = &clocksource_tsc;
        clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
@@ -1235,6 +1292,8 @@ void __init tsc_init(void)
                mark_tsc_unstable("TSCs unsynchronized");
 
        check_system_tsc_reliable();
+
+       detect_art();
 }
 
 #ifdef CONFIG_SMP
@@ -1246,14 +1305,14 @@ void __init tsc_init(void)
  */
 unsigned long calibrate_delay_is_known(void)
 {
-       int i, cpu = smp_processor_id();
+       int sibling, cpu = smp_processor_id();
 
        if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
                return 0;
 
-       for_each_online_cpu(i)
-               if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
-                       return cpu_data(i).loops_per_jiffy;
+       sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+       if (sibling < nr_cpu_ids)
+               return cpu_data(sibling).loops_per_jiffy;
        return 0;
 }
 #endif
index 07efb35..014ea59 100644 (file)
@@ -30,7 +30,7 @@
  *     appropriately. Either display a message or halt.
  */
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 
 verify_cpu:
index e574b85..3dce1ca 100644 (file)
@@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
        /* make room for real-mode segments */
        tsk->thread.sp0 += 16;
 
-       if (static_cpu_has_safe(X86_FEATURE_SEP))
+       if (static_cpu_has(X86_FEATURE_SEP))
                tsk->thread.sysenter_cs = 0;
 
        load_sp0(tss, &tsk->thread);
index 74e4bf1..5af9958 100644 (file)
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
  *
  * However, kernel identity mappings will have different RWX permissions
  * to the pages mapping to text and to the pages padding (which are freed) the
  * text section. Hence kernel identity mappings will be broken to smaller
  * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
  */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN   . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
 
-#define X64_ALIGN_DEBUG_RODATA_END                             \
+#define X64_ALIGN_RODATA_END                                   \
                . = ALIGN(HPAGE_SIZE);                          \
                __end_rodata_hpage_align = .;
 
 #else
 
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
 
 #endif
 
@@ -112,13 +111,11 @@ SECTIONS
 
        EXCEPTION_TABLE(16) :text = 0x9090
 
-#if defined(CONFIG_DEBUG_RODATA)
        /* .text should occupy whole number of pages */
        . = ALIGN(PAGE_SIZE);
-#endif
-       X64_ALIGN_DEBUG_RODATA_BEGIN
+       X64_ALIGN_RODATA_BEGIN
        RO_DATA(PAGE_SIZE)
-       X64_ALIGN_DEBUG_RODATA_END
+       X64_ALIGN_RODATA_END
 
        /* Data */
        .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -195,6 +192,17 @@ SECTIONS
        :init
 #endif
 
+       /*
+        * Section for code used exclusively before alternatives are run. All
+        * references to such code must be patched out by alternatives, normally
+        * by using X86_FEATURE_ALWAYS CPU feature bit.
+        *
+        * See static_cpu_has() for an example.
+        */
+       .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+               *(.altinstr_aux)
+       }
+
        INIT_DATA_SECTION(16)
 
        .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
index a0695be..cd05942 100644 (file)
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
index 1505587..b9b09fe 100644 (file)
@@ -650,10 +650,10 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        u16 sel;
 
        la = seg_base(ctxt, addr.seg) + addr.ea;
-       *linear = la;
        *max_size = 0;
        switch (mode) {
        case X86EMUL_MODE_PROT64:
+               *linear = la;
                if (is_noncanonical_address(la))
                        goto bad;
 
@@ -662,6 +662,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
                        goto bad;
                break;
        default:
+               *linear = la = (u32)la;
                usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
                                                addr.seg);
                if (!usable)
@@ -689,7 +690,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
                        if (size > *max_size)
                                goto bad;
                }
-               la &= (u32)-1;
                break;
        }
        if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
index 36591fa..3a045f3 100644 (file)
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
        struct kvm_vcpu *vcpu = apic->vcpu;
-       wait_queue_head_t *q = &vcpu->wq;
+       struct swait_queue_head *q = &vcpu->wq;
        struct kvm_timer *ktimer = &apic->lapic_timer;
 
        if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
        atomic_inc(&apic->lapic_timer.pending);
        kvm_set_pending_timer(vcpu);
 
-       if (waitqueue_active(q))
-               wake_up_interruptible(q);
+       if (swait_active(q))
+               swake_up(q);
 
        if (apic_lvtt_tscdeadline(apic))
                ktimer->expired_tscdeadline = ktimer->tscdeadline;
index 95a955d..1e7a49b 100644 (file)
@@ -3721,13 +3721,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
+       bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
        /*
         * Passing "true" to the last argument is okay; it adds a check
         * on bit 8 of the SPTEs which KVM doesn't use anyway.
         */
        __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
                                boot_cpu_data.x86_phys_bits,
-                               context->shadow_root_level, context->nx,
+                               context->shadow_root_level, uses_nx,
                                guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
                                true);
 }
index 6c9fed9..2ce4f05 100644 (file)
@@ -249,7 +249,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                        return ret;
 
                kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
-               walker->ptes[level] = pte;
+               walker->ptes[level - 1] = pte;
        }
        return 0;
 }
index e2951b6..9bd8f44 100644 (file)
@@ -596,6 +596,8 @@ struct vcpu_vmx {
        /* Support for PML */
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
+
+       u64 current_tsc_ratio;
 };
 
 enum segment_cache_field {
@@ -1811,6 +1813,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        return;
                }
                break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
        for (i = 0; i < m->nr; ++i)
@@ -1848,26 +1857,31 @@ static void reload_tss(void)
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-       u64 guest_efer;
-       u64 ignore_bits;
+       u64 guest_efer = vmx->vcpu.arch.efer;
+       u64 ignore_bits = 0;
 
-       guest_efer = vmx->vcpu.arch.efer;
+       if (!enable_ept) {
+               /*
+                * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+                * host CPUID is more efficient than testing guest CPUID
+                * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+                */
+               if (boot_cpu_has(X86_FEATURE_SMEP))
+                       guest_efer |= EFER_NX;
+               else if (!(guest_efer & EFER_NX))
+                       ignore_bits |= EFER_NX;
+       }
 
        /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-        * outside long mode
+        * LMA and LME handled by hardware; SCE meaningless outside long mode.
         */
-       ignore_bits = EFER_NX | EFER_SCE;
+       ignore_bits |= EFER_SCE;
 #ifdef CONFIG_X86_64
        ignore_bits |= EFER_LMA | EFER_LME;
        /* SCE is meaningful only in long mode on Intel */
        if (guest_efer & EFER_LMA)
                ignore_bits &= ~(u64)EFER_SCE;
 #endif
-       guest_efer &= ~ignore_bits;
-       guest_efer |= host_efer & ignore_bits;
-       vmx->guest_msrs[efer_offset].data = guest_efer;
-       vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
        clear_atomic_switch_msr(vmx, MSR_EFER);
 
@@ -1878,16 +1892,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
         */
        if (cpu_has_load_ia32_efer ||
            (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-               guest_efer = vmx->vcpu.arch.efer;
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
                                              guest_efer, host_efer);
                return false;
-       }
+       } else {
+               guest_efer &= ~ignore_bits;
+               guest_efer |= host_efer & ignore_bits;
 
-       return true;
+               vmx->guest_msrs[efer_offset].data = guest_efer;
+               vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+               return true;
+       }
 }
 
 static unsigned long segment_base(u16 selector)
@@ -2127,14 +2146,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 
-               /* Setup TSC multiplier */
-               if (cpu_has_vmx_tsc_scaling())
-                       vmcs_write64(TSC_MULTIPLIER,
-                                    vcpu->arch.tsc_scaling_ratio);
-
                vmx->loaded_vmcs->cpu = cpu;
        }
 
+       /* Setup TSC multiplier */
+       if (kvm_has_tsc_control &&
+           vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+               vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+               vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+       }
+
        vmx_vcpu_pi_load(vcpu, cpu);
 }
 
index 4244c2b..eaf6ee8 100644 (file)
@@ -6618,12 +6618,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         * KVM_DEBUGREG_WONT_EXIT again.
         */
        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-               int i;
-
                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                kvm_x86_ops->sync_dirty_debug_regs(vcpu);
-               for (i = 0; i < KVM_NR_DB_REGS; i++)
-                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+               kvm_update_dr0123(vcpu);
+               kvm_update_dr6(vcpu);
+               kvm_update_dr7(vcpu);
+               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
        /*
index 4ba229a..fd57d3a 100644 (file)
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
         */
        reserve_top_address(lguest_data.reserve_mem);
 
-       /*
-        * If we don't initialize the lock dependency checker now, it crashes
-        * atomic_notifier_chain_register, then paravirt_disable_iospace.
-        */
-       lockdep_init();
-
        /* Hook in our special panic hypercall code. */
        atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
@@ -1535,7 +1529,7 @@ __init void lguest_init(void)
         */
        cpu_detect(&new_cpu_data);
        /* head.S usually sets up the first capability word, so do it here. */
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 
        /* Math is always hard! */
        set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
index a2fe51b..65be7cf 100644 (file)
@@ -1,5 +1,5 @@
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
index 422db00..5cc78bf 100644 (file)
@@ -21,12 +21,16 @@ static inline int myisspace(u8 c)
  * @option: option string to look for
  *
  * Returns the position of that @option (starts counting with 1)
- * or 0 on not found.
+ * or 0 on not found.  @option will only be found if it is found
+ * as an entire word in @cmdline.  For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
  */
-int cmdline_find_option_bool(const char *cmdline, const char *option)
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+                          const char *option)
 {
        char c;
-       int len, pos = 0, wstart = 0;
+       int pos = 0, wstart = 0;
        const char *opptr = NULL;
        enum {
                st_wordstart = 0,       /* Start of word/after whitespace */
@@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
        if (!cmdline)
                return -1;      /* No command line */
 
-       len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
-       if (!len)
-               return 0;
-
-       while (len--) {
+       /*
+        * This 'pos' check ensures we do not overrun
+        * a non-NULL-terminated 'cmdline'
+        */
+       while (pos < max_cmdline_size) {
                c = *(char *)cmdline++;
                pos++;
 
@@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
                        /* fall through */
 
                case st_wordcmp:
-                       if (!*opptr)
+                       if (!*opptr) {
+                               /*
+                                * We matched all the way to the end of the
+                                * option we were looking for.  If the
+                                * command-line has a space _or_ ends, then
+                                * we matched!
+                                */
                                if (!c || myisspace(c))
                                        return wstart;
-                               else
-                                       state = st_wordskip;
-                       else if (!c)
+                               /*
+                                * We hit the end of the option, but _not_
+                                * the end of a word on the cmdline.  Not
+                                * a match.
+                                */
+                       } else if (!c) {
+                               /*
+                                * Hit the NULL terminator on the end of
+                                * cmdline.
+                                */
                                return 0;
-                       else if (c != *opptr++)
-                               state = st_wordskip;
-                       else if (!len)          /* last word and is matching */
-                               return wstart;
-                       break;
+                       } else if (c == *opptr++) {
+                               /*
+                                * We are currently matching, so continue
+                                * to the next character on the cmdline.
+                                */
+                               break;
+                       }
+                       state = st_wordskip;
+                       /* fall through */
 
                case st_wordskip:
                        if (!c)
@@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 
        return 0;       /* Buffer overrun */
 }
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+       return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
index 009f982..24ef1c2 100644 (file)
@@ -1,7 +1,7 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
index 982ce34..2b0ef26 100644 (file)
@@ -10,7 +10,7 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
@@ -232,17 +232,31 @@ ENDPROC(copy_user_enhanced_fast_string)
 
 /*
  * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
+ * This will force destination out of cache for more performance.
+ *
+ * Note: Cached memory copy is used when destination or size is not
+ * naturally aligned. That is:
+ *  - Require 8-byte alignment when size is 8 bytes or larger.
+ *  - Require 4-byte alignment when size is 4 bytes.
  */
 ENTRY(__copy_user_nocache)
        ASM_STAC
+
+       /* If size is less than 8 bytes, go to 4-byte copy */
        cmpl $8,%edx
-       jb 20f          /* less then 8 bytes, go to byte copy loop */
+       jb .L_4b_nocache_copy_entry
+
+       /* If destination is not 8-byte aligned, "cache" copy to align it */
        ALIGN_DESTINATION
+
+       /* Set 4x8-byte copy count and remainder */
        movl %edx,%ecx
        andl $63,%edx
        shrl $6,%ecx
-       jz 17f
+       jz .L_8b_nocache_copy_entry     /* jump if count is 0 */
+
+       /* Perform 4x8-byte nocache loop-copy */
+.L_4x8b_nocache_copy_loop:
 1:     movq (%rsi),%r8
 2:     movq 1*8(%rsi),%r9
 3:     movq 2*8(%rsi),%r10
@@ -262,60 +276,106 @@ ENTRY(__copy_user_nocache)
        leaq 64(%rsi),%rsi
        leaq 64(%rdi),%rdi
        decl %ecx
-       jnz 1b
-17:    movl %edx,%ecx
+       jnz .L_4x8b_nocache_copy_loop
+
+       /* Set 8-byte copy count and remainder */
+.L_8b_nocache_copy_entry:
+       movl %edx,%ecx
        andl $7,%edx
        shrl $3,%ecx
-       jz 20f
-18:    movq (%rsi),%r8
-19:    movnti %r8,(%rdi)
+       jz .L_4b_nocache_copy_entry     /* jump if count is 0 */
+
+       /* Perform 8-byte nocache loop-copy */
+.L_8b_nocache_copy_loop:
+20:    movq (%rsi),%r8
+21:    movnti %r8,(%rdi)
        leaq 8(%rsi),%rsi
        leaq 8(%rdi),%rdi
        decl %ecx
-       jnz 18b
-20:    andl %edx,%edx
-       jz 23f
+       jnz .L_8b_nocache_copy_loop
+
+       /* If no byte left, we're done */
+.L_4b_nocache_copy_entry:
+       andl %edx,%edx
+       jz .L_finish_copy
+
+       /* If destination is not 4-byte aligned, go to byte copy: */
+       movl %edi,%ecx
+       andl $3,%ecx
+       jnz .L_1b_cache_copy_entry
+
+       /* Set 4-byte copy count (1 or 0) and remainder */
        movl %edx,%ecx
-21:    movb (%rsi),%al
-22:    movb %al,(%rdi)
+       andl $3,%edx
+       shrl $2,%ecx
+       jz .L_1b_cache_copy_entry       /* jump if count is 0 */
+
+       /* Perform 4-byte nocache copy: */
+30:    movl (%rsi),%r8d
+31:    movnti %r8d,(%rdi)
+       leaq 4(%rsi),%rsi
+       leaq 4(%rdi),%rdi
+
+       /* If no bytes left, we're done: */
+       andl %edx,%edx
+       jz .L_finish_copy
+
+       /* Perform byte "cache" loop-copy for the remainder */
+.L_1b_cache_copy_entry:
+       movl %edx,%ecx
+.L_1b_cache_copy_loop:
+40:    movb (%rsi),%al
+41:    movb %al,(%rdi)
        incq %rsi
        incq %rdi
        decl %ecx
-       jnz 21b
-23:    xorl %eax,%eax
+       jnz .L_1b_cache_copy_loop
+
+       /* Finished copying; fence the prior stores */
+.L_finish_copy:
+       xorl %eax,%eax
        ASM_CLAC
        sfence
        ret
 
        .section .fixup,"ax"
-30:    shll $6,%ecx
+.L_fixup_4x8b_copy:
+       shll $6,%ecx
        addl %ecx,%edx
-       jmp 60f
-40:    lea (%rdx,%rcx,8),%rdx
-       jmp 60f
-50:    movl %ecx,%edx
-60:    sfence
+       jmp .L_fixup_handle_tail
+.L_fixup_8b_copy:
+       lea (%rdx,%rcx,8),%rdx
+       jmp .L_fixup_handle_tail
+.L_fixup_4b_copy:
+       lea (%rdx,%rcx,4),%rdx
+       jmp .L_fixup_handle_tail
+.L_fixup_1b_copy:
+       movl %ecx,%edx
+.L_fixup_handle_tail:
+       sfence
        jmp copy_user_handle_tail
        .previous
 
-       _ASM_EXTABLE(1b,30b)
-       _ASM_EXTABLE(2b,30b)
-       _ASM_EXTABLE(3b,30b)
-       _ASM_EXTABLE(4b,30b)
-       _ASM_EXTABLE(5b,30b)
-       _ASM_EXTABLE(6b,30b)
-       _ASM_EXTABLE(7b,30b)
-       _ASM_EXTABLE(8b,30b)
-       _ASM_EXTABLE(9b,30b)
-       _ASM_EXTABLE(10b,30b)
-       _ASM_EXTABLE(11b,30b)
-       _ASM_EXTABLE(12b,30b)
-       _ASM_EXTABLE(13b,30b)
-       _ASM_EXTABLE(14b,30b)
-       _ASM_EXTABLE(15b,30b)
-       _ASM_EXTABLE(16b,30b)
-       _ASM_EXTABLE(18b,40b)
-       _ASM_EXTABLE(19b,40b)
-       _ASM_EXTABLE(21b,50b)
-       _ASM_EXTABLE(22b,50b)
+       _ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
+       _ASM_EXTABLE(20b,.L_fixup_8b_copy)
+       _ASM_EXTABLE(21b,.L_fixup_8b_copy)
+       _ASM_EXTABLE(30b,.L_fixup_4b_copy)
+       _ASM_EXTABLE(31b,.L_fixup_4b_copy)
+       _ASM_EXTABLE(40b,.L_fixup_1b_copy)
+       _ASM_EXTABLE(41b,.L_fixup_1b_copy)
 ENDPROC(__copy_user_nocache)
index e912b2f..2f07c29 100644 (file)
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
                 * Use cpu_tss as a cacheline-aligned, seldomly
                 * accessed per-cpu variable as the monitor target.
                 */
-               __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
 
                /*
                 * AMD, like Intel, supports the EAX hint and EAX=0xf
index 16698bb..cbb8ee5 100644 (file)
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
 .Lend:
        retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+       cmpl $8, %edx
+       /* Less than 8 bytes? Go to byte copy loop */
+       jb .L_no_whole_words
+
+       /* Check for bad alignment of source */
+       testl $7, %esi
+       /* Already aligned */
+       jz .L_8byte_aligned
+
+       /* Copy one byte at a time until source is 8-byte aligned */
+       movl %esi, %ecx
+       andl $7, %ecx
+       subl $8, %ecx
+       negl %ecx
+       subl %ecx, %edx
+.L_copy_leading_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+       /* Figure out how many whole cache lines (64-bytes) to copy */
+       movl %edx, %ecx
+       andl $63, %edx
+       shrl $6, %ecx
+       jz .L_no_whole_cache_lines
+
+       /* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+       movq %r8, (%rdi)
+       movq %r9, 1*8(%rdi)
+       movq %r10, 2*8(%rdi)
+       movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+       movq %r8, 4*8(%rdi)
+       movq %r9, 5*8(%rdi)
+       movq %r10, 6*8(%rdi)
+       movq %r11, 7*8(%rdi)
+       leaq 64(%rsi), %rsi
+       leaq 64(%rdi), %rdi
+       decl %ecx
+       jnz .L_cache_w0
+
+       /* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+       movl %edx, %ecx
+       andl $7, %edx
+       shrl $3, %ecx
+       jz .L_no_whole_words
+
+       /* Copy trailing words */
+.L_copy_trailing_words:
+       movq (%rsi), %r8
+       mov %r8, (%rdi)
+       leaq 8(%rsi), %rsi
+       leaq 8(%rdi), %rdi
+       decl %ecx
+       jnz .L_copy_trailing_words
+
+       /* Any trailing bytes? */
+.L_no_whole_words:
+       andl %edx, %edx
+       jz .L_done_memcpy_trap
+
+       /* Copy trailing bytes */
+       movl %edx, %ecx
+.L_copy_trailing_bytes:
+       movb (%rsi), %al
+       movb %al, (%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .L_copy_trailing_bytes
+
+       /* Copy successful. Return true */
+.L_done_memcpy_trap:
+       xorq %rax, %rax
+       ret
+ENDPROC(memcpy_mcsafe)
+
+       .section .fixup, "ax"
+       /* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+       mov     $1, %rax
+       ret
+
+       .previous
+
+       _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
index ca2afdd..90ce01b 100644 (file)
@@ -6,7 +6,7 @@
  *     - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 #undef memmove
index 2661fad..c9c8122 100644 (file)
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 .weak memset
index 4a6f1d9..99bfb19 100644 (file)
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
 #endif
 
-#ifdef CONFIG_X86_64
 static inline bool is_hypervisor_range(int idx)
 {
+#ifdef CONFIG_X86_64
        /*
         * ffff800000000000 - ffff87ffffffffff is reserved for
         * the hypervisor.
         */
-       return paravirt_enabled() &&
-               (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
-               (idx < pgd_index(__PAGE_OFFSET));
-}
+       return  (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+               (idx <  pgd_index(__PAGE_OFFSET));
 #else
-static inline bool is_hypervisor_range(int idx) { return false; }
+       return false;
 #endif
+}
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
                                       bool checkwx)
index 903ec1e..9dd7e4b 100644 (file)
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+                           struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
        return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+       return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+                      struct pt_regs *regs, int trapnr)
 {
-       const struct exception_table_entry *fixup;
-       unsigned long new_ip;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+                    struct pt_regs *regs, int trapnr)
+{
+       regs->ip = ex_fixup_addr(fixup);
+       regs->ax = trapnr;
+       return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+                  struct pt_regs *regs, int trapnr)
+{
+       /* Special hack for uaccess_err */
+       current_thread_info()->uaccess_err = 1;
+       regs->ip = ex_fixup_addr(fixup);
+       return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
+
+       e = search_exception_tables(ip);
+       if (!e)
+               return false;
+       handler = ex_fixup_handler(e);
+
+       return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+       const struct exception_table_entry *e;
+       ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
        if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
        }
 #endif
 
-       fixup = search_exception_tables(regs->ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
-
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* Special hack for uaccess_err */
-                       current_thread_info()->uaccess_err = 1;
-                       new_ip -= 0x7ffffff0;
-               }
-               regs->ip = new_ip;
-               return 1;
-       }
+       e = search_exception_tables(regs->ip);
+       if (!e)
+               return 0;
 
-       return 0;
+       handler = ex_fixup_handler(e);
+       return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-       const struct exception_table_entry *fixup;
+       const struct exception_table_entry *e;
        unsigned long new_ip;
+       ex_handler_t handler;
 
-       fixup = search_exception_tables(*ip);
-       if (fixup) {
-               new_ip = ex_fixup_addr(fixup);
+       e = search_exception_tables(*ip);
+       if (!e)
+               return 0;
 
-               if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-                       /* uaccess handling not supported during early boot */
-                       return 0;
-               }
+       new_ip  = ex_fixup_addr(e);
+       handler = ex_fixup_handler(e);
 
-               *ip = new_ip;
-               return 1;
-       }
+       /* special handling not supported during early boot */
+       if (handler != ex_handler_default)
+               return 0;
 
-       return 0;
+       *ip = new_ip;
+       return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
                i += 4;
                p->fixup += i;
                i += 4;
+               p->handler += i;
+               i += 4;
        }
 
        sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
                i += 4;
                p->fixup -= i;
                i += 4;
+               p->handler -= i;
+               i += 4;
        }
 }
 
index eef44d9..03898ae 100644 (file)
@@ -287,6 +287,9 @@ static noinline int vmalloc_fault(unsigned long address)
        if (!pmd_k)
                return -1;
 
+       if (pmd_huge(*pmd_k))
+               return 0;
+
        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;
@@ -360,8 +363,6 @@ void vmalloc_sync_all(void)
  * 64-bit:
  *
  *   Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
  */
 static noinline int vmalloc_fault(unsigned long address)
 {
@@ -403,17 +404,23 @@ static noinline int vmalloc_fault(unsigned long address)
        if (pud_none(*pud_ref))
                return -1;
 
-       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+       if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
                BUG();
 
+       if (pud_huge(*pud))
+               return 0;
+
        pmd = pmd_offset(pud, address);
        pmd_ref = pmd_offset(pud_ref, address);
        if (pmd_none(*pmd_ref))
                return -1;
 
-       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+       if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
                BUG();
 
+       if (pmd_huge(*pmd))
+               return 0;
+
        pte_ref = pte_offset_kernel(pmd_ref, address);
        if (!pte_present(*pte_ref))
                return -1;
@@ -656,7 +663,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
        int sig;
 
        /* Are we prepared to handle this kernel fault? */
-       if (fixup_exception(regs)) {
+       if (fixup_exception(regs, X86_TRAP_PF)) {
                /*
                 * Any interrupt that takes a fault gets the fixup. This makes
                 * the below recursive fault logic only apply to a faults from
index 6d5eb59..d8a798d 100644 (file)
@@ -102,7 +102,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        return 0;
                }
 
-               page = pte_page(pte);
                if (pte_devmap(pte)) {
                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
@@ -115,6 +114,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        return 0;
                }
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+               page = pte_page(pte);
                get_page(page);
                put_dev_pagemap(pgmap);
                SetPageReferenced(page);
index 42982b2..740d7ac 100644 (file)
@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt)
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 static __init int gigantic_pages_init(void)
 {
-       /* With CMA we can allocate gigantic pages at runtime */
+       /* With compaction or CMA we can allocate gigantic pages at runtime */
        if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
        return 0;
index cb4ef3d..bd7a9b9 100644 (file)
@@ -388,7 +388,6 @@ repeat:
 }
 
 pte_t *kmap_pte;
-pgprot_t kmap_prot;
 
 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
 {
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
         */
        kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
        kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-       kmap_prot = PAGE_KERNEL;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -871,7 +868,6 @@ static noinline int do_test_wp_bit(void)
        return flag;
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -960,5 +956,3 @@ void mark_rodata_ro(void)
        if (__supported_pte_mask & _PAGE_NX)
                debug_checkwx();
 }
-#endif
-
index 5488d21..214afda 100644 (file)
@@ -53,6 +53,7 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <asm/uv/uv.h>
 #include <asm/setup.h>
 
 #include "mm_internal.h"
@@ -1074,7 +1075,6 @@ void __init mem_init(void)
        mem_init_print_info(NULL);
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -1166,8 +1166,6 @@ void mark_rodata_ro(void)
        debug_checkwx();
 }
 
-#endif
-
 int kern_addr_valid(unsigned long addr)
 {
        unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -1206,26 +1204,13 @@ int kern_addr_valid(unsigned long addr)
 
 static unsigned long probe_memory_block_size(void)
 {
-       /* start from 2g */
-       unsigned long bz = 1UL<<31;
-
-       if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
-               pr_info("Using 2GB memory block size for large-memory system\n");
-               return 2UL * 1024 * 1024 * 1024;
-       }
-
-       /* less than 64g installed */
-       if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
-               return MIN_MEMORY_BLOCK_SIZE;
+       unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
 
-       /* get the tail size */
-       while (bz > MIN_MEMORY_BLOCK_SIZE) {
-               if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
-                       break;
-               bz >>= 1;
-       }
+       /* if system is UV or has 64GB of RAM or more, use large blocks */
+       if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
+               bz = 2UL << 30; /* 2GB */
 
-       printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+       pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
 
        return bz;
 }
index d470cf2..1b1110f 100644 (file)
@@ -120,11 +120,22 @@ void __init kasan_init(void)
        kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
                        (void *)KASAN_SHADOW_END);
 
-       memset(kasan_zero_page, 0, PAGE_SIZE);
-
        load_cr3(init_level4_pgt);
        __flush_tlb_all();
-       init_task.kasan_depth = 0;
 
+       /*
+        * kasan_zero_page has been used as early shadow memory, thus it may
+        * contain some garbage. Now we can clear and write protect it, since
+        * after the TLB flush no one should write to it.
+        */
+       memset(kasan_zero_page, 0, PAGE_SIZE);
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+               set_pte(&kasan_zero_pte[i], pte);
+       }
+       /* Flush TLBs again to be sure that write protection applied. */
+       __flush_tlb_all();
+
+       init_task.kasan_depth = 0;
        pr_info("KernelAddressSanitizer initialized\n");
 }
index 637ab34..ddb2244 100644 (file)
@@ -33,7 +33,7 @@
 struct kmmio_fault_page {
        struct list_head list;
        struct kmmio_fault_page *release_next;
-       unsigned long page; /* location of the fault page */
+       unsigned long addr; /* the requested address */
        pteval_t old_presence; /* page presence prior to arming */
        bool armed;
 
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
-static struct list_head *kmmio_page_list(unsigned long page)
+static struct list_head *kmmio_page_list(unsigned long addr)
 {
-       return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+       unsigned int l;
+       pte_t *pte = lookup_address(addr, &l);
+
+       if (!pte)
+               return NULL;
+       addr &= page_level_mask(l);
+
+       return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
 }
 
 /* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 }
 
 /* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
 {
        struct list_head *head;
        struct kmmio_fault_page *f;
+       unsigned int l;
+       pte_t *pte = lookup_address(addr, &l);
 
-       page &= PAGE_MASK;
-       head = kmmio_page_list(page);
+       if (!pte)
+               return NULL;
+       addr &= page_level_mask(l);
+       head = kmmio_page_list(addr);
        list_for_each_entry_rcu(f, head, list) {
-               if (f->page == page)
+               if (f->addr == addr)
                        return f;
        }
        return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
        unsigned int level;
-       pte_t *pte = lookup_address(f->page, &level);
+       pte_t *pte = lookup_address(f->addr, &level);
 
        if (!pte) {
-               pr_err("no pte for page 0x%08lx\n", f->page);
+               pr_err("no pte for addr 0x%08lx\n", f->addr);
                return -1;
        }
 
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
                return -1;
        }
 
-       __flush_tlb_one(f->page);
+       __flush_tlb_one(f->addr);
        return 0;
 }
 
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
        int ret;
        WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
        if (f->armed) {
-               pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
-                          f->page, f->count, !!f->old_presence);
+               pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+                          f->addr, f->count, !!f->old_presence);
        }
        ret = clear_page_presence(f, true);
-       WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
-                 f->page);
+       WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+                 f->addr);
        f->armed = true;
        return ret;
 }
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
        int ret = clear_page_presence(f, false);
        WARN_ONCE(ret < 0,
-                       KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+                       KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
        f->armed = false;
 }
 
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
        struct kmmio_context *ctx;
        struct kmmio_fault_page *faultpage;
        int ret = 0; /* default to fault not handled */
+       unsigned long page_base = addr;
+       unsigned int l;
+       pte_t *pte = lookup_address(addr, &l);
+       if (!pte)
+               return -EINVAL;
+       page_base &= page_level_mask(l);
 
        /*
         * Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
        preempt_disable();
        rcu_read_lock();
 
-       faultpage = get_kmmio_fault_page(addr);
+       faultpage = get_kmmio_fault_page(page_base);
        if (!faultpage) {
                /*
                 * Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
        ctx = &get_cpu_var(kmmio_ctx);
        if (ctx->active) {
-               if (addr == ctx->addr) {
+               if (page_base == ctx->addr) {
                        /*
                         * A second fault on the same page means some other
                         * condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
        ctx->active++;
 
        ctx->fpage = faultpage;
-       ctx->probe = get_kmmio_probe(addr);
+       ctx->probe = get_kmmio_probe(page_base);
        ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
-       ctx->addr = addr;
+       ctx->addr = page_base;
 
        if (ctx->probe && ctx->probe->pre_handler)
                ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
 }
 
 /* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
+static int add_kmmio_fault_page(unsigned long addr)
 {
        struct kmmio_fault_page *f;
 
-       page &= PAGE_MASK;
-       f = get_kmmio_fault_page(page);
+       f = get_kmmio_fault_page(addr);
        if (f) {
                if (!f->count)
                        arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
                return -1;
 
        f->count = 1;
-       f->page = page;
+       f->addr = addr;
 
        if (arm_kmmio_fault_page(f)) {
                kfree(f);
                return -1;
        }
 
-       list_add_rcu(&f->list, kmmio_page_list(f->page));
+       list_add_rcu(&f->list, kmmio_page_list(f->addr));
 
        return 0;
 }
 
 /* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
+static void release_kmmio_fault_page(unsigned long addr,
                                struct kmmio_fault_page **release_list)
 {
        struct kmmio_fault_page *f;
 
-       page &= PAGE_MASK;
-       f = get_kmmio_fault_page(page);
+       f = get_kmmio_fault_page(addr);
        if (!f)
                return;
 
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
        int ret = 0;
        unsigned long size = 0;
        const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+       unsigned int l;
+       pte_t *pte;
 
        spin_lock_irqsave(&kmmio_lock, flags);
        if (get_kmmio_probe(p->addr)) {
                ret = -EEXIST;
                goto out;
        }
+
+       pte = lookup_address(p->addr, &l);
+       if (!pte) {
+               ret = -EINVAL;
+               goto out;
+       }
+
        kmmio_count++;
        list_add_rcu(&p->list, &kmmio_probes);
        while (size < size_lim) {
                if (add_kmmio_fault_page(p->addr + size))
                        pr_err("Unable to set page fault.\n");
-               size += PAGE_SIZE;
+               size += page_level_size(l);
        }
 out:
        spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
        const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
        struct kmmio_fault_page *release_list = NULL;
        struct kmmio_delayed_release *drelease;
+       unsigned int l;
+       pte_t *pte;
+
+       pte = lookup_address(p->addr, &l);
+       if (!pte)
+               return;
 
        spin_lock_irqsave(&kmmio_lock, flags);
        while (size < size_lim) {
                release_kmmio_fault_page(p->addr + size, &release_list);
-               size += PAGE_SIZE;
+               size += page_level_size(l);
        }
        list_del_rcu(&p->list);
        kmmio_count--;
index 96bd1e2..d2dc043 100644 (file)
@@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void)
 
        if (mmap_is_ia32())
 #ifdef CONFIG_COMPAT
-               rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+               rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
 #else
-               rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+               rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
 #endif
        else
-               rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+               rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
 
        return rnd << PAGE_SHIFT;
 }
@@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd)
        return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 
-/*
- * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
- * does, but not when emulating X86_32
- */
-static unsigned long mmap_legacy_base(unsigned long rnd)
-{
-       if (mmap_is_ia32())
-               return TASK_UNMAPPED_BASE;
-       else
-               return TASK_UNMAPPED_BASE + rnd;
-}
-
 /*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();
 
-       mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+       mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
 
        if (mmap_is_legacy()) {
                mm->mmap_base = mm->mmap_legacy_base;
index b2fd67d..ef05755 100644 (file)
@@ -123,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
                break;
        }
 
-       if (regno > nr_registers) {
+       if (regno >= nr_registers) {
                WARN_ONCE(1, "decoded an instruction with an invalid register");
                return -EINVAL;
        }
index c3b3f65..f70c1ff 100644 (file)
@@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
        return true;
 }
 
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
 static void __init numa_clear_kernel_node_hotplug(void)
 {
-       int i, nid;
-       nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
-       unsigned long start, end;
-       struct memblock_region *r;
+       nodemask_t reserved_nodemask = NODE_MASK_NONE;
+       struct memblock_region *mb_region;
+       int i;
 
        /*
+        * We have to do some preprocessing of memblock regions, to
+        * make them suitable for reservation.
+        *
         * At this time, all memory regions reserved by memblock are
-        * used by the kernel. Set the nid in memblock.reserved will
-        * mark out all the nodes the kernel resides in.
+        * used by the kernel, but those regions are not split up
+        * along node boundaries yet, and don't necessarily have their
+        * node ID set yet either.
+        *
+        * So iterate over all memory known to the x86 architecture,
+        * and use those ranges to set the nid in memblock.reserved.
+        * This will split up the memblock regions along node
+        * boundaries and will set the node IDs as well.
         */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
-               struct numa_memblk *mb = &numa_meminfo.blk[i];
+               struct numa_memblk *mb = numa_meminfo.blk + i;
+               int ret;
 
-               memblock_set_node(mb->start, mb->end - mb->start,
-                                 &memblock.reserved, mb->nid);
+               ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+               WARN_ON_ONCE(ret);
        }
 
        /*
-        * Mark all kernel nodes.
+        * Now go over all reserved memblock regions, to construct a
+        * node mask of all kernel reserved memory areas.
         *
-        * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
-        * may not include all the memblock.reserved memory ranges because
-        * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+        * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+        *   numa_meminfo might not include all memblock.reserved
+        *   memory ranges, because quirks such as trim_snb_memory()
+        *   reserve specific pages for Sandy Bridge graphics. ]
         */
-       for_each_memblock(reserved, r)
-               if (r->nid != MAX_NUMNODES)
-                       node_set(r->nid, numa_kernel_nodes);
+       for_each_memblock(reserved, mb_region) {
+               if (mb_region->nid != MAX_NUMNODES)
+                       node_set(mb_region->nid, reserved_nodemask);
+       }
 
-       /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+       /*
+        * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+        * belonging to the reserved node mask.
+        *
+        * Note that this will include memory regions that reside
+        * on nodes that contain kernel memory - entire nodes
+        * become hot-unpluggable:
+        */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
-               nid = numa_meminfo.blk[i].nid;
-               if (!node_isset(nid, numa_kernel_nodes))
-                       continue;
+               struct numa_memblk *mb = numa_meminfo.blk + i;
 
-               start = numa_meminfo.blk[i].start;
-               end = numa_meminfo.blk[i].end;
+               if (!node_isset(mb->nid, reserved_nodemask))
+                       continue;
 
-               memblock_clear_hotplug(start, end - start);
+               memblock_clear_hotplug(mb->start, mb->end - mb->start);
        }
 }
 
index fc6a4c8..007ebe2 100644 (file)
@@ -33,7 +33,7 @@ struct cpa_data {
        pgd_t           *pgd;
        pgprot_t        mask_set;
        pgprot_t        mask_clr;
-       int             numpages;
+       unsigned long   numpages;
        int             flags;
        unsigned long   pfn;
        unsigned        force_split : 1;
@@ -283,7 +283,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_RW;
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
        /*
         * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
         * kernel text mappings for the large page aligned text, rodata sections
@@ -419,24 +419,30 @@ pmd_t *lookup_pmd_address(unsigned long address)
 phys_addr_t slow_virt_to_phys(void *__virt_addr)
 {
        unsigned long virt_addr = (unsigned long)__virt_addr;
-       unsigned long phys_addr, offset;
+       phys_addr_t phys_addr;
+       unsigned long offset;
        enum pg_level level;
        pte_t *pte;
 
        pte = lookup_address(virt_addr, &level);
        BUG_ON(!pte);
 
+       /*
+        * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+        * before being left-shifted PAGE_SHIFT bits -- this trick is to
+        * make 32-PAE kernel work correctly.
+        */
        switch (level) {
        case PG_LEVEL_1G:
-               phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+               phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PUD_PAGE_MASK;
                break;
        case PG_LEVEL_2M:
-               phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+               phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PMD_PAGE_MASK;
                break;
        default:
-               phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+               phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
                offset = virt_addr & ~PAGE_MASK;
        }
 
@@ -1122,8 +1128,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
        /*
         * Ignore all non primary paths.
         */
-       if (!primary)
+       if (!primary) {
+               cpa->numpages = 1;
                return 0;
+       }
 
        /*
         * Ignore the NULL PTE for kernel identity mapping, as it is expected
@@ -1350,7 +1358,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
                 * CPA operation. Either a large page has been
                 * preserved or a single page update happened.
                 */
-               BUG_ON(cpa->numpages > numpages);
+               BUG_ON(cpa->numpages > numpages || !cpa->numpages);
                numpages -= cpa->numpages;
                if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
                        cpa->curpage++;
index f4ae536..04e2e71 100644 (file)
@@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                        return -EINVAL;
        }
 
-       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+       *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));
 
        return 0;
@@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 
        /* Set prot based on lookup */
        pcm = lookup_memtype(pfn_t_to_phys(pfn));
-       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+       *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));
 
        return 0;
index 92e2eac..8bea847 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/proto.h>
+#include <asm/cpufeature.h>
 
 static int disable_nx;
 
@@ -31,9 +32,8 @@ early_param("noexec", noexec_setup);
 
 void x86_configure_nx(void)
 {
-       if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
-               __supported_pte_mask |= _PAGE_NX;
-       else
+       /* If disable_nx is set, clear NX on all new mappings going forward. */
+       if (disable_nx)
                __supported_pte_mask &= ~_PAGE_NX;
 }
 
index 50d86c0..660a83c 100644 (file)
@@ -24,7 +24,6 @@
 #include <asm/nmi.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
-#include <asm/cpufeature.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
index 2879efc..d34b511 100644 (file)
@@ -711,28 +711,22 @@ int pcibios_add_device(struct pci_dev *dev)
        return 0;
 }
 
-int pcibios_alloc_irq(struct pci_dev *dev)
+int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
-       /*
-        * If the PCI device was already claimed by core code and has
-        * MSI enabled, probing of the pcibios IRQ will overwrite
-        * dev->irq.  So bail out if MSI is already enabled.
-        */
-       if (pci_dev_msi_enabled(dev))
-               return -EBUSY;
+       int err;
 
-       return pcibios_enable_irq(dev);
-}
+       if ((err = pci_enable_resources(dev, mask)) < 0)
+               return err;
 
-void pcibios_free_irq(struct pci_dev *dev)
-{
-       if (pcibios_disable_irq)
-               pcibios_disable_irq(dev);
+       if (!pci_dev_msi_enabled(dev))
+               return pcibios_enable_irq(dev);
+       return 0;
 }
 
-int pcibios_enable_device(struct pci_dev *dev, int mask)
+void pcibios_disable_device (struct pci_dev *dev)
 {
-       return pci_enable_resources(dev, mask);
+       if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+               pcibios_disable_irq(dev);
 }
 
 int pci_ext_cfg_avail(void)
index 0d24e7c..8b93e63 100644 (file)
@@ -215,7 +215,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
        int polarity;
        int ret;
 
-       if (pci_has_managed_irq(dev))
+       if (dev->irq_managed && dev->irq > 0)
                return 0;
 
        switch (intel_mid_identify_cpu()) {
@@ -256,13 +256,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-       if (pci_has_managed_irq(dev)) {
+       if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+           dev->irq > 0) {
                mp_unmap_irq(dev->irq);
                dev->irq_managed = 0;
-               /*
-                * Don't reset dev->irq here, otherwise
-                * intel_mid_pci_irq_enable() will fail on next call.
-                */
        }
 }
 
index 32e7034..9bd1154 100644 (file)
@@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
                        struct pci_dev *temp_dev;
                        int irq;
 
-                       if (pci_has_managed_irq(dev))
+                       if (dev->irq_managed && dev->irq > 0)
                                return 0;
 
                        irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1230,7 +1230,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
                        }
                        dev = temp_dev;
                        if (irq >= 0) {
-                               pci_set_managed_irq(dev, irq);
+                               dev->irq_managed = 1;
+                               dev->irq = irq;
                                dev_info(&dev->dev, "PCI->APIC IRQ transform: "
                                         "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
                                return 0;
@@ -1256,10 +1257,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
        return 0;
 }
 
+bool mp_should_keep_irq(struct device *dev)
+{
+       if (dev->power.is_prepared)
+               return true;
+#ifdef CONFIG_PM
+       if (dev->power.runtime_status == RPM_SUSPENDING)
+               return true;
+#endif
+
+       return false;
+}
+
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-       if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
+       if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+           dev->irq_managed && dev->irq) {
                mp_unmap_irq(dev->irq);
-               pci_reset_managed_irq(dev);
+               dev->irq = 0;
+               dev->irq_managed = 0;
        }
 }
index ff31ab4..beac4df 100644 (file)
@@ -196,7 +196,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        return 0;
 
 error:
-       dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+       if (ret == -ENOSYS)
+               dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+       else if (ret)
+               dev_err(&dev->dev, "Xen PCI frontend error: %d!\n", ret);
 free:
        kfree(v);
        return ret;
index 1c7380d..ed30e79 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/memblock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
+#include <linux/dmi.h>
 #include <asm/efi.h>
 #include <asm/uv/uv.h>
 
@@ -129,6 +130,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(efi_query_variable_store);
 
+/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+       if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+               return false;
+
+       if (!e820_all_mapped(start, start+size, E820_RAM))
+               return false;
+
+       return true;
+}
+
 /*
  * The UEFI specification makes it clear that the operating system is free to do
  * whatever it wants with boot services code after ExitBootServices() has been
@@ -146,26 +168,50 @@ void __init efi_reserve_boot_services(void)
                efi_memory_desc_t *md = p;
                u64 start = md->phys_addr;
                u64 size = md->num_pages << EFI_PAGE_SHIFT;
+               bool already_reserved;
 
                if (md->type != EFI_BOOT_SERVICES_CODE &&
                    md->type != EFI_BOOT_SERVICES_DATA)
                        continue;
-               /* Only reserve where possible:
-                * - Not within any already allocated areas
-                * - Not over any memory area (really needed, if above?)
-                * - Not within any part of the kernel
-                * - Not the bios reserved area
-               */
-               if ((start + size > __pa_symbol(_text)
-                               && start <= __pa_symbol(_end)) ||
-                       !e820_all_mapped(start, start+size, E820_RAM) ||
-                       memblock_is_region_reserved(start, size)) {
-                       /* Could not reserve, skip it */
-                       md->num_pages = 0;
-                       memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
-                                    start, start+size-1);
-               } else
+
+               already_reserved = memblock_is_region_reserved(start, size);
+
+               /*
+                * Because the following memblock_reserve() is paired
+                * with free_bootmem_late() for this region in
+                * efi_free_boot_services(), we must be extremely
+                * careful not to reserve, and subsequently free,
+                * critical regions of memory (like the kernel image) or
+                * those regions that somebody else has already
+                * reserved.
+                *
+                * A good example of a critical region that must not be
+                * freed is page zero (first 4Kb of memory), which may
+                * contain boot services code/data but is marked
+                * E820_RESERVED by trim_bios_range().
+                */
+               if (!already_reserved) {
                        memblock_reserve(start, size);
+
+                       /*
+                        * If we are the first to reserve the region, no
+                        * one else cares about it. We own it and can
+                        * free it later.
+                        */
+                       if (can_free_region(start, size))
+                               continue;
+               }
+
+               /*
+                * We don't own the region. We must not free it.
+                *
+                * Setting this bit for a boot services region really
+                * doesn't make sense as far as the firmware is
+                * concerned, but it does provide us with a way to tag
+                * those regions that must not be paired with
+                * free_bootmem_late().
+                */
+               md->attribute |= EFI_MEMORY_RUNTIME;
        }
 }
 
@@ -182,8 +228,8 @@ void __init efi_free_boot_services(void)
                    md->type != EFI_BOOT_SERVICES_DATA)
                        continue;
 
-               /* Could not reserve boot area */
-               if (!size)
+               /* Do not free, someone else owns it: */
+               if (md->attribute & EFI_MEMORY_RUNTIME)
                        continue;
 
                free_bootmem_late(start, size);
@@ -248,6 +294,16 @@ out:
        return ret;
 }
 
+static const struct dmi_system_id sgi_uv1_dmi[] = {
+       { NULL, "SGI UV1",
+               {       DMI_MATCH(DMI_PRODUCT_NAME,     "Stoutland Platform"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,  "1.0"),
+                       DMI_MATCH(DMI_BIOS_VENDOR,      "SGI.COM"),
+               }
+       },
+       { } /* NULL entry stops DMI scanning */
+};
+
 void __init efi_apply_memmap_quirks(void)
 {
        /*
@@ -260,10 +316,8 @@ void __init efi_apply_memmap_quirks(void)
                efi_unmap_memmap();
        }
 
-       /*
-        * UV doesn't support the new EFI pagetable mapping yet.
-        */
-       if (is_uv_system())
+       /* UV2+ BIOS has a fix for this issue.  UV1 still needs the quirk. */
+       if (dmi_check_system(sgi_uv1_dmi))
                set_bit(EFI_OLD_MEMMAP, &efi.flags);
 }
 
index 76b6632..1865c19 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
 #define BIOS_SIGNATURE_COREBOOT                0x500
 #define BIOS_REGION_SIZE               0x10000
 
+/*
+ * This driver is not modular, but to keep back compatibility
+ * with existing use cases, continuing with module_param is
+ * the easiest way forward.
+ */
 static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
@@ -192,9 +197,4 @@ static int __init alix_init(void)
 
        return 0;
 }
-
-module_init(alix_init);
-
-MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>");
-MODULE_DESCRIPTION("PCEngines ALIX System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(alix_init);
index aa733fb..4fcdb91 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -120,9 +119,4 @@ static int __init geos_init(void)
 
        return 0;
 }
-
-module_init(geos_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(geos_init);
index 927e38c..a2f6b98 100644 (file)
@@ -20,7 +20,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -146,9 +145,4 @@ static int __init net5501_init(void)
 
        return 0;
 }
-
-module_init(net5501_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Soekris net5501 System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(net5501_init);
index 1bbc21e..90bb997 100644 (file)
@@ -138,7 +138,7 @@ static void intel_mid_arch_setup(void)
                intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
        else {
                intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
-               pr_info("ARCH: Unknown SoC, assuming PENWELL!\n");
+               pr_info("ARCH: Unknown SoC, assuming Penwell!\n");
        }
 
 out:
@@ -214,12 +214,10 @@ static inline int __init setup_x86_intel_mid_timer(char *arg)
        else if (strcmp("lapic_and_apbt", arg) == 0)
                intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT;
        else {
-               pr_warn("X86 INTEL_MID timer option %s not recognised"
-                          " use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
-                          arg);
+               pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
+                       arg);
                return -EINVAL;
        }
        return 0;
 }
 __setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer);
-
index 23381d2..1eb47b6 100644 (file)
@@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void)
        /* mark tsc clocksource as reliable */
        set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-       if (fast_calibrate)
-               return fast_calibrate;
-
-       return 0;
+       return fast_calibrate;
 }
 
 static void __init penwell_arch_setup(void)
index aaca917..bd1adc6 100644 (file)
@@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
        /* mark tsc clocksource as reliable */
        set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-       if (fast_calibrate)
-               return fast_calibrate;
-
-       return 0;
+       return fast_calibrate;
 }
 
 static void __init tangier_arch_setup(void)
index c1bdafa..17d6d22 100644 (file)
@@ -1,5 +1,5 @@
 /**
- * imr.c
+ * imr.c -- Intel Isolated Memory Region driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -31,7 +31,6 @@
 #include <linux/debugfs.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 struct imr_device {
@@ -135,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
  * @idev:      pointer to imr_device structure.
  * @imr_id:    IMR entry to write.
  * @imr:       IMR structure representing address and access masks.
- * @lock:      indicates if the IMR lock bit should be applied.
  * @return:    0 on success or error code passed from mbi_iosf on failure.
  */
-static int imr_write(struct imr_device *idev, u32 imr_id,
-                    struct imr_regs *imr, bool lock)
+static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
 {
        unsigned long flags;
        u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
@@ -163,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
        if (ret)
                goto failed;
 
-       /* Lock bit must be set separately to addr_lo address bits. */
-       if (lock) {
-               imr->addr_lo |= IMR_LOCK;
-               ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
-                                    reg - IMR_NUM_REGS, imr->addr_lo);
-               if (ret)
-                       goto failed;
-       }
-
        local_irq_restore(flags);
        return 0;
 failed:
@@ -220,11 +208,12 @@ static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
                if (imr_is_enabled(&imr)) {
                        base = imr_to_phys(imr.addr_lo);
                        end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+                       size = end - base + 1;
                } else {
                        base = 0;
                        end = 0;
+                       size = 0;
                }
-               size = end - base;
                seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
                           "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
                           &base, &end, size, imr.rmask, imr.wmask,
@@ -268,17 +257,6 @@ static int imr_debugfs_register(struct imr_device *idev)
        return PTR_ERR_OR_ZERO(idev->file);
 }
 
-/**
- * imr_debugfs_unregister - unregister debugfs hooks.
- *
- * @idev:      pointer to imr_device structure.
- * @return:
- */
-static void imr_debugfs_unregister(struct imr_device *idev)
-{
-       debugfs_remove(idev->file);
-}
-
 /**
  * imr_check_params - check passed address range IMR alignment and non-zero size
  *
@@ -333,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
  * @size:      physical size of region in bytes must be aligned to 1KiB.
  * @read_mask: read access mask.
  * @write_mask:        write access mask.
- * @lock:      indicates whether or not to permanently lock this region.
  * @return:    zero on success or negative value indicating error.
  */
 int imr_add_range(phys_addr_t base, size_t size,
-                 unsigned int rmask, unsigned int wmask, bool lock)
+                 unsigned int rmask, unsigned int wmask)
 {
        phys_addr_t end;
        unsigned int i;
@@ -410,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size,
        imr.rmask = rmask;
        imr.wmask = wmask;
 
-       ret = imr_write(idev, reg, &imr, lock);
+       ret = imr_write(idev, reg, &imr);
        if (ret < 0) {
                /*
                 * In the highly unlikely event iosf_mbi_write failed
@@ -421,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size,
                imr.addr_hi = 0;
                imr.rmask = IMR_READ_ACCESS_ALL;
                imr.wmask = IMR_WRITE_ACCESS_ALL;
-               imr_write(idev, reg, &imr, false);
+               imr_write(idev, reg, &imr);
        }
 failed:
        mutex_unlock(&idev->lock);
@@ -517,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
        imr.rmask = IMR_READ_ACCESS_ALL;
        imr.wmask = IMR_WRITE_ACCESS_ALL;
 
-       ret = imr_write(idev, reg, &imr, false);
+       ret = imr_write(idev, reg, &imr);
 
 failed:
        mutex_unlock(&idev->lock);
@@ -579,6 +556,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
 {
        phys_addr_t base = virt_to_phys(&_text);
        size_t size = virt_to_phys(&__end_rodata) - base;
+       unsigned long start, end;
        int i;
        int ret;
 
@@ -586,18 +564,24 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
        for (i = 0; i < idev->max_imr; i++)
                imr_clear(i);
 
+       start = (unsigned long)_text;
+       end = (unsigned long)__end_rodata - 1;
+
        /*
-        * Setup a locked IMR around the physical extent of the kernel
+        * Setup an unlocked IMR around the physical extent of the kernel
         * from the beginning of the .text secton to the end of the
         * .rodata section as one physically contiguous block.
+        *
+        * We don't round up @size since it is already PAGE_SIZE aligned.
+        * See vmlinux.lds.S for details.
         */
-       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
        if (ret < 0) {
-               pr_err("unable to setup IMR for kernel: (%p - %p)\n",
-                       &_text, &__end_rodata);
+               pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
+                       size / 1024, start, end);
        } else {
-               pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n",
-                       size / 1024, &_text, &__end_rodata);
+               pr_info("protecting kernel .text - .rodata: %zu KiB (%lx - %lx)\n",
+                       size / 1024, start, end);
        }
 
 }
@@ -606,7 +590,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
        { X86_VENDOR_INTEL, 5, 9 },     /* Intel Quark SoC X1000. */
        {}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_init - entry point for IMR driver.
@@ -632,22 +615,4 @@ static int __init imr_init(void)
        imr_fixup_memmap(idev);
        return 0;
 }
-
-/**
- * imr_exit - exit point for IMR code.
- *
- * Deregisters debugfs, leave IMR state as-is.
- *
- * return:
- */
-static void __exit imr_exit(void)
-{
-       imr_debugfs_unregister(&imr_dev);
-}
-
-module_init(imr_init);
-module_exit(imr_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_init);
index 278e4da..f5bad40 100644 (file)
@@ -1,5 +1,5 @@
 /**
- * imr_selftest.c
+ * imr_selftest.c -- Intel Isolated Memory Region self-test driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -15,7 +15,6 @@
 #include <asm/imr.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 #define SELFTEST KBUILD_MODNAME ": "
@@ -61,30 +60,30 @@ static void __init imr_self_test(void)
        int ret;
 
        /* Test zero zero. */
-       ret = imr_add_range(0, 0, 0, 0, false);
+       ret = imr_add_range(0, 0, 0, 0);
        imr_self_test_result(ret < 0, "zero sized IMR\n");
 
        /* Test exact overlap. */
-       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
        imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
        /* Test overlap with base inside of existing. */
        base += size - IMR_ALIGN;
-       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
        imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
        /* Test overlap with end inside of existing. */
        base -= size + IMR_ALIGN * 2;
-       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+       ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
        imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
        /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
        ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
-                           IMR_WRITE_ACCESS_ALL, false);
+                           IMR_WRITE_ACCESS_ALL);
        imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
 
        /* Test that a 1 KiB IMR @ zero with CPU only will work. */
-       ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+       ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
        imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
        if (ret >= 0) {
                ret = imr_remove_range(0, IMR_ALIGN);
@@ -93,8 +92,7 @@ static void __init imr_self_test(void)
 
        /* Test 2 KiB works. */
        size = IMR_ALIGN * 2;
-       ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
-                           IMR_WRITE_ACCESS_ALL, false);
+       ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
        imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
        if (ret >= 0) {
                ret = imr_remove_range(0, size);
@@ -106,7 +104,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
        { X86_VENDOR_INTEL, 5, 9 },     /* Intel Quark SoC X1000. */
        {}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_self_test_init - entry point for IMR driver.
@@ -125,13 +122,4 @@ static int __init imr_self_test_init(void)
  *
  * return:
  */
-static void __exit imr_self_test_exit(void)
-{
-}
-
-module_init(imr_self_test_init);
-module_exit(imr_self_test_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_self_test_init);
index 174781a..00c3190 100644 (file)
@@ -3,7 +3,7 @@
 
 #include <asm/asm.h>
 #include <asm/segment.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cmpxchg.h>
 #include <asm/nops.h>
 
index 8502ad3..5adb6a2 100644 (file)
@@ -109,7 +109,7 @@ unsigned long os_get_top_address(void)
                exit(1);
        }
 
-       printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT);
+       printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT);
        printf("Locating the top of the address space ... ");
        fflush(stdout);
 
@@ -134,7 +134,7 @@ out:
                exit(1);
        }
        top <<= UM_KERN_PAGE_SHIFT;
-       printf("0x%x\n", top);
+       printf("0x%lx\n", top);
 
        return top;
 }
index 439c099..bfce503 100644 (file)
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
index b74ea6c..f306413 100644 (file)
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
index ce7e360..470564b 100644 (file)
@@ -9,14 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
index d09e4c9..2c26108 100644 (file)
@@ -1654,7 +1654,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
        cpu_detect(&new_cpu_data);
        set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
        new_cpu_data.wp_works_ok = 1;
-       new_cpu_data.x86_capability[0] = cpuid_edx(1);
+       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 #endif
 
        if (xen_start_info->mod_start) {
index 724a087..9466354 100644 (file)
@@ -11,7 +11,7 @@
 #include "pmu.h"
 
 /* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
 
 #define XENPMU_IRQ_PROCESSING    1
 struct xenpmu {
index 3f4ebf0..3c6d17f 100644 (file)
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
                xen_pvh_secondary_vcpu_init(cpu);
 #endif
        cpu_bringup();
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void xen_smp_intr_free(unsigned int cpu)
index 4d02e38..fc4ad21 100644 (file)
@@ -157,7 +157,7 @@ void secondary_start_kernel(void)
 
        complete(&cpu_running);
 
-       cpu_startup_entry(CPUHP_ONLINE);
+       cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void mx_cpu_start(void *p)
index 161491d..0363cd7 100644 (file)
@@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY
        T10/SCSI Data Integrity Field or the T13/ATA External Path
        Protection.  If in doubt, say N.
 
+config BLK_DEV_DAX
+       bool "Block device DAX support"
+       depends on FS_DAX
+       depends on BROKEN
+       help
+         When DAX support is available (CONFIG_FS_DAX) raw block
+         devices can also support direct userspace access to the
+         storage capacity via MMAP(2) similar to a file on a
+         DAX-enabled filesystem.  However, the DAX I/O-path disables
+         some standard I/O-statistics, and the MMAP(2) path has some
+         operational differences due to bypassing the page
+         cache.  If in doubt, say N.
+
 config BLK_DEV_THROTTLING
        bool "Block layer bio throttling support"
        depends on BLK_CGROUP=y
index dbabd48..cf75915 100644 (file)
@@ -874,7 +874,7 @@ int submit_bio_wait(int rw, struct bio *bio)
        bio->bi_private = &ret;
        bio->bi_end_io = submit_bio_wait_endio;
        submit_bio(rw, bio);
-       wait_for_completion(&ret.event);
+       wait_for_completion_io(&ret.event);
 
        return ret.error;
 }
@@ -1090,9 +1090,12 @@ int bio_uncopy_user(struct bio *bio)
        if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
                /*
                 * if we're in a workqueue, the request is orphaned, so
-                * don't copy into a random user address space, just free.
+                * don't copy into a random user address space, just free
+                * and return -EINTR so user space doesn't expect any data.
                 */
-               if (current->mm && bio_data_dir(bio) == READ)
+               if (!current->mm)
+                       ret = -EINTR;
+               else if (bio_data_dir(bio) == READ)
                        ret = bio_copy_to_iter(bio, bmd->iter);
                if (bmd->is_our_pages)
                        bio_free_pages(bio);
index 5a37188..66e6f1a 100644 (file)
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 {
        struct gendisk *disk;
        struct blkcg_gq *blkg;
+       struct module *owner;
        unsigned int major, minor;
        int key_len, part, ret;
        char *body;
@@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (!disk)
                return -ENODEV;
        if (part) {
+               owner = disk->fops->owner;
                put_disk(disk);
+               module_put(owner);
                return -ENODEV;
        }
 
@@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                ret = PTR_ERR(blkg);
                rcu_read_unlock();
                spin_unlock_irq(disk->queue->queue_lock);
+               owner = disk->fops->owner;
                put_disk(disk);
+               module_put(owner);
                /*
                 * If queue was bypassing, we should retry.  Do so after a
                 * short msleep().  It isn't strictly necessary but queue
@@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
        __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
+       struct module *owner;
+
        spin_unlock_irq(ctx->disk->queue->queue_lock);
        rcu_read_unlock();
+       owner = ctx->disk->fops->owner;
        put_disk(ctx->disk);
+       module_put(owner);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
index ab51685..b83d297 100644 (file)
@@ -2455,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q)
 
                        rq = NULL;
                        break;
-               } else if (ret == BLKPREP_KILL) {
+               } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
+                       int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
+
                        rq->cmd_flags |= REQ_QUIET;
                        /*
                         * Mark this request as started so we don't trigger
                         * any debug logic in the end I/O path.
                         */
                        blk_start_request(rq);
-                       __blk_end_request_all(rq, -EIO);
+                       __blk_end_request_all(rq, err);
                } else {
                        printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
                        break;
index f565e11..a54f054 100644 (file)
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
        return ret;
 }
 
+static int __blk_rq_map_user_iov(struct request *rq,
+               struct rq_map_data *map_data, struct iov_iter *iter,
+               gfp_t gfp_mask, bool copy)
+{
+       struct request_queue *q = rq->q;
+       struct bio *bio, *orig_bio;
+       int ret;
+
+       if (copy)
+               bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+       else
+               bio = bio_map_user_iov(q, iter, gfp_mask);
+
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       if (map_data && map_data->null_mapped)
+               bio_set_flag(bio, BIO_NULL_MAPPED);
+
+       iov_iter_advance(iter, bio->bi_iter.bi_size);
+       if (map_data)
+               map_data->offset += bio->bi_iter.bi_size;
+
+       orig_bio = bio;
+       blk_queue_bounce(q, &bio);
+
+       /*
+        * We link the bounce buffer in and could have to traverse it
+        * later so we have to get a ref to prevent it from being freed
+        */
+       bio_get(bio);
+
+       ret = blk_rq_append_bio(q, rq, bio);
+       if (ret) {
+               bio_endio(bio);
+               __blk_rq_unmap_user(orig_bio);
+               bio_put(bio);
+               return ret;
+       }
+
+       return 0;
+}
+
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:         request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
                        struct rq_map_data *map_data,
                        const struct iov_iter *iter, gfp_t gfp_mask)
 {
-       struct bio *bio;
-       int unaligned = 0;
-       struct iov_iter i;
        struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+       bool copy = (q->dma_pad_mask & iter->count) || map_data;
+       struct bio *bio = NULL;
+       struct iov_iter i;
+       int ret;
 
        if (!iter || !iter->count)
                return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
                 */
                if ((uaddr & queue_dma_alignment(q)) ||
                    iovec_gap_to_prv(q, &prv, &iov))
-                       unaligned = 1;
+                       copy = true;
 
                prv.iov_base = iov.iov_base;
                prv.iov_len = iov.iov_len;
        }
 
-       if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-               bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
-       else
-               bio = bio_map_user_iov(q, iter, gfp_mask);
-
-       if (IS_ERR(bio))
-               return PTR_ERR(bio);
-
-       if (map_data && map_data->null_mapped)
-               bio_set_flag(bio, BIO_NULL_MAPPED);
-
-       if (bio->bi_iter.bi_size != iter->count) {
-               /*
-                * Grab an extra reference to this bio, as bio_unmap_user()
-                * expects to be able to drop it twice as it happens on the
-                * normal IO completion path
-                */
-               bio_get(bio);
-               bio_endio(bio);
-               __blk_rq_unmap_user(bio);
-               return -EINVAL;
-       }
+       i = *iter;
+       do {
+               ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+               if (ret)
+                       goto unmap_rq;
+               if (!bio)
+                       bio = rq->bio;
+       } while (iov_iter_count(&i));
 
        if (!bio_flagged(bio, BIO_USER_MAPPED))
                rq->cmd_flags |= REQ_COPY_USER;
-
-       blk_queue_bounce(q, &bio);
-       bio_get(bio);
-       blk_rq_bio_prep(q, rq, bio);
        return 0;
+
+unmap_rq:
+       __blk_rq_unmap_user(bio);
+       rq->bio = NULL;
+       return -EINVAL;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
index 1699df5..2613531 100644 (file)
@@ -70,6 +70,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
        return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
 }
 
+static inline unsigned get_max_io_size(struct request_queue *q,
+                                      struct bio *bio)
+{
+       unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
+       unsigned mask = queue_logical_block_size(q) - 1;
+
+       /* aligned to logical block size */
+       sectors &= ~(mask >> 9);
+
+       return sectors;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
                                         struct bio *bio,
                                         struct bio_set *bs,
@@ -81,6 +93,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
        unsigned front_seg_size = bio->bi_seg_front_size;
        bool do_split = true;
        struct bio *new = NULL;
+       const unsigned max_sectors = get_max_io_size(q, bio);
 
        bio_for_each_segment(bv, bio, iter) {
                /*
@@ -90,20 +103,19 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
                if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
                        goto split;
 
-               if (sectors + (bv.bv_len >> 9) >
-                               blk_max_size_offset(q, bio->bi_iter.bi_sector)) {
+               if (sectors + (bv.bv_len >> 9) > max_sectors) {
                        /*
                         * Consider this a new segment if we're splitting in
                         * the middle of this vector.
                         */
                        if (nsegs < queue_max_segments(q) &&
-                           sectors < blk_max_size_offset(q,
-                                               bio->bi_iter.bi_sector)) {
+                           sectors < max_sectors) {
                                nsegs++;
-                               sectors = blk_max_size_offset(q,
-                                               bio->bi_iter.bi_sector);
+                               sectors = max_sectors;
                        }
-                       goto split;
+                       if (sectors)
+                               goto split;
+                       /* Make this single bvec as the 1st segment */
                }
 
                if (bvprvp && blk_queue_cluster(q)) {
@@ -292,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
                                   struct bio *nxt)
 {
        struct bio_vec end_bv = { NULL }, nxt_bv;
-       struct bvec_iter iter;
 
        if (!blk_queue_cluster(q))
                return 0;
@@ -304,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        if (!bio_has_data(bio))
                return 1;
 
-       bio_for_each_segment(end_bv, bio, iter)
-               if (end_bv.bv_len == iter.bi_size)
-                       break;
-
-       nxt_bv = bio_iovec(nxt);
+       bio_get_last_bvec(bio, &end_bv);
+       bio_get_first_bvec(nxt, &nxt_bv);
 
        if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
                return 0;
index 4c0622f..56c0a72 100644 (file)
@@ -599,8 +599,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                 * If a request wasn't started before the queue was
                 * marked dying, kill it here or it'll go unnoticed.
                 */
-               if (unlikely(blk_queue_dying(rq->q)))
-                       blk_mq_complete_request(rq, -EIO);
+               if (unlikely(blk_queue_dying(rq->q))) {
+                       rq->errors = -EIO;
+                       blk_mq_end_request(rq, rq->errors);
+               }
                return;
        }
 
index dd49735..c7bb666 100644 (file)
@@ -91,8 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
        lim->virt_boundary_mask = 0;
        lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-       lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors =
-               BLK_SAFE_MAX_SECTORS;
+       lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+       lim->max_dev_sectors = 0;
        lim->chunk_sectors = 0;
        lim->max_write_same_sectors = 0;
        lim->max_discard_sectors = 0;
index e140cc4..dd93763 100644 (file)
@@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 
 static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
 {
-       unsigned long long val;
 
-       val = q->limits.max_hw_discard_sectors << 9;
-       return sprintf(page, "%llu\n", val);
+       return sprintf(page, "%llu\n",
+               (unsigned long long)q->limits.max_hw_discard_sectors << 9);
 }
 
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
index a753df2..d0dd788 100644 (file)
@@ -39,7 +39,6 @@ struct deadline_data {
         */
        struct request *next_rq[2];
        unsigned int batching;          /* number of sequential requests made */
-       sector_t last_sector;           /* head position */
        unsigned int starved;           /* times reads have starved writes */
 
        /*
@@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
        dd->next_rq[WRITE] = NULL;
        dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-       dd->last_sector = rq_end_sector(rq);
-
        /*
         * take it off the sort and fifo list, move
         * to dispatch queue
index 77f5d17..d8996bb 100644 (file)
@@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev)
 
        return true;
 }
-
-static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
-{
-       unsigned long arg;
-       int rc = 0;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EACCES;
-
-       if (get_user(arg, (int __user *)(argp)))
-               return -EFAULT;
-       arg = !!arg;
-       if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
-               return 0;
-
-       if (arg)
-               arg = S_DAX;
-
-       if (arg && !blkdev_dax_capable(bdev))
-               return -ENOTTY;
-
-       inode_lock(bdev->bd_inode);
-       if (bdev->bd_map_count == 0)
-               inode_set_flags(bdev->bd_inode, arg, S_DAX);
-       else
-               rc = -EBUSY;
-       inode_unlock(bdev->bd_inode);
-       return rc;
-}
-#else
-static int blkdev_daxset(struct block_device *bdev, int arg)
-{
-       if (arg)
-               return -ENOTTY;
-       return 0;
-}
 #endif
 
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
@@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKTRACESETUP:
        case BLKTRACETEARDOWN:
                return blk_trace_ioctl(bdev, cmd, argp);
-       case BLKDAXSET:
-               return blkdev_daxset(bdev, arg);
        case BLKDAXGET:
                return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
                break;
index 746935a..fefd01b 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <linux/dax.h>
 #include <linux/blktrace_api.h>
 
 #include "partitions/check.h"
@@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
        return 0;
 }
 
-unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+       return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+                       NULL);
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
        struct page *page;
 
-       page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-                                NULL);
+       /* don't populate page cache for dax capable devices */
+       if (IS_DAX(bdev->bd_inode))
+               page = read_dax_sector(bdev, n);
+       else
+               page = read_pagecache_sector(bdev, n);
+
        if (!IS_ERR(page)) {
                if (PageError(page))
                        goto fail;
index 7240821..3be07ad 100644 (file)
@@ -472,11 +472,13 @@ config CRYPTO_CRCT10DIF_PCLMUL
 config CRYPTO_GHASH
        tristate "GHASH digest algorithm"
        select CRYPTO_GF128MUL
+       select CRYPTO_HASH
        help
          GHASH is message digest algorithm for GCM (Galois/Counter Mode).
 
 config CRYPTO_POLY1305
        tristate "Poly1305 authenticator algorithm"
+       select CRYPTO_HASH
        help
          Poly1305 authenticator algorithm, RFC7539.
 
index 608a756..68a5cea 100644 (file)
@@ -54,7 +54,8 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
 
        lock_sock(sk);
        if (!ctx->more) {
-               err = crypto_ahash_init(&ctx->req);
+               err = af_alg_wait_for_completion(crypto_ahash_init(&ctx->req),
+                                               &ctx->completion);
                if (err)
                        goto unlock;
        }
@@ -125,6 +126,7 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page,
        } else {
                if (!ctx->more) {
                        err = crypto_ahash_init(&ctx->req);
+                       err = af_alg_wait_for_completion(err, &ctx->completion);
                        if (err)
                                goto unlock;
                }
index 38c1aa8..28556fc 100644 (file)
@@ -65,18 +65,10 @@ struct skcipher_async_req {
        struct skcipher_async_rsgl first_sgl;
        struct list_head list;
        struct scatterlist *tsg;
-       char iv[];
+       atomic_t *inflight;
+       struct skcipher_request req;
 };
 
-#define GET_SREQ(areq, ctx) (struct skcipher_async_req *)((char *)areq + \
-       crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req)))
-
-#define GET_REQ_SIZE(ctx) \
-       crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req))
-
-#define GET_IV_SIZE(ctx) \
-       crypto_skcipher_ivsize(crypto_skcipher_reqtfm(&ctx->req))
-
 #define MAX_SGL_ENTS ((4096 - sizeof(struct skcipher_sg_list)) / \
                      sizeof(struct scatterlist) - 1)
 
@@ -102,15 +94,12 @@ static void skcipher_free_async_sgls(struct skcipher_async_req *sreq)
 
 static void skcipher_async_cb(struct crypto_async_request *req, int err)
 {
-       struct sock *sk = req->data;
-       struct alg_sock *ask = alg_sk(sk);
-       struct skcipher_ctx *ctx = ask->private;
-       struct skcipher_async_req *sreq = GET_SREQ(req, ctx);
+       struct skcipher_async_req *sreq = req->data;
        struct kiocb *iocb = sreq->iocb;
 
-       atomic_dec(&ctx->inflight);
+       atomic_dec(sreq->inflight);
        skcipher_free_async_sgls(sreq);
-       kfree(req);
+       kzfree(sreq);
        iocb->ki_complete(iocb, err, err);
 }
 
@@ -306,8 +295,11 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 {
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
+       struct sock *psk = ask->parent;
+       struct alg_sock *pask = alg_sk(psk);
        struct skcipher_ctx *ctx = ask->private;
-       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(&ctx->req);
+       struct skcipher_tfm *skc = pask->private;
+       struct crypto_skcipher *tfm = skc->skcipher;
        unsigned ivsize = crypto_skcipher_ivsize(tfm);
        struct skcipher_sg_list *sgl;
        struct af_alg_control con = {};
@@ -509,37 +501,43 @@ static int skcipher_recvmsg_async(struct socket *sock, struct msghdr *msg,
 {
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
+       struct sock *psk = ask->parent;
+       struct alg_sock *pask = alg_sk(psk);
        struct skcipher_ctx *ctx = ask->private;
+       struct skcipher_tfm *skc = pask->private;
+       struct crypto_skcipher *tfm = skc->skcipher;
        struct skcipher_sg_list *sgl;
        struct scatterlist *sg;
        struct skcipher_async_req *sreq;
        struct skcipher_request *req;
        struct skcipher_async_rsgl *last_rsgl = NULL;
-       unsigned int txbufs = 0, len = 0, tx_nents = skcipher_all_sg_nents(ctx);
-       unsigned int reqlen = sizeof(struct skcipher_async_req) +
-                               GET_REQ_SIZE(ctx) + GET_IV_SIZE(ctx);
+       unsigned int txbufs = 0, len = 0, tx_nents;
+       unsigned int reqsize = crypto_skcipher_reqsize(tfm);
+       unsigned int ivsize = crypto_skcipher_ivsize(tfm);
        int err = -ENOMEM;
        bool mark = false;
+       char *iv;
 
-       lock_sock(sk);
-       req = kmalloc(reqlen, GFP_KERNEL);
-       if (unlikely(!req))
-               goto unlock;
+       sreq = kzalloc(sizeof(*sreq) + reqsize + ivsize, GFP_KERNEL);
+       if (unlikely(!sreq))
+               goto out;
 
-       sreq = GET_SREQ(req, ctx);
+       req = &sreq->req;
+       iv = (char *)(req + 1) + reqsize;
        sreq->iocb = msg->msg_iocb;
-       memset(&sreq->first_sgl, '\0', sizeof(struct skcipher_async_rsgl));
        INIT_LIST_HEAD(&sreq->list);
+       sreq->inflight = &ctx->inflight;
+
+       lock_sock(sk);
+       tx_nents = skcipher_all_sg_nents(ctx);
        sreq->tsg = kcalloc(tx_nents, sizeof(*sg), GFP_KERNEL);
-       if (unlikely(!sreq->tsg)) {
-               kfree(req);
+       if (unlikely(!sreq->tsg))
                goto unlock;
-       }
        sg_init_table(sreq->tsg, tx_nents);
-       memcpy(sreq->iv, ctx->iv, GET_IV_SIZE(ctx));
-       skcipher_request_set_tfm(req, crypto_skcipher_reqtfm(&ctx->req));
-       skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-                                     skcipher_async_cb, sk);
+       memcpy(iv, ctx->iv, ivsize);
+       skcipher_request_set_tfm(req, tfm);
+       skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+                                     skcipher_async_cb, sreq);
 
        while (iov_iter_count(&msg->msg_iter)) {
                struct skcipher_async_rsgl *rsgl;
@@ -615,20 +613,22 @@ static int skcipher_recvmsg_async(struct socket *sock, struct msghdr *msg,
                sg_mark_end(sreq->tsg + txbufs - 1);
 
        skcipher_request_set_crypt(req, sreq->tsg, sreq->first_sgl.sgl.sg,
-                                  len, sreq->iv);
+                                  len, iv);
        err = ctx->enc ? crypto_skcipher_encrypt(req) :
                         crypto_skcipher_decrypt(req);
        if (err == -EINPROGRESS) {
                atomic_inc(&ctx->inflight);
                err = -EIOCBQUEUED;
+               sreq = NULL;
                goto unlock;
        }
 free:
        skcipher_free_async_sgls(sreq);
-       kfree(req);
 unlock:
        skcipher_wmem_wakeup(sk);
        release_sock(sk);
+       kzfree(sreq);
+out:
        return err;
 }
 
@@ -637,9 +637,12 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg,
 {
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
+       struct sock *psk = ask->parent;
+       struct alg_sock *pask = alg_sk(psk);
        struct skcipher_ctx *ctx = ask->private;
-       unsigned bs = crypto_skcipher_blocksize(crypto_skcipher_reqtfm(
-               &ctx->req));
+       struct skcipher_tfm *skc = pask->private;
+       struct crypto_skcipher *tfm = skc->skcipher;
+       unsigned bs = crypto_skcipher_blocksize(tfm);
        struct skcipher_sg_list *sgl;
        struct scatterlist *sg;
        int err = -EAGAIN;
@@ -947,7 +950,8 @@ static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
        ask->private = ctx;
 
        skcipher_request_set_tfm(&ctx->req, skcipher);
-       skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+       skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_SLEEP |
+                                                CRYPTO_TFM_REQ_MAY_BACKLOG,
                                      af_alg_complete, &ctx->completion);
 
        sk->sk_destruct = skcipher_sock_destruct;
index 758acab..8f3056c 100644 (file)
@@ -547,9 +547,7 @@ int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen,
        struct pkcs7_signed_info *sinfo = ctx->sinfo;
 
        if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) ||
-           !test_bit(sinfo_has_message_digest, &sinfo->aa_set) ||
-           (ctx->msg->data_type == OID_msIndirectData &&
-            !test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))) {
+           !test_bit(sinfo_has_message_digest, &sinfo->aa_set)) {
                pr_warn("Missing required AuthAttr\n");
                return -EBADMSG;
        }
index 237f379..43fe85f 100644 (file)
@@ -499,6 +499,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (link->dump == NULL)
                        return -EINVAL;
 
+               down_read(&crypto_alg_sem);
                list_for_each_entry(alg, &crypto_alg_list, cra_list)
                        dump_alloc += CRYPTO_REPORT_MAXSIZE;
 
@@ -508,8 +509,11 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                .done = link->done,
                                .min_dump_alloc = dump_alloc,
                        };
-                       return netlink_dump_start(crypto_nlsk, skb, nlh, &c);
+                       err = netlink_dump_start(crypto_nlsk, skb, nlh, &c);
                }
+               up_read(&crypto_alg_sem);
+
+               return err;
        }
 
        err = nlmsg_parse(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX,
index 88a27de..3597545 100644 (file)
@@ -354,11 +354,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
        crt->final = shash_async_final;
        crt->finup = shash_async_finup;
        crt->digest = shash_async_digest;
+       crt->setkey = shash_async_setkey;
+
+       crt->has_setkey = alg->setkey != shash_no_setkey;
 
-       if (alg->setkey) {
-               crt->setkey = shash_async_setkey;
-               crt->has_setkey = true;
-       }
        if (alg->export)
                crt->export = shash_async_export;
        if (alg->import)
index c570b1d..0872d5f 100644 (file)
@@ -880,7 +880,7 @@ static int acpi_lpss_platform_notify(struct notifier_block *nb,
                break;
        case BUS_NOTIFY_DRIVER_NOT_BOUND:
        case BUS_NOTIFY_UNBOUND_DRIVER:
-               pdev->dev.pm_domain = NULL;
+               dev_pm_domain_set(&pdev->dev, NULL);
                break;
        case BUS_NOTIFY_ADD_DEVICE:
                dev_pm_domain_set(&pdev->dev, &acpi_lpss_pm_domain);
index 296b7a1..b6f7fa3 100644 (file)
@@ -62,7 +62,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
        if (count < 0) {
                return NULL;
        } else if (count > 0) {
-               resources = kmalloc(count * sizeof(struct resource),
+               resources = kzalloc(count * sizeof(struct resource),
                                    GFP_KERNEL);
                if (!resources) {
                        dev_err(&adev->dev, "No memory for resources\n");
index 3052185..d48cbed 100644 (file)
@@ -269,8 +269,7 @@ acpi_ps_get_next_namepath(struct acpi_walk_state *walk_state,
         */
        if (ACPI_SUCCESS(status) &&
            possible_method_call && (node->type == ACPI_TYPE_METHOD)) {
-               if (GET_CURRENT_ARG_TYPE(walk_state->arg_types) ==
-                   ARGP_SUPERNAME) {
+               if (walk_state->opcode == AML_UNLOAD_OP) {
                        /*
                         * acpi_ps_get_next_namestring has increased the AML pointer,
                         * so we need to restore the saved AML pointer for method call.
@@ -697,7 +696,7 @@ static union acpi_parse_object *acpi_ps_get_next_field(struct acpi_parse_state
  *
  * PARAMETERS:  walk_state          - Current state
  *              parser_state        - Current parser state object
- *              arg_type            - The parser argument type (ARGP_*)
+ *              arg_type            - The argument type (AML_*_ARG)
  *              return_arg          - Where the next arg is returned
  *
  * RETURN:      Status, and an op object containing the next argument.
@@ -817,9 +816,9 @@ acpi_ps_get_next_arg(struct acpi_walk_state *walk_state,
                                return_ACPI_STATUS(AE_NO_MEMORY);
                        }
 
-                       /* super_name allows argument to be a method call */
+                       /* To support super_name arg of Unload */
 
-                       if (arg_type == ARGP_SUPERNAME) {
+                       if (walk_state->opcode == AML_UNLOAD_OP) {
                                status =
                                    acpi_ps_get_next_namepath(walk_state,
                                                              parser_state, arg,
index 0431883..559c117 100644 (file)
@@ -519,7 +519,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
                             u64 param3, u64 param4)
 {
        int rc;
-       unsigned long pfn;
+       u64 base_addr, size;
 
        /* If user manually set "flags", make sure it is legal */
        if (flags && (flags &
@@ -545,10 +545,17 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
        /*
         * Disallow crazy address masks that give BIOS leeway to pick
         * injection address almost anywhere. Insist on page or
-        * better granularity and that target address is normal RAM.
+        * better granularity and that target address is normal RAM or
+        * NVDIMM.
         */
-       pfn = PFN_DOWN(param1 & param2);
-       if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK))
+       base_addr = param1 & param2;
+       size = ~param2 + 1;
+
+       if (((param2 & PAGE_MASK) != PAGE_MASK) ||
+           ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+                               != REGION_INTERSECTS) &&
+            (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
+                               != REGION_INTERSECTS)))
                return -EINVAL;
 
 inject:
index ad6d8c6..35947ac 100644 (file)
@@ -469,37 +469,16 @@ static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc,
        nfit_mem->bdw = NULL;
 }
 
-static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
+static void nfit_mem_init_bdw(struct acpi_nfit_desc *acpi_desc,
                struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
 {
        u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
        struct nfit_memdev *nfit_memdev;
        struct nfit_flush *nfit_flush;
-       struct nfit_dcr *nfit_dcr;
        struct nfit_bdw *nfit_bdw;
        struct nfit_idt *nfit_idt;
        u16 idt_idx, range_index;
 
-       list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
-               if (nfit_dcr->dcr->region_index != dcr)
-                       continue;
-               nfit_mem->dcr = nfit_dcr->dcr;
-               break;
-       }
-
-       if (!nfit_mem->dcr) {
-               dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n",
-                               spa->range_index, __to_nfit_memdev(nfit_mem)
-                               ? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR");
-               return -ENODEV;
-       }
-
-       /*
-        * We've found enough to create an nvdimm, optionally
-        * find an associated BDW
-        */
-       list_add(&nfit_mem->list, &acpi_desc->dimms);
-
        list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) {
                if (nfit_bdw->bdw->region_index != dcr)
                        continue;
@@ -508,12 +487,12 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
        }
 
        if (!nfit_mem->bdw)
-               return 0;
+               return;
 
        nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
 
        if (!nfit_mem->spa_bdw)
-               return 0;
+               return;
 
        range_index = nfit_mem->spa_bdw->range_index;
        list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
@@ -538,8 +517,6 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
                }
                break;
        }
-
-       return 0;
 }
 
 static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
@@ -548,7 +525,6 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
        struct nfit_mem *nfit_mem, *found;
        struct nfit_memdev *nfit_memdev;
        int type = nfit_spa_type(spa);
-       u16 dcr;
 
        switch (type) {
        case NFIT_SPA_DCR:
@@ -559,14 +535,18 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
        }
 
        list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
-               int rc;
+               struct nfit_dcr *nfit_dcr;
+               u32 device_handle;
+               u16 dcr;
 
                if (nfit_memdev->memdev->range_index != spa->range_index)
                        continue;
                found = NULL;
                dcr = nfit_memdev->memdev->region_index;
+               device_handle = nfit_memdev->memdev->device_handle;
                list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
-                       if (__to_nfit_memdev(nfit_mem)->region_index == dcr) {
+                       if (__to_nfit_memdev(nfit_mem)->device_handle
+                                       == device_handle) {
                                found = nfit_mem;
                                break;
                        }
@@ -579,6 +559,31 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
                        if (!nfit_mem)
                                return -ENOMEM;
                        INIT_LIST_HEAD(&nfit_mem->list);
+                       list_add(&nfit_mem->list, &acpi_desc->dimms);
+               }
+
+               list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
+                       if (nfit_dcr->dcr->region_index != dcr)
+                               continue;
+                       /*
+                        * Record the control region for the dimm.  For
+                        * the ACPI 6.1 case, where there are separate
+                        * control regions for the pmem vs blk
+                        * interfaces, be sure to record the extended
+                        * blk details.
+                        */
+                       if (!nfit_mem->dcr)
+                               nfit_mem->dcr = nfit_dcr->dcr;
+                       else if (nfit_mem->dcr->windows == 0
+                                       && nfit_dcr->dcr->windows)
+                               nfit_mem->dcr = nfit_dcr->dcr;
+                       break;
+               }
+
+               if (dcr && !nfit_mem->dcr) {
+                       dev_err(acpi_desc->dev, "SPA %d missing DCR %d\n",
+                                       spa->range_index, dcr);
+                       return -ENODEV;
                }
 
                if (type == NFIT_SPA_DCR) {
@@ -595,6 +600,7 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
                                nfit_mem->idt_dcr = nfit_idt->idt;
                                break;
                        }
+                       nfit_mem_init_bdw(acpi_desc, nfit_mem, spa);
                } else {
                        /*
                         * A single dimm may belong to multiple SPA-PM
@@ -603,13 +609,6 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
                         */
                        nfit_mem->memdev_pmem = nfit_memdev->memdev;
                }
-
-               if (found)
-                       continue;
-
-               rc = nfit_mem_add(acpi_desc, nfit_mem, spa);
-               if (rc)
-                       return rc;
        }
 
        return 0;
@@ -1504,9 +1503,7 @@ static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
                case 1:
                        /* ARS unsupported, but we should never get here */
                        return 0;
-               case 2:
-                       return -EINVAL;
-               case 3:
+               case 6:
                        /* ARS is in progress */
                        msleep(1000);
                        break;
@@ -1517,13 +1514,13 @@ static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
 }
 
 static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
-               struct nd_cmd_ars_status *cmd)
+               struct nd_cmd_ars_status *cmd, u32 size)
 {
        int rc;
 
        while (1) {
                rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
-                       sizeof(*cmd));
+                       size);
                if (rc || cmd->status & 0xffff)
                        return -ENXIO;
 
@@ -1538,6 +1535,8 @@ static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
                case 2:
                        /* No ARS performed for the current boot */
                        return 0;
+               case 3:
+                       /* TODO: error list overflow support */
                default:
                        return -ENXIO;
                }
@@ -1581,6 +1580,7 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
        struct nd_cmd_ars_start *ars_start = NULL;
        struct nd_cmd_ars_cap *ars_cap = NULL;
        u64 start, len, cur, remaining;
+       u32 ars_status_size;
        int rc;
 
        ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
@@ -1590,14 +1590,21 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
        start = ndr_desc->res->start;
        len = ndr_desc->res->end - ndr_desc->res->start + 1;
 
+       /*
+        * If ARS is unimplemented, unsupported, or if the 'Persistent Memory
+        * Scrub' flag in extended status is not set, skip this but continue
+        * initialization
+        */
        rc = ars_get_cap(nd_desc, ars_cap, start, len);
+       if (rc == -ENOTTY) {
+               dev_dbg(acpi_desc->dev,
+                       "Address Range Scrub is not implemented, won't create an error list\n");
+               rc = 0;
+               goto out;
+       }
        if (rc)
                goto out;
 
-       /*
-        * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
-        * extended status is not set, skip this but continue initialization
-        */
        if ((ars_cap->status & 0xffff) ||
                !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
                dev_warn(acpi_desc->dev,
@@ -1610,14 +1617,14 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
         * Check if a full-range ARS has been run. If so, use those results
         * without having to start a new ARS.
         */
-       ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
-                       GFP_KERNEL);
+       ars_status_size = ars_cap->max_ars_out;
+       ars_status = kzalloc(ars_status_size, GFP_KERNEL);
        if (!ars_status) {
                rc = -ENOMEM;
                goto out;
        }
 
-       rc = ars_get_status(nd_desc, ars_status);
+       rc = ars_get_status(nd_desc, ars_status, ars_status_size);
        if (rc)
                goto out;
 
@@ -1647,7 +1654,7 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
                if (rc)
                        goto out;
 
-               rc = ars_get_status(nd_desc, ars_status);
+               rc = ars_get_status(nd_desc, ars_status, ars_status_size);
                if (rc)
                        goto out;
 
index d30184c..c8e169e 100644 (file)
@@ -406,7 +406,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                return 0;
        }
 
-       if (pci_has_managed_irq(dev))
+       if (dev->irq_managed && dev->irq > 0)
                return 0;
 
        entry = acpi_pci_irq_lookup(dev, pin);
@@ -451,7 +451,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                kfree(entry);
                return rc;
        }
-       pci_set_managed_irq(dev, rc);
+       dev->irq = rc;
+       dev->irq_managed = 1;
 
        if (link)
                snprintf(link_desc, sizeof(link_desc), " -> Link[%s]", link);
@@ -474,9 +475,17 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
        u8 pin;
 
        pin = dev->pin;
-       if (!pin || !pci_has_managed_irq(dev))
+       if (!pin || !dev->irq_managed || dev->irq <= 0)
                return;
 
+       /* Keep IOAPIC pin configuration when suspending */
+       if (dev->dev.power.is_prepared)
+               return;
+#ifdef CONFIG_PM
+       if (dev->dev.power.runtime_status == RPM_SUSPENDING)
+               return;
+#endif
+
        entry = acpi_pci_irq_lookup(dev, pin);
        if (!entry)
                return;
@@ -496,6 +505,6 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
        dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
        if (gsi >= 0) {
                acpi_unregister_gsi(gsi);
-               pci_reset_managed_irq(dev);
+               dev->irq_managed = 0;
        }
 }
index fa28635..ededa90 100644 (file)
@@ -4,7 +4,6 @@
  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
  *  Copyright (C) 2002       Dominik Brodowski <devel@brodo.de>
- *  Copyright (c) 2015, The Linux Foundation. All rights reserved.
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -438,6 +437,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq)
  * enabled system.
  */
 
+#define ACPI_MAX_IRQS          256
 #define ACPI_MAX_ISA_IRQ       16
 
 #define PIRQ_PENALTY_PCI_AVAILABLE     (0)
@@ -447,7 +447,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq)
 #define PIRQ_PENALTY_ISA_USED          (16*16*16*16*16)
 #define PIRQ_PENALTY_ISA_ALWAYS                (16*16*16*16*16*16)
 
-static int acpi_irq_isa_penalty[ACPI_MAX_ISA_IRQ] = {
+static int acpi_irq_penalty[ACPI_MAX_IRQS] = {
        PIRQ_PENALTY_ISA_ALWAYS,        /* IRQ0 timer */
        PIRQ_PENALTY_ISA_ALWAYS,        /* IRQ1 keyboard */
        PIRQ_PENALTY_ISA_ALWAYS,        /* IRQ2 cascade */
@@ -464,68 +464,9 @@ static int acpi_irq_isa_penalty[ACPI_MAX_ISA_IRQ] = {
        PIRQ_PENALTY_ISA_USED,          /* IRQ13 fpe, sometimes */
        PIRQ_PENALTY_ISA_USED,          /* IRQ14 ide0 */
        PIRQ_PENALTY_ISA_USED,          /* IRQ15 ide1 */
+       /* >IRQ15 */
 };
 
-struct irq_penalty_info {
-       int irq;
-       int penalty;
-       struct list_head node;
-};
-
-static LIST_HEAD(acpi_irq_penalty_list);
-
-static int acpi_irq_get_penalty(int irq)
-{
-       struct irq_penalty_info *irq_info;
-
-       if (irq < ACPI_MAX_ISA_IRQ)
-               return acpi_irq_isa_penalty[irq];
-
-       list_for_each_entry(irq_info, &acpi_irq_penalty_list, node) {
-               if (irq_info->irq == irq)
-                       return irq_info->penalty;
-       }
-
-       return 0;
-}
-
-static int acpi_irq_set_penalty(int irq, int new_penalty)
-{
-       struct irq_penalty_info *irq_info;
-
-       /* see if this is a ISA IRQ */
-       if (irq < ACPI_MAX_ISA_IRQ) {
-               acpi_irq_isa_penalty[irq] = new_penalty;
-               return 0;
-       }
-
-       /* next, try to locate from the dynamic list */
-       list_for_each_entry(irq_info, &acpi_irq_penalty_list, node) {
-               if (irq_info->irq == irq) {
-                       irq_info->penalty  = new_penalty;
-                       return 0;
-               }
-       }
-
-       /* nope, let's allocate a slot for this IRQ */
-       irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
-       if (!irq_info)
-               return -ENOMEM;
-
-       irq_info->irq = irq;
-       irq_info->penalty = new_penalty;
-       list_add_tail(&irq_info->node, &acpi_irq_penalty_list);
-
-       return 0;
-}
-
-static void acpi_irq_add_penalty(int irq, int penalty)
-{
-       int curpen = acpi_irq_get_penalty(irq);
-
-       acpi_irq_set_penalty(irq, curpen + penalty);
-}
-
 int __init acpi_irq_penalty_init(void)
 {
        struct acpi_pci_link *link;
@@ -546,16 +487,15 @@ int __init acpi_irq_penalty_init(void)
                            link->irq.possible_count;
 
                        for (i = 0; i < link->irq.possible_count; i++) {
-                               if (link->irq.possible[i] < ACPI_MAX_ISA_IRQ) {
-                                       int irqpos = link->irq.possible[i];
-
-                                       acpi_irq_add_penalty(irqpos, penalty);
-                               }
+                               if (link->irq.possible[i] < ACPI_MAX_ISA_IRQ)
+                                       acpi_irq_penalty[link->irq.
+                                                        possible[i]] +=
+                                           penalty;
                        }
 
                } else if (link->irq.active) {
-                       acpi_irq_add_penalty(link->irq.active,
-                                            PIRQ_PENALTY_PCI_POSSIBLE);
+                       acpi_irq_penalty[link->irq.active] +=
+                           PIRQ_PENALTY_PCI_POSSIBLE;
                }
        }
 
@@ -607,12 +547,12 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link)
                 * the use of IRQs 9, 10, 11, and >15.
                 */
                for (i = (link->irq.possible_count - 1); i >= 0; i--) {
-                       if (acpi_irq_get_penalty(irq) >
-                           acpi_irq_get_penalty(link->irq.possible[i]))
+                       if (acpi_irq_penalty[irq] >
+                           acpi_irq_penalty[link->irq.possible[i]])
                                irq = link->irq.possible[i];
                }
        }
-       if (acpi_irq_get_penalty(irq) >= PIRQ_PENALTY_ISA_ALWAYS) {
+       if (acpi_irq_penalty[irq] >= PIRQ_PENALTY_ISA_ALWAYS) {
                printk(KERN_ERR PREFIX "No IRQ available for %s [%s]. "
                            "Try pci=noacpi or acpi=off\n",
                            acpi_device_name(link->device),
@@ -628,8 +568,7 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link)
                            acpi_device_bid(link->device));
                return -ENODEV;
        } else {
-               acpi_irq_add_penalty(link->irq.active, PIRQ_PENALTY_PCI_USING);
-
+               acpi_irq_penalty[link->irq.active] += PIRQ_PENALTY_PCI_USING;
                printk(KERN_WARNING PREFIX "%s [%s] enabled at IRQ %d\n",
                       acpi_device_name(link->device),
                       acpi_device_bid(link->device), link->irq.active);
@@ -839,7 +778,7 @@ static void acpi_pci_link_remove(struct acpi_device *device)
 }
 
 /*
- * modify penalty from cmdline
+ * modify acpi_irq_penalty[] from cmdline
  */
 static int __init acpi_irq_penalty_update(char *str, int used)
 {
@@ -857,10 +796,13 @@ static int __init acpi_irq_penalty_update(char *str, int used)
                if (irq < 0)
                        continue;
 
+               if (irq >= ARRAY_SIZE(acpi_irq_penalty))
+                       continue;
+
                if (used)
-                       acpi_irq_add_penalty(irq, PIRQ_PENALTY_ISA_USED);
+                       acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_USED;
                else
-                       acpi_irq_set_penalty(irq, PIRQ_PENALTY_PCI_AVAILABLE);
+                       acpi_irq_penalty[irq] = PIRQ_PENALTY_PCI_AVAILABLE;
 
                if (retval != 2)        /* no next number */
                        break;
@@ -877,15 +819,18 @@ static int __init acpi_irq_penalty_update(char *str, int used)
  */
 void acpi_penalize_isa_irq(int irq, int active)
 {
-       if (irq >= 0)
-               acpi_irq_add_penalty(irq, active ?
-                       PIRQ_PENALTY_ISA_USED : PIRQ_PENALTY_PCI_USING);
+       if (irq >= 0 && irq < ARRAY_SIZE(acpi_irq_penalty)) {
+               if (active)
+                       acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_USED;
+               else
+                       acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
+       }
 }
 
 bool acpi_isa_irq_available(int irq)
 {
-       return irq >= 0 &&
-               (acpi_irq_get_penalty(irq) < PIRQ_PENALTY_ISA_ALWAYS);
+       return irq >= 0 && (irq >= ARRAY_SIZE(acpi_irq_penalty) ||
+                           acpi_irq_penalty[irq] < PIRQ_PENALTY_ISA_ALWAYS);
 }
 
 /*
@@ -895,18 +840,13 @@ bool acpi_isa_irq_available(int irq)
  */
 void acpi_penalize_sci_irq(int irq, int trigger, int polarity)
 {
-       int penalty;
-
-       if (irq < 0)
-               return;
-
-       if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
-           polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
-               penalty = PIRQ_PENALTY_ISA_ALWAYS;
-       else
-               penalty = PIRQ_PENALTY_PCI_USING;
-
-       acpi_irq_add_penalty(irq, penalty);
+       if (irq >= 0 && irq < ARRAY_SIZE(acpi_irq_penalty)) {
+               if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
+                   polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
+                       acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_ALWAYS;
+               else
+                       acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
+       }
 }
 
 /*
index 90e2d54..1316ddd 100644 (file)
@@ -135,14 +135,6 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
                DMI_MATCH(DMI_PRODUCT_NAME, "UL30A"),
                },
        },
-       {
-       .callback = video_detect_force_vendor,
-       .ident = "Dell Inspiron 5737",
-       .matches = {
-               DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-               DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 5737"),
-               },
-       },
 
        /*
         * These models have a working acpi_video backlight control, and using
index a39e85f..7d00b7a 100644 (file)
@@ -2074,7 +2074,7 @@ static int binder_thread_write(struct binder_proc *proc,
                        if (get_user(cookie, (binder_uintptr_t __user *)ptr))
                                return -EFAULT;
 
-                       ptr += sizeof(void *);
+                       ptr += sizeof(cookie);
                        list_for_each_entry(w, &proc->delivered_death, entry) {
                                struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
 
index 594fcab..146dc0b 100644 (file)
@@ -264,6 +264,26 @@ static const struct pci_device_id ahci_pci_tbl[] = {
        { PCI_VDEVICE(INTEL, 0x3b2b), board_ahci }, /* PCH RAID */
        { PCI_VDEVICE(INTEL, 0x3b2c), board_ahci }, /* PCH RAID */
        { PCI_VDEVICE(INTEL, 0x3b2f), board_ahci }, /* PCH AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b0), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b1), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b2), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b3), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b4), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b5), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b6), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19b7), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19bE), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19bF), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c0), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c1), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c2), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c3), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c4), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c5), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c6), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19c7), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19cE), board_ahci }, /* DNV AHCI */
+       { PCI_VDEVICE(INTEL, 0x19cF), board_ahci }, /* DNV AHCI */
        { PCI_VDEVICE(INTEL, 0x1c02), board_ahci }, /* CPT AHCI */
        { PCI_VDEVICE(INTEL, 0x1c03), board_ahci }, /* CPT AHCI */
        { PCI_VDEVICE(INTEL, 0x1c04), board_ahci }, /* CPT RAID */
@@ -347,15 +367,21 @@ static const struct pci_device_id ahci_pci_tbl[] = {
        { PCI_VDEVICE(INTEL, 0xa107), board_ahci }, /* Sunrise Point-H RAID */
        { PCI_VDEVICE(INTEL, 0xa10f), board_ahci }, /* Sunrise Point-H RAID */
        { PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0x2823), board_ahci }, /* Lewisburg AHCI*/
        { PCI_VDEVICE(INTEL, 0x2826), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0x2827), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa182), board_ahci }, /* Lewisburg AHCI*/
        { PCI_VDEVICE(INTEL, 0xa184), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa186), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa18e), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0xa1d2), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0xa1d6), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa202), board_ahci }, /* Lewisburg AHCI*/
        { PCI_VDEVICE(INTEL, 0xa204), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa206), board_ahci }, /* Lewisburg RAID*/
        { PCI_VDEVICE(INTEL, 0xa20e), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0xa252), board_ahci }, /* Lewisburg RAID*/
+       { PCI_VDEVICE(INTEL, 0xa256), board_ahci }, /* Lewisburg RAID*/
 
        /* JMicron 360/1/3/5/6, match class to avoid IDE function */
        { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
@@ -1305,6 +1331,44 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host)
 {}
 #endif
 
+#ifdef CONFIG_ARM64
+/*
+ * Due to ERRATA#22536, ThunderX needs to handle HOST_IRQ_STAT differently.
+ * Workaround is to make sure all pending IRQs are served before leaving
+ * handler.
+ */
+static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
+{
+       struct ata_host *host = dev_instance;
+       struct ahci_host_priv *hpriv;
+       unsigned int rc = 0;
+       void __iomem *mmio;
+       u32 irq_stat, irq_masked;
+       unsigned int handled = 1;
+
+       VPRINTK("ENTER\n");
+       hpriv = host->private_data;
+       mmio = hpriv->mmio;
+       irq_stat = readl(mmio + HOST_IRQ_STAT);
+       if (!irq_stat)
+               return IRQ_NONE;
+
+       do {
+               irq_masked = irq_stat & hpriv->port_map;
+               spin_lock(&host->lock);
+               rc = ahci_handle_port_intr(host, irq_masked);
+               if (!rc)
+                       handled = 0;
+               writel(irq_stat, mmio + HOST_IRQ_STAT);
+               irq_stat = readl(mmio + HOST_IRQ_STAT);
+               spin_unlock(&host->lock);
+       } while (irq_stat);
+       VPRINTK("EXIT\n");
+
+       return IRQ_RETVAL(handled);
+}
+#endif
+
 /*
  * ahci_init_msix() - optionally enable per-port MSI-X otherwise defer
  * to single msi.
@@ -1540,6 +1604,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (ahci_broken_devslp(pdev))
                hpriv->flags |= AHCI_HFLAG_NO_DEVSLP;
 
+#ifdef CONFIG_ARM64
+       if (pdev->vendor == 0x177d && pdev->device == 0xa01c)
+               hpriv->irq_handler = ahci_thunderx_irq_handler;
+#endif
+
        /* save initial config */
        ahci_pci_save_initial_config(pdev, hpriv);
 
index a4faa43..167ba7e 100644 (file)
@@ -240,8 +240,7 @@ enum {
                                                        error-handling stage) */
        AHCI_HFLAG_NO_DEVSLP            = (1 << 17), /* no device sleep */
        AHCI_HFLAG_NO_FBS               = (1 << 18), /* no FBS */
-       AHCI_HFLAG_EDGE_IRQ             = (1 << 19), /* HOST_IRQ_STAT behaves as
-                                                       Edge Triggered */
+
 #ifdef CONFIG_PCI_MSI
        AHCI_HFLAG_MULTI_MSI            = (1 << 20), /* multiple PCI MSIs */
        AHCI_HFLAG_MULTI_MSIX           = (1 << 21), /* per-port MSI-X */
@@ -250,6 +249,7 @@ enum {
        AHCI_HFLAG_MULTI_MSI            = 0,
        AHCI_HFLAG_MULTI_MSIX           = 0,
 #endif
+       AHCI_HFLAG_WAKE_BEFORE_STOP     = (1 << 22), /* wake before DMA stop */
 
        /* ap->flags bits */
 
@@ -360,6 +360,7 @@ struct ahci_host_priv {
         * be overridden anytime before the host is activated.
         */
        void                    (*start_engine)(struct ata_port *ap);
+       irqreturn_t             (*irq_handler)(int irq, void *dev_instance);
 };
 
 #ifdef CONFIG_PCI_MSI
@@ -423,6 +424,7 @@ int ahci_reset_em(struct ata_host *host);
 void ahci_print_info(struct ata_host *host, const char *scc_s);
 int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht);
 void ahci_error_handler(struct ata_port *ap);
+u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked);
 
 static inline void __iomem *__ahci_port_base(struct ata_host *host,
                                             unsigned int port_no)
index b36cae2..e87bcec 100644 (file)
@@ -317,6 +317,7 @@ static int brcm_ahci_probe(struct platform_device *pdev)
        if (IS_ERR(hpriv))
                return PTR_ERR(hpriv);
        hpriv->plat_data = priv;
+       hpriv->flags = AHCI_HFLAG_WAKE_BEFORE_STOP;
 
        brcm_sata_alpm_init(hpriv);
 
index e2c6d9e..8e3f7fa 100644 (file)
@@ -548,6 +548,88 @@ softreset_retry:
        return rc;
 }
 
+/**
+ * xgene_ahci_handle_broken_edge_irq - Handle the broken irq.
+ * @ata_host: Host that recieved the irq
+ * @irq_masked: HOST_IRQ_STAT value
+ *
+ * For hardware with broken edge trigger latch
+ * the HOST_IRQ_STAT register misses the edge interrupt
+ * when clearing of HOST_IRQ_STAT register and hardware
+ * reporting the PORT_IRQ_STAT register at the
+ * same clock cycle.
+ * As such, the algorithm below outlines the workaround.
+ *
+ * 1. Read HOST_IRQ_STAT register and save the state.
+ * 2. Clear the HOST_IRQ_STAT register.
+ * 3. Read back the HOST_IRQ_STAT register.
+ * 4. If HOST_IRQ_STAT register equals to zero, then
+ *    traverse the rest of port's PORT_IRQ_STAT register
+ *    to check if an interrupt is triggered at that point else
+ *    go to step 6.
+ * 5. If PORT_IRQ_STAT register of rest ports is not equal to zero
+ *    then update the state of HOST_IRQ_STAT saved in step 1.
+ * 6. Handle port interrupts.
+ * 7. Exit
+ */
+static int xgene_ahci_handle_broken_edge_irq(struct ata_host *host,
+                                            u32 irq_masked)
+{
+       struct ahci_host_priv *hpriv = host->private_data;
+       void __iomem *port_mmio;
+       int i;
+
+       if (!readl(hpriv->mmio + HOST_IRQ_STAT)) {
+               for (i = 0; i < host->n_ports; i++) {
+                       if (irq_masked & (1 << i))
+                               continue;
+
+                       port_mmio = ahci_port_base(host->ports[i]);
+                       if (readl(port_mmio + PORT_IRQ_STAT))
+                               irq_masked |= (1 << i);
+               }
+       }
+
+       return ahci_handle_port_intr(host, irq_masked);
+}
+
+static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance)
+{
+       struct ata_host *host = dev_instance;
+       struct ahci_host_priv *hpriv;
+       unsigned int rc = 0;
+       void __iomem *mmio;
+       u32 irq_stat, irq_masked;
+
+       VPRINTK("ENTER\n");
+
+       hpriv = host->private_data;
+       mmio = hpriv->mmio;
+
+       /* sigh.  0xffffffff is a valid return from h/w */
+       irq_stat = readl(mmio + HOST_IRQ_STAT);
+       if (!irq_stat)
+               return IRQ_NONE;
+
+       irq_masked = irq_stat & hpriv->port_map;
+
+       spin_lock(&host->lock);
+
+       /*
+        * HOST_IRQ_STAT behaves as edge triggered latch meaning that
+        * it should be cleared before all the port events are cleared.
+        */
+       writel(irq_stat, mmio + HOST_IRQ_STAT);
+
+       rc = xgene_ahci_handle_broken_edge_irq(host, irq_masked);
+
+       spin_unlock(&host->lock);
+
+       VPRINTK("EXIT\n");
+
+       return IRQ_RETVAL(rc);
+}
+
 static struct ata_port_operations xgene_ahci_v1_ops = {
        .inherits = &ahci_ops,
        .host_stop = xgene_ahci_host_stop,
@@ -779,7 +861,8 @@ skip_clk_phy:
                hpriv->flags = AHCI_HFLAG_NO_NCQ;
                break;
        case XGENE_AHCI_V2:
-               hpriv->flags |= AHCI_HFLAG_YES_FBS | AHCI_HFLAG_EDGE_IRQ;
+               hpriv->flags |= AHCI_HFLAG_YES_FBS;
+               hpriv->irq_handler = xgene_ahci_irq_intr;
                break;
        default:
                break;
index d61740e..85ea514 100644 (file)
@@ -113,6 +113,7 @@ static ssize_t ahci_store_em_buffer(struct device *dev,
                                    const char *buf, size_t size);
 static ssize_t ahci_show_em_supported(struct device *dev,
                                      struct device_attribute *attr, char *buf);
+static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance);
 
 static DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL);
 static DEVICE_ATTR(ahci_host_cap2, S_IRUGO, ahci_show_host_cap2, NULL);
@@ -496,8 +497,8 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv)
                }
        }
 
-       /* fabricate port_map from cap.nr_ports */
-       if (!port_map) {
+       /* fabricate port_map from cap.nr_ports for < AHCI 1.3 */
+       if (!port_map && vers < 0x10300) {
                port_map = (1 << ahci_nr_ports(cap)) - 1;
                dev_warn(dev, "forcing PORTS_IMPL to 0x%x\n", port_map);
 
@@ -512,6 +513,9 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv)
 
        if (!hpriv->start_engine)
                hpriv->start_engine = ahci_start_engine;
+
+       if (!hpriv->irq_handler)
+               hpriv->irq_handler = ahci_single_level_irq_intr;
 }
 EXPORT_SYMBOL_GPL(ahci_save_initial_config);
 
@@ -593,8 +597,22 @@ EXPORT_SYMBOL_GPL(ahci_start_engine);
 int ahci_stop_engine(struct ata_port *ap)
 {
        void __iomem *port_mmio = ahci_port_base(ap);
+       struct ahci_host_priv *hpriv = ap->host->private_data;
        u32 tmp;
 
+       /*
+        * On some controllers, stopping a port's DMA engine while the port
+        * is in ALPM state (partial or slumber) results in failures on
+        * subsequent DMA engine starts.  For those controllers, put the
+        * port back in active state before stopping its DMA engine.
+        */
+       if ((hpriv->flags & AHCI_HFLAG_WAKE_BEFORE_STOP) &&
+           (ap->link.lpm_policy > ATA_LPM_MAX_POWER) &&
+           ahci_set_lpm(&ap->link, ATA_LPM_MAX_POWER, ATA_LPM_WAKE_ONLY)) {
+               dev_err(ap->host->dev, "Failed to wake up port before engine stop\n");
+               return -EIO;
+       }
+
        tmp = readl(port_mmio + PORT_CMD);
 
        /* check if the HBA is idle */
@@ -689,6 +707,9 @@ static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
        void __iomem *port_mmio = ahci_port_base(ap);
 
        if (policy != ATA_LPM_MAX_POWER) {
+               /* wakeup flag only applies to the max power policy */
+               hints &= ~ATA_LPM_WAKE_ONLY;
+
                /*
                 * Disable interrupts on Phy Ready. This keeps us from
                 * getting woken up due to spurious phy ready
@@ -704,7 +725,8 @@ static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
                u32 cmd = readl(port_mmio + PORT_CMD);
 
                if (policy == ATA_LPM_MAX_POWER || !(hints & ATA_LPM_HIPM)) {
-                       cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE);
+                       if (!(hints & ATA_LPM_WAKE_ONLY))
+                               cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE);
                        cmd |= PORT_CMD_ICC_ACTIVE;
 
                        writel(cmd, port_mmio + PORT_CMD);
@@ -712,6 +734,9 @@ static int ahci_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 
                        /* wait 10ms to be sure we've come out of LPM state */
                        ata_msleep(ap, 10);
+
+                       if (hints & ATA_LPM_WAKE_ONLY)
+                               return 0;
                } else {
                        cmd |= PORT_CMD_ALPE;
                        if (policy == ATA_LPM_MIN_POWER)
@@ -1143,8 +1168,7 @@ static void ahci_port_init(struct device *dev, struct ata_port *ap,
 
        /* mark esata ports */
        tmp = readl(port_mmio + PORT_CMD);
-       if ((tmp & PORT_CMD_HPCP) ||
-           ((tmp & PORT_CMD_ESP) && (hpriv->cap & HOST_CAP_SXS)))
+       if ((tmp & PORT_CMD_ESP) && (hpriv->cap & HOST_CAP_SXS))
                ap->pflags |= ATA_PFLAG_EXTERNAL;
 }
 
@@ -1825,7 +1849,7 @@ static irqreturn_t ahci_multi_irqs_intr_hard(int irq, void *dev_instance)
        return IRQ_HANDLED;
 }
 
-static u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
+u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
 {
        unsigned int i, handled = 0;
 
@@ -1851,43 +1875,7 @@ static u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
 
        return handled;
 }
-
-static irqreturn_t ahci_single_edge_irq_intr(int irq, void *dev_instance)
-{
-       struct ata_host *host = dev_instance;
-       struct ahci_host_priv *hpriv;
-       unsigned int rc = 0;
-       void __iomem *mmio;
-       u32 irq_stat, irq_masked;
-
-       VPRINTK("ENTER\n");
-
-       hpriv = host->private_data;
-       mmio = hpriv->mmio;
-
-       /* sigh.  0xffffffff is a valid return from h/w */
-       irq_stat = readl(mmio + HOST_IRQ_STAT);
-       if (!irq_stat)
-               return IRQ_NONE;
-
-       irq_masked = irq_stat & hpriv->port_map;
-
-       spin_lock(&host->lock);
-
-       /*
-        * HOST_IRQ_STAT behaves as edge triggered latch meaning that
-        * it should be cleared before all the port events are cleared.
-        */
-       writel(irq_stat, mmio + HOST_IRQ_STAT);
-
-       rc = ahci_handle_port_intr(host, irq_masked);
-
-       spin_unlock(&host->lock);
-
-       VPRINTK("EXIT\n");
-
-       return IRQ_RETVAL(rc);
-}
+EXPORT_SYMBOL_GPL(ahci_handle_port_intr);
 
 static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance)
 {
@@ -2514,14 +2502,18 @@ int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht)
        int irq = hpriv->irq;
        int rc;
 
-       if (hpriv->flags & (AHCI_HFLAG_MULTI_MSI | AHCI_HFLAG_MULTI_MSIX))
+       if (hpriv->flags & (AHCI_HFLAG_MULTI_MSI | AHCI_HFLAG_MULTI_MSIX)) {
+               if (hpriv->irq_handler)
+                       dev_warn(host->dev, "both AHCI_HFLAG_MULTI_MSI flag set \
+                                and custom irq handler implemented\n");
+
                rc = ahci_host_activate_multi_irqs(host, sht);
-       else if (hpriv->flags & AHCI_HFLAG_EDGE_IRQ)
-               rc = ata_host_activate(host, irq, ahci_single_edge_irq_intr,
-                                      IRQF_SHARED, sht);
-       else
-               rc = ata_host_activate(host, irq, ahci_single_level_irq_intr,
+       } else {
+               rc = ata_host_activate(host, irq, hpriv->irq_handler,
                                       IRQF_SHARED, sht);
+       }
+
+
        return rc;
 }
 EXPORT_SYMBOL_GPL(ahci_host_activate);
index cbb7471..55e257c 100644 (file)
@@ -4125,6 +4125,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
        { "SAMSUNG CD-ROM SN-124", "N001",      ATA_HORKAGE_NODMA },
        { "Seagate STT20000A", NULL,            ATA_HORKAGE_NODMA },
        { " 2GB ATA Flash Disk", "ADMA428M",    ATA_HORKAGE_NODMA },
+       { "VRFDFC22048UCHC-TE*", NULL,          ATA_HORKAGE_NODMA },
        /* Odd clown on sil3726/4726 PMPs */
        { "Config  Disk",       NULL,           ATA_HORKAGE_DISABLE },
 
index 7e959f9..e417e1a 100644 (file)
@@ -675,19 +675,18 @@ static int ata_ioc32(struct ata_port *ap)
 int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
                     int cmd, void __user *arg)
 {
-       int val = -EINVAL, rc = -EINVAL;
+       unsigned long val;
+       int rc = -EINVAL;
        unsigned long flags;
 
        switch (cmd) {
-       case ATA_IOC_GET_IO32:
+       case HDIO_GET_32BIT:
                spin_lock_irqsave(ap->lock, flags);
                val = ata_ioc32(ap);
                spin_unlock_irqrestore(ap->lock, flags);
-               if (copy_to_user(arg, &val, 1))
-                       return -EFAULT;
-               return 0;
+               return put_user(val, (unsigned long __user *)arg);
 
-       case ATA_IOC_SET_IO32:
+       case HDIO_SET_32BIT:
                val = (unsigned long) arg;
                rc = 0;
                spin_lock_irqsave(ap->lock, flags);
index cdf6215..051b615 100644 (file)
@@ -997,12 +997,9 @@ static inline int ata_hsm_ok_in_wq(struct ata_port *ap,
 static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
 {
        struct ata_port *ap = qc->ap;
-       unsigned long flags;
 
        if (ap->ops->error_handler) {
                if (in_wq) {
-                       spin_lock_irqsave(ap->lock, flags);
-
                        /* EH might have kicked in while host lock is
                         * released.
                         */
@@ -1014,8 +1011,6 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
                                } else
                                        ata_port_freeze(ap);
                        }
-
-                       spin_unlock_irqrestore(ap->lock, flags);
                } else {
                        if (likely(!(qc->err_mask & AC_ERR_HSM)))
                                ata_qc_complete(qc);
@@ -1024,10 +1019,8 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
                }
        } else {
                if (in_wq) {
-                       spin_lock_irqsave(ap->lock, flags);
                        ata_sff_irq_on(ap);
                        ata_qc_complete(qc);
-                       spin_unlock_irqrestore(ap->lock, flags);
                } else
                        ata_qc_complete(qc);
        }
@@ -1048,9 +1041,10 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
 {
        struct ata_link *link = qc->dev->link;
        struct ata_eh_info *ehi = &link->eh_info;
-       unsigned long flags = 0;
        int poll_next;
 
+       lockdep_assert_held(ap->lock);
+
        WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
 
        /* Make sure ata_sff_qc_issue() does not throw things
@@ -1112,14 +1106,6 @@ fsm_start:
                        }
                }
 
-               /* Send the CDB (atapi) or the first data block (ata pio out).
-                * During the state transition, interrupt handler shouldn't
-                * be invoked before the data transfer is complete and
-                * hsm_task_state is changed. Hence, the following locking.
-                */
-               if (in_wq)
-                       spin_lock_irqsave(ap->lock, flags);
-
                if (qc->tf.protocol == ATA_PROT_PIO) {
                        /* PIO data out protocol.
                         * send first data block.
@@ -1135,9 +1121,6 @@ fsm_start:
                        /* send CDB */
                        atapi_send_cdb(ap, qc);
 
-               if (in_wq)
-                       spin_unlock_irqrestore(ap->lock, flags);
-
                /* if polling, ata_sff_pio_task() handles the rest.
                 * otherwise, interrupt handler takes over from here.
                 */
@@ -1296,7 +1279,8 @@ fsm_start:
                break;
        default:
                poll_next = 0;
-               BUG();
+               WARN(true, "ata%d: SFF host state machine in invalid state %d",
+                    ap->print_id, ap->hsm_task_state);
        }
 
        return poll_next;
@@ -1361,12 +1345,14 @@ static void ata_sff_pio_task(struct work_struct *work)
        u8 status;
        int poll_next;
 
+       spin_lock_irq(ap->lock);
+
        BUG_ON(ap->sff_pio_task_link == NULL);
        /* qc can be NULL if timeout occurred */
        qc = ata_qc_from_tag(ap, link->active_tag);
        if (!qc) {
                ap->sff_pio_task_link = NULL;
-               return;
+               goto out_unlock;
        }
 
 fsm_start:
@@ -1381,11 +1367,14 @@ fsm_start:
         */
        status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
        if (status & ATA_BUSY) {
+               spin_unlock_irq(ap->lock);
                ata_msleep(ap, 2);
+               spin_lock_irq(ap->lock);
+
                status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
                if (status & ATA_BUSY) {
                        ata_sff_queue_pio_task(link, ATA_SHORT_PAUSE);
-                       return;
+                       goto out_unlock;
                }
        }
 
@@ -1402,6 +1391,8 @@ fsm_start:
         */
        if (poll_next)
                goto fsm_start;
+out_unlock:
+       spin_unlock_irq(ap->lock);
 }
 
 /**
index 12fe0f3..c8b6a78 100644 (file)
@@ -32,6 +32,8 @@
 #include <linux/libata.h>
 #include <scsi/scsi_host.h>
 
+#include <asm/mach-rc32434/rb.h>
+
 #define DRV_NAME       "pata-rb532-cf"
 #define DRV_VERSION    "0.1.0"
 #define DRV_DESC       "PATA driver for RouterBOARD 532 Compact Flash"
@@ -107,6 +109,7 @@ static int rb532_pata_driver_probe(struct platform_device *pdev)
        int gpio;
        struct resource *res;
        struct ata_host *ah;
+       struct cf_device *pdata;
        struct rb532_cf_info *info;
        int ret;
 
@@ -122,7 +125,13 @@ static int rb532_pata_driver_probe(struct platform_device *pdev)
                return -ENOENT;
        }
 
-       gpio = irq_to_gpio(irq);
+       pdata = dev_get_platdata(&pdev->dev);
+       if (!pdata) {
+               dev_err(&pdev->dev, "no platform data specified\n");
+               return -EINVAL;
+       }
+
+       gpio = pdata->gpio_pin;
        if (gpio < 0) {
                dev_err(&pdev->dev, "no GPIO found for irq%d\n", irq);
                return -ENOENT;
index 89f5cf6..04a1582 100644 (file)
@@ -206,6 +206,8 @@ static void component_match_release(struct device *master,
                if (mc->release)
                        mc->release(master, mc->data);
        }
+
+       kfree(match->compare);
 }
 
 static void devm_component_match_release(struct device *dev, void *res)
@@ -221,14 +223,14 @@ static int component_match_realloc(struct device *dev,
        if (match->alloc == num)
                return 0;
 
-       new = devm_kmalloc_array(dev, num, sizeof(*new), GFP_KERNEL);
+       new = kmalloc_array(num, sizeof(*new), GFP_KERNEL);
        if (!new)
                return -ENOMEM;
 
        if (match->compare) {
                memcpy(new, match->compare, sizeof(*new) *
                                            min(match->num, num));
-               devm_kfree(dev, match->compare);
+               kfree(match->compare);
        }
        match->compare = new;
        match->alloc = num;
@@ -283,6 +285,24 @@ void component_match_add_release(struct device *master,
 }
 EXPORT_SYMBOL(component_match_add_release);
 
+static void free_master(struct master *master)
+{
+       struct component_match *match = master->match;
+       int i;
+
+       list_del(&master->node);
+
+       if (match) {
+               for (i = 0; i < match->num; i++) {
+                       struct component *c = match->compare[i].component;
+                       if (c)
+                               c->master = NULL;
+               }
+       }
+
+       kfree(master);
+}
+
 int component_master_add_with_match(struct device *dev,
        const struct component_master_ops *ops,
        struct component_match *match)
@@ -309,11 +329,9 @@ int component_master_add_with_match(struct device *dev,
 
        ret = try_to_bring_up_master(master, NULL);
 
-       if (ret < 0) {
-               /* Delete off the list if we weren't successful */
-               list_del(&master->node);
-               kfree(master);
-       }
+       if (ret < 0)
+               free_master(master);
+
        mutex_unlock(&component_mutex);
 
        return ret < 0 ? ret : 0;
@@ -324,25 +342,12 @@ void component_master_del(struct device *dev,
        const struct component_master_ops *ops)
 {
        struct master *master;
-       int i;
 
        mutex_lock(&component_mutex);
        master = __master_find(dev, ops);
        if (master) {
-               struct component_match *match = master->match;
-
                take_down_master(master);
-
-               list_del(&master->node);
-
-               if (match) {
-                       for (i = 0; i < match->num; i++) {
-                               struct component *c = match->compare[i].component;
-                               if (c)
-                                       c->master = NULL;
-                       }
-               }
-               kfree(master);
+               free_master(master);
        }
        mutex_unlock(&component_mutex);
 }
@@ -486,6 +491,8 @@ int component_add(struct device *dev, const struct component_ops *ops)
 
        ret = try_to_bring_up_masters(component);
        if (ret < 0) {
+               if (component->master)
+                       remove_component(component->master, component);
                list_del(&component->node);
 
                kfree(component);
index 47c4338..279e539 100644 (file)
@@ -284,6 +284,7 @@ out_free_priv_data:
 
        return err;
 }
+EXPORT_SYMBOL_GPL(platform_msi_domain_alloc_irqs);
 
 /**
  * platform_msi_domain_free_irqs - Free MSI interrupts for @dev
@@ -301,6 +302,7 @@ void platform_msi_domain_free_irqs(struct device *dev)
        msi_domain_free_irqs(dev->msi_domain, dev);
        platform_msi_free_descs(dev, 0, MAX_DEV_MSIS);
 }
+EXPORT_SYMBOL_GPL(platform_msi_domain_free_irqs);
 
 /**
  * platform_msi_get_host_data - Query the private data associated with
index 73d6e5d..f437afa 100644 (file)
@@ -558,10 +558,15 @@ static int platform_drv_probe(struct device *_dev)
                return ret;
 
        ret = dev_pm_domain_attach(_dev, true);
-       if (ret != -EPROBE_DEFER && drv->probe) {
-               ret = drv->probe(dev);
-               if (ret)
-                       dev_pm_domain_detach(_dev, true);
+       if (ret != -EPROBE_DEFER) {
+               if (drv->probe) {
+                       ret = drv->probe(dev);
+                       if (ret)
+                               dev_pm_domain_detach(_dev, true);
+               } else {
+                       /* don't fail if just dev_pm_domain_attach failed */
+                       ret = 0;
+               }
        }
 
        if (drv->prevent_deferred_probe && ret == -EPROBE_DEFER) {
index 93ed14c..f6a9ad5 100644 (file)
@@ -146,7 +146,7 @@ void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd)
        if (dev->pm_domain == pd)
                return;
 
-       WARN(device_is_bound(dev),
+       WARN(pd && device_is_bound(dev),
             "PM domains can only be changed for unbound devices\n");
        dev->pm_domain = pd;
        device_pm_check_callbacks(dev);
index 6ac9a7f..301b785 100644 (file)
@@ -162,7 +162,7 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
 
 /**
  * genpd_queue_power_off_work - Queue up the execution of genpd_poweroff().
- * @genpd: PM domait to power off.
+ * @genpd: PM domain to power off.
  *
  * Queue up the execution of genpd_poweroff() unless it's already been done
  * before.
@@ -172,16 +172,15 @@ static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
        queue_work(pm_wq, &genpd->power_off_work);
 }
 
-static int genpd_poweron(struct generic_pm_domain *genpd);
-
 /**
- * __genpd_poweron - Restore power to a given PM domain and its masters.
+ * genpd_poweron - Restore power to a given PM domain and its masters.
  * @genpd: PM domain to power up.
+ * @depth: nesting count for lockdep.
  *
  * Restore power to @genpd and all of its masters so that it is possible to
  * resume a device belonging to it.
  */
-static int __genpd_poweron(struct generic_pm_domain *genpd)
+static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth)
 {
        struct gpd_link *link;
        int ret = 0;
@@ -196,11 +195,16 @@ static int __genpd_poweron(struct generic_pm_domain *genpd)
         * with it.
         */
        list_for_each_entry(link, &genpd->slave_links, slave_node) {
-               genpd_sd_counter_inc(link->master);
+               struct generic_pm_domain *master = link->master;
+
+               genpd_sd_counter_inc(master);
+
+               mutex_lock_nested(&master->lock, depth + 1);
+               ret = genpd_poweron(master, depth + 1);
+               mutex_unlock(&master->lock);
 
-               ret = genpd_poweron(link->master);
                if (ret) {
-                       genpd_sd_counter_dec(link->master);
+                       genpd_sd_counter_dec(master);
                        goto err;
                }
        }
@@ -223,20 +227,6 @@ static int __genpd_poweron(struct generic_pm_domain *genpd)
        return ret;
 }
 
-/**
- * genpd_poweron - Restore power to a given PM domain and its masters.
- * @genpd: PM domain to power up.
- */
-static int genpd_poweron(struct generic_pm_domain *genpd)
-{
-       int ret;
-
-       mutex_lock(&genpd->lock);
-       ret = __genpd_poweron(genpd);
-       mutex_unlock(&genpd->lock);
-       return ret;
-}
-
 static int genpd_save_dev(struct generic_pm_domain *genpd, struct device *dev)
 {
        return GENPD_DEV_CALLBACK(genpd, int, save_state, dev);
@@ -484,7 +474,7 @@ static int pm_genpd_runtime_resume(struct device *dev)
        }
 
        mutex_lock(&genpd->lock);
-       ret = __genpd_poweron(genpd);
+       ret = genpd_poweron(genpd, 0);
        mutex_unlock(&genpd->lock);
 
        if (ret)
@@ -1339,8 +1329,8 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
        if (!link)
                return -ENOMEM;
 
-       mutex_lock(&genpd->lock);
-       mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
+       mutex_lock(&subdomain->lock);
+       mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
 
        if (genpd->status == GPD_STATE_POWER_OFF
            &&  subdomain->status != GPD_STATE_POWER_OFF) {
@@ -1363,8 +1353,8 @@ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
                genpd_sd_counter_inc(genpd);
 
  out:
-       mutex_unlock(&subdomain->lock);
        mutex_unlock(&genpd->lock);
+       mutex_unlock(&subdomain->lock);
        if (ret)
                kfree(link);
        return ret;
@@ -1385,7 +1375,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
        if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
                return -EINVAL;
 
-       mutex_lock(&genpd->lock);
+       mutex_lock(&subdomain->lock);
+       mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
 
        if (!list_empty(&subdomain->slave_links) || subdomain->device_count) {
                pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
@@ -1398,22 +1389,19 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
                if (link->slave != subdomain)
                        continue;
 
-               mutex_lock_nested(&subdomain->lock, SINGLE_DEPTH_NESTING);
-
                list_del(&link->master_node);
                list_del(&link->slave_node);
                kfree(link);
                if (subdomain->status != GPD_STATE_POWER_OFF)
                        genpd_sd_counter_dec(genpd);
 
-               mutex_unlock(&subdomain->lock);
-
                ret = 0;
                break;
        }
 
 out:
        mutex_unlock(&genpd->lock);
+       mutex_unlock(&subdomain->lock);
 
        return ret;
 }
@@ -1818,8 +1806,10 @@ int genpd_dev_pm_attach(struct device *dev)
 
        dev->pm_domain->detach = genpd_dev_pm_detach;
        dev->pm_domain->sync = genpd_dev_pm_sync;
-       ret = genpd_poweron(pd);
 
+       mutex_lock(&pd->lock);
+       ret = genpd_poweron(pd, 0);
+       mutex_unlock(&pd->lock);
 out:
        return ret ? -EPROBE_DEFER : 0;
 }
index c359351..a163f2c 100644 (file)
@@ -218,7 +218,7 @@ bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
        bool ret;
 
        ret = __fwnode_property_present(fwnode, propname);
-       if (ret == false && fwnode && fwnode->secondary)
+       if (ret == false && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                ret = __fwnode_property_present(fwnode->secondary, propname);
        return ret;
 }
@@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(device_property_match_string);
        int _ret_;                                                                      \
        _ret_ = FWNODE_PROP_READ(_fwnode_, _propname_, _type_, _proptype_,              \
                                 _val_, _nval_);                                        \
-       if (_ret_ == -EINVAL && _fwnode_ && _fwnode_->secondary)                        \
+       if (_ret_ == -EINVAL && _fwnode_ && !IS_ERR_OR_NULL(_fwnode_->secondary))       \
                _ret_ = FWNODE_PROP_READ(_fwnode_->secondary, _propname_, _type_,       \
                                _proptype_, _val_, _nval_);                             \
        _ret_;                                                                          \
@@ -593,7 +593,7 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
        int ret;
 
        ret = __fwnode_property_read_string_array(fwnode, propname, val, nval);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                ret = __fwnode_property_read_string_array(fwnode->secondary,
                                                          propname, val, nval);
        return ret;
@@ -621,7 +621,7 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
        int ret;
 
        ret = __fwnode_property_read_string(fwnode, propname, val);
-       if (ret == -EINVAL && fwnode && fwnode->secondary)
+       if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
                ret = __fwnode_property_read_string(fwnode->secondary,
                                                    propname, val);
        return ret;
index 8812bfb..eea5156 100644 (file)
@@ -133,17 +133,17 @@ static int regmap_mmio_gather_write(void *context,
        while (val_size) {
                switch (ctx->val_bytes) {
                case 1:
-                       __raw_writeb(*(u8 *)val, ctx->regs + offset);
+                       writeb(*(u8 *)val, ctx->regs + offset);
                        break;
                case 2:
-                       __raw_writew(*(u16 *)val, ctx->regs + offset);
+                       writew(*(u16 *)val, ctx->regs + offset);
                        break;
                case 4:
-                       __raw_writel(*(u32 *)val, ctx->regs + offset);
+                       writel(*(u32 *)val, ctx->regs + offset);
                        break;
 #ifdef CONFIG_64BIT
                case 8:
-                       __raw_writeq(*(u64 *)val, ctx->regs + offset);
+                       writeq(*(u64 *)val, ctx->regs + offset);
                        break;
 #endif
                default:
@@ -193,17 +193,17 @@ static int regmap_mmio_read(void *context,
        while (val_size) {
                switch (ctx->val_bytes) {
                case 1:
-                       *(u8 *)val = __raw_readb(ctx->regs + offset);
+                       *(u8 *)val = readb(ctx->regs + offset);
                        break;
                case 2:
-                       *(u16 *)val = __raw_readw(ctx->regs + offset);
+                       *(u16 *)val = readw(ctx->regs + offset);
                        break;
                case 4:
-                       *(u32 *)val = __raw_readl(ctx->regs + offset);
+                       *(u32 *)val = readl(ctx->regs + offset);
                        break;
 #ifdef CONFIG_64BIT
                case 8:
-                       *(u64 *)val = __raw_readq(ctx->regs + offset);
+                       *(u64 *)val = readq(ctx->regs + offset);
                        break;
 #endif
                default:
index 9e25120..84708a5 100644 (file)
@@ -866,7 +866,7 @@ static void set_fdc(int drive)
 }
 
 /* locks the driver */
-static int lock_fdc(int drive, bool interruptible)
+static int lock_fdc(int drive)
 {
        if (WARN(atomic_read(&usage_count) == 0,
                 "Trying to lock fdc while usage count=0\n"))
@@ -2173,7 +2173,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req)
 {
        int ret;
 
-       if (lock_fdc(drive, true))
+       if (lock_fdc(drive))
                return -EINTR;
 
        set_floppy(drive);
@@ -2960,7 +2960,7 @@ static int user_reset_fdc(int drive, int arg, bool interruptible)
 {
        int ret;
 
-       if (lock_fdc(drive, interruptible))
+       if (lock_fdc(drive))
                return -EINTR;
 
        if (arg == FD_RESET_ALWAYS)
@@ -3243,7 +3243,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                mutex_lock(&open_lock);
-               if (lock_fdc(drive, true)) {
+               if (lock_fdc(drive)) {
                        mutex_unlock(&open_lock);
                        return -EINTR;
                }
@@ -3263,7 +3263,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
        } else {
                int oldStretch;
 
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                if (cmd != FDDEFPRM) {
                        /* notice a disk change immediately, else
@@ -3349,7 +3349,7 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
        if (type)
                *g = &floppy_type[type];
        else {
-               if (lock_fdc(drive, false))
+               if (lock_fdc(drive))
                        return -EINTR;
                if (poll_drive(false, 0) == -EINTR)
                        return -EINTR;
@@ -3433,7 +3433,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                if (UDRS->fd_ref != 1)
                        /* somebody else has this drive open */
                        return -EBUSY;
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
 
                /* do the actual eject. Fails on
@@ -3445,7 +3445,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                process_fd_request();
                return ret;
        case FDCLRPRM:
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                current_type[drive] = NULL;
                floppy_sizes[drive] = MAX_DISK_SIZE << 1;
@@ -3467,7 +3467,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                UDP->flags &= ~FTD_MSG;
                return 0;
        case FDFMTBEG:
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
                        return -EINTR;
@@ -3484,7 +3484,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                return do_format(drive, &inparam.f);
        case FDFMTEND:
        case FDFLUSH:
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                return invalidate_drive(bdev);
        case FDSETEMSGTRESH:
@@ -3507,7 +3507,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                outparam = UDP;
                break;
        case FDPOLLDRVSTAT:
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
                        return -EINTR;
@@ -3530,7 +3530,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
        case FDRAWCMD:
                if (type)
                        return -EINVAL;
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                set_floppy(drive);
                i = raw_cmd_ioctl(cmd, (void __user *)param);
@@ -3539,7 +3539,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                process_fd_request();
                return i;
        case FDTWADDLE:
-               if (lock_fdc(drive, true))
+               if (lock_fdc(drive))
                        return -EINTR;
                twaddle();
                process_fd_request();
@@ -3663,6 +3663,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 
        opened_bdev[drive] = bdev;
 
+       if (!(mode & (FMODE_READ|FMODE_WRITE))) {
+               res = -EINVAL;
+               goto out;
+       }
+
        res = -ENXIO;
 
        if (!floppy_track_buffer) {
@@ -3706,21 +3711,20 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
        if (UFDCS->rawcmd == 1)
                UFDCS->rawcmd = 2;
 
-       if (!(mode & FMODE_NDELAY)) {
-               if (mode & (FMODE_READ|FMODE_WRITE)) {
-                       UDRS->last_checked = 0;
-                       clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
-                       check_disk_change(bdev);
-                       if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
-                               goto out;
-                       if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
-                               goto out;
-               }
-               res = -EROFS;
-               if ((mode & FMODE_WRITE) &&
-                   !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags))
-                       goto out;
-       }
+       UDRS->last_checked = 0;
+       clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+       check_disk_change(bdev);
+       if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
+               goto out;
+       if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
+               goto out;
+
+       res = -EROFS;
+
+       if ((mode & FMODE_WRITE) &&
+                       !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags))
+               goto out;
+
        mutex_unlock(&open_lock);
        mutex_unlock(&floppy_mutex);
        return 0;
@@ -3748,7 +3752,8 @@ static unsigned int floppy_check_events(struct gendisk *disk,
                return DISK_EVENT_MEDIA_CHANGE;
 
        if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
-               lock_fdc(drive, false);
+               if (lock_fdc(drive))
+                       return -EINTR;
                poll_drive(false, 0);
                process_fd_request();
        }
@@ -3847,7 +3852,9 @@ static int floppy_revalidate(struct gendisk *disk)
                         "VFS: revalidate called on non-open device.\n"))
                        return -EFAULT;
 
-               lock_fdc(drive, false);
+               res = lock_fdc(drive);
+               if (res)
+                       return res;
                cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
                      test_bit(FD_VERIFY_BIT, &UDRS->flags));
                if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) {
index 8ba1e97..64a7b59 100644 (file)
@@ -478,7 +478,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
        id->ver_id = 0x1;
        id->vmnt = 0;
        id->cgrps = 1;
-       id->cap = 0x3;
+       id->cap = 0x2;
        id->dom = 0x1;
 
        id->ppaf.blk_offset = 0;
@@ -707,9 +707,7 @@ static int null_add_dev(void)
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
        queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
 
-
        mutex_lock(&lock);
-       list_add_tail(&nullb->list, &nullb_list);
        nullb->index = nullb_indexes++;
        mutex_unlock(&lock);
 
@@ -743,6 +741,10 @@ static int null_add_dev(void)
        strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
 
        add_disk(disk);
+
+       mutex_lock(&lock);
+       list_add_tail(&nullb->list, &nullb_list);
+       mutex_unlock(&lock);
 done:
        return 0;
 
index 8a8dc91..83eb9e6 100644 (file)
@@ -1873,6 +1873,43 @@ again:
        return err;
 }
 
+static int negotiate_mq(struct blkfront_info *info)
+{
+       unsigned int backend_max_queues = 0;
+       int err;
+       unsigned int i;
+
+       BUG_ON(info->nr_rings);
+
+       /* Check if backend supports multiple queues. */
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "multi-queue-max-queues", "%u", &backend_max_queues);
+       if (err < 0)
+               backend_max_queues = 1;
+
+       info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+       /* We need at least one ring. */
+       if (!info->nr_rings)
+               info->nr_rings = 1;
+
+       info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+       if (!info->rinfo) {
+               xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < info->nr_rings; i++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[i];
+               INIT_LIST_HEAD(&rinfo->indirect_pages);
+               INIT_LIST_HEAD(&rinfo->grants);
+               rinfo->dev_info = info;
+               INIT_WORK(&rinfo->work, blkif_restart_queue);
+               spin_lock_init(&rinfo->ring_lock);
+       }
+       return 0;
+}
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffer for communication with the backend, and
@@ -1883,9 +1920,7 @@ static int blkfront_probe(struct xenbus_device *dev,
                          const struct xenbus_device_id *id)
 {
        int err, vdevice;
-       unsigned int r_index;
        struct blkfront_info *info;
-       unsigned int backend_max_queues = 0;
 
        /* FIXME: Use dynamic device id if this is not set. */
        err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1936,33 +1971,10 @@ static int blkfront_probe(struct xenbus_device *dev,
        }
 
        info->xbdev = dev;
-       /* Check if backend supports multiple queues. */
-       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
-                          "multi-queue-max-queues", "%u", &backend_max_queues);
-       if (err < 0)
-               backend_max_queues = 1;
-
-       info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
-       /* We need at least one ring. */
-       if (!info->nr_rings)
-               info->nr_rings = 1;
-
-       info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
-       if (!info->rinfo) {
-               xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+       err = negotiate_mq(info);
+       if (err) {
                kfree(info);
-               return -ENOMEM;
-       }
-
-       for (r_index = 0; r_index < info->nr_rings; r_index++) {
-               struct blkfront_ring_info *rinfo;
-
-               rinfo = &info->rinfo[r_index];
-               INIT_LIST_HEAD(&rinfo->indirect_pages);
-               INIT_LIST_HEAD(&rinfo->grants);
-               rinfo->dev_info = info;
-               INIT_WORK(&rinfo->work, blkif_restart_queue);
-               spin_lock_init(&rinfo->ring_lock);
+               return err;
        }
 
        mutex_init(&info->mutex);
@@ -2123,12 +2135,16 @@ static int blkif_recover(struct blkfront_info *info)
 static int blkfront_resume(struct xenbus_device *dev)
 {
        struct blkfront_info *info = dev_get_drvdata(&dev->dev);
-       int err;
+       int err = 0;
 
        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 
        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 
+       err = negotiate_mq(info);
+       if (err)
+               return err;
+
        err = talk_to_blkback(dev, info);
 
        /*
index 129d47b..9a92c07 100644 (file)
@@ -132,7 +132,7 @@ config SUNXI_RSB
          and AC100/AC200 ICs.
 
 config UNIPHIER_SYSTEM_BUS
-       bool "UniPhier System Bus driver"
+       tristate "UniPhier System Bus driver"
        depends on ARCH_UNIPHIER && OF
        default y
        help
index 6575c0f..c3cb76b 100644 (file)
@@ -192,8 +192,10 @@ static int __init vexpress_config_init(void)
        /* Need the config devices early, before the "normal" devices... */
        for_each_compatible_node(node, NULL, "arm,vexpress,config-bus") {
                err = vexpress_config_populate(node);
-               if (err)
+               if (err) {
+                       of_node_put(node);
                        break;
+               }
        }
 
        return err;
index 240b6cf..be54e53 100644 (file)
@@ -42,7 +42,7 @@
 /*
  * The High Precision Event Timer driver.
  * This driver is closely modelled after the rtc.c driver.
- * http://www.intel.com/hardwaredesign/hpetspec_1.pdf
+ * See HPET spec revision 1.
  */
 #define        HPET_USER_FREQ  (64)
 #define        HPET_DRIFT      (500)
index dbf2271..ff00331 100644 (file)
@@ -372,6 +372,7 @@ config HW_RANDOM_XGENE
 config HW_RANDOM_STM32
        tristate "STMicroelectronics STM32 random number generator"
        depends on HW_RANDOM && (ARCH_STM32 || COMPILE_TEST)
+       depends on HAS_IOMEM
        help
          This driver provides kernel-side support for the Random Number
          Generator hardware found on STM32 microcontrollers.
index 9fda22e..7fddd86 100644 (file)
@@ -68,6 +68,7 @@
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/acpi.h>
 
 #ifdef CONFIG_PARISC
 #include <asm/hardware.h>      /* for register_parisc_driver() stuff */
@@ -2054,8 +2055,6 @@ static int hardcode_find_bmc(void)
 
 #ifdef CONFIG_ACPI
 
-#include <linux/acpi.h>
-
 /*
  * Once we get an ACPI failure, we don't try any more, because we go
  * through the tables sequentially.  Once we don't find a table, there
index d0da5d8..b583e53 100644 (file)
@@ -1818,6 +1818,28 @@ unsigned int get_random_int(void)
 }
 EXPORT_SYMBOL(get_random_int);
 
+/*
+ * Same as get_random_int(), but returns unsigned long.
+ */
+unsigned long get_random_long(void)
+{
+       __u32 *hash;
+       unsigned long ret;
+
+       if (arch_get_random_long(&ret))
+               return ret;
+
+       hash = get_cpu_var(get_random_int_hash);
+
+       hash[0] += current->pid + jiffies + random_get_entropy();
+       md5_transform(hash, random_int_secret);
+       ret = *(unsigned long *)hash;
+       put_cpu_var(get_random_int_hash);
+
+       return ret;
+}
+EXPORT_SYMBOL(get_random_long);
+
 /*
  * randomize_range() returns a start address such that
  *
index b038e36..bae4be6 100644 (file)
@@ -43,7 +43,7 @@ obj-$(CONFIG_COMMON_CLK_SI514)                += clk-si514.o
 obj-$(CONFIG_COMMON_CLK_SI570)         += clk-si570.o
 obj-$(CONFIG_COMMON_CLK_CDCE925)       += clk-cdce925.o
 obj-$(CONFIG_ARCH_STM32)               += clk-stm32f4.o
-obj-$(CONFIG_ARCH_TANGOX)              += clk-tango4.o
+obj-$(CONFIG_ARCH_TANGO              += clk-tango4.o
 obj-$(CONFIG_CLK_TWL6040)              += clk-twl6040.o
 obj-$(CONFIG_ARCH_U300)                        += clk-u300.o
 obj-$(CONFIG_ARCH_VT8500)              += clk-vt8500.o
index 19fed65..7b09a26 100644 (file)
@@ -289,7 +289,7 @@ static void __init of_gpio_clk_setup(struct device_node *node,
 
        num_parents = of_clk_get_parent_count(node);
        if (num_parents < 0)
-               return;
+               num_parents = 0;
 
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
index cd0f272..89e9ca7 100644 (file)
@@ -299,7 +299,7 @@ static int scpi_clocks_probe(struct platform_device *pdev)
        /* Add the virtual cpufreq device */
        cpufreq_dev = platform_device_register_simple("scpi-cpufreq",
                                                      -1, NULL, 0);
-       if (!cpufreq_dev)
+       if (IS_ERR(cpufreq_dev))
                pr_warn("unable to register cpufreq device");
 
        return 0;
index d5c5bfa..3e0b52d 100644 (file)
@@ -247,7 +247,7 @@ static struct clk_onecell_data dove_divider_data = {
 
 void __init dove_divider_clk_init(struct device_node *np)
 {
-       void *base;
+       void __iomem *base;
 
        base = of_iomap(np, 0);
        if (WARN_ON(!base))
index cf73e53..070037a 100644 (file)
@@ -3587,7 +3587,6 @@ static const struct regmap_config gcc_apq8084_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x1fc0,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_apq8084_desc = {
index b692ae8..dd5402b 100644 (file)
@@ -3005,7 +3005,6 @@ static const struct regmap_config gcc_ipq806x_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x3e40,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_ipq806x_desc = {
index f6a2b14..ad41303 100644 (file)
@@ -2702,7 +2702,6 @@ static const struct regmap_config gcc_msm8660_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x363c,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_msm8660_desc = {
index e3bf09d..8cc9b28 100644 (file)
@@ -3336,7 +3336,6 @@ static const struct regmap_config gcc_msm8916_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x80000,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_msm8916_desc = {
index f31111e..983dd7d 100644 (file)
@@ -3468,7 +3468,6 @@ static const struct regmap_config gcc_msm8960_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x3660,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct regmap_config gcc_apq8064_regmap_config = {
@@ -3477,7 +3476,6 @@ static const struct regmap_config gcc_apq8064_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x3880,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_msm8960_desc = {
index df164d6..335952d 100644 (file)
@@ -2680,7 +2680,6 @@ static const struct regmap_config gcc_msm8974_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x1fc0,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc gcc_msm8974_desc = {
index 62e79fa..db3998e 100644 (file)
@@ -419,7 +419,6 @@ static const struct regmap_config lcc_ipq806x_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0xfc,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc lcc_ipq806x_desc = {
index bf95bb0..4fcf9d1 100644 (file)
@@ -524,7 +524,6 @@ static const struct regmap_config lcc_msm8960_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0xfc,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc lcc_msm8960_desc = {
index 1e703fd..30777f9 100644 (file)
@@ -3368,7 +3368,6 @@ static const struct regmap_config mmcc_apq8084_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x5104,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc mmcc_apq8084_desc = {
index d73a048..00e3619 100644 (file)
@@ -3029,7 +3029,6 @@ static const struct regmap_config mmcc_msm8960_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x334,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct regmap_config mmcc_apq8064_regmap_config = {
@@ -3038,7 +3037,6 @@ static const struct regmap_config mmcc_apq8064_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x350,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc mmcc_msm8960_desc = {
index bbe28ed..9d790bc 100644 (file)
@@ -2594,7 +2594,6 @@ static const struct regmap_config mmcc_msm8974_regmap_config = {
        .val_bits       = 32,
        .max_register   = 0x5104,
        .fast_io        = true,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static const struct qcom_cc_desc mmcc_msm8974_desc = {
index ebce980..bc7fbac 100644 (file)
@@ -133,7 +133,7 @@ PNAME(mux_spdif_p)  = { "spdif_src", "spdif_frac", "xin12m" };
 PNAME(mux_uart0_p)     = { "uart0_src", "uart0_frac", "xin24m" };
 PNAME(mux_uart1_p)     = { "uart1_src", "uart1_frac", "xin24m" };
 PNAME(mux_uart2_p)     = { "uart2_src", "uart2_frac", "xin24m" };
-PNAME(mux_mac_p)       = { "mac_pll_src", "ext_gmac" };
+PNAME(mux_mac_p)       = { "mac_pll_src", "rmii_clkin" };
 PNAME(mux_dclk_p)      = { "dclk_lcdc", "dclk_cru" };
 
 static struct rockchip_pll_clock rk3036_pll_clks[] __initdata = {
@@ -224,16 +224,16 @@ static struct rockchip_clk_branch rk3036_clk_branches[] __initdata = {
                        RK2928_CLKGATE_CON(2), 2, GFLAGS),
 
        COMPOSITE_NODIV(SCLK_TIMER0, "sclk_timer0", mux_timer_p, CLK_IGNORE_UNUSED,
-                       RK2928_CLKSEL_CON(2), 4, 1, DFLAGS,
+                       RK2928_CLKSEL_CON(2), 4, 1, MFLAGS,
                        RK2928_CLKGATE_CON(1), 0, GFLAGS),
        COMPOSITE_NODIV(SCLK_TIMER1, "sclk_timer1", mux_timer_p, CLK_IGNORE_UNUSED,
-                       RK2928_CLKSEL_CON(2), 5, 1, DFLAGS,
+                       RK2928_CLKSEL_CON(2), 5, 1, MFLAGS,
                        RK2928_CLKGATE_CON(1), 1, GFLAGS),
        COMPOSITE_NODIV(SCLK_TIMER2, "sclk_timer2", mux_timer_p, CLK_IGNORE_UNUSED,
-                       RK2928_CLKSEL_CON(2), 6, 1, DFLAGS,
+                       RK2928_CLKSEL_CON(2), 6, 1, MFLAGS,
                        RK2928_CLKGATE_CON(2), 4, GFLAGS),
        COMPOSITE_NODIV(SCLK_TIMER3, "sclk_timer3", mux_timer_p, CLK_IGNORE_UNUSED,
-                       RK2928_CLKSEL_CON(2), 7, 1, DFLAGS,
+                       RK2928_CLKSEL_CON(2), 7, 1, MFLAGS,
                        RK2928_CLKGATE_CON(2), 5, GFLAGS),
 
        MUX(0, "uart_pll_clk", mux_pll_src_apll_dpll_gpll_usb480m_p, 0,
@@ -242,11 +242,11 @@ static struct rockchip_clk_branch rk3036_clk_branches[] __initdata = {
                        RK2928_CLKSEL_CON(13), 0, 7, DFLAGS,
                        RK2928_CLKGATE_CON(1), 8, GFLAGS),
        COMPOSITE_NOMUX(0, "uart1_src", "uart_pll_clk", 0,
-                       RK2928_CLKSEL_CON(13), 0, 7, DFLAGS,
-                       RK2928_CLKGATE_CON(1), 8, GFLAGS),
+                       RK2928_CLKSEL_CON(14), 0, 7, DFLAGS,
+                       RK2928_CLKGATE_CON(1), 10, GFLAGS),
        COMPOSITE_NOMUX(0, "uart2_src", "uart_pll_clk", 0,
-                       RK2928_CLKSEL_CON(13), 0, 7, DFLAGS,
-                       RK2928_CLKGATE_CON(1), 8, GFLAGS),
+                       RK2928_CLKSEL_CON(15), 0, 7, DFLAGS,
+                       RK2928_CLKGATE_CON(1), 12, GFLAGS),
        COMPOSITE_FRACMUX(0, "uart0_frac", "uart0_src", CLK_SET_RATE_PARENT,
                        RK2928_CLKSEL_CON(17), 0,
                        RK2928_CLKGATE_CON(1), 9, GFLAGS,
@@ -279,13 +279,13 @@ static struct rockchip_clk_branch rk3036_clk_branches[] __initdata = {
                        RK2928_CLKGATE_CON(3), 2, GFLAGS),
 
        COMPOSITE_NODIV(0, "sclk_sdmmc_src", mux_mmc_src_p, 0,
-                       RK2928_CLKSEL_CON(12), 8, 2, DFLAGS,
+                       RK2928_CLKSEL_CON(12), 8, 2, MFLAGS,
                        RK2928_CLKGATE_CON(2), 11, GFLAGS),
        DIV(SCLK_SDMMC, "sclk_sdmmc", "sclk_sdmmc_src", 0,
                        RK2928_CLKSEL_CON(11), 0, 7, DFLAGS),
 
        COMPOSITE_NODIV(0, "sclk_sdio_src", mux_mmc_src_p, 0,
-                       RK2928_CLKSEL_CON(12), 10, 2, DFLAGS,
+                       RK2928_CLKSEL_CON(12), 10, 2, MFLAGS,
                        RK2928_CLKGATE_CON(2), 13, GFLAGS),
        DIV(SCLK_SDIO, "sclk_sdio", "sclk_sdio_src", 0,
                        RK2928_CLKSEL_CON(11), 8, 7, DFLAGS),
@@ -344,12 +344,12 @@ static struct rockchip_clk_branch rk3036_clk_branches[] __initdata = {
                        RK2928_CLKGATE_CON(10), 5, GFLAGS),
 
        COMPOSITE_NOGATE(0, "mac_pll_src", mux_pll_src_3plls_p, 0,
-                       RK2928_CLKSEL_CON(21), 0, 2, MFLAGS, 4, 5, DFLAGS),
+                       RK2928_CLKSEL_CON(21), 0, 2, MFLAGS, 9, 5, DFLAGS),
        MUX(SCLK_MACREF, "mac_clk_ref", mux_mac_p, CLK_SET_RATE_PARENT,
                        RK2928_CLKSEL_CON(21), 3, 1, MFLAGS),
 
        COMPOSITE_NOMUX(SCLK_MAC, "mac_clk", "mac_clk_ref", 0,
-                       RK2928_CLKSEL_CON(21), 9, 5, DFLAGS,
+                       RK2928_CLKSEL_CON(21), 4, 5, DFLAGS,
                        RK2928_CLKGATE_CON(2), 6, GFLAGS),
 
        MUX(SCLK_HDMI, "dclk_hdmi", mux_dclk_p, 0,
index be0ede5..21f3ea9 100644 (file)
@@ -780,13 +780,13 @@ static struct rockchip_clk_branch rk3368_clk_branches[] __initdata = {
        GATE(PCLK_TSADC, "pclk_tsadc", "pclk_peri", 0, RK3368_CLKGATE_CON(20), 0, GFLAGS),
 
        /* pclk_pd_alive gates */
-       GATE(PCLK_TIMER1, "pclk_timer1", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(14), 8, GFLAGS),
-       GATE(PCLK_TIMER0, "pclk_timer0", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(14), 7, GFLAGS),
-       GATE(0, "pclk_alive_niu", "pclk_pd_alive", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(14), 12, GFLAGS),
-       GATE(PCLK_GRF, "pclk_grf", "pclk_pd_alive", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(14), 11, GFLAGS),
-       GATE(PCLK_GPIO3, "pclk_gpio3", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(14), 3, GFLAGS),
-       GATE(PCLK_GPIO2, "pclk_gpio2", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(14), 2, GFLAGS),
-       GATE(PCLK_GPIO1, "pclk_gpio1", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(14), 1, GFLAGS),
+       GATE(PCLK_TIMER1, "pclk_timer1", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(22), 13, GFLAGS),
+       GATE(PCLK_TIMER0, "pclk_timer0", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(22), 12, GFLAGS),
+       GATE(0, "pclk_alive_niu", "pclk_pd_alive", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(22), 9, GFLAGS),
+       GATE(PCLK_GRF, "pclk_grf", "pclk_pd_alive", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(22), 8, GFLAGS),
+       GATE(PCLK_GPIO3, "pclk_gpio3", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(22), 3, GFLAGS),
+       GATE(PCLK_GPIO2, "pclk_gpio2", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(22), 2, GFLAGS),
+       GATE(PCLK_GPIO1, "pclk_gpio1", "pclk_pd_alive", 0, RK3368_CLKGATE_CON(22), 1, GFLAGS),
 
        /*
         * pclk_vio gates
@@ -796,12 +796,12 @@ static struct rockchip_clk_branch rk3368_clk_branches[] __initdata = {
        GATE(0, "pclk_dphytx", "hclk_vio", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(14), 8, GFLAGS),
 
        /* pclk_pd_pmu gates */
-       GATE(PCLK_PMUGRF, "pclk_pmugrf", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(17), 0, GFLAGS),
-       GATE(PCLK_GPIO0, "pclk_gpio0", "pclk_pd_pmu", 0, RK3368_CLKGATE_CON(17), 4, GFLAGS),
-       GATE(PCLK_SGRF, "pclk_sgrf", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(17), 3, GFLAGS),
-       GATE(0, "pclk_pmu_noc", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(17), 2, GFLAGS),
-       GATE(0, "pclk_intmem1", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(17), 1, GFLAGS),
-       GATE(PCLK_PMU, "pclk_pmu", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(17), 2, GFLAGS),
+       GATE(PCLK_PMUGRF, "pclk_pmugrf", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(23), 5, GFLAGS),
+       GATE(PCLK_GPIO0, "pclk_gpio0", "pclk_pd_pmu", 0, RK3368_CLKGATE_CON(23), 4, GFLAGS),
+       GATE(PCLK_SGRF, "pclk_sgrf", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(23), 3, GFLAGS),
+       GATE(0, "pclk_pmu_noc", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(23), 2, GFLAGS),
+       GATE(0, "pclk_intmem1", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(23), 1, GFLAGS),
+       GATE(PCLK_PMU, "pclk_pmu", "pclk_pd_pmu", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(23), 0, GFLAGS),
 
        /* timer gates */
        GATE(0, "sclk_timer15", "xin24m", CLK_IGNORE_UNUSED, RK3368_CLKGATE_CON(24), 11, GFLAGS),
index e1fe8f3..74e7544 100644 (file)
@@ -450,8 +450,10 @@ static int load_timings_from_dt(struct tegra_clk_emc *tegra,
                struct emc_timing *timing = tegra->timings + (i++);
 
                err = load_one_timing_from_dt(tegra, timing, child);
-               if (err)
+               if (err) {
+                       of_node_put(child);
                        return err;
+               }
 
                timing->ram_code = ram_code;
        }
@@ -499,9 +501,9 @@ struct clk *tegra_clk_register_emc(void __iomem *base, struct device_node *np,
                 * fuses until the apbmisc driver is loaded.
                 */
                err = load_timings_from_dt(tegra, node, node_ram_code);
+               of_node_put(node);
                if (err)
                        return ERR_PTR(err);
-               of_node_put(node);
                break;
        }
 
index 19ce073..62ea381 100644 (file)
@@ -11,6 +11,7 @@ enum clk_id {
        tegra_clk_afi,
        tegra_clk_amx,
        tegra_clk_amx1,
+       tegra_clk_apb2ape,
        tegra_clk_apbdma,
        tegra_clk_apbif,
        tegra_clk_ape,
index a534bfa..6ac3f84 100644 (file)
 #define PLLE_SS_DISABLE (PLLE_SS_CNTL_BYPASS_SS | PLLE_SS_CNTL_INTERP_RESET |\
                                PLLE_SS_CNTL_SSC_BYP)
 #define PLLE_SS_MAX_MASK 0x1ff
-#define PLLE_SS_MAX_VAL 0x25
+#define PLLE_SS_MAX_VAL_TEGRA114 0x25
+#define PLLE_SS_MAX_VAL_TEGRA210 0x21
 #define PLLE_SS_INC_MASK (0xff << 16)
 #define PLLE_SS_INC_VAL (0x1 << 16)
 #define PLLE_SS_INCINTRV_MASK (0x3f << 24)
-#define PLLE_SS_INCINTRV_VAL (0x20 << 24)
+#define PLLE_SS_INCINTRV_VAL_TEGRA114 (0x20 << 24)
+#define PLLE_SS_INCINTRV_VAL_TEGRA210 (0x23 << 24)
 #define PLLE_SS_COEFFICIENTS_MASK \
        (PLLE_SS_MAX_MASK | PLLE_SS_INC_MASK | PLLE_SS_INCINTRV_MASK)
-#define PLLE_SS_COEFFICIENTS_VAL \
-       (PLLE_SS_MAX_VAL | PLLE_SS_INC_VAL | PLLE_SS_INCINTRV_VAL)
+#define PLLE_SS_COEFFICIENTS_VAL_TEGRA114 \
+       (PLLE_SS_MAX_VAL_TEGRA114 | PLLE_SS_INC_VAL |\
+        PLLE_SS_INCINTRV_VAL_TEGRA114)
+#define PLLE_SS_COEFFICIENTS_VAL_TEGRA210 \
+       (PLLE_SS_MAX_VAL_TEGRA210 | PLLE_SS_INC_VAL |\
+        PLLE_SS_INCINTRV_VAL_TEGRA210)
 
 #define PLLE_AUX_PLLP_SEL      BIT(2)
 #define PLLE_AUX_USE_LOCKDET   BIT(3)
@@ -880,7 +886,7 @@ static int clk_plle_training(struct tegra_clk_pll *pll)
 static int clk_plle_enable(struct clk_hw *hw)
 {
        struct tegra_clk_pll *pll = to_clk_pll(hw);
-       unsigned long input_rate = clk_get_rate(clk_get_parent(hw->clk));
+       unsigned long input_rate = clk_hw_get_rate(clk_hw_get_parent(hw));
        struct tegra_clk_pll_freq_table sel;
        u32 val;
        int err;
@@ -1378,7 +1384,7 @@ static int clk_plle_tegra114_enable(struct clk_hw *hw)
        u32 val;
        int ret;
        unsigned long flags = 0;
-       unsigned long input_rate = clk_get_rate(clk_get_parent(hw->clk));
+       unsigned long input_rate = clk_hw_get_rate(clk_hw_get_parent(hw));
 
        if (_get_table_rate(hw, &sel, pll->params->fixed_rate, input_rate))
                return -EINVAL;
@@ -1401,7 +1407,7 @@ static int clk_plle_tegra114_enable(struct clk_hw *hw)
        val |= PLLE_MISC_IDDQ_SW_CTRL;
        val &= ~PLLE_MISC_IDDQ_SW_VALUE;
        val |= PLLE_MISC_PLLE_PTS;
-       val |= PLLE_MISC_VREG_BG_CTRL_MASK | PLLE_MISC_VREG_CTRL_MASK;
+       val &= ~(PLLE_MISC_VREG_BG_CTRL_MASK | PLLE_MISC_VREG_CTRL_MASK);
        pll_writel_misc(val, pll);
        udelay(5);
 
@@ -1428,7 +1434,7 @@ static int clk_plle_tegra114_enable(struct clk_hw *hw)
        val = pll_readl(PLLE_SS_CTRL, pll);
        val &= ~(PLLE_SS_CNTL_CENTER | PLLE_SS_CNTL_INVERT);
        val &= ~PLLE_SS_COEFFICIENTS_MASK;
-       val |= PLLE_SS_COEFFICIENTS_VAL;
+       val |= PLLE_SS_COEFFICIENTS_VAL_TEGRA114;
        pll_writel(val, PLLE_SS_CTRL, pll);
        val &= ~(PLLE_SS_CNTL_SSC_BYP | PLLE_SS_CNTL_BYPASS_SS);
        pll_writel(val, PLLE_SS_CTRL, pll);
@@ -2012,9 +2018,9 @@ static int clk_plle_tegra210_enable(struct clk_hw *hw)
        struct tegra_clk_pll *pll = to_clk_pll(hw);
        struct tegra_clk_pll_freq_table sel;
        u32 val;
-       int ret;
+       int ret = 0;
        unsigned long flags = 0;
-       unsigned long input_rate = clk_get_rate(clk_get_parent(hw->clk));
+       unsigned long input_rate = clk_hw_get_rate(clk_hw_get_parent(hw));
 
        if (_get_table_rate(hw, &sel, pll->params->fixed_rate, input_rate))
                return -EINVAL;
@@ -2022,22 +2028,20 @@ static int clk_plle_tegra210_enable(struct clk_hw *hw)
        if (pll->lock)
                spin_lock_irqsave(pll->lock, flags);
 
+       val = pll_readl(pll->params->aux_reg, pll);
+       if (val & PLLE_AUX_SEQ_ENABLE)
+               goto out;
+
        val = pll_readl_base(pll);
        val &= ~BIT(30); /* Disable lock override */
        pll_writel_base(val, pll);
 
-       val = pll_readl(pll->params->aux_reg, pll);
-       val |= PLLE_AUX_ENABLE_SWCTL;
-       val &= ~PLLE_AUX_SEQ_ENABLE;
-       pll_writel(val, pll->params->aux_reg, pll);
-       udelay(1);
-
        val = pll_readl_misc(pll);
        val |= PLLE_MISC_LOCK_ENABLE;
        val |= PLLE_MISC_IDDQ_SW_CTRL;
        val &= ~PLLE_MISC_IDDQ_SW_VALUE;
        val |= PLLE_MISC_PLLE_PTS;
-       val |= PLLE_MISC_VREG_BG_CTRL_MASK | PLLE_MISC_VREG_CTRL_MASK;
+       val &= ~(PLLE_MISC_VREG_BG_CTRL_MASK | PLLE_MISC_VREG_CTRL_MASK);
        pll_writel_misc(val, pll);
        udelay(5);
 
@@ -2067,7 +2071,7 @@ static int clk_plle_tegra210_enable(struct clk_hw *hw)
        val = pll_readl(PLLE_SS_CTRL, pll);
        val &= ~(PLLE_SS_CNTL_CENTER | PLLE_SS_CNTL_INVERT);
        val &= ~PLLE_SS_COEFFICIENTS_MASK;
-       val |= PLLE_SS_COEFFICIENTS_VAL;
+       val |= PLLE_SS_COEFFICIENTS_VAL_TEGRA210;
        pll_writel(val, PLLE_SS_CTRL, pll);
        val &= ~(PLLE_SS_CNTL_SSC_BYP | PLLE_SS_CNTL_BYPASS_SS);
        pll_writel(val, PLLE_SS_CTRL, pll);
@@ -2104,15 +2108,25 @@ static void clk_plle_tegra210_disable(struct clk_hw *hw)
        if (pll->lock)
                spin_lock_irqsave(pll->lock, flags);
 
+       /* If PLLE HW sequencer is enabled, SW should not disable PLLE */
+       val = pll_readl(pll->params->aux_reg, pll);
+       if (val & PLLE_AUX_SEQ_ENABLE)
+               goto out;
+
        val = pll_readl_base(pll);
        val &= ~PLLE_BASE_ENABLE;
        pll_writel_base(val, pll);
 
+       val = pll_readl(pll->params->aux_reg, pll);
+       val |= PLLE_AUX_ENABLE_SWCTL | PLLE_AUX_SS_SWCTL;
+       pll_writel(val, pll->params->aux_reg, pll);
+
        val = pll_readl_misc(pll);
        val |= PLLE_MISC_IDDQ_SW_CTRL | PLLE_MISC_IDDQ_SW_VALUE;
        pll_writel_misc(val, pll);
        udelay(1);
 
+out:
        if (pll->lock)
                spin_unlock_irqrestore(pll->lock, flags);
 }
index 6ad381a..ea2b9cb 100644 (file)
@@ -773,7 +773,7 @@ static struct tegra_periph_init_data periph_clks[] = {
        XUSB("xusb_dev_src", mux_clkm_pllp_pllc_pllre, CLK_SOURCE_XUSB_DEV_SRC, 95, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_xusb_dev_src),
        XUSB("xusb_dev_src", mux_clkm_pllp_pllre, CLK_SOURCE_XUSB_DEV_SRC, 95, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_xusb_dev_src_8),
        MUX8("dbgapb", mux_pllp_clkm_2, CLK_SOURCE_DBGAPB, 185, TEGRA_PERIPH_NO_RESET, tegra_clk_dbgapb),
-       MUX8("msenc", mux_pllc2_c_c3_pllp_plla1_clkm, CLK_SOURCE_NVENC, 219, 0, tegra_clk_nvenc),
+       MUX8("nvenc", mux_pllc2_c_c3_pllp_plla1_clkm, CLK_SOURCE_NVENC, 219, 0, tegra_clk_nvenc),
        MUX8("nvdec", mux_pllc2_c_c3_pllp_plla1_clkm, CLK_SOURCE_NVDEC, 194, 0, tegra_clk_nvdec),
        MUX8("nvjpg", mux_pllc2_c_c3_pllp_plla1_clkm, CLK_SOURCE_NVJPG, 195, 0, tegra_clk_nvjpg),
        MUX8("ape", mux_plla_pllc4_out0_pllc_pllc4_out1_pllp_pllc4_out2_clkm, CLK_SOURCE_APE, 198, TEGRA_PERIPH_ON_APB, tegra_clk_ape),
@@ -782,7 +782,7 @@ static struct tegra_periph_init_data periph_clks[] = {
        NODIV("sor1", mux_clkm_sor1_brick_sor1_src, CLK_SOURCE_SOR1, 15, MASK(1), 183, 0, tegra_clk_sor1, &sor1_lock),
        MUX8("sdmmc_legacy", mux_pllp_out3_clkm_pllp_pllc4, CLK_SOURCE_SDMMC_LEGACY, 193, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_sdmmc_legacy),
        MUX8("qspi", mux_pllp_pllc_pllc_out1_pllc4_out2_pllc4_out1_clkm_pllc4_out0, CLK_SOURCE_QSPI, 211, TEGRA_PERIPH_ON_APB, tegra_clk_qspi),
-       MUX("vii2c", mux_pllp_pllc_clkm, CLK_SOURCE_VI_I2C, 208, TEGRA_PERIPH_ON_APB, tegra_clk_vi_i2c),
+       I2C("vii2c", mux_pllp_pllc_clkm, CLK_SOURCE_VI_I2C, 208, tegra_clk_vi_i2c),
        MUX("mipibif", mux_pllp_clkm, CLK_SOURCE_MIPIBIF, 173, TEGRA_PERIPH_ON_APB, tegra_clk_mipibif),
        MUX("uartape", mux_pllp_pllc_clkm, CLK_SOURCE_UARTAPE, 212, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_uartape),
        MUX8("tsecb", mux_pllp_pllc2_c_c3_clkm, CLK_SOURCE_TSECB, 206, 0, tegra_clk_tsecb),
@@ -829,6 +829,7 @@ static struct tegra_periph_init_data gate_clks[] = {
        GATE("xusb_gate", "osc", 143, 0, tegra_clk_xusb_gate, 0),
        GATE("pll_p_out_cpu", "pll_p", 223, 0, tegra_clk_pll_p_out_cpu, 0),
        GATE("pll_p_out_adsp", "pll_p", 187, 0, tegra_clk_pll_p_out_adsp, 0),
+       GATE("apb2ape", "clk_m", 107, 0, tegra_clk_apb2ape, 0),
 };
 
 static struct tegra_periph_init_data div_clks[] = {
index 4559a20..474de0f 100644 (file)
@@ -67,7 +67,7 @@ static const char *cclk_lp_parents[] = { "clk_m", "pll_c", "clk_32k", "pll_m",
                                         "pll_p", "pll_p_out4", "unused",
                                         "unused", "pll_x", "pll_x_out0" };
 
-const struct tegra_super_gen_info tegra_super_gen_info_gen4 = {
+static const struct tegra_super_gen_info tegra_super_gen_info_gen4 = {
        .gen = gen4,
        .sclk_parents = sclk_parents,
        .cclk_g_parents = cclk_g_parents,
@@ -93,7 +93,7 @@ static const char *cclk_lp_parents_gen5[] = { "clk_m", "unused", "clk_32k", "unu
                                        "unused", "unused", "unused", "unused",
                                        "dfllCPU_out" };
 
-const struct tegra_super_gen_info tegra_super_gen_info_gen5 = {
+static const struct tegra_super_gen_info tegra_super_gen_info_gen5 = {
        .gen = gen5,
        .sclk_parents = sclk_parents_gen5,
        .cclk_g_parents = cclk_g_parents_gen5,
@@ -171,7 +171,7 @@ static void __init tegra_sclk_init(void __iomem *clk_base,
        *dt_clk = clk;
 }
 
-void __init tegra_super_clk_init(void __iomem *clk_base,
+static void __init tegra_super_clk_init(void __iomem *clk_base,
                                void __iomem *pmc_base,
                                struct tegra_clk *tegra_clks,
                                struct tegra_clk_pll_params *params,
index 58514c4..637041f 100644 (file)
@@ -59,8 +59,8 @@
 #define PLLC3_MISC3 0x50c
 
 #define PLLM_BASE 0x90
-#define PLLM_MISC0 0x9c
 #define PLLM_MISC1 0x98
+#define PLLM_MISC2 0x9c
 #define PLLP_BASE 0xa0
 #define PLLP_MISC0 0xac
 #define PLLP_MISC1 0x680
@@ -99,7 +99,7 @@
 #define PLLC4_MISC0 0x5a8
 #define PLLC4_OUT 0x5e4
 #define PLLMB_BASE 0x5e8
-#define PLLMB_MISC0 0x5ec
+#define PLLMB_MISC1 0x5ec
 #define PLLA1_BASE 0x6a4
 #define PLLA1_MISC0 0x6a8
 #define PLLA1_MISC1 0x6ac
@@ -243,7 +243,8 @@ static unsigned long tegra210_input_freq[] = {
 };
 
 static const char *mux_pllmcp_clkm[] = {
-       "pll_m", "pll_c", "pll_p", "clk_m", "pll_m_ud", "pll_c2", "pll_c3",
+       "pll_m", "pll_c", "pll_p", "clk_m", "pll_m_ud", "pll_mb", "pll_mb",
+       "pll_p",
 };
 #define mux_pllmcp_clkm_idx NULL
 
@@ -367,12 +368,12 @@ static const char *mux_pllmcp_clkm[] = {
 /* PLLMB */
 #define PLLMB_BASE_LOCK                        (1 << 27)
 
-#define PLLMB_MISC0_LOCK_OVERRIDE      (1 << 18)
-#define PLLMB_MISC0_IDDQ               (1 << 17)
-#define PLLMB_MISC0_LOCK_ENABLE                (1 << 16)
+#define PLLMB_MISC1_LOCK_OVERRIDE      (1 << 18)
+#define PLLMB_MISC1_IDDQ               (1 << 17)
+#define PLLMB_MISC1_LOCK_ENABLE                (1 << 16)
 
-#define PLLMB_MISC0_DEFAULT_VALUE      0x00030000
-#define PLLMB_MISC0_WRITE_MASK         0x0007ffff
+#define PLLMB_MISC1_DEFAULT_VALUE      0x00030000
+#define PLLMB_MISC1_WRITE_MASK         0x0007ffff
 
 /* PLLP */
 #define PLLP_BASE_OVERRIDE             (1 << 28)
@@ -457,7 +458,8 @@ static void pllcx_check_defaults(struct tegra_clk_pll_params *params)
                        PLLCX_MISC3_WRITE_MASK);
 }
 
-void tegra210_pllcx_set_defaults(const char *name, struct tegra_clk_pll *pllcx)
+static void tegra210_pllcx_set_defaults(const char *name,
+                                       struct tegra_clk_pll *pllcx)
 {
        pllcx->params->defaults_set = true;
 
@@ -482,22 +484,22 @@ void tegra210_pllcx_set_defaults(const char *name, struct tegra_clk_pll *pllcx)
        udelay(1);
 }
 
-void _pllc_set_defaults(struct tegra_clk_pll *pllcx)
+static void _pllc_set_defaults(struct tegra_clk_pll *pllcx)
 {
        tegra210_pllcx_set_defaults("PLL_C", pllcx);
 }
 
-void _pllc2_set_defaults(struct tegra_clk_pll *pllcx)
+static void _pllc2_set_defaults(struct tegra_clk_pll *pllcx)
 {
        tegra210_pllcx_set_defaults("PLL_C2", pllcx);
 }
 
-void _pllc3_set_defaults(struct tegra_clk_pll *pllcx)
+static void _pllc3_set_defaults(struct tegra_clk_pll *pllcx)
 {
        tegra210_pllcx_set_defaults("PLL_C3", pllcx);
 }
 
-void _plla1_set_defaults(struct tegra_clk_pll *pllcx)
+static void _plla1_set_defaults(struct tegra_clk_pll *pllcx)
 {
        tegra210_pllcx_set_defaults("PLL_A1", pllcx);
 }
@@ -507,7 +509,7 @@ void _plla1_set_defaults(struct tegra_clk_pll *pllcx)
  * PLL with dynamic ramp and fractional SDM. Dynamic ramp is not used.
  * Fractional SDM is allowed to provide exact audio rates.
  */
-void tegra210_plla_set_defaults(struct tegra_clk_pll *plla)
+static void tegra210_plla_set_defaults(struct tegra_clk_pll *plla)
 {
        u32 mask;
        u32 val = readl_relaxed(clk_base + plla->params->base_reg);
@@ -559,7 +561,7 @@ void tegra210_plla_set_defaults(struct tegra_clk_pll *plla)
  * PLLD
  * PLL with fractional SDM.
  */
-void tegra210_plld_set_defaults(struct tegra_clk_pll *plld)
+static void tegra210_plld_set_defaults(struct tegra_clk_pll *plld)
 {
        u32 val;
        u32 mask = 0xffff;
@@ -698,7 +700,7 @@ static void plldss_defaults(const char *pll_name, struct tegra_clk_pll *plldss,
        udelay(1);
 }
 
-void tegra210_plld2_set_defaults(struct tegra_clk_pll *plld2)
+static void tegra210_plld2_set_defaults(struct tegra_clk_pll *plld2)
 {
        plldss_defaults("PLL_D2", plld2, PLLD2_MISC0_DEFAULT_VALUE,
                        PLLD2_MISC1_CFG_DEFAULT_VALUE,
@@ -706,7 +708,7 @@ void tegra210_plld2_set_defaults(struct tegra_clk_pll *plld2)
                        PLLD2_MISC3_CTRL2_DEFAULT_VALUE);
 }
 
-void tegra210_plldp_set_defaults(struct tegra_clk_pll *plldp)
+static void tegra210_plldp_set_defaults(struct tegra_clk_pll *plldp)
 {
        plldss_defaults("PLL_DP", plldp, PLLDP_MISC0_DEFAULT_VALUE,
                        PLLDP_MISC1_CFG_DEFAULT_VALUE,
@@ -719,7 +721,7 @@ void tegra210_plldp_set_defaults(struct tegra_clk_pll *plldp)
  * Base and misc0 layout is the same as PLLD2/PLLDP, but no SDM/SSC support.
  * VCO is exposed to the clock tree via fixed 1/3 and 1/5 dividers.
  */
-void tegra210_pllc4_set_defaults(struct tegra_clk_pll *pllc4)
+static void tegra210_pllc4_set_defaults(struct tegra_clk_pll *pllc4)
 {
        plldss_defaults("PLL_C4", pllc4, PLLC4_MISC0_DEFAULT_VALUE, 0, 0, 0);
 }
@@ -728,7 +730,7 @@ void tegra210_pllc4_set_defaults(struct tegra_clk_pll *pllc4)
  * PLLRE
  * VCO is exposed to the clock tree directly along with post-divider output
  */
-void tegra210_pllre_set_defaults(struct tegra_clk_pll *pllre)
+static void tegra210_pllre_set_defaults(struct tegra_clk_pll *pllre)
 {
        u32 mask;
        u32 val = readl_relaxed(clk_base + pllre->params->base_reg);
@@ -780,13 +782,13 @@ static void pllx_get_dyn_steps(struct clk_hw *hw, u32 *step_a, u32 *step_b)
 {
        unsigned long input_rate;
 
-       if (!IS_ERR_OR_NULL(hw->clk)) {
+       /* cf rate */
+       if (!IS_ERR_OR_NULL(hw->clk))
                input_rate = clk_hw_get_rate(clk_hw_get_parent(hw));
-               /* cf rate */
-               input_rate /= tegra_pll_get_fixed_mdiv(hw, input_rate);
-       } else {
+       else
                input_rate = 38400000;
-       }
+
+       input_rate /= tegra_pll_get_fixed_mdiv(hw, input_rate);
 
        switch (input_rate) {
        case 12000000:
@@ -841,7 +843,7 @@ static void pllx_check_defaults(struct tegra_clk_pll *pll)
                        PLLX_MISC5_WRITE_MASK);
 }
 
-void tegra210_pllx_set_defaults(struct tegra_clk_pll *pllx)
+static void tegra210_pllx_set_defaults(struct tegra_clk_pll *pllx)
 {
        u32 val;
        u32 step_a, step_b;
@@ -901,7 +903,7 @@ void tegra210_pllx_set_defaults(struct tegra_clk_pll *pllx)
 }
 
 /* PLLMB */
-void tegra210_pllmb_set_defaults(struct tegra_clk_pll *pllmb)
+static void tegra210_pllmb_set_defaults(struct tegra_clk_pll *pllmb)
 {
        u32 mask, val = readl_relaxed(clk_base + pllmb->params->base_reg);
 
@@ -914,15 +916,15 @@ void tegra210_pllmb_set_defaults(struct tegra_clk_pll *pllmb)
                 * PLL is ON: check if defaults already set, then set those
                 * that can be updated in flight.
                 */
-               val = PLLMB_MISC0_DEFAULT_VALUE & (~PLLMB_MISC0_IDDQ);
-               mask = PLLMB_MISC0_LOCK_ENABLE | PLLMB_MISC0_LOCK_OVERRIDE;
+               val = PLLMB_MISC1_DEFAULT_VALUE & (~PLLMB_MISC1_IDDQ);
+               mask = PLLMB_MISC1_LOCK_ENABLE | PLLMB_MISC1_LOCK_OVERRIDE;
                _pll_misc_chk_default(clk_base, pllmb->params, 0, val,
-                               ~mask & PLLMB_MISC0_WRITE_MASK);
+                               ~mask & PLLMB_MISC1_WRITE_MASK);
 
                /* Enable lock detect */
                val = readl_relaxed(clk_base + pllmb->params->ext_misc_reg[0]);
                val &= ~mask;
-               val |= PLLMB_MISC0_DEFAULT_VALUE & mask;
+               val |= PLLMB_MISC1_DEFAULT_VALUE & mask;
                writel_relaxed(val, clk_base + pllmb->params->ext_misc_reg[0]);
                udelay(1);
 
@@ -930,7 +932,7 @@ void tegra210_pllmb_set_defaults(struct tegra_clk_pll *pllmb)
        }
 
        /* set IDDQ, enable lock detect */
-       writel_relaxed(PLLMB_MISC0_DEFAULT_VALUE,
+       writel_relaxed(PLLMB_MISC1_DEFAULT_VALUE,
                        clk_base + pllmb->params->ext_misc_reg[0]);
        udelay(1);
 }
@@ -960,7 +962,7 @@ static void pllp_check_defaults(struct tegra_clk_pll *pll, bool enabled)
                        ~mask & PLLP_MISC1_WRITE_MASK);
 }
 
-void tegra210_pllp_set_defaults(struct tegra_clk_pll *pllp)
+static void tegra210_pllp_set_defaults(struct tegra_clk_pll *pllp)
 {
        u32 mask;
        u32 val = readl_relaxed(clk_base + pllp->params->base_reg);
@@ -1022,7 +1024,7 @@ static void pllu_check_defaults(struct tegra_clk_pll *pll, bool hw_control)
                        ~mask & PLLU_MISC1_WRITE_MASK);
 }
 
-void tegra210_pllu_set_defaults(struct tegra_clk_pll *pllu)
+static void tegra210_pllu_set_defaults(struct tegra_clk_pll *pllu)
 {
        u32 val = readl_relaxed(clk_base + pllu->params->base_reg);
 
@@ -1212,8 +1214,9 @@ static void tegra210_clk_pll_set_gain(struct tegra_clk_pll_freq_table *cfg)
        cfg->m *= PLL_SDM_COEFF;
 }
 
-unsigned long tegra210_clk_adjust_vco_min(struct tegra_clk_pll_params *params,
-                                         unsigned long parent_rate)
+static unsigned long
+tegra210_clk_adjust_vco_min(struct tegra_clk_pll_params *params,
+                           unsigned long parent_rate)
 {
        unsigned long vco_min = params->vco_min;
 
@@ -1386,7 +1389,7 @@ static struct tegra_clk_pll_params pll_c_params = {
        .mdiv_default = 3,
        .div_nmp = &pllc_nmp,
        .freq_table = pll_cx_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .set_defaults = _pllc_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1425,7 +1428,7 @@ static struct tegra_clk_pll_params pll_c2_params = {
        .ext_misc_reg[2] = PLLC2_MISC2,
        .ext_misc_reg[3] = PLLC2_MISC3,
        .freq_table = pll_cx_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .set_defaults = _pllc2_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1455,7 +1458,7 @@ static struct tegra_clk_pll_params pll_c3_params = {
        .ext_misc_reg[2] = PLLC3_MISC2,
        .ext_misc_reg[3] = PLLC3_MISC3,
        .freq_table = pll_cx_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .set_defaults = _pllc3_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1505,7 +1508,6 @@ static struct tegra_clk_pll_params pll_c4_vco_params = {
        .base_reg = PLLC4_BASE,
        .misc_reg = PLLC4_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLSS_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .max_p = PLL_QLIN_PDIV_MAX,
        .ext_misc_reg[0] = PLLC4_MISC0,
@@ -1517,8 +1519,7 @@ static struct tegra_clk_pll_params pll_c4_vco_params = {
        .div_nmp = &pllss_nmp,
        .freq_table = pll_c4_vco_freq_table,
        .set_defaults = tegra210_pllc4_set_defaults,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE |
-                TEGRA_PLL_VCO_OUT,
+       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_VCO_OUT,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
 
@@ -1559,15 +1560,15 @@ static struct tegra_clk_pll_params pll_m_params = {
        .vco_min = 800000000,
        .vco_max = 1866000000,
        .base_reg = PLLM_BASE,
-       .misc_reg = PLLM_MISC1,
+       .misc_reg = PLLM_MISC2,
        .lock_mask = PLL_BASE_LOCK,
        .lock_enable_bit_idx = PLLM_MISC_LOCK_ENABLE,
        .lock_delay = 300,
-       .iddq_reg = PLLM_MISC0,
+       .iddq_reg = PLLM_MISC2,
        .iddq_bit_idx = PLLM_IDDQ_BIT,
        .max_p = PLL_QLIN_PDIV_MAX,
-       .ext_misc_reg[0] = PLLM_MISC0,
-       .ext_misc_reg[0] = PLLM_MISC1,
+       .ext_misc_reg[0] = PLLM_MISC2,
+       .ext_misc_reg[1] = PLLM_MISC1,
        .round_p_to_pdiv = pll_qlin_p_to_pdiv,
        .pdiv_tohw = pll_qlin_pdiv_to_hw,
        .div_nmp = &pllm_nmp,
@@ -1586,19 +1587,18 @@ static struct tegra_clk_pll_params pll_mb_params = {
        .vco_min = 800000000,
        .vco_max = 1866000000,
        .base_reg = PLLMB_BASE,
-       .misc_reg = PLLMB_MISC0,
+       .misc_reg = PLLMB_MISC1,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLMB_MISC_LOCK_ENABLE,
        .lock_delay = 300,
-       .iddq_reg = PLLMB_MISC0,
+       .iddq_reg = PLLMB_MISC1,
        .iddq_bit_idx = PLLMB_IDDQ_BIT,
        .max_p = PLL_QLIN_PDIV_MAX,
-       .ext_misc_reg[0] = PLLMB_MISC0,
+       .ext_misc_reg[0] = PLLMB_MISC1,
        .round_p_to_pdiv = pll_qlin_p_to_pdiv,
        .pdiv_tohw = pll_qlin_pdiv_to_hw,
        .div_nmp = &pllm_nmp,
        .freq_table = pll_m_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .set_defaults = tegra210_pllmb_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1671,7 +1671,6 @@ static struct tegra_clk_pll_params pll_re_vco_params = {
        .base_reg = PLLRE_BASE,
        .misc_reg = PLLRE_MISC0,
        .lock_mask = PLLRE_MISC_LOCK,
-       .lock_enable_bit_idx = PLLRE_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .max_p = PLL_QLIN_PDIV_MAX,
        .ext_misc_reg[0] = PLLRE_MISC0,
@@ -1681,8 +1680,7 @@ static struct tegra_clk_pll_params pll_re_vco_params = {
        .pdiv_tohw = pll_qlin_pdiv_to_hw,
        .div_nmp = &pllre_nmp,
        .freq_table = pll_re_vco_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_LOCK_MISC |
-                TEGRA_PLL_HAS_LOCK_ENABLE | TEGRA_PLL_VCO_OUT,
+       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_LOCK_MISC | TEGRA_PLL_VCO_OUT,
        .set_defaults = tegra210_pllre_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1712,7 +1710,6 @@ static struct tegra_clk_pll_params pll_p_params = {
        .base_reg = PLLP_BASE,
        .misc_reg = PLLP_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLP_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .iddq_reg = PLLP_MISC0,
        .iddq_bit_idx = PLLXP_IDDQ_BIT,
@@ -1721,8 +1718,7 @@ static struct tegra_clk_pll_params pll_p_params = {
        .div_nmp = &pllp_nmp,
        .freq_table = pll_p_freq_table,
        .fixed_rate = 408000000,
-       .flags = TEGRA_PLL_FIXED | TEGRA_PLL_USE_LOCK |
-                TEGRA_PLL_HAS_LOCK_ENABLE | TEGRA_PLL_VCO_OUT,
+       .flags = TEGRA_PLL_FIXED | TEGRA_PLL_USE_LOCK | TEGRA_PLL_VCO_OUT,
        .set_defaults = tegra210_pllp_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1750,7 +1746,7 @@ static struct tegra_clk_pll_params pll_a1_params = {
        .ext_misc_reg[2] = PLLA1_MISC2,
        .ext_misc_reg[3] = PLLA1_MISC3,
        .freq_table = pll_cx_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .set_defaults = _plla1_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -1787,7 +1783,6 @@ static struct tegra_clk_pll_params pll_a_params = {
        .base_reg = PLLA_BASE,
        .misc_reg = PLLA_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLA_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .round_p_to_pdiv = pll_qlin_p_to_pdiv,
        .pdiv_tohw = pll_qlin_pdiv_to_hw,
@@ -1802,8 +1797,7 @@ static struct tegra_clk_pll_params pll_a_params = {
        .ext_misc_reg[1] = PLLA_MISC1,
        .ext_misc_reg[2] = PLLA_MISC2,
        .freq_table = pll_a_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_MDIV_NEW |
-                TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK | TEGRA_MDIV_NEW,
        .set_defaults = tegra210_plla_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
        .set_gain = tegra210_clk_pll_set_gain,
@@ -1836,7 +1830,6 @@ static struct tegra_clk_pll_params pll_d_params = {
        .base_reg = PLLD_BASE,
        .misc_reg = PLLD_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLD_MISC_LOCK_ENABLE,
        .lock_delay = 1000,
        .iddq_reg = PLLD_MISC0,
        .iddq_bit_idx = PLLD_IDDQ_BIT,
@@ -1850,7 +1843,7 @@ static struct tegra_clk_pll_params pll_d_params = {
        .ext_misc_reg[0] = PLLD_MISC0,
        .ext_misc_reg[1] = PLLD_MISC1,
        .freq_table = pll_d_freq_table,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .mdiv_default = 1,
        .set_defaults = tegra210_plld_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
@@ -1876,7 +1869,6 @@ static struct tegra_clk_pll_params pll_d2_params = {
        .base_reg = PLLD2_BASE,
        .misc_reg = PLLD2_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLSS_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .iddq_reg = PLLD2_BASE,
        .iddq_bit_idx = PLLSS_IDDQ_BIT,
@@ -1897,7 +1889,7 @@ static struct tegra_clk_pll_params pll_d2_params = {
        .mdiv_default = 1,
        .freq_table = tegra210_pll_d2_freq_table,
        .set_defaults = tegra210_plld2_set_defaults,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
        .set_gain = tegra210_clk_pll_set_gain,
        .adjust_vco = tegra210_clk_adjust_vco_min,
@@ -1920,7 +1912,6 @@ static struct tegra_clk_pll_params pll_dp_params = {
        .base_reg = PLLDP_BASE,
        .misc_reg = PLLDP_MISC,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLSS_MISC_LOCK_ENABLE,
        .lock_delay = 300,
        .iddq_reg = PLLDP_BASE,
        .iddq_bit_idx = PLLSS_IDDQ_BIT,
@@ -1941,7 +1932,7 @@ static struct tegra_clk_pll_params pll_dp_params = {
        .mdiv_default = 1,
        .freq_table = pll_dp_freq_table,
        .set_defaults = tegra210_plldp_set_defaults,
-       .flags = TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE,
+       .flags = TEGRA_PLL_USE_LOCK,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
        .set_gain = tegra210_clk_pll_set_gain,
        .adjust_vco = tegra210_clk_adjust_vco_min,
@@ -1973,7 +1964,6 @@ static struct tegra_clk_pll_params pll_u_vco_params = {
        .base_reg = PLLU_BASE,
        .misc_reg = PLLU_MISC0,
        .lock_mask = PLL_BASE_LOCK,
-       .lock_enable_bit_idx = PLLU_MISC_LOCK_ENABLE,
        .lock_delay = 1000,
        .iddq_reg = PLLU_MISC0,
        .iddq_bit_idx = PLLU_IDDQ_BIT,
@@ -1983,8 +1973,7 @@ static struct tegra_clk_pll_params pll_u_vco_params = {
        .pdiv_tohw = pll_qlin_pdiv_to_hw,
        .div_nmp = &pllu_nmp,
        .freq_table = pll_u_freq_table,
-       .flags = TEGRA_PLLU | TEGRA_PLL_USE_LOCK | TEGRA_PLL_HAS_LOCK_ENABLE |
-                TEGRA_PLL_VCO_OUT,
+       .flags = TEGRA_PLLU | TEGRA_PLL_USE_LOCK | TEGRA_PLL_VCO_OUT,
        .set_defaults = tegra210_pllu_set_defaults,
        .calc_rate = tegra210_pll_fixed_mdiv_cfg,
 };
@@ -2218,6 +2207,7 @@ static struct tegra_clk tegra210_clks[tegra_clk_max] __initdata = {
        [tegra_clk_pll_c4_out1] = { .dt_id = TEGRA210_CLK_PLL_C4_OUT1, .present = true },
        [tegra_clk_pll_c4_out2] = { .dt_id = TEGRA210_CLK_PLL_C4_OUT2, .present = true },
        [tegra_clk_pll_c4_out3] = { .dt_id = TEGRA210_CLK_PLL_C4_OUT3, .present = true },
+       [tegra_clk_apb2ape] = { .dt_id = TEGRA210_CLK_APB2APE, .present = true },
 };
 
 static struct tegra_devclk devclks[] __initdata = {
@@ -2519,7 +2509,7 @@ static void __init tegra210_pll_init(void __iomem *clk_base,
 
        /* PLLU_VCO */
        val = readl(clk_base + pll_u_vco_params.base_reg);
-       val &= ~BIT(24); /* disable PLLU_OVERRIDE */
+       val &= ~PLLU_BASE_OVERRIDE; /* disable PLLU_OVERRIDE */
        writel(val, clk_base + pll_u_vco_params.base_reg);
 
        clk = tegra_clk_register_pllre("pll_u_vco", "pll_ref", clk_base, pmc,
@@ -2738,8 +2728,6 @@ static struct tegra_clk_init_table init_table[] __initdata = {
        { TEGRA210_CLK_DFLL_REF, TEGRA210_CLK_PLL_P, 51000000, 1 },
        { TEGRA210_CLK_SBC4, TEGRA210_CLK_PLL_P, 12000000, 1 },
        { TEGRA210_CLK_PLL_RE_VCO, TEGRA210_CLK_CLK_MAX, 672000000, 1 },
-       { TEGRA210_CLK_PLL_U_OUT1, TEGRA210_CLK_CLK_MAX, 48000000, 1 },
-       { TEGRA210_CLK_PLL_U_OUT2, TEGRA210_CLK_CLK_MAX, 60000000, 1 },
        { TEGRA210_CLK_XUSB_GATE, TEGRA210_CLK_CLK_MAX, 0, 1 },
        { TEGRA210_CLK_XUSB_SS_SRC, TEGRA210_CLK_PLL_U_480M, 120000000, 0 },
        { TEGRA210_CLK_XUSB_FS_SRC, TEGRA210_CLK_PLL_U_48M, 48000000, 0 },
index 1c30038..cc73929 100644 (file)
@@ -460,7 +460,8 @@ int omap3_noncore_dpll_enable(struct clk_hw *hw)
 
        parent = clk_hw_get_parent(hw);
 
-       if (clk_hw_get_rate(hw) == clk_get_rate(dd->clk_bypass)) {
+       if (clk_hw_get_rate(hw) ==
+           clk_hw_get_rate(__clk_get_hw(dd->clk_bypass))) {
                WARN_ON(parent != __clk_get_hw(dd->clk_bypass));
                r = _omap3_noncore_dpll_bypass(clk);
        } else {
index e62f8cb..3bca438 100644 (file)
@@ -78,6 +78,9 @@ static int vco_set(struct clk_icst *icst, struct icst_vco vco)
        ret = regmap_read(icst->map, icst->vcoreg_off, &val);
        if (ret)
                return ret;
+
+       /* Mask the 18 bits used by the VCO */
+       val &= ~0x7ffff;
        val |= vco.v | (vco.r << 9) | (vco.s << 16);
 
        /* This magic unlocks the VCO so it can be controlled */
index 56777f0..c346be6 100644 (file)
@@ -30,6 +30,8 @@ config CLKSRC_MMIO
 config DIGICOLOR_TIMER
        bool "Digicolor timer driver" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       select CLKSRC_MMIO
+       depends on HAS_IOMEM
        help
          Enables the support for the digicolor timer driver.
 
@@ -55,6 +57,7 @@ config ARMADA_370_XP_TIMER
        bool "Armada 370 and XP timer driver" if COMPILE_TEST
        depends on ARM
        select CLKSRC_OF
+       select CLKSRC_MMIO
        help
          Enables the support for the Armada 370 and XP timer driver.
 
@@ -76,6 +79,7 @@ config ORION_TIMER
 config SUN4I_TIMER
        bool "Sun4i timer driver" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        select CLKSRC_MMIO
        help
          Enables support for the Sun4i timer.
@@ -89,6 +93,7 @@ config SUN5I_HSTIMER
 
 config TEGRA_TIMER
        bool "Tegra timer driver" if COMPILE_TEST
+       select CLKSRC_MMIO
        depends on ARM
        help
          Enables support for the Tegra driver.
@@ -96,6 +101,7 @@ config TEGRA_TIMER
 config VT8500_TIMER
        bool "VT8500 timer driver" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        help
          Enables support for the VT8500 driver.
 
@@ -131,6 +137,7 @@ config CLKSRC_NOMADIK_MTU_SCHED_CLOCK
 config CLKSRC_DBX500_PRCMU
        bool "Clocksource PRCMU Timer" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        help
          Use the always on PRCMU Timer as clocksource
 
@@ -153,6 +160,7 @@ config CLKSRC_EFM32
 config CLKSRC_LPC32XX
        bool "Clocksource for LPC32XX" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS && HAS_IOMEM
+       depends on ARM
        select CLKSRC_MMIO
        select CLKSRC_OF
        help
@@ -248,6 +256,7 @@ config CLKSRC_EXYNOS_MCT
 config CLKSRC_SAMSUNG_PWM
        bool "PWM timer drvier for Samsung S3C, S5P" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        help
          This is a new clocksource driver for the PWM timer found in
          Samsung S3C, S5P and Exynos SoCs, replacing an earlier driver
@@ -257,12 +266,14 @@ config CLKSRC_SAMSUNG_PWM
 config FSL_FTM_TIMER
        bool "Freescale FlexTimer Module driver" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        select CLKSRC_MMIO
        help
          Support for Freescale FlexTimer Module (FTM) timer.
 
 config VF_PIT_TIMER
        bool
+       select CLKSRC_MMIO
        help
          Support for Period Interrupt Timer on Freescale Vybrid Family SoCs.
 
@@ -360,6 +371,7 @@ config CLKSRC_TANGO_XTAL
 config CLKSRC_PXA
        bool "Clocksource for PXA or SA-11x0 platform" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
+       depends on HAS_IOMEM
        select CLKSRC_MMIO
        help
          This enables OST0 support available on PXA and SA-11x0
@@ -394,6 +406,7 @@ config CLKSRC_ST_LPC
        bool "Low power clocksource found in the LPC" if COMPILE_TEST
        select CLKSRC_OF if OF
        depends on HAS_IOMEM
+       select CLKSRC_MMIO
        help
          Enable this option to use the Low Power controller timer
          as clocksource.
index c64d543..f0dd9d4 100644 (file)
 #define CNTTIDR                0x08
 #define CNTTIDR_VIRT(n)        (BIT(1) << ((n) * 4))
 
+#define CNTACR(n)      (0x40 + ((n) * 4))
+#define CNTACR_RPCT    BIT(0)
+#define CNTACR_RVCT    BIT(1)
+#define CNTACR_RFRQ    BIT(2)
+#define CNTACR_RVOFF   BIT(3)
+#define CNTACR_RWVT    BIT(4)
+#define CNTACR_RWPT    BIT(5)
+
 #define CNTVCT_LO      0x08
 #define CNTVCT_HI      0x0c
 #define CNTFRQ         0x10
@@ -266,10 +274,12 @@ static void __arch_timer_setup(unsigned type,
                if (arch_timer_use_virtual) {
                        clk->irq = arch_timer_ppi[VIRT_PPI];
                        clk->set_state_shutdown = arch_timer_shutdown_virt;
+                       clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
                        clk->set_next_event = arch_timer_set_next_event_virt;
                } else {
                        clk->irq = arch_timer_ppi[PHYS_SECURE_PPI];
                        clk->set_state_shutdown = arch_timer_shutdown_phys;
+                       clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
                        clk->set_next_event = arch_timer_set_next_event_phys;
                }
        } else {
@@ -279,10 +289,12 @@ static void __arch_timer_setup(unsigned type,
                clk->cpumask = cpu_all_mask;
                if (arch_timer_mem_use_virtual) {
                        clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
+                       clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_virt_mem;
                } else {
                        clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
+                       clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_phys_mem;
                }
@@ -757,7 +769,6 @@ static void __init arch_timer_mem_init(struct device_node *np)
        }
 
        cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
-       iounmap(cntctlbase);
 
        /*
         * Try to find a virtual capable frame. Otherwise fall back to a
@@ -765,20 +776,31 @@ static void __init arch_timer_mem_init(struct device_node *np)
         */
        for_each_available_child_of_node(np, frame) {
                int n;
+               u32 cntacr;
 
                if (of_property_read_u32(frame, "frame-number", &n)) {
                        pr_err("arch_timer: Missing frame-number\n");
-                       of_node_put(best_frame);
                        of_node_put(frame);
-                       return;
+                       goto out;
                }
 
-               if (cnttidr & CNTTIDR_VIRT(n)) {
+               /* Try enabling everything, and see what sticks */
+               cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
+                        CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
+               writel_relaxed(cntacr, cntctlbase + CNTACR(n));
+               cntacr = readl_relaxed(cntctlbase + CNTACR(n));
+
+               if ((cnttidr & CNTTIDR_VIRT(n)) &&
+                   !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
                        of_node_put(best_frame);
                        best_frame = frame;
                        arch_timer_mem_use_virtual = true;
                        break;
                }
+
+               if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
+                       continue;
+
                of_node_put(best_frame);
                best_frame = of_node_get(frame);
        }
@@ -786,24 +808,26 @@ static void __init arch_timer_mem_init(struct device_node *np)
        base = arch_counter_base = of_iomap(best_frame, 0);
        if (!base) {
                pr_err("arch_timer: Can't map frame's registers\n");
-               of_node_put(best_frame);
-               return;
+               goto out;
        }
 
        if (arch_timer_mem_use_virtual)
                irq = irq_of_parse_and_map(best_frame, 1);
        else
                irq = irq_of_parse_and_map(best_frame, 0);
-       of_node_put(best_frame);
+
        if (!irq) {
                pr_err("arch_timer: Frame missing %s irq",
                       arch_timer_mem_use_virtual ? "virt" : "phys");
-               return;
+               goto out;
        }
 
        arch_timer_detect_rate(base, np);
        arch_timer_mem_register(base, irq);
        arch_timer_common_init();
+out:
+       iounmap(cntctlbase);
+       of_node_put(best_frame);
 }
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
                       arch_timer_mem_init);
index d189d8c..9df0d16 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
@@ -174,6 +175,7 @@ static int gt_clockevents_init(struct clock_event_device *clk)
        clk->set_state_shutdown = gt_clockevent_shutdown;
        clk->set_state_periodic = gt_clockevent_set_periodic;
        clk->set_state_oneshot = gt_clockevent_shutdown;
+       clk->set_state_oneshot_stopped = gt_clockevent_shutdown;
        clk->set_next_event = gt_clockevent_set_next_event;
        clk->cpumask = cpumask_of(cpu);
        clk->rating = 300;
@@ -221,6 +223,21 @@ static u64 notrace gt_sched_clock_read(void)
 }
 #endif
 
+static unsigned long gt_read_long(void)
+{
+       return readl_relaxed(gt_base + GT_COUNTER0);
+}
+
+static struct delay_timer gt_delay_timer = {
+       .read_current_timer = gt_read_long,
+};
+
+static void __init gt_delay_timer_init(void)
+{
+       gt_delay_timer.freq = gt_clk_rate;
+       register_current_timer_delay(&gt_delay_timer);
+}
+
 static void __init gt_clocksource_init(void)
 {
        writel(0, gt_base + GT_CONTROL);
@@ -317,6 +334,7 @@ static void __init global_timer_of_register(struct device_node *np)
        /* Immediately configure the timer on the boot CPU */
        gt_clocksource_init();
        gt_clockevents_init(this_cpu_ptr(gt_evt));
+       gt_delay_timer_init();
 
        return;
 
index ff44082..be09bc0 100644 (file)
@@ -313,6 +313,7 @@ static struct clock_event_device mct_comp_device = {
        .set_state_periodic     = mct_set_state_periodic,
        .set_state_shutdown     = mct_set_state_shutdown,
        .set_state_oneshot      = mct_set_state_shutdown,
+       .set_state_oneshot_stopped = mct_set_state_shutdown,
        .tick_resume            = mct_set_state_shutdown,
 };
 
@@ -452,6 +453,7 @@ static int exynos4_local_timer_setup(struct mct_clock_event_device *mevt)
        evt->set_state_periodic = set_state_periodic;
        evt->set_state_shutdown = set_state_shutdown;
        evt->set_state_oneshot = set_state_shutdown;
+       evt->set_state_oneshot_stopped = set_state_shutdown;
        evt->tick_resume = set_state_shutdown;
        evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
        evt->rating = 450;
index 8c77a52..b991b28 100644 (file)
@@ -122,23 +122,23 @@ static void __init rk_timer_init(struct device_node *np)
        pclk = of_clk_get_by_name(np, "pclk");
        if (IS_ERR(pclk)) {
                pr_err("Failed to get pclk for '%s'\n", TIMER_NAME);
-               return;
+               goto out_unmap;
        }
 
        if (clk_prepare_enable(pclk)) {
                pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME);
-               return;
+               goto out_unmap;
        }
 
        timer_clk = of_clk_get_by_name(np, "timer");
        if (IS_ERR(timer_clk)) {
                pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME);
-               return;
+               goto out_timer_clk;
        }
 
        if (clk_prepare_enable(timer_clk)) {
                pr_err("Failed to enable timer clock\n");
-               return;
+               goto out_timer_clk;
        }
 
        bc_timer.freq = clk_get_rate(timer_clk);
@@ -146,7 +146,7 @@ static void __init rk_timer_init(struct device_node *np)
        irq = irq_of_parse_and_map(np, 0);
        if (!irq) {
                pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME);
-               return;
+               goto out_irq;
        }
 
        ce->name = TIMER_NAME;
@@ -164,10 +164,19 @@ static void __init rk_timer_init(struct device_node *np)
        ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce);
        if (ret) {
                pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret);
-               return;
+               goto out_irq;
        }
 
        clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX);
+
+       return;
+
+out_irq:
+       clk_disable_unprepare(timer_clk);
+out_timer_clk:
+       clk_disable_unprepare(pclk);
+out_unmap:
+       iounmap(bc_timer.base);
 }
 
 CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init);
index 6ee9140..4da2af9 100644 (file)
@@ -98,7 +98,8 @@ static int tc_shutdown(struct clock_event_device *d)
 
        __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
        __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
-       clk_disable(tcd->clk);
+       if (!clockevent_state_detached(d))
+               clk_disable(tcd->clk);
 
        return 0;
 }
index 1316876..daae61e 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/clk.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
@@ -43,6 +44,7 @@
 struct lpc32xx_clock_event_ddata {
        struct clock_event_device evtdev;
        void __iomem *base;
+       u32 ticks_per_jiffy;
 };
 
 /* Needed for the sched clock */
@@ -53,6 +55,15 @@ static u64 notrace lpc32xx_read_sched_clock(void)
        return readl(clocksource_timer_counter);
 }
 
+static unsigned long lpc32xx_delay_timer_read(void)
+{
+       return readl(clocksource_timer_counter);
+}
+
+static struct delay_timer lpc32xx_delay_timer = {
+       .read_current_timer = lpc32xx_delay_timer_read,
+};
+
 static int lpc32xx_clkevt_next_event(unsigned long delta,
                                     struct clock_event_device *evtdev)
 {
@@ -60,14 +71,13 @@ static int lpc32xx_clkevt_next_event(unsigned long delta,
                container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
 
        /*
-        * Place timer in reset and program the delta in the prescale
-        * register (PR). When the prescale counter matches the value
-        * in PR the counter register is incremented and the compare
-        * match will trigger. After setup the timer is released from
-        * reset and enabled.
+        * Place timer in reset and program the delta in the match
+        * channel 0 (MR0). When the timer counter matches the value
+        * in MR0 register the match will trigger an interrupt.
+        * After setup the timer is released from reset and enabled.
         */
        writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
-       writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR);
+       writel_relaxed(delta, ddata->base + LPC32XX_TIMER_MR0);
        writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
 
        return 0;
@@ -86,11 +96,39 @@ static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev)
 
 static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev)
 {
+       struct lpc32xx_clock_event_ddata *ddata =
+               container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
        /*
         * When using oneshot, we must also disable the timer
         * to wait for the first call to set_next_event().
         */
-       return lpc32xx_clkevt_shutdown(evtdev);
+       writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR);
+
+       /* Enable interrupt, reset on match and stop on match (MCR). */
+       writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
+                      LPC32XX_TIMER_MCR_MR0S, ddata->base + LPC32XX_TIMER_MCR);
+       return 0;
+}
+
+static int lpc32xx_clkevt_periodic(struct clock_event_device *evtdev)
+{
+       struct lpc32xx_clock_event_ddata *ddata =
+               container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
+       /* Enable interrupt and reset on match. */
+       writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R,
+                      ddata->base + LPC32XX_TIMER_MCR);
+
+       /*
+        * Place timer in reset and program the delta in the match
+        * channel 0 (MR0).
+        */
+       writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
+       writel_relaxed(ddata->ticks_per_jiffy, ddata->base + LPC32XX_TIMER_MR0);
+       writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
+
+       return 0;
 }
 
 static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
@@ -108,11 +146,13 @@ static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
 static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = {
        .evtdev = {
                .name                   = "lpc3220 clockevent",
-               .features               = CLOCK_EVT_FEAT_ONESHOT,
+               .features               = CLOCK_EVT_FEAT_ONESHOT |
+                                         CLOCK_EVT_FEAT_PERIODIC,
                .rating                 = 300,
                .set_next_event         = lpc32xx_clkevt_next_event,
                .set_state_shutdown     = lpc32xx_clkevt_shutdown,
                .set_state_oneshot      = lpc32xx_clkevt_oneshot,
+               .set_state_periodic     = lpc32xx_clkevt_periodic,
        },
 };
 
@@ -162,6 +202,8 @@ static int __init lpc32xx_clocksource_init(struct device_node *np)
        }
 
        clocksource_timer_counter = base + LPC32XX_TIMER_TC;
+       lpc32xx_delay_timer.freq = rate;
+       register_current_timer_delay(&lpc32xx_delay_timer);
        sched_clock_register(lpc32xx_read_sched_clock, 32, rate);
 
        return 0;
@@ -210,18 +252,16 @@ static int __init lpc32xx_clockevent_init(struct device_node *np)
 
        /*
         * Disable timer and clear any pending interrupt (IR) on match
-        * channel 0 (MR0). Configure a compare match value of 1 on MR0
-        * and enable interrupt, reset on match and stop on match (MCR).
+        * channel 0 (MR0). Clear the prescaler as it's not used.
         */
        writel_relaxed(0, base + LPC32XX_TIMER_TCR);
+       writel_relaxed(0, base + LPC32XX_TIMER_PR);
        writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
        writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR);
-       writel_relaxed(1, base + LPC32XX_TIMER_MR0);
-       writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
-                      LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR);
 
        rate = clk_get_rate(clk);
        lpc32xx_clk_event_ddata.base = base;
+       lpc32xx_clk_event_ddata.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ);
        clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev,
                                        rate, 1, -1);
 
index 659879a..f935110 100644 (file)
@@ -296,6 +296,7 @@ endif
 config QORIQ_CPUFREQ
        tristate "CPU frequency scaling driver for Freescale QorIQ SoCs"
        depends on OF && COMMON_CLK && (PPC_E500MC || ARM)
+       depends on !CPU_THERMAL || THERMAL
        select CLK_QORIQ
        help
          This adds the CPUFreq driver support for Freescale QorIQ SoCs
index 0031069..14b1f93 100644 (file)
@@ -84,10 +84,10 @@ config ARM_KIRKWOOD_CPUFREQ
          SoCs.
 
 config ARM_MT8173_CPUFREQ
-       bool "Mediatek MT8173 CPUFreq support"
+       tristate "Mediatek MT8173 CPUFreq support"
        depends on ARCH_MEDIATEK && REGULATOR
        depends on ARM64 || (ARM_CPU_TOPOLOGY && COMPILE_TEST)
-       depends on !CPU_THERMAL || THERMAL=y
+       depends on !CPU_THERMAL || THERMAL
        select PM_OPP
        help
          This adds the CPUFreq driver support for Mediatek MT8173 SoC.
index 9bc37c4..0ca74d0 100644 (file)
@@ -142,15 +142,16 @@ static int allocate_resources(int cpu, struct device **cdev,
 
 try_again:
        cpu_reg = regulator_get_optional(cpu_dev, reg);
-       if (IS_ERR(cpu_reg)) {
+       ret = PTR_ERR_OR_ZERO(cpu_reg);
+       if (ret) {
                /*
                 * If cpu's regulator supply node is present, but regulator is
                 * not yet registered, we should try defering probe.
                 */
-               if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
+               if (ret == -EPROBE_DEFER) {
                        dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
                                cpu);
-                       return -EPROBE_DEFER;
+                       return ret;
                }
 
                /* Try with "cpu-supply" */
@@ -159,18 +160,16 @@ try_again:
                        goto try_again;
                }
 
-               dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n",
-                       cpu, PTR_ERR(cpu_reg));
+               dev_dbg(cpu_dev, "no regulator for cpu%d: %d\n", cpu, ret);
        }
 
        cpu_clk = clk_get(cpu_dev, NULL);
-       if (IS_ERR(cpu_clk)) {
+       ret = PTR_ERR_OR_ZERO(cpu_clk);
+       if (ret) {
                /* put regulator */
                if (!IS_ERR(cpu_reg))
                        regulator_put(cpu_reg);
 
-               ret = PTR_ERR(cpu_clk);
-
                /*
                 * If cpu's clk node is present, but clock is not yet
                 * registered, we should try defering probe.
index c35e7da..e979ec7 100644 (file)
@@ -48,11 +48,11 @@ static struct cpufreq_policy *next_policy(struct cpufreq_policy *policy,
                                          bool active)
 {
        do {
-               policy = list_next_entry(policy, policy_list);
-
                /* No more policies in the list */
-               if (&policy->policy_list == &cpufreq_policy_list)
+               if (list_is_last(&policy->policy_list, &cpufreq_policy_list))
                        return NULL;
+
+               policy = list_next_entry(policy, policy_list);
        } while (!suitable_policy(policy, active));
 
        return policy;
index bab3a51..e0d1110 100644 (file)
@@ -387,16 +387,18 @@ static int cpufreq_governor_init(struct cpufreq_policy *policy,
        if (!have_governor_per_policy())
                cdata->gdbs_data = dbs_data;
 
+       policy->governor_data = dbs_data;
+
        ret = sysfs_create_group(get_governor_parent_kobj(policy),
                                 get_sysfs_attr(dbs_data));
        if (ret)
                goto reset_gdbs_data;
 
-       policy->governor_data = dbs_data;
-
        return 0;
 
 reset_gdbs_data:
+       policy->governor_data = NULL;
+
        if (!have_governor_per_policy())
                cdata->gdbs_data = NULL;
        cdata->exit(dbs_data, !policy->governor->initialized);
@@ -417,16 +419,19 @@ static int cpufreq_governor_exit(struct cpufreq_policy *policy,
        if (!cdbs->shared || cdbs->shared->policy)
                return -EBUSY;
 
-       policy->governor_data = NULL;
        if (!--dbs_data->usage_count) {
                sysfs_remove_group(get_governor_parent_kobj(policy),
                                   get_sysfs_attr(dbs_data));
 
+               policy->governor_data = NULL;
+
                if (!have_governor_per_policy())
                        cdata->gdbs_data = NULL;
 
                cdata->exit(dbs_data, policy->governor->initialized == 1);
                kfree(dbs_data);
+       } else {
+               policy->governor_data = NULL;
        }
 
        free_common_dbs_info(policy, cdata);
index cd83d47..3a4b39a 100644 (file)
@@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void)
        if (!all_cpu_data)
                return -ENOMEM;
 
-       if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
+       if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) {
                pr_info("intel_pstate: HWP enabled\n");
                hwp_active++;
        }
index 1efba34..2058e6d 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
index 1d99c97..0963772 100644 (file)
@@ -202,7 +202,7 @@ static void __init pxa_cpufreq_init_voltages(void)
        }
 }
 #else
-static int pxa_cpufreq_change_voltage(struct pxa_freqs *pxa_freq)
+static int pxa_cpufreq_change_voltage(const struct pxa_freqs *pxa_freq)
 {
        return 0;
 }
index 344058f..d5657d5 100644 (file)
@@ -119,7 +119,6 @@ struct cpuidle_coupled {
 
 #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
 
-static DEFINE_MUTEX(cpuidle_coupled_lock);
 static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
 
 /*
index 046423b..f996efc 100644 (file)
@@ -153,7 +153,7 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
         * be frozen safely.
         */
        index = find_deepest_state(drv, dev, UINT_MAX, 0, true);
-       if (index >= 0)
+       if (index > 0)
                enter_freeze_proper(drv, dev, index);
 
        return index;
index 6dd3317..3eb3f12 100644 (file)
@@ -400,7 +400,7 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
 {
        int err;
 
-       err = clk_prepare_enable(dd->iclk);
+       err = clk_enable(dd->iclk);
        if (err)
                return err;
 
@@ -430,7 +430,7 @@ static int atmel_aes_hw_version_init(struct atmel_aes_dev *dd)
 
        dev_info(dd->dev, "version: 0x%x\n", dd->hw_version);
 
-       clk_disable_unprepare(dd->iclk);
+       clk_disable(dd->iclk);
        return 0;
 }
 
@@ -448,7 +448,7 @@ static inline bool atmel_aes_is_encrypt(const struct atmel_aes_dev *dd)
 
 static inline int atmel_aes_complete(struct atmel_aes_dev *dd, int err)
 {
-       clk_disable_unprepare(dd->iclk);
+       clk_disable(dd->iclk);
        dd->flags &= ~AES_FLAGS_BUSY;
 
        if (dd->is_async)
@@ -2091,10 +2091,14 @@ static int atmel_aes_probe(struct platform_device *pdev)
                goto res_err;
        }
 
-       err = atmel_aes_hw_version_init(aes_dd);
+       err = clk_prepare(aes_dd->iclk);
        if (err)
                goto res_err;
 
+       err = atmel_aes_hw_version_init(aes_dd);
+       if (err)
+               goto iclk_unprepare;
+
        atmel_aes_get_cap(aes_dd);
 
        err = atmel_aes_buff_init(aes_dd);
@@ -2127,6 +2131,8 @@ err_algs:
 err_aes_dma:
        atmel_aes_buff_cleanup(aes_dd);
 err_aes_buff:
+iclk_unprepare:
+       clk_unprepare(aes_dd->iclk);
 res_err:
        tasklet_kill(&aes_dd->done_task);
        tasklet_kill(&aes_dd->queue_task);
@@ -2155,6 +2161,8 @@ static int atmel_aes_remove(struct platform_device *pdev)
        atmel_aes_dma_cleanup(aes_dd);
        atmel_aes_buff_cleanup(aes_dd);
 
+       clk_unprepare(aes_dd->iclk);
+
        return 0;
 }
 
index 20de861..8bf9914 100644 (file)
@@ -782,7 +782,7 @@ static void atmel_sha_finish_req(struct ahash_request *req, int err)
        dd->flags &= ~(SHA_FLAGS_BUSY | SHA_FLAGS_FINAL | SHA_FLAGS_CPU |
                        SHA_FLAGS_DMA_READY | SHA_FLAGS_OUTPUT_READY);
 
-       clk_disable_unprepare(dd->iclk);
+       clk_disable(dd->iclk);
 
        if (req->base.complete)
                req->base.complete(&req->base, err);
@@ -795,7 +795,7 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
 {
        int err;
 
-       err = clk_prepare_enable(dd->iclk);
+       err = clk_enable(dd->iclk);
        if (err)
                return err;
 
@@ -822,7 +822,7 @@ static void atmel_sha_hw_version_init(struct atmel_sha_dev *dd)
        dev_info(dd->dev,
                        "version: 0x%x\n", dd->hw_version);
 
-       clk_disable_unprepare(dd->iclk);
+       clk_disable(dd->iclk);
 }
 
 static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
@@ -1410,6 +1410,10 @@ static int atmel_sha_probe(struct platform_device *pdev)
                goto res_err;
        }
 
+       err = clk_prepare(sha_dd->iclk);
+       if (err)
+               goto res_err;
+
        atmel_sha_hw_version_init(sha_dd);
 
        atmel_sha_get_cap(sha_dd);
@@ -1421,12 +1425,12 @@ static int atmel_sha_probe(struct platform_device *pdev)
                        if (IS_ERR(pdata)) {
                                dev_err(&pdev->dev, "platform data not available\n");
                                err = PTR_ERR(pdata);
-                               goto res_err;
+                               goto iclk_unprepare;
                        }
                }
                if (!pdata->dma_slave) {
                        err = -ENXIO;
-                       goto res_err;
+                       goto iclk_unprepare;
                }
                err = atmel_sha_dma_init(sha_dd, pdata);
                if (err)
@@ -1457,6 +1461,8 @@ err_algs:
        if (sha_dd->caps.has_dma)
                atmel_sha_dma_cleanup(sha_dd);
 err_sha_dma:
+iclk_unprepare:
+       clk_unprepare(sha_dd->iclk);
 res_err:
        tasklet_kill(&sha_dd->done_task);
 sha_dd_err:
@@ -1483,12 +1489,7 @@ static int atmel_sha_remove(struct platform_device *pdev)
        if (sha_dd->caps.has_dma)
                atmel_sha_dma_cleanup(sha_dd);
 
-       iounmap(sha_dd->io_base);
-
-       clk_put(sha_dd->iclk);
-
-       if (sha_dd->irq >= 0)
-               free_irq(sha_dd->irq, sha_dd);
+       clk_unprepare(sha_dd->iclk);
 
        return 0;
 }
index 8abb4bc..69d4a13 100644 (file)
@@ -534,8 +534,8 @@ static int caam_probe(struct platform_device *pdev)
         * long pointers in master configuration register
         */
        clrsetbits_32(&ctrl->mcr, MCFGR_AWCACHE_MASK, MCFGR_AWCACHE_CACH |
-                     MCFGR_WDENABLE | (sizeof(dma_addr_t) == sizeof(u64) ?
-                                       MCFGR_LONG_PTR : 0));
+                     MCFGR_AWCACHE_BUFF | MCFGR_WDENABLE |
+                     (sizeof(dma_addr_t) == sizeof(u64) ? MCFGR_LONG_PTR : 0));
 
        /*
         *  Read the Compile Time paramters and SCFGR to determine
index 0643e33..c0656e7 100644 (file)
@@ -306,7 +306,7 @@ static int mv_cesa_dev_dma_init(struct mv_cesa_dev *cesa)
                return -ENOMEM;
 
        dma->padding_pool = dmam_pool_create("cesa_padding", dev, 72, 1, 0);
-       if (!dma->cache_pool)
+       if (!dma->padding_pool)
                return -ENOMEM;
 
        cesa->dma = dma;
index 848b93e..fe9dce0 100644 (file)
@@ -500,6 +500,8 @@ static int tegra_devfreq_target(struct device *dev, unsigned long *freq,
        clk_set_min_rate(tegra->emc_clock, rate);
        clk_set_rate(tegra->emc_clock, 0);
 
+       *freq = rate;
+
        return 0;
 }
 
index 64f5d1b..8e304b1 100644 (file)
 #define AT_XDMAC_MAX_CHAN      0x20
 #define AT_XDMAC_MAX_CSIZE     16      /* 16 data */
 #define AT_XDMAC_MAX_DWIDTH    8       /* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES   5
 
 #define AT_XDMAC_DMA_BUSWIDTHS\
        (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -1395,8 +1396,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
        struct at_xdmac_desc    *desc, *_desc;
        struct list_head        *descs_list;
        enum dma_status         ret;
-       int                     residue;
-       u32                     cur_nda, mask, value;
+       int                     residue, retry;
+       u32                     cur_nda, check_nda, cur_ubc, mask, value;
        u8                      dwidth = 0;
        unsigned long           flags;
 
@@ -1433,7 +1434,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                        cpu_relax();
        }
 
+       /*
+        * When processing the residue, we need to read two registers but we
+        * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+        * we stand in the descriptor list and AT_XDMAC_CUBC is used
+        * to know how many data are remaining for the current descriptor.
+        * Since the dma channel is not paused to not loose data, between the
+        * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+        * descriptor.
+        * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+        * still using the same descriptor by reading a second time
+        * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+        * read again AT_XDMAC_CUBC.
+        * Memory barriers are used to ensure the read order of the registers.
+        * A max number of retries is set because unlikely it can never ends if
+        * we are transferring a lot of data with small buffers.
+        */
        cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+       rmb();
+       cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+               rmb();
+               check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+               if (likely(cur_nda == check_nda))
+                       break;
+
+               cur_nda = check_nda;
+               rmb();
+               cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+       }
+
+       if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+               ret = DMA_ERROR;
+               goto spin_unlock;
+       }
+
        /*
         * Remove size of all microblocks already transferred and the current
         * one. Then add the remaining size to transfer of the current
@@ -1446,7 +1482,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
                        break;
        }
-       residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+       residue += cur_ubc << dwidth;
 
        dma_set_residue(txstate, residue);
 
index e893318..5ad0ec1 100644 (file)
@@ -156,7 +156,6 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
 
        /* Enable interrupts */
        channel_set_bit(dw, MASK.XFER, dwc->mask);
-       channel_set_bit(dw, MASK.BLOCK, dwc->mask);
        channel_set_bit(dw, MASK.ERROR, dwc->mask);
 
        dwc->initialized = true;
@@ -588,6 +587,9 @@ static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
 
                spin_unlock_irqrestore(&dwc->lock, flags);
        }
+
+       /* Re-enable interrupts */
+       channel_set_bit(dw, MASK.BLOCK, dwc->mask);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -618,11 +620,8 @@ static void dw_dma_tasklet(unsigned long data)
                        dwc_scan_descriptors(dw, dwc);
        }
 
-       /*
-        * Re-enable interrupts.
-        */
+       /* Re-enable interrupts */
        channel_set_bit(dw, MASK.XFER, dw->all_chan_mask);
-       channel_set_bit(dw, MASK.BLOCK, dw->all_chan_mask);
        channel_set_bit(dw, MASK.ERROR, dw->all_chan_mask);
 }
 
@@ -1261,6 +1260,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 int dw_dma_cyclic_start(struct dma_chan *chan)
 {
        struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(chan->device);
        unsigned long           flags;
 
        if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
@@ -1269,7 +1269,12 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
        }
 
        spin_lock_irqsave(&dwc->lock, flags);
+
+       /* Enable interrupts to perform cyclic transfer */
+       channel_set_bit(dw, MASK.BLOCK, dwc->mask);
+
        dwc_dostart(dwc, dwc->cdesc->desc[0]);
+
        spin_unlock_irqrestore(&dwc->lock, flags);
 
        return 0;
index 4c30fdd..358f968 100644 (file)
@@ -108,6 +108,10 @@ static const struct pci_device_id dw_pci_id_table[] = {
 
        /* Haswell */
        { PCI_VDEVICE(INTEL, 0x9c60) },
+
+       /* Broadwell */
+       { PCI_VDEVICE(INTEL, 0x9ce0) },
+
        { }
 };
 MODULE_DEVICE_TABLE(pci, dw_pci_id_table);
index d92d655..e3d7fcb 100644 (file)
 #define GET_NUM_REGN(x)                ((x & 0x300000) >> 20) /* bits 20-21 */
 #define CHMAP_EXIST            BIT(24)
 
+/* CCSTAT register */
+#define EDMA_CCSTAT_ACTV       BIT(4)
+
 /*
  * Max of 20 segments per channel to conserve PaRAM slots
  * Also note that MAX_NR_SG should be atleast the no.of periods
@@ -1680,9 +1683,20 @@ static void edma_issue_pending(struct dma_chan *chan)
        spin_unlock_irqrestore(&echan->vchan.lock, flags);
 }
 
+/*
+ * This limit exists to avoid a possible infinite loop when waiting for proof
+ * that a particular transfer is completed. This limit can be hit if there
+ * are large bursts to/from slow devices or the CPU is never able to catch
+ * the DMA hardware idle. On an AM335x transfering 48 bytes from the UART
+ * RX-FIFO, as many as 55 loops have been seen.
+ */
+#define EDMA_MAX_TR_WAIT_LOOPS 1000
+
 static u32 edma_residue(struct edma_desc *edesc)
 {
        bool dst = edesc->direction == DMA_DEV_TO_MEM;
+       int loop_count = EDMA_MAX_TR_WAIT_LOOPS;
+       struct edma_chan *echan = edesc->echan;
        struct edma_pset *pset = edesc->pset;
        dma_addr_t done, pos;
        int i;
@@ -1691,7 +1705,32 @@ static u32 edma_residue(struct edma_desc *edesc)
         * We always read the dst/src position from the first RamPar
         * pset. That's the one which is active now.
         */
-       pos = edma_get_position(edesc->echan->ecc, edesc->echan->slot[0], dst);
+       pos = edma_get_position(echan->ecc, echan->slot[0], dst);
+
+       /*
+        * "pos" may represent a transfer request that is still being
+        * processed by the EDMACC or EDMATC. We will busy wait until
+        * any one of the situations occurs:
+        *   1. the DMA hardware is idle
+        *   2. a new transfer request is setup
+        *   3. we hit the loop limit
+        */
+       while (edma_read(echan->ecc, EDMA_CCSTAT) & EDMA_CCSTAT_ACTV) {
+               /* check if a new transfer request is setup */
+               if (edma_get_position(echan->ecc,
+                                     echan->slot[0], dst) != pos) {
+                       break;
+               }
+
+               if (!--loop_count) {
+                       dev_dbg_ratelimited(echan->vchan.chan.device->dev,
+                               "%s: timeout waiting for PaRAM update\n",
+                               __func__);
+                       break;
+               }
+
+               cpu_relax();
+       }
 
        /*
         * Cyclic is simple. Just subtract pset[0].addr from pos.
index 2209f75..aac85c3 100644 (file)
@@ -522,6 +522,8 @@ static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
                        chan_dbg(chan, "LD %p callback\n", desc);
                        txd->callback(txd->callback_param);
                }
+
+               dma_descriptor_unmap(txd);
        }
 
        /* Run any dependencies */
index 1d5df2e..21539d5 100644 (file)
@@ -861,32 +861,42 @@ void ioat_timer_event(unsigned long data)
                        return;
        }
 
+       spin_lock_bh(&ioat_chan->cleanup_lock);
+
+       /* handle the no-actives case */
+       if (!ioat_ring_active(ioat_chan)) {
+               spin_lock_bh(&ioat_chan->prep_lock);
+               check_active(ioat_chan);
+               spin_unlock_bh(&ioat_chan->prep_lock);
+               spin_unlock_bh(&ioat_chan->cleanup_lock);
+               return;
+       }
+
        /* if we haven't made progress and we have already
         * acknowledged a pending completion once, then be more
         * forceful with a restart
         */
-       spin_lock_bh(&ioat_chan->cleanup_lock);
        if (ioat_cleanup_preamble(ioat_chan, &phys_complete))
                __cleanup(ioat_chan, phys_complete);
        else if (test_bit(IOAT_COMPLETION_ACK, &ioat_chan->state)) {
+               u32 chanerr;
+
+               chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
+               dev_warn(to_dev(ioat_chan), "Restarting channel...\n");
+               dev_warn(to_dev(ioat_chan), "CHANSTS: %#Lx CHANERR: %#x\n",
+                        status, chanerr);
+               dev_warn(to_dev(ioat_chan), "Active descriptors: %d\n",
+                        ioat_ring_active(ioat_chan));
+
                spin_lock_bh(&ioat_chan->prep_lock);
                ioat_restart_channel(ioat_chan);
                spin_unlock_bh(&ioat_chan->prep_lock);
                spin_unlock_bh(&ioat_chan->cleanup_lock);
                return;
-       } else {
+       } else
                set_bit(IOAT_COMPLETION_ACK, &ioat_chan->state);
-               mod_timer(&ioat_chan->timer, jiffies + COMPLETION_TIMEOUT);
-       }
-
 
-       if (ioat_ring_active(ioat_chan))
-               mod_timer(&ioat_chan->timer, jiffies + COMPLETION_TIMEOUT);
-       else {
-               spin_lock_bh(&ioat_chan->prep_lock);
-               check_active(ioat_chan);
-               spin_unlock_bh(&ioat_chan->prep_lock);
-       }
+       mod_timer(&ioat_chan->timer, jiffies + COMPLETION_TIMEOUT);
        spin_unlock_bh(&ioat_chan->cleanup_lock);
 }
 
index e4f4312..f039cfa 100644 (file)
@@ -1300,10 +1300,10 @@ static int iop_adma_probe(struct platform_device *pdev)
         * note: writecombine gives slightly better performance, but
         * requires that we explicitly flush the writes
         */
-       adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
-                                                         plat_data->pool_size,
-                                                         &adev->dma_desc_pool,
-                                                         GFP_KERNEL);
+       adev->dma_desc_pool_virt = dma_alloc_wc(&pdev->dev,
+                                               plat_data->pool_size,
+                                               &adev->dma_desc_pool,
+                                               GFP_KERNEL);
        if (!adev->dma_desc_pool_virt) {
                ret = -ENOMEM;
                goto err_free_adev;
index 14091f8..3922a5d 100644 (file)
@@ -964,8 +964,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
         * requires that we explicitly flush the writes
         */
        mv_chan->dma_desc_pool_virt =
-         dma_alloc_writecombine(&pdev->dev, MV_XOR_POOL_SIZE,
-                                &mv_chan->dma_desc_pool, GFP_KERNEL);
+         dma_alloc_wc(&pdev->dev, MV_XOR_POOL_SIZE, &mv_chan->dma_desc_pool,
+                      GFP_KERNEL);
        if (!mv_chan->dma_desc_pool_virt)
                return ERR_PTR(-ENOMEM);
 
index f2a0310..debca82 100644 (file)
@@ -583,6 +583,8 @@ static void set_updater_desc(struct pxad_desc_sw *sw_desc,
                (PXA_DCMD_LENGTH & sizeof(u32));
        if (flags & DMA_PREP_INTERRUPT)
                updater->dcmd |= PXA_DCMD_ENDIRQEN;
+       if (sw_desc->cyclic)
+               sw_desc->hw_desc[sw_desc->nb_desc - 2]->ddadr = sw_desc->first;
 }
 
 static bool is_desc_completed(struct virt_dma_desc *vd)
@@ -673,6 +675,10 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
                dev_dbg(&chan->vc.chan.dev->device,
                        "%s(): checking txd %p[%x]: completed=%d\n",
                        __func__, vd, vd->tx.cookie, is_desc_completed(vd));
+               if (to_pxad_sw_desc(vd)->cyclic) {
+                       vchan_cyclic_callback(vd);
+                       break;
+               }
                if (is_desc_completed(vd)) {
                        list_del(&vd->node);
                        vchan_cookie_complete(vd);
@@ -1080,7 +1086,7 @@ pxad_prep_dma_cyclic(struct dma_chan *dchan,
                return NULL;
 
        pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
-       dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH | period_len);
+       dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH & period_len);
        dev_dbg(&chan->vc.chan.dev->device,
                "%s(): buf_addr=0x%lx len=%zu period=%zu dir=%d flags=%lx\n",
                __func__, (unsigned long)buf_addr, len, period_len, dir, flags);
index 5a250cd..d34aef7 100644 (file)
@@ -502,8 +502,8 @@ static int bam_alloc_chan(struct dma_chan *chan)
                return 0;
 
        /* allocate FIFO descriptor space, but only if necessary */
-       bchan->fifo_virt = dma_alloc_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                               &bchan->fifo_phys, GFP_KERNEL);
+       bchan->fifo_virt = dma_alloc_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                                       &bchan->fifo_phys, GFP_KERNEL);
 
        if (!bchan->fifo_virt) {
                dev_err(bdev->dev, "Failed to allocate desc fifo\n");
@@ -538,8 +538,8 @@ static void bam_free_chan(struct dma_chan *chan)
        bam_reset_channel(bchan);
        spin_unlock_irqrestore(&bchan->vc.lock, flags);
 
-       dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
-                               bchan->fifo_phys);
+       dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
+                   bchan->fifo_phys);
        bchan->fifo_virt = NULL;
 
        /* mask irq for pipe/channel */
@@ -1231,9 +1231,9 @@ static int bam_dma_remove(struct platform_device *pdev)
                bam_dma_terminate_all(&bdev->channels[i].vc.chan);
                tasklet_kill(&bdev->channels[i].vc.task);
 
-               dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-                       bdev->channels[i].fifo_virt,
-                       bdev->channels[i].fifo_phys);
+               dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+                           bdev->channels[i].fifo_virt,
+                           bdev->channels[i].fifo_phys);
        }
 
        tasklet_kill(&bdev->task);
index e3a945c..49768c0 100644 (file)
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
        "Status Register File",
 };
 
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+       "Load queue parity",
+       "Store queue parity",
+       "Miss address buffer payload parity",
+       "L1 TLB parity",
+       "",                                             /* reserved */
+       "DC tag error type 6",
+       "DC tag error type 1",
+       "Internal error type 1",
+       "Internal error type 2",
+       "Sys Read data error thread 0",
+       "Sys read data error thread 1",
+       "DC tag error type 2",
+       "DC data error type 1 (poison comsumption)",
+       "DC data error type 2",
+       "DC data error type 3",
+       "DC tag error type 4",
+       "L2 TLB parity",
+       "PDC parity error",
+       "DC tag error type 3",
+       "DC tag error type 5",
+       "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+       "microtag probe port parity error",
+       "IC microtag or full tag multi-hit error",
+       "IC full tag parity",
+       "IC data array parity",
+       "Decoupling queue phys addr parity error",
+       "L0 ITLB parity error",
+       "L1 ITLB parity error",
+       "L2 ITLB parity error",
+       "BPQ snoop parity on Thread 0",
+       "BPQ snoop parity on Thread 1",
+       "L1 BTB multi-match error",
+       "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+       "L2M tag multi-way-hit error",
+       "L2M tag ECC error",
+       "L2M data ECC error",
+       "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+       "uop cache tag parity error",
+       "uop cache data parity error",
+       "Insn buffer parity error",
+       "Insn dispatch queue parity error",
+       "Fetch address FIFO parity",
+       "Patch RAM data parity",
+       "Patch RAM sequencer parity",
+       "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+       "Watchdog timeout error",
+       "Phy register file parity",
+       "Flag register file parity",
+       "Immediate displacement register file parity",
+       "Address generator payload parity",
+       "EX payload parity",
+       "Checkpoint queue parity",
+       "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+       "Physical register file parity",
+       "Freelist parity error",
+       "Schedule queue parity",
+       "NSQ parity error",
+       "Retire queue parity",
+       "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+       "Shadow tag macro ECC error",
+       "Shadow tag macro multi-way-hit error",
+       "L3M tag ECC error",
+       "L3M tag multi-way-hit error",
+       "L3M data ECC error",
+       "XI parity, L3 fill done channel error",
+       "L3 victim queue parity",
+       "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+       "Illegal request from transport layer",
+       "Address violation",
+       "Security violation",
+       "Illegal response from transport layer",
+       "Unexpected response",
+       "Parity error on incoming request or probe response data",
+       "Parity error on incoming read response data",
+       "Atomic request parity",
+       "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+       "HW assert",
+       "Internal PIE register security violation",
+       "Error on GMI link",
+       "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+       "DRAM ECC error",
+       "Data poison error on DRAM",
+       "SDP parity error",
+       "Advanced peripheral bus error",
+       "Command/address parity error",
+       "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+       "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+       "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+       "SMU RAM ECC or parity error",
+};
+
 static bool f12h_mc0_mce(u16 ec, u8 xec)
 {
        bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
        pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 }
 
+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+                                  unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       switch (mca_type) {
+       case SMCA_LS:
+               error_desc_array = f17h_ls_mce_desc;
+               len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+               if (xec == 0x4) {
+                       pr_cont("Unrecognized LS MCA error code.\n");
+                       return;
+               }
+               break;
+
+       case SMCA_IF:
+               error_desc_array = f17h_if_mce_desc;
+               len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+               break;
+
+       case SMCA_L2_CACHE:
+               error_desc_array = f17h_l2_mce_desc;
+               len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+               break;
+
+       case SMCA_DE:
+               error_desc_array = f17h_de_mce_desc;
+               len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+               break;
+
+       case SMCA_EX:
+               error_desc_array = f17h_ex_mce_desc;
+               len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+               break;
+
+       case SMCA_FP:
+               error_desc_array = f17h_fp_mce_desc;
+               len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+               break;
+
+       case SMCA_L3_CACHE:
+               error_desc_array = f17h_l3_mce_desc;
+               len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA core error info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_core_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+       const char * const *error_desc_array;
+       size_t len;
+
+       pr_emerg(HW_ERR "Data Fabric Error: ");
+
+       switch (mca_type) {
+       case  SMCA_CS:
+               error_desc_array = f17h_cs_mce_desc;
+               len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+               break;
+
+       case SMCA_PIE:
+               error_desc_array = f17h_pie_mce_desc;
+               len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+               break;
+
+       default:
+               pr_cont("Corrupted MCA Data Fabric info.\n");
+               return;
+       }
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n",
+                        amd_df_mcablock_names[mca_type]);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+       u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+       unsigned int hwid, mca_type, i;
+       u8 xec = XEC(m->status, xec_mask);
+       const char * const *error_desc_array;
+       const char *ip_name;
+       u32 low, high;
+       size_t len;
+
+       if (rdmsr_safe(addr, &low, &high)) {
+               pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+               return;
+       }
+
+       hwid = high & MCI_IPID_HWID;
+       mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+       pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+       /*
+        * Based on hwid and mca_type values, decode errors from respective IPs.
+        * Note: mca_type values make sense only in the context of an hwid.
+        */
+       for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+               if (amd_hwids[i].hwid == hwid)
+                       break;
+
+       switch (i) {
+       case SMCA_F17H_CORE:
+               ip_name = (mca_type == SMCA_L3_CACHE) ?
+                         "L3 Cache" : "F17h Core";
+               return decode_f17h_core_errors(ip_name, xec, mca_type);
+               break;
+
+       case SMCA_DF:
+               return decode_df_errors(xec, mca_type);
+               break;
+
+       case SMCA_UMC:
+               error_desc_array = f17h_umc_mce_desc;
+               len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+               break;
+
+       case SMCA_PB:
+               error_desc_array = f17h_pb_mce_desc;
+               len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+               break;
+
+       case SMCA_PSP:
+               error_desc_array = f17h_psp_mce_desc;
+               len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+               break;
+
+       case SMCA_SMU:
+               error_desc_array = f17h_smu_mce_desc;
+               len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+               break;
+
+       default:
+               pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+               return;
+       }
+
+       ip_name = amd_hwids[i].name;
+       pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+       if (xec > len) {
+               pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+               return;
+       }
+
+       pr_cont("%s.\n", error_desc_array[xec]);
+}
+
 static inline void amd_decode_err_code(u16 ec)
 {
        if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
        struct mce *m = (struct mce *)data;
        struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
        int ecc;
+       u32 ebx = cpuid_ebx(0x80000007);
 
        if (amd_filter_mce(m))
                return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
                ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
                ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
 
-       if (c->x86 == 0x15 || c->x86 == 0x16)
+       if (c->x86 >= 0x15)
                pr_cont("|%s|%s",
                        ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
                        ((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 
+       if (!!(ebx & BIT(3))) {
+               u32 low, high;
+               u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+               if (!rdmsr_safe(addr, &low, &high) &&
+                   (low & MCI_CONFIG_MCAX))
+                       pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+       }
+
        /* do the two bits[14:13] together */
        ecc = (m->status >> 45) & 0x3;
        if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
        if (m->status & MCI_STATUS_ADDRV)
                pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
 
+       if (!!(ebx & BIT(3))) {
+               decode_smca_errors(m);
+               goto err_code;
+       }
+
        if (!fam_ops)
                goto err_code;
 
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
 static int __init mce_amd_init(void)
 {
        struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 ebx;
 
        if (c->x86_vendor != X86_VENDOR_AMD)
                return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
                fam_ops->mc2_mce = f16h_mc2_mce;
                break;
 
+       case 0x17:
+               ebx = cpuid_ebx(0x80000007);
+               xec_mask = 0x3f;
+               if (!(ebx & BIT(3))) {
+                       printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+                       goto err_out;
+               }
+               break;
+
        default:
                printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
-               kfree(fam_ops);
-               fam_ops = NULL;
+               goto err_out;
        }
 
        pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
        mce_register_decode_chain(&amd_mce_dec_nb);
 
        return 0;
+
+err_out:
+       kfree(fam_ops);
+       fam_ops = NULL;
+       return -EINVAL;
 }
 early_initcall(mce_amd_init);
 
index e438ee5..93f0d41 100644 (file)
@@ -1574,7 +1574,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes)
                                for (cha = 0; cha < KNL_MAX_CHAS; cha++) {
                                        if (knl_get_mc_route(target,
                                                mc_route_reg[cha]) == channel
-                                               && participants[channel]) {
+                                               && !participants[channel]) {
                                                participant_count++;
                                                participants[channel] = 1;
                                                break;
@@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
                edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
                         n_tads, gb, (mb*1000)/1024,
                         ((u64)tmp_mb) << 20L,
-                        (u32)TAD_SOCK(reg),
-                        (u32)TAD_CH(reg),
+                        (u32)(1 << TAD_SOCK(reg)),
+                        (u32)TAD_CH(reg) + 1,
                         (u32)TAD_TGT0(reg),
                         (u32)TAD_TGT1(reg),
                         (u32)TAD_TGT2(reg),
@@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
        }
 
        ch_way = TAD_CH(reg) + 1;
-       sck_way = TAD_SOCK(reg) + 1;
+       sck_way = 1 << TAD_SOCK(reg);
 
        if (ch_way == 3)
                idx = addr >> 6;
@@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                 n_tads,
                 addr,
                 limit,
-                (u32)TAD_SOCK(reg),
+                sck_way,
                 ch_way,
                 offset,
                 idx,
@@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                        offset, addr);
                return -EINVAL;
        }
-       addr -= offset;
-       /* Store the low bits [0:6] of the addr */
-       ch_addr = addr & 0x7f;
-       /* Remove socket wayness and remove 6 bits */
-       addr >>= 6;
-       addr = div_u64(addr, sck_xch);
-#if 0
-       /* Divide by channel way */
-       addr = addr / ch_way;
-#endif
-       /* Recover the last 6 bits */
-       ch_addr |= addr << 6;
+
+       ch_addr = addr - offset;
+       ch_addr >>= (6 + shiftup);
+       ch_addr /= ch_way * sck_way;
+       ch_addr <<= (6 + shiftup);
+       ch_addr |= addr & ((1 << (6 + shiftup)) - 1);
 
        /*
         * Step 3) Decode rank
index 756eca8..10e6774 100644 (file)
@@ -221,7 +221,7 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor,
        }
 
        if ((attributes & ~EFI_VARIABLE_MASK) != 0 ||
-           efivar_validate(name, data, size) == false) {
+           efivar_validate(vendor, name, data, size) == false) {
                printk(KERN_ERR "efivars: Malformed variable content\n");
                return -EINVAL;
        }
@@ -447,7 +447,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
        }
 
        if ((attributes & ~EFI_VARIABLE_MASK) != 0 ||
-           efivar_validate(name, data, size) == false) {
+           efivar_validate(new_var->VendorGuid, name, data,
+                           size) == false) {
                printk(KERN_ERR "efivars: Malformed variable content\n");
                return -EINVAL;
        }
@@ -540,38 +541,30 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 static int
 efivar_create_sysfs_entry(struct efivar_entry *new_var)
 {
-       int i, short_name_size;
+       int short_name_size;
        char *short_name;
-       unsigned long variable_name_size;
-       efi_char16_t *variable_name;
+       unsigned long utf8_name_size;
+       efi_char16_t *variable_name = new_var->var.VariableName;
        int ret;
 
-       variable_name = new_var->var.VariableName;
-       variable_name_size = ucs2_strlen(variable_name) * sizeof(efi_char16_t);
-
        /*
-        * Length of the variable bytes in ASCII, plus the '-' separator,
+        * Length of the variable bytes in UTF8, plus the '-' separator,
         * plus the GUID, plus trailing NUL
         */
-       short_name_size = variable_name_size / sizeof(efi_char16_t)
-                               + 1 + EFI_VARIABLE_GUID_LEN + 1;
-
-       short_name = kzalloc(short_name_size, GFP_KERNEL);
+       utf8_name_size = ucs2_utf8size(variable_name);
+       short_name_size = utf8_name_size + 1 + EFI_VARIABLE_GUID_LEN + 1;
 
+       short_name = kmalloc(short_name_size, GFP_KERNEL);
        if (!short_name)
                return -ENOMEM;
 
-       /* Convert Unicode to normal chars (assume top bits are 0),
-          ala UTF-8 */
-       for (i=0; i < (int)(variable_name_size / sizeof(efi_char16_t)); i++) {
-               short_name[i] = variable_name[i] & 0xFF;
-       }
+       ucs2_as_utf8(short_name, variable_name, short_name_size);
+
        /* This is ugly, but necessary to separate one vendor's
           private variables from another's.         */
-
-       *(short_name + strlen(short_name)) = '-';
+       short_name[utf8_name_size] = '-';
        efi_guid_to_str(&new_var->var.VendorGuid,
-                        short_name + strlen(short_name));
+                        short_name + utf8_name_size + 1);
 
        new_var->kobj.kset = efivars_kset;
 
index 70a0fb1..7f2ea21 100644 (file)
@@ -165,67 +165,133 @@ validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer,
 }
 
 struct variable_validate {
+       efi_guid_t vendor;
        char *name;
        bool (*validate)(efi_char16_t *var_name, int match, u8 *data,
                         unsigned long len);
 };
 
+/*
+ * This is the list of variables we need to validate, as well as the
+ * whitelist for what we think is safe not to default to immutable.
+ *
+ * If it has a validate() method that's not NULL, it'll go into the
+ * validation routine.  If not, it is assumed valid, but still used for
+ * whitelisting.
+ *
+ * Note that it's sorted by {vendor,name}, but globbed names must come after
+ * any other name with the same prefix.
+ */
 static const struct variable_validate variable_validate[] = {
-       { "BootNext", validate_uint16 },
-       { "BootOrder", validate_boot_order },
-       { "DriverOrder", validate_boot_order },
-       { "Boot*", validate_load_option },
-       { "Driver*", validate_load_option },
-       { "ConIn", validate_device_path },
-       { "ConInDev", validate_device_path },
-       { "ConOut", validate_device_path },
-       { "ConOutDev", validate_device_path },
-       { "ErrOut", validate_device_path },
-       { "ErrOutDev", validate_device_path },
-       { "Timeout", validate_uint16 },
-       { "Lang", validate_ascii_string },
-       { "PlatformLang", validate_ascii_string },
-       { "", NULL },
+       { EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 },
+       { EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order },
+       { EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option },
+       { EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order },
+       { EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option },
+       { EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path },
+       { EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string },
+       { EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL },
+       { EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string },
+       { EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 },
+       { LINUX_EFI_CRASH_GUID, "*", NULL },
+       { NULL_GUID, "", NULL },
 };
 
+static bool
+variable_matches(const char *var_name, size_t len, const char *match_name,
+                int *match)
+{
+       for (*match = 0; ; (*match)++) {
+               char c = match_name[*match];
+               char u = var_name[*match];
+
+               /* Wildcard in the matching name means we've matched */
+               if (c == '*')
+                       return true;
+
+               /* Case sensitive match */
+               if (!c && *match == len)
+                       return true;
+
+               if (c != u)
+                       return false;
+
+               if (!c)
+                       return true;
+       }
+       return true;
+}
+
 bool
-efivar_validate(efi_char16_t *var_name, u8 *data, unsigned long len)
+efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+               unsigned long data_size)
 {
        int i;
-       u16 *unicode_name = var_name;
+       unsigned long utf8_size;
+       u8 *utf8_name;
 
-       for (i = 0; variable_validate[i].validate != NULL; i++) {
-               const char *name = variable_validate[i].name;
-               int match;
+       utf8_size = ucs2_utf8size(var_name);
+       utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL);
+       if (!utf8_name)
+               return false;
 
-               for (match = 0; ; match++) {
-                       char c = name[match];
-                       u16 u = unicode_name[match];
+       ucs2_as_utf8(utf8_name, var_name, utf8_size);
+       utf8_name[utf8_size] = '\0';
 
-                       /* All special variables are plain ascii */
-                       if (u > 127)
-                               return true;
+       for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+               const char *name = variable_validate[i].name;
+               int match = 0;
 
-                       /* Wildcard in the matching name means we've matched */
-                       if (c == '*')
-                               return variable_validate[i].validate(var_name,
-                                                            match, data, len);
+               if (efi_guidcmp(vendor, variable_validate[i].vendor))
+                       continue;
 
-                       /* Case sensitive match */
-                       if (c != u)
+               if (variable_matches(utf8_name, utf8_size+1, name, &match)) {
+                       if (variable_validate[i].validate == NULL)
                                break;
-
-                       /* Reached the end of the string while matching */
-                       if (!c)
-                               return variable_validate[i].validate(var_name,
-                                                            match, data, len);
+                       kfree(utf8_name);
+                       return variable_validate[i].validate(var_name, match,
+                                                            data, data_size);
                }
        }
-
+       kfree(utf8_name);
        return true;
 }
 EXPORT_SYMBOL_GPL(efivar_validate);
 
+bool
+efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
+                            size_t len)
+{
+       int i;
+       bool found = false;
+       int match = 0;
+
+       /*
+        * Check if our variable is in the validated variables list
+        */
+       for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+               if (efi_guidcmp(variable_validate[i].vendor, vendor))
+                       continue;
+
+               if (variable_matches(var_name, len,
+                                    variable_validate[i].name, &match)) {
+                       found = true;
+                       break;
+               }
+       }
+
+       /*
+        * If it's in our list, it is removable.
+        */
+       return found;
+}
+EXPORT_SYMBOL_GPL(efivar_variable_is_removable);
+
 static efi_status_t
 check_var_size(u32 attributes, unsigned long size)
 {
@@ -852,7 +918,7 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
 
        *set = false;
 
-       if (efivar_validate(name, data, *size) == false)
+       if (efivar_validate(*vendor, name, data, *size) == false)
                return -EINVAL;
 
        /*
index 2aeaebd..3f87a03 100644 (file)
@@ -312,8 +312,8 @@ static int altera_gpio_probe(struct platform_device *pdev)
                handle_simple_irq, IRQ_TYPE_NONE);
 
        if (ret) {
-               dev_info(&pdev->dev, "could not add irqchip\n");
-               return ret;
+               dev_err(&pdev->dev, "could not add irqchip\n");
+               goto teardown;
        }
 
        gpiochip_set_chained_irqchip(&altera_gc->mmchip.gc,
@@ -326,6 +326,7 @@ static int altera_gpio_probe(struct platform_device *pdev)
 skip_irq:
        return 0;
 teardown:
+       of_mm_gpiochip_remove(&altera_gc->mmchip);
        pr_err("%s: registration failed with status %d\n",
                node->full_name, ret);
 
index ec58f42..cd007a6 100644 (file)
@@ -195,7 +195,7 @@ static int davinci_gpio_of_xlate(struct gpio_chip *gc,
 static int davinci_gpio_probe(struct platform_device *pdev)
 {
        int i, base;
-       unsigned ngpio;
+       unsigned ngpio, nbank;
        struct davinci_gpio_controller *chips;
        struct davinci_gpio_platform_data *pdata;
        struct davinci_gpio_regs __iomem *regs;
@@ -224,8 +224,9 @@ static int davinci_gpio_probe(struct platform_device *pdev)
        if (WARN_ON(ARCH_NR_GPIOS < ngpio))
                ngpio = ARCH_NR_GPIOS;
 
+       nbank = DIV_ROUND_UP(ngpio, 32);
        chips = devm_kzalloc(dev,
-                            ngpio * sizeof(struct davinci_gpio_controller),
+                            nbank * sizeof(struct davinci_gpio_controller),
                             GFP_KERNEL);
        if (!chips)
                return -ENOMEM;
@@ -511,7 +512,7 @@ static int davinci_gpio_irq_setup(struct platform_device *pdev)
                        return irq;
                }
 
-               irq_domain = irq_domain_add_legacy(NULL, ngpio, irq, 0,
+               irq_domain = irq_domain_add_legacy(dev->of_node, ngpio, irq, 0,
                                                        &davinci_gpio_irq_ops,
                                                        chips);
                if (!irq_domain) {
index cf41440..d9ab0cd 100644 (file)
@@ -196,6 +196,44 @@ static int gpio_rcar_irq_set_wake(struct irq_data *d, unsigned int on)
        return 0;
 }
 
+static void gpio_rcar_irq_bus_lock(struct irq_data *d)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+       pm_runtime_get_sync(&p->pdev->dev);
+}
+
+static void gpio_rcar_irq_bus_sync_unlock(struct irq_data *d)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+       pm_runtime_put(&p->pdev->dev);
+}
+
+
+static int gpio_rcar_irq_request_resources(struct irq_data *d)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+       int error;
+
+       error = pm_runtime_get_sync(&p->pdev->dev);
+       if (error < 0)
+               return error;
+
+       return 0;
+}
+
+static void gpio_rcar_irq_release_resources(struct irq_data *d)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+       pm_runtime_put(&p->pdev->dev);
+}
+
 static irqreturn_t gpio_rcar_irq_handler(int irq, void *dev_id)
 {
        struct gpio_rcar_priv *p = dev_id;
@@ -450,6 +488,10 @@ static int gpio_rcar_probe(struct platform_device *pdev)
        irq_chip->irq_unmask = gpio_rcar_irq_enable;
        irq_chip->irq_set_type = gpio_rcar_irq_set_type;
        irq_chip->irq_set_wake = gpio_rcar_irq_set_wake;
+       irq_chip->irq_bus_lock = gpio_rcar_irq_bus_lock;
+       irq_chip->irq_bus_sync_unlock = gpio_rcar_irq_bus_sync_unlock;
+       irq_chip->irq_request_resources = gpio_rcar_irq_request_resources;
+       irq_chip->irq_release_resources = gpio_rcar_irq_release_resources;
        irq_chip->flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_MASK_ON_SUSPEND;
 
        ret = gpiochip_add_data(gpio_chip, p);
index 66f729e..20c9539 100644 (file)
@@ -25,7 +25,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
        amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o
 
 # add asic specific block
-amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o gmc_v7_0.o cik_ih.o kv_smc.o kv_dpm.o \
+amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
        ci_smc.o ci_dpm.o dce_v8_0.o gfx_v7_0.o cik_sdma.o uvd_v4_2.o vce_v2_0.o \
        amdgpu_amdkfd_gfx_v7.o
 
@@ -34,6 +34,7 @@ amdgpu-y += \
 
 # add GMC block
 amdgpu-y += \
+       gmc_v7_0.o \
        gmc_v8_0.o
 
 # add IH block
index 313b0cc..5e7770f 100644 (file)
@@ -87,6 +87,8 @@ extern int amdgpu_sched_jobs;
 extern int amdgpu_sched_hw_submission;
 extern int amdgpu_enable_semaphores;
 extern int amdgpu_powerplay;
+extern unsigned amdgpu_pcie_gen_cap;
+extern unsigned amdgpu_pcie_lane_cap;
 
 #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS         3000
 #define AMDGPU_MAX_USEC_TIMEOUT                        100000  /* 100 ms */
@@ -132,47 +134,6 @@ extern int amdgpu_powerplay;
 #define AMDGPU_RESET_VCE                       (1 << 13)
 #define AMDGPU_RESET_VCE1                      (1 << 14)
 
-/* CG block flags */
-#define AMDGPU_CG_BLOCK_GFX                    (1 << 0)
-#define AMDGPU_CG_BLOCK_MC                     (1 << 1)
-#define AMDGPU_CG_BLOCK_SDMA                   (1 << 2)
-#define AMDGPU_CG_BLOCK_UVD                    (1 << 3)
-#define AMDGPU_CG_BLOCK_VCE                    (1 << 4)
-#define AMDGPU_CG_BLOCK_HDP                    (1 << 5)
-#define AMDGPU_CG_BLOCK_BIF                    (1 << 6)
-
-/* CG flags */
-#define AMDGPU_CG_SUPPORT_GFX_MGCG             (1 << 0)
-#define AMDGPU_CG_SUPPORT_GFX_MGLS             (1 << 1)
-#define AMDGPU_CG_SUPPORT_GFX_CGCG             (1 << 2)
-#define AMDGPU_CG_SUPPORT_GFX_CGLS             (1 << 3)
-#define AMDGPU_CG_SUPPORT_GFX_CGTS             (1 << 4)
-#define AMDGPU_CG_SUPPORT_GFX_CGTS_LS          (1 << 5)
-#define AMDGPU_CG_SUPPORT_GFX_CP_LS            (1 << 6)
-#define AMDGPU_CG_SUPPORT_GFX_RLC_LS           (1 << 7)
-#define AMDGPU_CG_SUPPORT_MC_LS                        (1 << 8)
-#define AMDGPU_CG_SUPPORT_MC_MGCG              (1 << 9)
-#define AMDGPU_CG_SUPPORT_SDMA_LS              (1 << 10)
-#define AMDGPU_CG_SUPPORT_SDMA_MGCG            (1 << 11)
-#define AMDGPU_CG_SUPPORT_BIF_LS               (1 << 12)
-#define AMDGPU_CG_SUPPORT_UVD_MGCG             (1 << 13)
-#define AMDGPU_CG_SUPPORT_VCE_MGCG             (1 << 14)
-#define AMDGPU_CG_SUPPORT_HDP_LS               (1 << 15)
-#define AMDGPU_CG_SUPPORT_HDP_MGCG             (1 << 16)
-
-/* PG flags */
-#define AMDGPU_PG_SUPPORT_GFX_PG               (1 << 0)
-#define AMDGPU_PG_SUPPORT_GFX_SMG              (1 << 1)
-#define AMDGPU_PG_SUPPORT_GFX_DMG              (1 << 2)
-#define AMDGPU_PG_SUPPORT_UVD                  (1 << 3)
-#define AMDGPU_PG_SUPPORT_VCE                  (1 << 4)
-#define AMDGPU_PG_SUPPORT_CP                   (1 << 5)
-#define AMDGPU_PG_SUPPORT_GDS                  (1 << 6)
-#define AMDGPU_PG_SUPPORT_RLC_SMU_HS           (1 << 7)
-#define AMDGPU_PG_SUPPORT_SDMA                 (1 << 8)
-#define AMDGPU_PG_SUPPORT_ACP                  (1 << 9)
-#define AMDGPU_PG_SUPPORT_SAMU                 (1 << 10)
-
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE                 0x00000000L
 #define AMDGPU_GFX_SAFE_MODE                   0x00000001L
@@ -606,8 +567,6 @@ struct amdgpu_sa_manager {
        uint32_t                align;
 };
 
-struct amdgpu_sa_bo;
-
 /* sub-allocation buffer */
 struct amdgpu_sa_bo {
        struct list_head                olist;
@@ -2278,60 +2237,60 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_dpm_enable_bapm(adev, e) (adev)->pm.funcs->enable_bapm((adev), (e))
 
 #define amdgpu_dpm_get_temperature(adev) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->get_temperature((adev)->powerplay.pp_handle) : \
-             (adev)->pm.funcs->get_temperature((adev))
+             (adev)->pm.funcs->get_temperature((adev)))
 
 #define amdgpu_dpm_set_fan_control_mode(adev, m) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->set_fan_control_mode((adev)->powerplay.pp_handle, (m)) : \
-             (adev)->pm.funcs->set_fan_control_mode((adev), (m))
+             (adev)->pm.funcs->set_fan_control_mode((adev), (m)))
 
 #define amdgpu_dpm_get_fan_control_mode(adev) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->get_fan_control_mode((adev)->powerplay.pp_handle) : \
-             (adev)->pm.funcs->get_fan_control_mode((adev))
+             (adev)->pm.funcs->get_fan_control_mode((adev)))
 
 #define amdgpu_dpm_set_fan_speed_percent(adev, s) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->set_fan_speed_percent((adev)->powerplay.pp_handle, (s)) : \
-             (adev)->pm.funcs->set_fan_speed_percent((adev), (s))
+             (adev)->pm.funcs->set_fan_speed_percent((adev), (s)))
 
 #define amdgpu_dpm_get_fan_speed_percent(adev, s) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->get_fan_speed_percent((adev)->powerplay.pp_handle, (s)) : \
-             (adev)->pm.funcs->get_fan_speed_percent((adev), (s))
+             (adev)->pm.funcs->get_fan_speed_percent((adev), (s)))
 
 #define amdgpu_dpm_get_sclk(adev, l) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->get_sclk((adev)->powerplay.pp_handle, (l)) : \
-               (adev)->pm.funcs->get_sclk((adev), (l))
+               (adev)->pm.funcs->get_sclk((adev), (l)))
 
 #define amdgpu_dpm_get_mclk(adev, l)  \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->get_mclk((adev)->powerplay.pp_handle, (l)) : \
-             (adev)->pm.funcs->get_mclk((adev), (l))
+             (adev)->pm.funcs->get_mclk((adev), (l)))
 
 
 #define amdgpu_dpm_force_performance_level(adev, l) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->force_performance_level((adev)->powerplay.pp_handle, (l)) : \
-             (adev)->pm.funcs->force_performance_level((adev), (l))
+             (adev)->pm.funcs->force_performance_level((adev), (l)))
 
 #define amdgpu_dpm_powergate_uvd(adev, g) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->powergate_uvd((adev)->powerplay.pp_handle, (g)) : \
-             (adev)->pm.funcs->powergate_uvd((adev), (g))
+             (adev)->pm.funcs->powergate_uvd((adev), (g)))
 
 #define amdgpu_dpm_powergate_vce(adev, g) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->powergate_vce((adev)->powerplay.pp_handle, (g)) : \
-             (adev)->pm.funcs->powergate_vce((adev), (g))
+             (adev)->pm.funcs->powergate_vce((adev), (g)))
 
 #define amdgpu_dpm_debugfs_print_current_performance_level(adev, m) \
-       (adev)->pp_enabled ?                                            \
+       ((adev)->pp_enabled ?                                           \
              (adev)->powerplay.pp_funcs->print_current_performance_level((adev)->powerplay.pp_handle, (m)) : \
-             (adev)->pm.funcs->debugfs_print_current_performance_level((adev), (m))
+             (adev)->pm.funcs->debugfs_print_current_performance_level((adev), (m)))
 
 #define amdgpu_dpm_get_current_power_state(adev) \
        (adev)->powerplay.pp_funcs->get_current_power_state((adev)->powerplay.pp_handle)
@@ -2360,6 +2319,8 @@ bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
 int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr,
                                     uint32_t flags);
 bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm);
+bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
+                                 unsigned long end);
 bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm);
 uint32_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
                                 struct ttm_mem_reg *mem);
index 0e13763..362bedc 100644 (file)
@@ -154,7 +154,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
        .get_fw_version = get_fw_version
 };
 
-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions()
+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
 {
        return (struct kfd2kgd_calls *)&kfd2kgd;
 }
index 79fa5c7..04b744d 100644 (file)
@@ -115,7 +115,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
        .get_fw_version = get_fw_version
 };
 
-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions()
+struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
 {
        return (struct kfd2kgd_calls *)&kfd2kgd;
 }
index a081dda..7a4b101 100644 (file)
@@ -795,6 +795,12 @@ static int amdgpu_cgs_query_system_info(void *cgs_device,
        case CGS_SYSTEM_INFO_PCIE_MLW:
                sys_info->value = adev->pm.pcie_mlw_mask;
                break;
+       case CGS_SYSTEM_INFO_CG_FLAGS:
+               sys_info->value = adev->cg_flags;
+               break;
+       case CGS_SYSTEM_INFO_PG_FLAGS:
+               sys_info->value = adev->pg_flags;
+               break;
        default:
                return -ENODEV;
        }
index 89c3dd6..119cdc2 100644 (file)
@@ -77,7 +77,7 @@ void amdgpu_connector_hotplug(struct drm_connector *connector)
                        } else if (amdgpu_atombios_dp_needs_link_train(amdgpu_connector)) {
                                /* Don't try to start link training before we
                                 * have the dpcd */
-                               if (!amdgpu_atombios_dp_get_dpcd(amdgpu_connector))
+                               if (amdgpu_atombios_dp_get_dpcd(amdgpu_connector))
                                        return;
 
                                /* set it to OFF so that drm_helper_connector_dpms()
index 6f89f8e..b882e81 100644 (file)
@@ -478,9 +478,9 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, bo
        struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
        unsigned i;
 
-       amdgpu_vm_move_pt_bos_in_lru(parser->adev, &fpriv->vm);
-
        if (!error) {
+               amdgpu_vm_move_pt_bos_in_lru(parser->adev, &fpriv->vm);
+
                /* Sort the buffer list from the smallest to largest buffer,
                 * which affects the order of buffers in the LRU list.
                 * This assures that the smallest buffers are added first
index 6553146..51bfc11 100644 (file)
@@ -1795,15 +1795,20 @@ int amdgpu_resume_kms(struct drm_device *dev, bool resume, bool fbcon)
        }
 
        /* post card */
-       amdgpu_atom_asic_init(adev->mode_info.atom_context);
+       if (!amdgpu_card_posted(adev))
+               amdgpu_atom_asic_init(adev->mode_info.atom_context);
 
        r = amdgpu_resume(adev);
+       if (r)
+               DRM_ERROR("amdgpu_resume failed (%d).\n", r);
 
        amdgpu_fence_driver_resume(adev);
 
-       r = amdgpu_ib_ring_tests(adev);
-       if (r)
-               DRM_ERROR("ib ring test failed (%d).\n", r);
+       if (resume) {
+               r = amdgpu_ib_ring_tests(adev);
+               if (r)
+                       DRM_ERROR("ib ring test failed (%d).\n", r);
+       }
 
        r = amdgpu_late_init(adev);
        if (r)
@@ -1933,80 +1938,97 @@ retry:
        return r;
 }
 
+#define AMDGPU_DEFAULT_PCIE_GEN_MASK 0x30007  /* gen: chipset 1/2, asic 1/2/3 */
+#define AMDGPU_DEFAULT_PCIE_MLW_MASK 0x2f0000 /* 1/2/4/8/16 lanes */
+
 void amdgpu_get_pcie_info(struct amdgpu_device *adev)
 {
        u32 mask;
        int ret;
 
-       if (pci_is_root_bus(adev->pdev->bus))
-               return;
+       if (amdgpu_pcie_gen_cap)
+               adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
 
-       if (amdgpu_pcie_gen2 == 0)
-               return;
+       if (amdgpu_pcie_lane_cap)
+               adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
 
-       if (adev->flags & AMD_IS_APU)
+       /* covers APUs as well */
+       if (pci_is_root_bus(adev->pdev->bus)) {
+               if (adev->pm.pcie_gen_mask == 0)
+                       adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
+               if (adev->pm.pcie_mlw_mask == 0)
+                       adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
                return;
+       }
 
-       ret = drm_pcie_get_speed_cap_mask(adev->ddev, &mask);
-       if (!ret) {
-               adev->pm.pcie_gen_mask = (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
-                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-                                         CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
-
-               if (mask & DRM_PCIE_SPEED_25)
-                       adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
-               if (mask & DRM_PCIE_SPEED_50)
-                       adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2;
-               if (mask & DRM_PCIE_SPEED_80)
-                       adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3;
-       }
-       ret = drm_pcie_get_max_link_width(adev->ddev, &mask);
-       if (!ret) {
-               switch (mask) {
-               case 32:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 16:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 12:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 8:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 4:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 2:
-                       adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
-                                                 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
-                       break;
-               case 1:
-                       adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
-                       break;
-               default:
-                       break;
+       if (adev->pm.pcie_gen_mask == 0) {
+               ret = drm_pcie_get_speed_cap_mask(adev->ddev, &mask);
+               if (!ret) {
+                       adev->pm.pcie_gen_mask = (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
+                                                 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
+                                                 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
+
+                       if (mask & DRM_PCIE_SPEED_25)
+                               adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
+                       if (mask & DRM_PCIE_SPEED_50)
+                               adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2;
+                       if (mask & DRM_PCIE_SPEED_80)
+                               adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3;
+               } else {
+                       adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
+               }
+       }
+       if (adev->pm.pcie_mlw_mask == 0) {
+               ret = drm_pcie_get_max_link_width(adev->ddev, &mask);
+               if (!ret) {
+                       switch (mask) {
+                       case 32:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 16:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 12:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 8:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 4:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 2:
+                               adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
+                                                         CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
+                               break;
+                       case 1:
+                               adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
+                               break;
+                       default:
+                               break;
+                       }
+               } else {
+                       adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
                }
        }
 }
index acd066d..1846d65 100644 (file)
@@ -72,8 +72,8 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
 
        struct drm_crtc *crtc = &amdgpuCrtc->base;
        unsigned long flags;
-       unsigned i;
-       int vpos, hpos, stat, min_udelay;
+       unsigned i, repcnt = 4;
+       int vpos, hpos, stat, min_udelay = 0;
        struct drm_vblank_crtc *vblank = &crtc->dev->vblank[work->crtc_id];
 
        amdgpu_flip_wait_fence(adev, &work->excl);
@@ -96,7 +96,7 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
         * In practice this won't execute very often unless on very fast
         * machines because the time window for this to happen is very small.
         */
-       for (;;) {
+       while (amdgpuCrtc->enabled && --repcnt) {
                /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                 * start in hpos, and to the "fudged earlier" vblank start in
                 * vpos.
@@ -112,12 +112,24 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
                        break;
 
                /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
+               if (min_udelay > vblank->framedur_ns / 2000) {
+                       /* Don't wait ridiculously long - something is wrong */
+                       repcnt = 0;
+                       break;
+               }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                usleep_range(min_udelay, 2 * min_udelay);
                spin_lock_irqsave(&crtc->dev->event_lock, flags);
        };
 
+       if (!repcnt)
+               DRM_DEBUG_DRIVER("Delay problem on crtc %d: min_udelay %d, "
+                                "framedur %d, linedur %d, stat %d, vpos %d, "
+                                "hpos %d\n", work->crtc_id, min_udelay,
+                                vblank->framedur_ns / 1000,
+                                vblank->linedur_ns / 1000, stat, vpos, hpos);
+
        /* do the flip (mmio) */
        adev->mode_info.funcs->page_flip(adev, work->crtc_id, work->base);
        /* set the flip status */
index b5dbbb5..9ef1db8 100644 (file)
@@ -83,6 +83,8 @@ int amdgpu_sched_jobs = 32;
 int amdgpu_sched_hw_submission = 2;
 int amdgpu_enable_semaphores = 0;
 int amdgpu_powerplay = -1;
+unsigned amdgpu_pcie_gen_cap = 0;
+unsigned amdgpu_pcie_lane_cap = 0;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -170,6 +172,12 @@ MODULE_PARM_DESC(powerplay, "Powerplay component (1 = enable, 0 = disable, -1 =
 module_param_named(powerplay, amdgpu_powerplay, int, 0444);
 #endif
 
+MODULE_PARM_DESC(pcie_gen_cap, "PCIE Gen Caps (0: autodetect (default))");
+module_param_named(pcie_gen_cap, amdgpu_pcie_gen_cap, uint, 0444);
+
+MODULE_PARM_DESC(pcie_lane_cap, "PCIE Lane Caps (0: autodetect (default))");
+module_param_named(pcie_lane_cap, amdgpu_pcie_lane_cap, uint, 0444);
+
 static struct pci_device_id pciidlist[] = {
 #ifdef CONFIG_DRM_AMDGPU_CIK
        /* Kaveri */
@@ -256,11 +264,11 @@ static struct pci_device_id pciidlist[] = {
        {0x1002, 0x985F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_MULLINS|AMD_IS_MOBILITY|AMD_IS_APU},
 #endif
        /* topaz */
-       {0x1002, 0x6900, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ|AMD_EXP_HW_SUPPORT},
-       {0x1002, 0x6901, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ|AMD_EXP_HW_SUPPORT},
-       {0x1002, 0x6902, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ|AMD_EXP_HW_SUPPORT},
-       {0x1002, 0x6903, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ|AMD_EXP_HW_SUPPORT},
-       {0x1002, 0x6907, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ|AMD_EXP_HW_SUPPORT},
+       {0x1002, 0x6900, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ},
+       {0x1002, 0x6901, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ},
+       {0x1002, 0x6902, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ},
+       {0x1002, 0x6903, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ},
+       {0x1002, 0x6907, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TOPAZ},
        /* tonga */
        {0x1002, 0x6920, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
        {0x1002, 0x6921, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
index cfb6caa..9191467 100644 (file)
@@ -333,6 +333,10 @@ int amdgpu_fbdev_init(struct amdgpu_device *adev)
        if (!adev->mode_info.mode_config_initialized)
                return 0;
 
+       /* don't init fbdev if there are no connectors */
+       if (list_empty(&adev->ddev->mode_config.connector_list))
+               return 0;
+
        /* select 8 bpp console on low vram cards */
        if (adev->mc.real_vram_size <= (32*1024*1024))
                bpp_sel = 8;
index 7380f78..d20c2a8 100644 (file)
@@ -596,7 +596,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                break;
        }
        ttm_eu_backoff_reservation(&ticket, &list);
-       if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE))
+       if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE) &&
+           !amdgpu_vm_debug)
                amdgpu_gem_va_update_vm(adev, bo_va, args->operation);
 
        drm_gem_object_unreference_unlocked(gobj);
index b1969f2..d4e2780 100644 (file)
@@ -142,7 +142,8 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
 
                list_for_each_entry(bo, &node->bos, mn_list) {
 
-                       if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
+                       if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start,
+                                                         end))
                                continue;
 
                        r = amdgpu_bo_reserve(bo, true);
index c3ce103..b8fbbd7 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/slab.h>
 #include <drm/drmP.h>
 #include <drm/amdgpu_drm.h>
+#include <drm/drm_cache.h>
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 
@@ -261,6 +262,13 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev,
                                       AMDGPU_GEM_DOMAIN_OA);
 
        bo->flags = flags;
+
+       /* For architectures that don't support WC memory,
+        * mask out the WC flag from the BO
+        */
+       if (!drm_arch_can_wc_memory())
+               bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+
        amdgpu_fill_placement_to_bo(bo, placement);
        /* Kernel allocation are uninterruptible */
        r = ttm_bo_init(&adev->mman.bdev, &bo->tbo, size, type,
@@ -399,7 +407,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
                }
                if (fpfn > bo->placements[i].fpfn)
                        bo->placements[i].fpfn = fpfn;
-               if (lpfn && lpfn < bo->placements[i].lpfn)
+               if (!bo->placements[i].lpfn ||
+                   (lpfn && lpfn < bo->placements[i].lpfn))
                        bo->placements[i].lpfn = lpfn;
                bo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
        }
index 7d8d84e..95a4a25 100644 (file)
@@ -113,6 +113,10 @@ static ssize_t amdgpu_get_dpm_forced_performance_level(struct device *dev,
        struct drm_device *ddev = dev_get_drvdata(dev);
        struct amdgpu_device *adev = ddev->dev_private;
 
+       if  ((adev->flags & AMD_IS_PX) &&
+            (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
+               return snprintf(buf, PAGE_SIZE, "off\n");
+
        if (adev->pp_enabled) {
                enum amd_dpm_forced_level level;
 
@@ -140,6 +144,11 @@ static ssize_t amdgpu_set_dpm_forced_performance_level(struct device *dev,
        enum amdgpu_dpm_forced_level level;
        int ret = 0;
 
+       /* Can't force performance level when the card is off */
+       if  ((adev->flags & AMD_IS_PX) &&
+            (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
+               return -EINVAL;
+
        if (strncmp("low", buf, strlen("low")) == 0) {
                level = AMDGPU_DPM_FORCED_LEVEL_LOW;
        } else if (strncmp("high", buf, strlen("high")) == 0) {
@@ -157,6 +166,7 @@ static ssize_t amdgpu_set_dpm_forced_performance_level(struct device *dev,
                mutex_lock(&adev->pm.mutex);
                if (adev->pm.dpm.thermal_active) {
                        count = -EINVAL;
+                       mutex_unlock(&adev->pm.mutex);
                        goto fail;
                }
                ret = amdgpu_dpm_force_performance_level(adev, level);
@@ -167,8 +177,6 @@ static ssize_t amdgpu_set_dpm_forced_performance_level(struct device *dev,
                mutex_unlock(&adev->pm.mutex);
        }
 fail:
-       mutex_unlock(&adev->pm.mutex);
-
        return count;
 }
 
@@ -182,8 +190,14 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
                                      char *buf)
 {
        struct amdgpu_device *adev = dev_get_drvdata(dev);
+       struct drm_device *ddev = adev->ddev;
        int temp;
 
+       /* Can't get temperature when the card is off */
+       if  ((adev->flags & AMD_IS_PX) &&
+            (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
+               return -EINVAL;
+
        if (!adev->pp_enabled && !adev->pm.funcs->get_temperature)
                temp = 0;
        else
@@ -634,11 +648,6 @@ force:
 
        /* update display watermarks based on new power state */
        amdgpu_display_bandwidth_update(adev);
-       /* update displays */
-       amdgpu_dpm_display_configuration_changed(adev);
-
-       adev->pm.dpm.current_active_crtcs = adev->pm.dpm.new_active_crtcs;
-       adev->pm.dpm.current_active_crtc_count = adev->pm.dpm.new_active_crtc_count;
 
        /* wait for the rings to drain */
        for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -655,6 +664,12 @@ force:
 
        amdgpu_dpm_post_set_power_state(adev);
 
+       /* update displays */
+       amdgpu_dpm_display_configuration_changed(adev);
+
+       adev->pm.dpm.current_active_crtcs = adev->pm.dpm.new_active_crtcs;
+       adev->pm.dpm.current_active_crtc_count = adev->pm.dpm.new_active_crtc_count;
+
        if (adev->pm.funcs->force_performance_level) {
                if (adev->pm.dpm.thermal_active) {
                        enum amdgpu_dpm_forced_level level = adev->pm.dpm.forced_level;
@@ -847,12 +862,16 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
        struct drm_info_node *node = (struct drm_info_node *) m->private;
        struct drm_device *dev = node->minor->dev;
        struct amdgpu_device *adev = dev->dev_private;
+       struct drm_device *ddev = adev->ddev;
 
        if (!adev->pm.dpm_enabled) {
                seq_printf(m, "dpm not enabled\n");
                return 0;
        }
-       if (adev->pp_enabled) {
+       if  ((adev->flags & AMD_IS_PX) &&
+            (ddev->switch_power_state != DRM_SWITCH_POWER_ON)) {
+               seq_printf(m, "PX asic powered off\n");
+       } else if (adev->pp_enabled) {
                amdgpu_dpm_debugfs_print_current_performance_level(adev, m);
        } else {
                mutex_lock(&adev->pm.mutex);
index 5ee9a06..3cb6d6c 100644 (file)
@@ -99,13 +99,24 @@ static int amdgpu_pp_early_init(void *handle)
 
 #ifdef CONFIG_DRM_AMD_POWERPLAY
        switch (adev->asic_type) {
-               case CHIP_TONGA:
-               case CHIP_FIJI:
-                       adev->pp_enabled = (amdgpu_powerplay > 0) ? true : false;
-                       break;
-               default:
-                       adev->pp_enabled = (amdgpu_powerplay > 0) ? true : false;
-                       break;
+       case CHIP_TONGA:
+       case CHIP_FIJI:
+               adev->pp_enabled = (amdgpu_powerplay == 0) ? false : true;
+               break;
+       case CHIP_CARRIZO:
+       case CHIP_STONEY:
+               adev->pp_enabled = (amdgpu_powerplay > 0) ? true : false;
+               break;
+       /* These chips don't have powerplay implemenations */
+       case CHIP_BONAIRE:
+       case CHIP_HAWAII:
+       case CHIP_KABINI:
+       case CHIP_MULLINS:
+       case CHIP_KAVERI:
+       case CHIP_TOPAZ:
+       default:
+               adev->pp_enabled = false;
+               break;
        }
 #else
        adev->pp_enabled = false;
@@ -132,8 +143,10 @@ static int amdgpu_pp_late_init(void *handle)
                                        adev->powerplay.pp_handle);
 
 #ifdef CONFIG_DRM_AMD_POWERPLAY
-       if (adev->pp_enabled)
+       if (adev->pp_enabled) {
                amdgpu_pm_sysfs_init(adev);
+               amdgpu_dpm_dispatch_task(adev, AMD_PP_EVENT_COMPLETE_INIT, NULL, NULL);
+       }
 #endif
        return ret;
 }
index 78e9b0f..d1f234d 100644 (file)
@@ -487,7 +487,7 @@ static int amdgpu_debugfs_ring_info(struct seq_file *m, void *data)
        seq_printf(m, "rptr: 0x%08x [%5d]\n",
                   rptr, rptr);
 
-       rptr_next = ~0;
+       rptr_next = le32_to_cpu(*ring->next_rptr_cpu_addr);
 
        seq_printf(m, "driver's copy of the wptr: 0x%08x [%5d]\n",
                   ring->wptr, ring->wptr);
index 8b88edb..ca72a2e 100644 (file)
@@ -354,12 +354,15 @@ int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager,
 
                for (i = 0, count = 0; i < AMDGPU_MAX_RINGS; ++i)
                        if (fences[i])
-                               fences[count++] = fences[i];
+                               fences[count++] = fence_get(fences[i]);
 
                if (count) {
                        spin_unlock(&sa_manager->wq.lock);
                        t = fence_wait_any_timeout(fences, count, false,
                                                   MAX_SCHEDULE_TIMEOUT);
+                       for (i = 0; i < count; ++i)
+                               fence_put(fences[i]);
+
                        r = (t > 0) ? 0 : t;
                        spin_lock(&sa_manager->wq.lock);
                } else {
index 8a1752f..1cbb16e 100644 (file)
@@ -712,7 +712,7 @@ static int amdgpu_ttm_tt_populate(struct ttm_tt *ttm)
                                                       0, PAGE_SIZE,
                                                       PCI_DMA_BIDIRECTIONAL);
                if (pci_dma_mapping_error(adev->pdev, gtt->ttm.dma_address[i])) {
-                       while (--i) {
+                       while (i--) {
                                pci_unmap_page(adev->pdev, gtt->ttm.dma_address[i],
                                               PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
                                gtt->ttm.dma_address[i] = 0;
@@ -783,6 +783,25 @@ bool amdgpu_ttm_tt_has_userptr(struct ttm_tt *ttm)
        return !!gtt->userptr;
 }
 
+bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
+                                 unsigned long end)
+{
+       struct amdgpu_ttm_tt *gtt = (void *)ttm;
+       unsigned long size;
+
+       if (gtt == NULL)
+               return false;
+
+       if (gtt->ttm.ttm.state != tt_bound || !gtt->userptr)
+               return false;
+
+       size = (unsigned long)gtt->ttm.ttm.num_pages * PAGE_SIZE;
+       if (gtt->userptr > end || gtt->userptr + size <= start)
+               return false;
+
+       return true;
+}
+
 bool amdgpu_ttm_tt_is_readonly(struct ttm_tt *ttm)
 {
        struct amdgpu_ttm_tt *gtt = (void *)ttm;
@@ -808,7 +827,7 @@ uint32_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
                        flags |= AMDGPU_PTE_SNOOPED;
        }
 
-       if (adev->asic_type >= CHIP_TOPAZ)
+       if (adev->asic_type >= CHIP_TONGA)
                flags |= AMDGPU_PTE_EXECUTABLE;
 
        flags |= AMDGPU_PTE_READABLE;
index aefc668..9599f75 100644 (file)
@@ -1282,7 +1282,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 {
        const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
                AMDGPU_VM_PTE_COUNT * 8);
-       unsigned pd_size, pd_entries, pts_size;
+       unsigned pd_size, pd_entries;
        int i, r;
 
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -1300,8 +1300,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        pd_entries = amdgpu_vm_num_pdes(adev);
 
        /* allocate page table array */
-       pts_size = pd_entries * sizeof(struct amdgpu_vm_pt);
-       vm->page_tables = kzalloc(pts_size, GFP_KERNEL);
+       vm->page_tables = drm_calloc_large(pd_entries, sizeof(struct amdgpu_vm_pt));
        if (vm->page_tables == NULL) {
                DRM_ERROR("Cannot allocate memory for page table array\n");
                return -ENOMEM;
@@ -1361,7 +1360,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 
        for (i = 0; i < amdgpu_vm_num_pdes(adev); i++)
                amdgpu_bo_unref(&vm->page_tables[i].entry.robj);
-       kfree(vm->page_tables);
+       drm_free_large(vm->page_tables);
 
        amdgpu_bo_unref(&vm->page_directory);
        fence_put(vm->page_directory_fence);
index 21aacc1..bf731e9 100644 (file)
@@ -265,15 +265,27 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
        unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
        unsigned lane_num, i, max_pix_clock;
 
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (amdgpu_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                        if (max_pix_clock >= pix_clock) {
                                *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                return 0;
                        }
                }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
        }
 
        return -EINVAL;
index 8b4731d..474ca02 100644 (file)
@@ -31,6 +31,7 @@
 #include "ci_dpm.h"
 #include "gfx_v7_0.h"
 #include "atom.h"
+#include "amd_pcie.h"
 #include <linux/seq_file.h>
 
 #include "smu/smu_7_0_1_d.h"
@@ -5835,18 +5836,16 @@ static int ci_dpm_init(struct amdgpu_device *adev)
        u8 frev, crev;
        struct ci_power_info *pi;
        int ret;
-       u32 mask;
 
        pi = kzalloc(sizeof(struct ci_power_info), GFP_KERNEL);
        if (pi == NULL)
                return -ENOMEM;
        adev->pm.dpm.priv = pi;
 
-       ret = drm_pcie_get_speed_cap_mask(adev->ddev, &mask);
-       if (ret)
-               pi->sys_pcie_mask = 0;
-       else
-               pi->sys_pcie_mask = mask;
+       pi->sys_pcie_mask =
+               (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_MASK) >>
+               CAIL_PCIE_LINK_SPEED_SUPPORT_SHIFT;
+
        pi->force_pcie_gen = AMDGPU_PCIE_GEN_INVALID;
 
        pi->pcie_gen_performance.max = AMDGPU_PCIE_GEN1;
index fd9c958..155965e 100644 (file)
@@ -1762,6 +1762,9 @@ static void cik_program_aspm(struct amdgpu_device *adev)
        if (amdgpu_aspm == 0)
                return;
 
+       if (pci_is_root_bus(adev->pdev->bus))
+               return;
+
        /* XXX double check APUs */
        if (adev->flags & AMD_IS_APU)
                return;
@@ -2332,72 +2335,72 @@ static int cik_common_early_init(void *handle)
        switch (adev->asic_type) {
        case CHIP_BONAIRE:
                adev->cg_flags =
-                       AMDGPU_CG_SUPPORT_GFX_MGCG |
-                       AMDGPU_CG_SUPPORT_GFX_MGLS |
-                       /*AMDGPU_CG_SUPPORT_GFX_CGCG |*/
-                       AMDGPU_CG_SUPPORT_GFX_CGLS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS_LS |
-                       AMDGPU_CG_SUPPORT_GFX_CP_LS |
-                       AMDGPU_CG_SUPPORT_MC_LS |
-                       AMDGPU_CG_SUPPORT_MC_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_LS |
-                       AMDGPU_CG_SUPPORT_BIF_LS |
-                       AMDGPU_CG_SUPPORT_VCE_MGCG |
-                       AMDGPU_CG_SUPPORT_UVD_MGCG |
-                       AMDGPU_CG_SUPPORT_HDP_LS |
-                       AMDGPU_CG_SUPPORT_HDP_MGCG;
+                       AMD_CG_SUPPORT_GFX_MGCG |
+                       AMD_CG_SUPPORT_GFX_MGLS |
+                       /*AMD_CG_SUPPORT_GFX_CGCG |*/
+                       AMD_CG_SUPPORT_GFX_CGLS |
+                       AMD_CG_SUPPORT_GFX_CGTS |
+                       AMD_CG_SUPPORT_GFX_CGTS_LS |
+                       AMD_CG_SUPPORT_GFX_CP_LS |
+                       AMD_CG_SUPPORT_MC_LS |
+                       AMD_CG_SUPPORT_MC_MGCG |
+                       AMD_CG_SUPPORT_SDMA_MGCG |
+                       AMD_CG_SUPPORT_SDMA_LS |
+                       AMD_CG_SUPPORT_BIF_LS |
+                       AMD_CG_SUPPORT_VCE_MGCG |
+                       AMD_CG_SUPPORT_UVD_MGCG |
+                       AMD_CG_SUPPORT_HDP_LS |
+                       AMD_CG_SUPPORT_HDP_MGCG;
                adev->pg_flags = 0;
                adev->external_rev_id = adev->rev_id + 0x14;
                break;
        case CHIP_HAWAII:
                adev->cg_flags =
-                       AMDGPU_CG_SUPPORT_GFX_MGCG |
-                       AMDGPU_CG_SUPPORT_GFX_MGLS |
-                       /*AMDGPU_CG_SUPPORT_GFX_CGCG |*/
-                       AMDGPU_CG_SUPPORT_GFX_CGLS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS |
-                       AMDGPU_CG_SUPPORT_GFX_CP_LS |
-                       AMDGPU_CG_SUPPORT_MC_LS |
-                       AMDGPU_CG_SUPPORT_MC_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_LS |
-                       AMDGPU_CG_SUPPORT_BIF_LS |
-                       AMDGPU_CG_SUPPORT_VCE_MGCG |
-                       AMDGPU_CG_SUPPORT_UVD_MGCG |
-                       AMDGPU_CG_SUPPORT_HDP_LS |
-                       AMDGPU_CG_SUPPORT_HDP_MGCG;
+                       AMD_CG_SUPPORT_GFX_MGCG |
+                       AMD_CG_SUPPORT_GFX_MGLS |
+                       /*AMD_CG_SUPPORT_GFX_CGCG |*/
+                       AMD_CG_SUPPORT_GFX_CGLS |
+                       AMD_CG_SUPPORT_GFX_CGTS |
+                       AMD_CG_SUPPORT_GFX_CP_LS |
+                       AMD_CG_SUPPORT_MC_LS |
+                       AMD_CG_SUPPORT_MC_MGCG |
+                       AMD_CG_SUPPORT_SDMA_MGCG |
+                       AMD_CG_SUPPORT_SDMA_LS |
+                       AMD_CG_SUPPORT_BIF_LS |
+                       AMD_CG_SUPPORT_VCE_MGCG |
+                       AMD_CG_SUPPORT_UVD_MGCG |
+                       AMD_CG_SUPPORT_HDP_LS |
+                       AMD_CG_SUPPORT_HDP_MGCG;
                adev->pg_flags = 0;
                adev->external_rev_id = 0x28;
                break;
        case CHIP_KAVERI:
                adev->cg_flags =
-                       AMDGPU_CG_SUPPORT_GFX_MGCG |
-                       AMDGPU_CG_SUPPORT_GFX_MGLS |
-                       /*AMDGPU_CG_SUPPORT_GFX_CGCG |*/
-                       AMDGPU_CG_SUPPORT_GFX_CGLS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS_LS |
-                       AMDGPU_CG_SUPPORT_GFX_CP_LS |
-                       AMDGPU_CG_SUPPORT_SDMA_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_LS |
-                       AMDGPU_CG_SUPPORT_BIF_LS |
-                       AMDGPU_CG_SUPPORT_VCE_MGCG |
-                       AMDGPU_CG_SUPPORT_UVD_MGCG |
-                       AMDGPU_CG_SUPPORT_HDP_LS |
-                       AMDGPU_CG_SUPPORT_HDP_MGCG;
+                       AMD_CG_SUPPORT_GFX_MGCG |
+                       AMD_CG_SUPPORT_GFX_MGLS |
+                       /*AMD_CG_SUPPORT_GFX_CGCG |*/
+                       AMD_CG_SUPPORT_GFX_CGLS |
+                       AMD_CG_SUPPORT_GFX_CGTS |
+                       AMD_CG_SUPPORT_GFX_CGTS_LS |
+                       AMD_CG_SUPPORT_GFX_CP_LS |
+                       AMD_CG_SUPPORT_SDMA_MGCG |
+                       AMD_CG_SUPPORT_SDMA_LS |
+                       AMD_CG_SUPPORT_BIF_LS |
+                       AMD_CG_SUPPORT_VCE_MGCG |
+                       AMD_CG_SUPPORT_UVD_MGCG |
+                       AMD_CG_SUPPORT_HDP_LS |
+                       AMD_CG_SUPPORT_HDP_MGCG;
                adev->pg_flags =
-                       /*AMDGPU_PG_SUPPORT_GFX_PG |
-                         AMDGPU_PG_SUPPORT_GFX_SMG |
-                         AMDGPU_PG_SUPPORT_GFX_DMG |*/
-                       AMDGPU_PG_SUPPORT_UVD |
-                       /*AMDGPU_PG_SUPPORT_VCE |
-                         AMDGPU_PG_SUPPORT_CP |
-                         AMDGPU_PG_SUPPORT_GDS |
-                         AMDGPU_PG_SUPPORT_RLC_SMU_HS |
-                         AMDGPU_PG_SUPPORT_ACP |
-                         AMDGPU_PG_SUPPORT_SAMU |*/
+                       /*AMD_PG_SUPPORT_GFX_PG |
+                         AMD_PG_SUPPORT_GFX_SMG |
+                         AMD_PG_SUPPORT_GFX_DMG |*/
+                       AMD_PG_SUPPORT_UVD |
+                       /*AMD_PG_SUPPORT_VCE |
+                         AMD_PG_SUPPORT_CP |
+                         AMD_PG_SUPPORT_GDS |
+                         AMD_PG_SUPPORT_RLC_SMU_HS |
+                         AMD_PG_SUPPORT_ACP |
+                         AMD_PG_SUPPORT_SAMU |*/
                        0;
                if (adev->pdev->device == 0x1312 ||
                        adev->pdev->device == 0x1316 ||
@@ -2409,29 +2412,29 @@ static int cik_common_early_init(void *handle)
        case CHIP_KABINI:
        case CHIP_MULLINS:
                adev->cg_flags =
-                       AMDGPU_CG_SUPPORT_GFX_MGCG |
-                       AMDGPU_CG_SUPPORT_GFX_MGLS |
-                       /*AMDGPU_CG_SUPPORT_GFX_CGCG |*/
-                       AMDGPU_CG_SUPPORT_GFX_CGLS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS |
-                       AMDGPU_CG_SUPPORT_GFX_CGTS_LS |
-                       AMDGPU_CG_SUPPORT_GFX_CP_LS |
-                       AMDGPU_CG_SUPPORT_SDMA_MGCG |
-                       AMDGPU_CG_SUPPORT_SDMA_LS |
-                       AMDGPU_CG_SUPPORT_BIF_LS |
-                       AMDGPU_CG_SUPPORT_VCE_MGCG |
-                       AMDGPU_CG_SUPPORT_UVD_MGCG |
-                       AMDGPU_CG_SUPPORT_HDP_LS |
-                       AMDGPU_CG_SUPPORT_HDP_MGCG;
+                       AMD_CG_SUPPORT_GFX_MGCG |
+                       AMD_CG_SUPPORT_GFX_MGLS |
+                       /*AMD_CG_SUPPORT_GFX_CGCG |*/
+                       AMD_CG_SUPPORT_GFX_CGLS |
+                       AMD_CG_SUPPORT_GFX_CGTS |
+                       AMD_CG_SUPPORT_GFX_CGTS_LS |
+                       AMD_CG_SUPPORT_GFX_CP_LS |
+                       AMD_CG_SUPPORT_SDMA_MGCG |
+                       AMD_CG_SUPPORT_SDMA_LS |
+                       AMD_CG_SUPPORT_BIF_LS |
+                       AMD_CG_SUPPORT_VCE_MGCG |
+                       AMD_CG_SUPPORT_UVD_MGCG |
+                       AMD_CG_SUPPORT_HDP_LS |
+                       AMD_CG_SUPPORT_HDP_MGCG;
                adev->pg_flags =
-                       /*AMDGPU_PG_SUPPORT_GFX_PG |
-                         AMDGPU_PG_SUPPORT_GFX_SMG | */
-                       AMDGPU_PG_SUPPORT_UVD |
-                       /*AMDGPU_PG_SUPPORT_VCE |
-                         AMDGPU_PG_SUPPORT_CP |
-                         AMDGPU_PG_SUPPORT_GDS |
-                         AMDGPU_PG_SUPPORT_RLC_SMU_HS |
-                         AMDGPU_PG_SUPPORT_SAMU |*/
+                       /*AMD_PG_SUPPORT_GFX_PG |
+                         AMD_PG_SUPPORT_GFX_SMG | */
+                       AMD_PG_SUPPORT_UVD |
+                       /*AMD_PG_SUPPORT_VCE |
+                         AMD_PG_SUPPORT_CP |
+                         AMD_PG_SUPPORT_GDS |
+                         AMD_PG_SUPPORT_RLC_SMU_HS |
+                         AMD_PG_SUPPORT_SAMU |*/
                        0;
                if (adev->asic_type == CHIP_KABINI) {
                        if (adev->rev_id == 0)
index 5f712ce..c55ecf0 100644 (file)
@@ -885,7 +885,7 @@ static void cik_enable_sdma_mgcg(struct amdgpu_device *adev,
 {
        u32 orig, data;
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_SDMA_MGCG)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_SDMA_MGCG)) {
                WREG32(mmSDMA0_CLK_CTRL + SDMA0_REGISTER_OFFSET, 0x00000100);
                WREG32(mmSDMA0_CLK_CTRL + SDMA1_REGISTER_OFFSET, 0x00000100);
        } else {
@@ -906,7 +906,7 @@ static void cik_enable_sdma_mgls(struct amdgpu_device *adev,
 {
        u32 orig, data;
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_SDMA_LS)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_SDMA_LS)) {
                orig = data = RREG32(mmSDMA0_POWER_CNTL + SDMA0_REGISTER_OFFSET);
                data |= 0x100;
                if (orig != data)
index 4dd17f2..e7ef226 100644 (file)
@@ -445,13 +445,13 @@ static int cz_dpm_init(struct amdgpu_device *adev)
        pi->gfx_pg_threshold = 500;
        pi->caps_fps = true;
        /* uvd */
-       pi->caps_uvd_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_UVD) ? true : false;
+       pi->caps_uvd_pg = (adev->pg_flags & AMD_PG_SUPPORT_UVD) ? true : false;
        pi->caps_uvd_dpm = true;
        /* vce */
-       pi->caps_vce_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_VCE) ? true : false;
+       pi->caps_vce_pg = (adev->pg_flags & AMD_PG_SUPPORT_VCE) ? true : false;
        pi->caps_vce_dpm = true;
        /* acp */
-       pi->caps_acp_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_ACP) ? true : false;
+       pi->caps_acp_pg = (adev->pg_flags & AMD_PG_SUPPORT_ACP) ? true : false;
        pi->caps_acp_dpm = true;
 
        pi->caps_stable_power_state = false;
@@ -2202,8 +2202,7 @@ static void cz_dpm_powergate_vce(struct amdgpu_device *adev, bool gate)
                                                            AMD_PG_STATE_GATE);
 
                                cz_enable_vce_dpm(adev, false);
-                               /* TODO: to figure out why vce can't be poweroff. */
-                               /* cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerOFF); */
+                               cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerOFF);
                                pi->vce_power_gated = true;
                        } else {
                                cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerON);
@@ -2226,10 +2225,8 @@ static void cz_dpm_powergate_vce(struct amdgpu_device *adev, bool gate)
                }
        } else { /*pi->caps_vce_pg*/
                cz_update_vce_dpm(adev);
-               cz_enable_vce_dpm(adev, true);
+               cz_enable_vce_dpm(adev, !gate);
        }
-
-       return;
 }
 
 const struct amd_ip_funcs cz_dpm_ip_funcs = {
index 72793f9..06602df 100644 (file)
@@ -3628,6 +3628,19 @@ static void gfx_v7_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
                                        unsigned vm_id, uint64_t pd_addr)
 {
        int usepfp = (ring->type == AMDGPU_RING_TYPE_GFX);
+       uint32_t seq = ring->fence_drv.sync_seq;
+       uint64_t addr = ring->fence_drv.gpu_addr;
+
+       amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
+       amdgpu_ring_write(ring, (WAIT_REG_MEM_MEM_SPACE(1) | /* memory */
+                                WAIT_REG_MEM_FUNCTION(3) | /* equal */
+                                WAIT_REG_MEM_ENGINE(usepfp)));   /* pfp or me */
+       amdgpu_ring_write(ring, addr & 0xfffffffc);
+       amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
+       amdgpu_ring_write(ring, seq);
+       amdgpu_ring_write(ring, 0xffffffff);
+       amdgpu_ring_write(ring, 4); /* poll interval */
+
        if (usepfp) {
                /* synce CE with ME to prevent CE fetch CEIB before context switch done */
                amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
@@ -4109,7 +4122,7 @@ static void gfx_v7_0_enable_cgcg(struct amdgpu_device *adev, bool enable)
 
        orig = data = RREG32(mmRLC_CGCG_CGLS_CTRL);
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_CGCG)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGCG)) {
                gfx_v7_0_enable_gui_idle_interrupt(adev, true);
 
                tmp = gfx_v7_0_halt_rlc(adev);
@@ -4147,9 +4160,9 @@ static void gfx_v7_0_enable_mgcg(struct amdgpu_device *adev, bool enable)
 {
        u32 data, orig, tmp = 0;
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_MGCG)) {
-               if (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_MGLS) {
-                       if (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_CP_LS) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGCG)) {
+               if (adev->cg_flags & AMD_CG_SUPPORT_GFX_MGLS) {
+                       if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CP_LS) {
                                orig = data = RREG32(mmCP_MEM_SLP_CNTL);
                                data |= CP_MEM_SLP_CNTL__CP_MEM_LS_EN_MASK;
                                if (orig != data)
@@ -4176,14 +4189,14 @@ static void gfx_v7_0_enable_mgcg(struct amdgpu_device *adev, bool enable)
 
                gfx_v7_0_update_rlc(adev, tmp);
 
-               if (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_CGTS) {
+               if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGTS) {
                        orig = data = RREG32(mmCGTS_SM_CTRL_REG);
                        data &= ~CGTS_SM_CTRL_REG__SM_MODE_MASK;
                        data |= (0x2 << CGTS_SM_CTRL_REG__SM_MODE__SHIFT);
                        data |= CGTS_SM_CTRL_REG__SM_MODE_ENABLE_MASK;
                        data &= ~CGTS_SM_CTRL_REG__OVERRIDE_MASK;
-                       if ((adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_MGLS) &&
-                           (adev->cg_flags & AMDGPU_CG_SUPPORT_GFX_CGTS_LS))
+                       if ((adev->cg_flags & AMD_CG_SUPPORT_GFX_MGLS) &&
+                           (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGTS_LS))
                                data &= ~CGTS_SM_CTRL_REG__LS_OVERRIDE_MASK;
                        data &= ~CGTS_SM_CTRL_REG__ON_MONITOR_ADD_MASK;
                        data |= CGTS_SM_CTRL_REG__ON_MONITOR_ADD_EN_MASK;
@@ -4249,7 +4262,7 @@ static void gfx_v7_0_enable_sclk_slowdown_on_pu(struct amdgpu_device *adev,
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_RLC_SMU_HS))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_RLC_SMU_HS))
                data |= RLC_PG_CNTL__SMU_CLK_SLOWDOWN_ON_PU_ENABLE_MASK;
        else
                data &= ~RLC_PG_CNTL__SMU_CLK_SLOWDOWN_ON_PU_ENABLE_MASK;
@@ -4263,7 +4276,7 @@ static void gfx_v7_0_enable_sclk_slowdown_on_pd(struct amdgpu_device *adev,
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_RLC_SMU_HS))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_RLC_SMU_HS))
                data |= RLC_PG_CNTL__SMU_CLK_SLOWDOWN_ON_PD_ENABLE_MASK;
        else
                data &= ~RLC_PG_CNTL__SMU_CLK_SLOWDOWN_ON_PD_ENABLE_MASK;
@@ -4276,7 +4289,7 @@ static void gfx_v7_0_enable_cp_pg(struct amdgpu_device *adev, bool enable)
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_CP))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_CP))
                data &= ~0x8000;
        else
                data |= 0x8000;
@@ -4289,7 +4302,7 @@ static void gfx_v7_0_enable_gds_pg(struct amdgpu_device *adev, bool enable)
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_GDS))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_GDS))
                data &= ~0x2000;
        else
                data |= 0x2000;
@@ -4370,7 +4383,7 @@ static void gfx_v7_0_enable_gfx_cgpg(struct amdgpu_device *adev,
 {
        u32 data, orig;
 
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_PG)) {
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG)) {
                orig = data = RREG32(mmRLC_PG_CNTL);
                data |= RLC_PG_CNTL__GFX_POWER_GATING_ENABLE_MASK;
                if (orig != data)
@@ -4442,7 +4455,7 @@ static void gfx_v7_0_enable_gfx_static_mgpg(struct amdgpu_device *adev,
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_SMG))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_GFX_SMG))
                data |= RLC_PG_CNTL__STATIC_PER_CU_PG_ENABLE_MASK;
        else
                data &= ~RLC_PG_CNTL__STATIC_PER_CU_PG_ENABLE_MASK;
@@ -4456,7 +4469,7 @@ static void gfx_v7_0_enable_gfx_dynamic_mgpg(struct amdgpu_device *adev,
        u32 data, orig;
 
        orig = data = RREG32(mmRLC_PG_CNTL);
-       if (enable && (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_DMG))
+       if (enable && (adev->pg_flags & AMD_PG_SUPPORT_GFX_DMG))
                data |= RLC_PG_CNTL__DYN_PER_CU_PG_ENABLE_MASK;
        else
                data &= ~RLC_PG_CNTL__DYN_PER_CU_PG_ENABLE_MASK;
@@ -4623,15 +4636,15 @@ static void gfx_v7_0_get_csb_buffer(struct amdgpu_device *adev,
 
 static void gfx_v7_0_init_pg(struct amdgpu_device *adev)
 {
-       if (adev->pg_flags & (AMDGPU_PG_SUPPORT_GFX_PG |
-                             AMDGPU_PG_SUPPORT_GFX_SMG |
-                             AMDGPU_PG_SUPPORT_GFX_DMG |
-                             AMDGPU_PG_SUPPORT_CP |
-                             AMDGPU_PG_SUPPORT_GDS |
-                             AMDGPU_PG_SUPPORT_RLC_SMU_HS)) {
+       if (adev->pg_flags & (AMD_PG_SUPPORT_GFX_PG |
+                             AMD_PG_SUPPORT_GFX_SMG |
+                             AMD_PG_SUPPORT_GFX_DMG |
+                             AMD_PG_SUPPORT_CP |
+                             AMD_PG_SUPPORT_GDS |
+                             AMD_PG_SUPPORT_RLC_SMU_HS)) {
                gfx_v7_0_enable_sclk_slowdown_on_pu(adev, true);
                gfx_v7_0_enable_sclk_slowdown_on_pd(adev, true);
-               if (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_PG) {
+               if (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG) {
                        gfx_v7_0_init_gfx_cgpg(adev);
                        gfx_v7_0_enable_cp_pg(adev, true);
                        gfx_v7_0_enable_gds_pg(adev, true);
@@ -4643,14 +4656,14 @@ static void gfx_v7_0_init_pg(struct amdgpu_device *adev)
 
 static void gfx_v7_0_fini_pg(struct amdgpu_device *adev)
 {
-       if (adev->pg_flags & (AMDGPU_PG_SUPPORT_GFX_PG |
-                             AMDGPU_PG_SUPPORT_GFX_SMG |
-                             AMDGPU_PG_SUPPORT_GFX_DMG |
-                             AMDGPU_PG_SUPPORT_CP |
-                             AMDGPU_PG_SUPPORT_GDS |
-                             AMDGPU_PG_SUPPORT_RLC_SMU_HS)) {
+       if (adev->pg_flags & (AMD_PG_SUPPORT_GFX_PG |
+                             AMD_PG_SUPPORT_GFX_SMG |
+                             AMD_PG_SUPPORT_GFX_DMG |
+                             AMD_PG_SUPPORT_CP |
+                             AMD_PG_SUPPORT_GDS |
+                             AMD_PG_SUPPORT_RLC_SMU_HS)) {
                gfx_v7_0_update_gfx_pg(adev, false);
-               if (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_PG) {
+               if (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG) {
                        gfx_v7_0_enable_cp_pg(adev, false);
                        gfx_v7_0_enable_gds_pg(adev, false);
                }
@@ -4738,6 +4751,22 @@ static int gfx_v7_0_early_init(void *handle)
        return 0;
 }
 
+static int gfx_v7_0_late_init(void *handle)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0);
+       if (r)
+               return r;
+
+       r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
+       if (r)
+               return r;
+
+       return 0;
+}
+
 static int gfx_v7_0_sw_init(void *handle)
 {
        struct amdgpu_ring *ring;
@@ -4890,6 +4919,8 @@ static int gfx_v7_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
        gfx_v7_0_cp_enable(adev, false);
        gfx_v7_0_rlc_stop(adev);
        gfx_v7_0_fini_pg(adev);
@@ -5509,14 +5540,14 @@ static int gfx_v7_0_set_powergating_state(void *handle,
        if (state == AMD_PG_STATE_GATE)
                gate = true;
 
-       if (adev->pg_flags & (AMDGPU_PG_SUPPORT_GFX_PG |
-                             AMDGPU_PG_SUPPORT_GFX_SMG |
-                             AMDGPU_PG_SUPPORT_GFX_DMG |
-                             AMDGPU_PG_SUPPORT_CP |
-                             AMDGPU_PG_SUPPORT_GDS |
-                             AMDGPU_PG_SUPPORT_RLC_SMU_HS)) {
+       if (adev->pg_flags & (AMD_PG_SUPPORT_GFX_PG |
+                             AMD_PG_SUPPORT_GFX_SMG |
+                             AMD_PG_SUPPORT_GFX_DMG |
+                             AMD_PG_SUPPORT_CP |
+                             AMD_PG_SUPPORT_GDS |
+                             AMD_PG_SUPPORT_RLC_SMU_HS)) {
                gfx_v7_0_update_gfx_pg(adev, gate);
-               if (adev->pg_flags & AMDGPU_PG_SUPPORT_GFX_PG) {
+               if (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG) {
                        gfx_v7_0_enable_cp_pg(adev, gate);
                        gfx_v7_0_enable_gds_pg(adev, gate);
                }
@@ -5527,7 +5558,7 @@ static int gfx_v7_0_set_powergating_state(void *handle,
 
 const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
        .early_init = gfx_v7_0_early_init,
-       .late_init = NULL,
+       .late_init = gfx_v7_0_late_init,
        .sw_init = gfx_v7_0_sw_init,
        .sw_fini = gfx_v7_0_sw_fini,
        .hw_init = gfx_v7_0_hw_init,
index 13235d8..7086ac1 100644 (file)
@@ -111,7 +111,6 @@ MODULE_FIRMWARE("amdgpu/topaz_ce.bin");
 MODULE_FIRMWARE("amdgpu/topaz_pfp.bin");
 MODULE_FIRMWARE("amdgpu/topaz_me.bin");
 MODULE_FIRMWARE("amdgpu/topaz_mec.bin");
-MODULE_FIRMWARE("amdgpu/topaz_mec2.bin");
 MODULE_FIRMWARE("amdgpu/topaz_rlc.bin");
 
 MODULE_FIRMWARE("amdgpu/fiji_ce.bin");
@@ -828,7 +827,8 @@ static int gfx_v8_0_init_microcode(struct amdgpu_device *adev)
        adev->gfx.mec_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
        adev->gfx.mec_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
-       if (adev->asic_type != CHIP_STONEY) {
+       if ((adev->asic_type != CHIP_STONEY) &&
+           (adev->asic_type != CHIP_TOPAZ)) {
                snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2.bin", chip_name);
                err = request_firmware(&adev->gfx.mec2_fw, fw_name, adev->dev);
                if (!err) {
@@ -3851,10 +3851,16 @@ static int gfx_v8_0_cp_resume(struct amdgpu_device *adev)
                        if (r)
                                return -EINVAL;
 
-                       r = adev->smu.smumgr_funcs->check_fw_load_finish(adev,
-                                                       AMDGPU_UCODE_ID_CP_MEC1);
-                       if (r)
-                               return -EINVAL;
+                       if (adev->asic_type == CHIP_TOPAZ) {
+                               r = gfx_v8_0_cp_compute_load_microcode(adev);
+                               if (r)
+                                       return r;
+                       } else {
+                               r = adev->smu.smumgr_funcs->check_fw_load_finish(adev,
+                                                                                AMDGPU_UCODE_ID_CP_MEC1);
+                               if (r)
+                                       return -EINVAL;
+                       }
                }
        }
 
@@ -3901,6 +3907,8 @@ static int gfx_v8_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
        gfx_v8_0_cp_enable(adev, false);
        gfx_v8_0_rlc_stop(adev);
        gfx_v8_0_cp_compute_fini(adev);
@@ -4186,7 +4194,18 @@ static int gfx_v8_0_soft_reset(void *handle)
                gfx_v8_0_cp_gfx_enable(adev, false);
 
                /* Disable MEC parsing/prefetching */
-               /* XXX todo */
+               gfx_v8_0_cp_compute_enable(adev, false);
+
+               if (grbm_soft_reset || srbm_soft_reset) {
+                       tmp = RREG32(mmGMCON_DEBUG);
+                       tmp = REG_SET_FIELD(tmp,
+                                           GMCON_DEBUG, GFX_STALL, 1);
+                       tmp = REG_SET_FIELD(tmp,
+                                           GMCON_DEBUG, GFX_CLEAR, 1);
+                       WREG32(mmGMCON_DEBUG, tmp);
+
+                       udelay(50);
+               }
 
                if (grbm_soft_reset) {
                        tmp = RREG32(mmGRBM_SOFT_RESET);
@@ -4215,6 +4234,16 @@ static int gfx_v8_0_soft_reset(void *handle)
                        WREG32(mmSRBM_SOFT_RESET, tmp);
                        tmp = RREG32(mmSRBM_SOFT_RESET);
                }
+
+               if (grbm_soft_reset || srbm_soft_reset) {
+                       tmp = RREG32(mmGMCON_DEBUG);
+                       tmp = REG_SET_FIELD(tmp,
+                                           GMCON_DEBUG, GFX_STALL, 0);
+                       tmp = REG_SET_FIELD(tmp,
+                                           GMCON_DEBUG, GFX_CLEAR, 0);
+                       WREG32(mmGMCON_DEBUG, tmp);
+               }
+
                /* Wait a little for things to settle down */
                udelay(50);
                gfx_v8_0_print_status((void *)adev);
@@ -4308,6 +4337,14 @@ static int gfx_v8_0_late_init(void *handle)
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
        int r;
 
+       r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0);
+       if (r)
+               return r;
+
+       r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0);
+       if (r)
+               return r;
+
        /* requires IBs so do in late init after IB pool is initialized */
        r = gfx_v8_0_do_edc_gpr_workarounds(adev);
        if (r)
@@ -4772,7 +4809,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 
        amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
        amdgpu_ring_write(ring, (WAIT_REG_MEM_MEM_SPACE(1) | /* memory */
-                WAIT_REG_MEM_FUNCTION(3))); /* equal */
+                                WAIT_REG_MEM_FUNCTION(3) | /* equal */
+                                WAIT_REG_MEM_ENGINE(usepfp))); /* pfp or me */
        amdgpu_ring_write(ring, addr & 0xfffffffc);
        amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
        amdgpu_ring_write(ring, seq);
@@ -4958,7 +4996,7 @@ static int gfx_v8_0_set_priv_reg_fault_state(struct amdgpu_device *adev,
        case AMDGPU_IRQ_STATE_ENABLE:
                cp_int_cntl = RREG32(mmCP_INT_CNTL_RING0);
                cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0,
-                                           PRIV_REG_INT_ENABLE, 0);
+                                           PRIV_REG_INT_ENABLE, 1);
                WREG32(mmCP_INT_CNTL_RING0, cp_int_cntl);
                break;
        default:
index 3f95606..b806079 100644 (file)
@@ -42,9 +42,39 @@ static void gmc_v7_0_set_irq_funcs(struct amdgpu_device *adev);
 
 MODULE_FIRMWARE("radeon/bonaire_mc.bin");
 MODULE_FIRMWARE("radeon/hawaii_mc.bin");
+MODULE_FIRMWARE("amdgpu/topaz_mc.bin");
+
+static const u32 golden_settings_iceland_a11[] =
+{
+       mmVM_PRT_APERTURE0_LOW_ADDR, 0x0fffffff, 0x0fffffff,
+       mmVM_PRT_APERTURE1_LOW_ADDR, 0x0fffffff, 0x0fffffff,
+       mmVM_PRT_APERTURE2_LOW_ADDR, 0x0fffffff, 0x0fffffff,
+       mmVM_PRT_APERTURE3_LOW_ADDR, 0x0fffffff, 0x0fffffff
+};
+
+static const u32 iceland_mgcg_cgcg_init[] =
+{
+       mmMC_MEM_POWER_LS, 0xffffffff, 0x00000104
+};
+
+static void gmc_v7_0_init_golden_registers(struct amdgpu_device *adev)
+{
+       switch (adev->asic_type) {
+       case CHIP_TOPAZ:
+               amdgpu_program_register_sequence(adev,
+                                                iceland_mgcg_cgcg_init,
+                                                (const u32)ARRAY_SIZE(iceland_mgcg_cgcg_init));
+               amdgpu_program_register_sequence(adev,
+                                                golden_settings_iceland_a11,
+                                                (const u32)ARRAY_SIZE(golden_settings_iceland_a11));
+               break;
+       default:
+               break;
+       }
+}
 
 /**
- * gmc8_mc_wait_for_idle - wait for MC idle callback.
+ * gmc7_mc_wait_for_idle - wait for MC idle callback.
  *
  * @adev: amdgpu_device pointer
  *
@@ -132,13 +162,20 @@ static int gmc_v7_0_init_microcode(struct amdgpu_device *adev)
        case CHIP_HAWAII:
                chip_name = "hawaii";
                break;
+       case CHIP_TOPAZ:
+               chip_name = "topaz";
+               break;
        case CHIP_KAVERI:
        case CHIP_KABINI:
                return 0;
        default: BUG();
        }
 
-       snprintf(fw_name, sizeof(fw_name), "radeon/%s_mc.bin", chip_name);
+       if (adev->asic_type == CHIP_TOPAZ)
+               snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mc.bin", chip_name);
+       else
+               snprintf(fw_name, sizeof(fw_name), "radeon/%s_mc.bin", chip_name);
+
        err = request_firmware(&adev->mc.fw, fw_name, adev->dev);
        if (err)
                goto out;
@@ -755,7 +792,7 @@ static void gmc_v7_0_enable_mc_ls(struct amdgpu_device *adev,
 
        for (i = 0; i < ARRAY_SIZE(mc_cg_registers); i++) {
                orig = data = RREG32(mc_cg_registers[i]);
-               if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_MC_LS))
+               if (enable && (adev->cg_flags & AMD_CG_SUPPORT_MC_LS))
                        data |= mc_cg_ls_en[i];
                else
                        data &= ~mc_cg_ls_en[i];
@@ -772,7 +809,7 @@ static void gmc_v7_0_enable_mc_mgcg(struct amdgpu_device *adev,
 
        for (i = 0; i < ARRAY_SIZE(mc_cg_registers); i++) {
                orig = data = RREG32(mc_cg_registers[i]);
-               if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_MC_MGCG))
+               if (enable && (adev->cg_flags & AMD_CG_SUPPORT_MC_MGCG))
                        data |= mc_cg_en[i];
                else
                        data &= ~mc_cg_en[i];
@@ -788,7 +825,7 @@ static void gmc_v7_0_enable_bif_mgls(struct amdgpu_device *adev,
 
        orig = data = RREG32_PCIE(ixPCIE_CNTL2);
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_BIF_LS)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_BIF_LS)) {
                data = REG_SET_FIELD(data, PCIE_CNTL2, SLV_MEM_LS_EN, 1);
                data = REG_SET_FIELD(data, PCIE_CNTL2, MST_MEM_LS_EN, 1);
                data = REG_SET_FIELD(data, PCIE_CNTL2, REPLAY_MEM_LS_EN, 1);
@@ -811,7 +848,7 @@ static void gmc_v7_0_enable_hdp_mgcg(struct amdgpu_device *adev,
 
        orig = data = RREG32(mmHDP_HOST_PATH_CNTL);
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_HDP_MGCG))
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_HDP_MGCG))
                data = REG_SET_FIELD(data, HDP_HOST_PATH_CNTL, CLOCK_GATING_DIS, 0);
        else
                data = REG_SET_FIELD(data, HDP_HOST_PATH_CNTL, CLOCK_GATING_DIS, 1);
@@ -827,7 +864,7 @@ static void gmc_v7_0_enable_hdp_ls(struct amdgpu_device *adev,
 
        orig = data = RREG32(mmHDP_MEM_POWER_LS);
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_HDP_LS))
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_HDP_LS))
                data = REG_SET_FIELD(data, HDP_MEM_POWER_LS, LS_ENABLE, 1);
        else
                data = REG_SET_FIELD(data, HDP_MEM_POWER_LS, LS_ENABLE, 0);
@@ -984,6 +1021,8 @@ static int gmc_v7_0_hw_init(void *handle)
        int r;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       gmc_v7_0_init_golden_registers(adev);
+
        gmc_v7_0_mc_program(adev);
 
        if (!(adev->flags & AMD_IS_APU)) {
index c0c9a01..3efd455 100644 (file)
@@ -42,9 +42,7 @@
 static void gmc_v8_0_set_gart_funcs(struct amdgpu_device *adev);
 static void gmc_v8_0_set_irq_funcs(struct amdgpu_device *adev);
 
-MODULE_FIRMWARE("amdgpu/topaz_mc.bin");
 MODULE_FIRMWARE("amdgpu/tonga_mc.bin");
-MODULE_FIRMWARE("amdgpu/fiji_mc.bin");
 
 static const u32 golden_settings_tonga_a11[] =
 {
@@ -75,19 +73,6 @@ static const u32 fiji_mgcg_cgcg_init[] =
        mmMC_MEM_POWER_LS, 0xffffffff, 0x00000104
 };
 
-static const u32 golden_settings_iceland_a11[] =
-{
-       mmVM_PRT_APERTURE0_LOW_ADDR, 0x0fffffff, 0x0fffffff,
-       mmVM_PRT_APERTURE1_LOW_ADDR, 0x0fffffff, 0x0fffffff,
-       mmVM_PRT_APERTURE2_LOW_ADDR, 0x0fffffff, 0x0fffffff,
-       mmVM_PRT_APERTURE3_LOW_ADDR, 0x0fffffff, 0x0fffffff
-};
-
-static const u32 iceland_mgcg_cgcg_init[] =
-{
-       mmMC_MEM_POWER_LS, 0xffffffff, 0x00000104
-};
-
 static const u32 cz_mgcg_cgcg_init[] =
 {
        mmMC_MEM_POWER_LS, 0xffffffff, 0x00000104
@@ -102,14 +87,6 @@ static const u32 stoney_mgcg_cgcg_init[] =
 static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev)
 {
        switch (adev->asic_type) {
-       case CHIP_TOPAZ:
-               amdgpu_program_register_sequence(adev,
-                                                iceland_mgcg_cgcg_init,
-                                                (const u32)ARRAY_SIZE(iceland_mgcg_cgcg_init));
-               amdgpu_program_register_sequence(adev,
-                                                golden_settings_iceland_a11,
-                                                (const u32)ARRAY_SIZE(golden_settings_iceland_a11));
-               break;
        case CHIP_FIJI:
                amdgpu_program_register_sequence(adev,
                                                 fiji_mgcg_cgcg_init,
@@ -229,15 +206,10 @@ static int gmc_v8_0_init_microcode(struct amdgpu_device *adev)
        DRM_DEBUG("\n");
 
        switch (adev->asic_type) {
-       case CHIP_TOPAZ:
-               chip_name = "topaz";
-               break;
        case CHIP_TONGA:
                chip_name = "tonga";
                break;
        case CHIP_FIJI:
-               chip_name = "fiji";
-               break;
        case CHIP_CARRIZO:
        case CHIP_STONEY:
                return 0;
@@ -1007,7 +979,7 @@ static int gmc_v8_0_hw_init(void *handle)
 
        gmc_v8_0_mc_program(adev);
 
-       if (!(adev->flags & AMD_IS_APU)) {
+       if (adev->asic_type == CHIP_TONGA) {
                r = gmc_v8_0_mc_load_microcode(adev);
                if (r) {
                        DRM_ERROR("Failed to load MC firmware!\n");
index 966d4b2..090486c 100644 (file)
@@ -432,7 +432,7 @@ static uint32_t iceland_smu_get_mask_for_fw_type(uint32_t fw_type)
                case AMDGPU_UCODE_ID_CP_ME:
                        return UCODE_ID_CP_ME_MASK;
                case AMDGPU_UCODE_ID_CP_MEC1:
-                       return UCODE_ID_CP_MEC_MASK | UCODE_ID_CP_MEC_JT1_MASK | UCODE_ID_CP_MEC_JT2_MASK;
+                       return UCODE_ID_CP_MEC_MASK | UCODE_ID_CP_MEC_JT1_MASK;
                case AMDGPU_UCODE_ID_CP_MEC2:
                        return UCODE_ID_CP_MEC_MASK;
                case AMDGPU_UCODE_ID_RLC_G:
@@ -522,12 +522,6 @@ static int iceland_smu_request_load_fw(struct amdgpu_device *adev)
                return -EINVAL;
        }
 
-       if (iceland_smu_populate_single_firmware_entry(adev, UCODE_ID_CP_MEC_JT2,
-                       &toc->entry[toc->num_entries++])) {
-               DRM_ERROR("Failed to get firmware entry for MEC_JT2\n");
-               return -EINVAL;
-       }
-
        if (iceland_smu_populate_single_firmware_entry(adev, UCODE_ID_SDMA0,
                        &toc->entry[toc->num_entries++])) {
                DRM_ERROR("Failed to get firmware entry for SDMA0\n");
@@ -550,8 +544,8 @@ static int iceland_smu_request_load_fw(struct amdgpu_device *adev)
                        UCODE_ID_CP_ME_MASK |
                        UCODE_ID_CP_PFP_MASK |
                        UCODE_ID_CP_MEC_MASK |
-                       UCODE_ID_CP_MEC_JT1_MASK |
-                       UCODE_ID_CP_MEC_JT2_MASK;
+                       UCODE_ID_CP_MEC_JT1_MASK;
+
 
        if (iceland_send_msg_to_smc_with_parameter_without_waiting(adev, PPSMC_MSG_LoadUcodes, fw_to_load)) {
                DRM_ERROR("Fail to request SMU load ucode\n");
index 7e9154c..654d767 100644 (file)
@@ -2859,11 +2859,11 @@ static int kv_dpm_init(struct amdgpu_device *adev)
        pi->voltage_drop_t = 0;
        pi->caps_sclk_throttle_low_notification = false;
        pi->caps_fps = false; /* true? */
-       pi->caps_uvd_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_UVD) ? true : false;
+       pi->caps_uvd_pg = (adev->pg_flags & AMD_PG_SUPPORT_UVD) ? true : false;
        pi->caps_uvd_dpm = true;
-       pi->caps_vce_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_VCE) ? true : false;
-       pi->caps_samu_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_SAMU) ? true : false;
-       pi->caps_acp_pg = (adev->pg_flags & AMDGPU_PG_SUPPORT_ACP) ? true : false;
+       pi->caps_vce_pg = (adev->pg_flags & AMD_PG_SUPPORT_VCE) ? true : false;
+       pi->caps_samu_pg = (adev->pg_flags & AMD_PG_SUPPORT_SAMU) ? true : false;
+       pi->caps_acp_pg = (adev->pg_flags & AMD_PG_SUPPORT_ACP) ? true : false;
        pi->caps_stable_p_state = false;
 
        ret = kv_parse_sys_info_table(adev);
index f4a1346..0497784 100644 (file)
@@ -122,25 +122,12 @@ static int tonga_dpm_hw_fini(void *handle)
 
 static int tonga_dpm_suspend(void *handle)
 {
-       return 0;
+       return tonga_dpm_hw_fini(handle);
 }
 
 static int tonga_dpm_resume(void *handle)
 {
-       int ret;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
-       mutex_lock(&adev->pm.mutex);
-
-       ret = tonga_smu_start(adev);
-       if (ret) {
-               DRM_ERROR("SMU start failed\n");
-               goto fail;
-       }
-
-fail:
-       mutex_unlock(&adev->pm.mutex);
-       return ret;
+       return tonga_dpm_hw_init(handle);
 }
 
 static int tonga_dpm_set_clockgating_state(void *handle,
index 5e9f73a..fbd3767 100644 (file)
@@ -611,7 +611,7 @@ static void uvd_v4_2_enable_mgcg(struct amdgpu_device *adev,
 {
        u32 orig, data;
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_UVD_MGCG)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG)) {
                data = RREG32_UVD_CTX(ixUVD_CGC_MEM_CTRL);
                data = 0xfff;
                WREG32_UVD_CTX(ixUVD_CGC_MEM_CTRL, data);
@@ -830,6 +830,9 @@ static int uvd_v4_2_set_clockgating_state(void *handle,
        bool gate = false;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (!(adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG))
+               return 0;
+
        if (state == AMD_CG_STATE_GATE)
                gate = true;
 
@@ -848,7 +851,10 @@ static int uvd_v4_2_set_powergating_state(void *handle,
         * revisit this when there is a cleaner line between
         * the smc and the hw blocks
         */
-        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       if (!(adev->pg_flags & AMD_PG_SUPPORT_UVD))
+               return 0;
 
        if (state == AMD_PG_STATE_GATE) {
                uvd_v4_2_stop(adev);
index 38864f5..57f1c5b 100644 (file)
@@ -774,6 +774,11 @@ static int uvd_v5_0_process_interrupt(struct amdgpu_device *adev,
 static int uvd_v5_0_set_clockgating_state(void *handle,
                                          enum amd_clockgating_state state)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       if (!(adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG))
+               return 0;
+
        return 0;
 }
 
@@ -789,6 +794,9 @@ static int uvd_v5_0_set_powergating_state(void *handle,
         */
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (!(adev->pg_flags & AMD_PG_SUPPORT_UVD))
+               return 0;
+
        if (state == AMD_PG_STATE_GATE) {
                uvd_v5_0_stop(adev);
                return 0;
index 3d59139..0b365b7 100644 (file)
@@ -532,7 +532,7 @@ static int uvd_v6_0_start(struct amdgpu_device *adev)
        uvd_v6_0_mc_resume(adev);
 
        /* Set dynamic clock gating in S/W control mode */
-       if (adev->cg_flags & AMDGPU_CG_SUPPORT_UVD_MGCG) {
+       if (adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG) {
                if (adev->flags & AMD_IS_APU)
                        cz_set_uvd_clock_gating_branches(adev, false);
                else
@@ -1000,7 +1000,7 @@ static int uvd_v6_0_set_clockgating_state(void *handle,
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
        bool enable = (state == AMD_CG_STATE_GATE) ? true : false;
 
-       if (!(adev->cg_flags & AMDGPU_CG_SUPPORT_UVD_MGCG))
+       if (!(adev->cg_flags & AMD_CG_SUPPORT_UVD_MGCG))
                return 0;
 
        if (enable) {
@@ -1030,6 +1030,9 @@ static int uvd_v6_0_set_powergating_state(void *handle,
         */
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (!(adev->pg_flags & AMD_PG_SUPPORT_UVD))
+               return 0;
+
        if (state == AMD_PG_STATE_GATE) {
                uvd_v6_0_stop(adev);
                return 0;
index 52ac7a8..a822eda 100644 (file)
@@ -373,7 +373,7 @@ static void vce_v2_0_enable_mgcg(struct amdgpu_device *adev, bool enable)
 {
        bool sw_cg = false;
 
-       if (enable && (adev->cg_flags & AMDGPU_CG_SUPPORT_VCE_MGCG)) {
+       if (enable && (adev->cg_flags & AMD_CG_SUPPORT_VCE_MGCG)) {
                if (sw_cg)
                        vce_v2_0_set_sw_cg(adev, true);
                else
@@ -608,6 +608,9 @@ static int vce_v2_0_set_powergating_state(void *handle,
         */
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (!(adev->pg_flags & AMD_PG_SUPPORT_VCE))
+               return 0;
+
        if (state == AMD_PG_STATE_GATE)
                /* XXX do we need a vce_v2_0_stop()? */
                return 0;
index e99af81..d662fa9 100644 (file)
@@ -277,7 +277,7 @@ static int vce_v3_0_start(struct amdgpu_device *adev)
                WREG32_P(mmVCE_STATUS, 0, ~1);
 
                /* Set Clock-Gating off */
-               if (adev->cg_flags & AMDGPU_CG_SUPPORT_VCE_MGCG)
+               if (adev->cg_flags & AMD_CG_SUPPORT_VCE_MGCG)
                        vce_v3_0_set_vce_sw_clock_gating(adev, false);
 
                if (r) {
@@ -676,7 +676,7 @@ static int vce_v3_0_set_clockgating_state(void *handle,
        bool enable = (state == AMD_CG_STATE_GATE) ? true : false;
        int i;
 
-       if (!(adev->cg_flags & AMDGPU_CG_SUPPORT_VCE_MGCG))
+       if (!(adev->cg_flags & AMD_CG_SUPPORT_VCE_MGCG))
                return 0;
 
        mutex_lock(&adev->grbm_idx_mutex);
@@ -728,6 +728,9 @@ static int vce_v3_0_set_powergating_state(void *handle,
         */
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       if (!(adev->pg_flags & AMD_PG_SUPPORT_VCE))
+               return 0;
+
        if (state == AMD_PG_STATE_GATE)
                /* XXX do we need a vce_v3_0_stop()? */
                return 0;
index 652e766..0d14d10 100644 (file)
@@ -61,6 +61,7 @@
 #include "vi.h"
 #include "vi_dpm.h"
 #include "gmc_v8_0.h"
+#include "gmc_v7_0.h"
 #include "gfx_v8_0.h"
 #include "sdma_v2_4.h"
 #include "sdma_v3_0.h"
@@ -1109,10 +1110,10 @@ static const struct amdgpu_ip_block_version topaz_ip_blocks[] =
        },
        {
                .type = AMD_IP_BLOCK_TYPE_GMC,
-               .major = 8,
-               .minor = 0,
+               .major = 7,
+               .minor = 4,
                .rev = 0,
-               .funcs = &gmc_v8_0_ip_funcs,
+               .funcs = &gmc_v7_0_ip_funcs,
        },
        {
                .type = AMD_IP_BLOCK_TYPE_IH,
@@ -1442,8 +1443,7 @@ static int vi_common_early_init(void *handle)
                break;
        case CHIP_FIJI:
                adev->has_uvd = true;
-               adev->cg_flags = AMDGPU_CG_SUPPORT_UVD_MGCG |
-                               AMDGPU_CG_SUPPORT_VCE_MGCG;
+               adev->cg_flags = 0;
                adev->pg_flags = 0;
                adev->external_rev_id = adev->rev_id + 0x3c;
                break;
@@ -1457,8 +1457,7 @@ static int vi_common_early_init(void *handle)
        case CHIP_STONEY:
                adev->has_uvd = true;
                adev->cg_flags = 0;
-               /* Disable UVD pg */
-               adev->pg_flags = /* AMDGPU_PG_SUPPORT_UVD | */AMDGPU_PG_SUPPORT_VCE;
+               adev->pg_flags = 0;
                adev->external_rev_id = adev->rev_id + 0x1;
                break;
        default:
index 9be0070..a902ae0 100644 (file)
@@ -194,7 +194,7 @@ static void kfd_process_wq_release(struct work_struct *work)
 
        kfree(p);
 
-       kfree((void *)work);
+       kfree(work);
 }
 
 static void kfd_process_destroy_delayed(struct rcu_head *rcu)
index 1195d06..dbf7e64 100644 (file)
@@ -85,6 +85,38 @@ enum amd_powergating_state {
        AMD_PG_STATE_UNGATE,
 };
 
+/* CG flags */
+#define AMD_CG_SUPPORT_GFX_MGCG                        (1 << 0)
+#define AMD_CG_SUPPORT_GFX_MGLS                        (1 << 1)
+#define AMD_CG_SUPPORT_GFX_CGCG                        (1 << 2)
+#define AMD_CG_SUPPORT_GFX_CGLS                        (1 << 3)
+#define AMD_CG_SUPPORT_GFX_CGTS                        (1 << 4)
+#define AMD_CG_SUPPORT_GFX_CGTS_LS             (1 << 5)
+#define AMD_CG_SUPPORT_GFX_CP_LS               (1 << 6)
+#define AMD_CG_SUPPORT_GFX_RLC_LS              (1 << 7)
+#define AMD_CG_SUPPORT_MC_LS                   (1 << 8)
+#define AMD_CG_SUPPORT_MC_MGCG                 (1 << 9)
+#define AMD_CG_SUPPORT_SDMA_LS                 (1 << 10)
+#define AMD_CG_SUPPORT_SDMA_MGCG               (1 << 11)
+#define AMD_CG_SUPPORT_BIF_LS                  (1 << 12)
+#define AMD_CG_SUPPORT_UVD_MGCG                        (1 << 13)
+#define AMD_CG_SUPPORT_VCE_MGCG                        (1 << 14)
+#define AMD_CG_SUPPORT_HDP_LS                  (1 << 15)
+#define AMD_CG_SUPPORT_HDP_MGCG                        (1 << 16)
+
+/* PG flags */
+#define AMD_PG_SUPPORT_GFX_PG                  (1 << 0)
+#define AMD_PG_SUPPORT_GFX_SMG                 (1 << 1)
+#define AMD_PG_SUPPORT_GFX_DMG                 (1 << 2)
+#define AMD_PG_SUPPORT_UVD                     (1 << 3)
+#define AMD_PG_SUPPORT_VCE                     (1 << 4)
+#define AMD_PG_SUPPORT_CP                      (1 << 5)
+#define AMD_PG_SUPPORT_GDS                     (1 << 6)
+#define AMD_PG_SUPPORT_RLC_SMU_HS              (1 << 7)
+#define AMD_PG_SUPPORT_SDMA                    (1 << 8)
+#define AMD_PG_SUPPORT_ACP                     (1 << 9)
+#define AMD_PG_SUPPORT_SAMU                    (1 << 10)
+
 enum amd_pm_state_type {
        /* not used for dpm */
        POWER_STATE_TYPE_DEFAULT,
index 713aec9..aec38fc 100644 (file)
@@ -109,6 +109,8 @@ enum cgs_system_info_id {
        CGS_SYSTEM_INFO_ADAPTER_BDF_ID = 1,
        CGS_SYSTEM_INFO_PCIE_GEN_INFO,
        CGS_SYSTEM_INFO_PCIE_MLW,
+       CGS_SYSTEM_INFO_CG_FLAGS,
+       CGS_SYSTEM_INFO_PG_FLAGS,
        CGS_SYSTEM_INFO_ID_MAXIMUM,
 };
 
index 8f5d5ed..589599f 100644 (file)
@@ -64,6 +64,11 @@ static int pp_sw_init(void *handle)
        if (ret == 0)
                ret = hwmgr->hwmgr_func->backend_init(hwmgr);
 
+       if (ret)
+               printk("amdgpu: powerplay initialization failed\n");
+       else
+               printk("amdgpu: powerplay initialized\n");
+
        return ret;
 }
 
@@ -397,8 +402,11 @@ int pp_dpm_dispatch_tasks(void *handle, enum amd_pp_event event_id, void *input,
 
                data.requested_ui_label = power_state_convert(ps);
                ret = pem_handle_event(pp_handle->eventmgr, event_id, &data);
+               break;
        }
-       break;
+       case AMD_PP_EVENT_COMPLETE_INIT:
+               ret = pem_handle_event(pp_handle->eventmgr, event_id, &data);
+               break;
        default:
                break;
        }
index 83be3cf..6b52c78 100644 (file)
@@ -165,6 +165,7 @@ const struct action_chain resume_action_chain = {
 };
 
 static const pem_event_action *complete_init_event[] = {
+       unblock_adjust_power_state_tasks,
        adjust_power_state_tasks,
        enable_gfx_clock_gating_tasks,
        enable_gfx_voltage_island_power_gating_tasks,
index 52a3efc..46410e3 100644 (file)
@@ -31,7 +31,7 @@
 static int pem_init(struct pp_eventmgr *eventmgr)
 {
        int result = 0;
-       struct pem_event_data event_data;
+       struct pem_event_data event_data = { {0} };
 
        /* Initialize PowerPlay feature info */
        pem_init_feature_info(eventmgr);
@@ -52,7 +52,7 @@ static int pem_init(struct pp_eventmgr *eventmgr)
 
 static void pem_fini(struct pp_eventmgr *eventmgr)
 {
-       struct pem_event_data event_data;
+       struct pem_event_data event_data = { {0} };
 
        pem_uninit_featureInfo(eventmgr);
        pem_unregister_interrupts(eventmgr);
index ad77008..ff08ce4 100644 (file)
@@ -226,7 +226,7 @@ int cz_dpm_powergate_vce(struct pp_hwmgr *hwmgr, bool bgate)
                }
        } else {
                cz_dpm_update_vce_dpm(hwmgr);
-               cz_enable_disable_vce_dpm(hwmgr, true);
+               cz_enable_disable_vce_dpm(hwmgr, !bgate);
                return 0;
        }
 
index 0874ab4..cf01177 100644 (file)
@@ -174,6 +174,8 @@ static int cz_initialize_dpm_defaults(struct pp_hwmgr *hwmgr)
 {
        struct cz_hwmgr *cz_hwmgr = (struct cz_hwmgr *)(hwmgr->backend);
        uint32_t i;
+       struct cgs_system_info sys_info = {0};
+       int result;
 
        cz_hwmgr->gfx_ramp_step = 256*25/100;
 
@@ -247,6 +249,22 @@ static int cz_initialize_dpm_defaults(struct pp_hwmgr *hwmgr)
        phm_cap_set(hwmgr->platform_descriptor.platformCaps,
                                   PHM_PlatformCaps_DisableVoltageIsland);
 
+       phm_cap_unset(hwmgr->platform_descriptor.platformCaps,
+                     PHM_PlatformCaps_UVDPowerGating);
+       phm_cap_unset(hwmgr->platform_descriptor.platformCaps,
+                     PHM_PlatformCaps_VCEPowerGating);
+       sys_info.size = sizeof(struct cgs_system_info);
+       sys_info.info_id = CGS_SYSTEM_INFO_PG_FLAGS;
+       result = cgs_query_system_info(hwmgr->device, &sys_info);
+       if (!result) {
+               if (sys_info.value & AMD_PG_SUPPORT_UVD)
+                       phm_cap_set(hwmgr->platform_descriptor.platformCaps,
+                                     PHM_PlatformCaps_UVDPowerGating);
+               if (sys_info.value & AMD_PG_SUPPORT_VCE)
+                       phm_cap_set(hwmgr->platform_descriptor.platformCaps,
+                                     PHM_PlatformCaps_VCEPowerGating);
+       }
+
        return 0;
 }
 
index 44a9250..980d3bf 100644 (file)
@@ -4451,6 +4451,7 @@ int tonga_hwmgr_backend_init(struct pp_hwmgr *hwmgr)
        pp_atomctrl_gpio_pin_assignment gpio_pin_assignment;
        struct phm_ppt_v1_information *pptable_info = (struct phm_ppt_v1_information *)(hwmgr->pptable);
        phw_tonga_ulv_parm *ulv;
+       struct cgs_system_info sys_info = {0};
 
        PP_ASSERT_WITH_CODE((NULL != hwmgr),
                "Invalid Parameter!", return -1;);
@@ -4615,9 +4616,23 @@ int tonga_hwmgr_backend_init(struct pp_hwmgr *hwmgr)
 
        data->vddc_phase_shed_control = 0;
 
-       if (0 == result) {
-               struct cgs_system_info sys_info = {0};
+       phm_cap_unset(hwmgr->platform_descriptor.platformCaps,
+                     PHM_PlatformCaps_UVDPowerGating);
+       phm_cap_unset(hwmgr->platform_descriptor.platformCaps,
+                     PHM_PlatformCaps_VCEPowerGating);
+       sys_info.size = sizeof(struct cgs_system_info);
+       sys_info.info_id = CGS_SYSTEM_INFO_PG_FLAGS;
+       result = cgs_query_system_info(hwmgr->device, &sys_info);
+       if (!result) {
+               if (sys_info.value & AMD_PG_SUPPORT_UVD)
+                       phm_cap_set(hwmgr->platform_descriptor.platformCaps,
+                                     PHM_PlatformCaps_UVDPowerGating);
+               if (sys_info.value & AMD_PG_SUPPORT_VCE)
+                       phm_cap_set(hwmgr->platform_descriptor.platformCaps,
+                                     PHM_PlatformCaps_VCEPowerGating);
+       }
 
+       if (0 == result) {
                data->is_tlu_enabled = 0;
                hwmgr->platform_descriptor.hardwareActivityPerformanceLevels =
                        TONGA_MAX_HARDWARE_POWERLEVELS;
index 873a8d2..ec222c6 100644 (file)
@@ -272,6 +272,9 @@ static int cz_start_smu(struct pp_smumgr *smumgr)
                                UCODE_ID_CP_MEC_JT1_MASK |
                                UCODE_ID_CP_MEC_JT2_MASK;
 
+       if (smumgr->chip_id == CHIP_STONEY)
+               fw_to_check &= ~(UCODE_ID_SDMA1_MASK | UCODE_ID_CP_MEC_JT2_MASK);
+
        cz_request_smu_load_fw(smumgr);
        cz_check_fw_load_finish(smumgr, fw_to_check);
 
@@ -282,7 +285,7 @@ static int cz_start_smu(struct pp_smumgr *smumgr)
        return ret;
 }
 
-static uint8_t cz_translate_firmware_enum_to_arg(
+static uint8_t cz_translate_firmware_enum_to_arg(struct pp_smumgr *smumgr,
                        enum cz_scratch_entry firmware_enum)
 {
        uint8_t ret = 0;
@@ -292,7 +295,10 @@ static uint8_t cz_translate_firmware_enum_to_arg(
                ret = UCODE_ID_SDMA0;
                break;
        case CZ_SCRATCH_ENTRY_UCODE_ID_SDMA1:
-               ret = UCODE_ID_SDMA1;
+               if (smumgr->chip_id == CHIP_STONEY)
+                       ret = UCODE_ID_SDMA0;
+               else
+                       ret = UCODE_ID_SDMA1;
                break;
        case CZ_SCRATCH_ENTRY_UCODE_ID_CP_CE:
                ret = UCODE_ID_CP_CE;
@@ -307,7 +313,10 @@ static uint8_t cz_translate_firmware_enum_to_arg(
                ret = UCODE_ID_CP_MEC_JT1;
                break;
        case CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT2:
-               ret = UCODE_ID_CP_MEC_JT2;
+               if (smumgr->chip_id == CHIP_STONEY)
+                       ret = UCODE_ID_CP_MEC_JT1;
+               else
+                       ret = UCODE_ID_CP_MEC_JT2;
                break;
        case CZ_SCRATCH_ENTRY_UCODE_ID_GMCON_RENG:
                ret = UCODE_ID_GMCON_RENG;
@@ -396,7 +405,7 @@ static int cz_smu_populate_single_scratch_task(
        struct SMU_Task *task = &toc->tasks[cz_smu->toc_entry_used_count++];
 
        task->type = type;
-       task->arg = cz_translate_firmware_enum_to_arg(fw_enum);
+       task->arg = cz_translate_firmware_enum_to_arg(smumgr, fw_enum);
        task->next = is_last ? END_OF_TASK_LIST : cz_smu->toc_entry_used_count;
 
        for (i = 0; i < cz_smu->scratch_buffer_length; i++)
@@ -433,7 +442,7 @@ static int cz_smu_populate_single_ucode_load_task(
        struct SMU_Task *task = &toc->tasks[cz_smu->toc_entry_used_count++];
 
        task->type = TASK_TYPE_UCODE_LOAD;
-       task->arg = cz_translate_firmware_enum_to_arg(fw_enum);
+       task->arg = cz_translate_firmware_enum_to_arg(smumgr, fw_enum);
        task->next = is_last ? END_OF_TASK_LIST : cz_smu->toc_entry_used_count;
 
        for (i = 0; i < cz_smu->driver_buffer_length; i++)
@@ -509,8 +518,14 @@ static int cz_smu_construct_toc_for_vddgfx_exit(struct pp_smumgr *smumgr)
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_ME, false);
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT1, false);
-       cz_smu_populate_single_ucode_load_task(smumgr,
+
+       if (smumgr->chip_id == CHIP_STONEY)
+               cz_smu_populate_single_ucode_load_task(smumgr,
+                               CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT1, false);
+       else
+               cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT2, false);
+
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_RLC_G, false);
 
@@ -551,7 +566,11 @@ static int cz_smu_construct_toc_for_bootup(struct pp_smumgr *smumgr)
 
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_SDMA0, false);
-       cz_smu_populate_single_ucode_load_task(smumgr,
+       if (smumgr->chip_id == CHIP_STONEY)
+               cz_smu_populate_single_ucode_load_task(smumgr,
+                               CZ_SCRATCH_ENTRY_UCODE_ID_SDMA0, false);
+       else
+               cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_SDMA1, false);
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_CE, false);
@@ -561,7 +580,11 @@ static int cz_smu_construct_toc_for_bootup(struct pp_smumgr *smumgr)
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_ME, false);
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT1, false);
-       cz_smu_populate_single_ucode_load_task(smumgr,
+       if (smumgr->chip_id == CHIP_STONEY)
+               cz_smu_populate_single_ucode_load_task(smumgr,
+                               CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT1, false);
+       else
+               cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_CP_MEC_JT2, false);
        cz_smu_populate_single_ucode_load_task(smumgr,
                                CZ_SCRATCH_ENTRY_UCODE_ID_RLC_G, true);
@@ -618,7 +641,7 @@ static int cz_smu_populate_firmware_entries(struct pp_smumgr *smumgr)
 
        for (i = 0; i < sizeof(firmware_list)/sizeof(*firmware_list); i++) {
 
-               firmware_type = cz_translate_firmware_enum_to_arg(
+               firmware_type = cz_translate_firmware_enum_to_arg(smumgr,
                                        firmware_list[i]);
 
                ucode_id = cz_convert_fw_type_to_cgs(firmware_type);
index 9759009..b1480ac 100644 (file)
@@ -227,7 +227,7 @@ static int ast_get_dram_info(struct drm_device *dev)
        } while (ast_read32(ast, 0x10000) != 0x01);
        data = ast_read32(ast, 0x10004);
 
-       if (data & 0x400)
+       if (data & 0x40)
                ast->dram_bus_width = 16;
        else
                ast->dram_bus_width = 32;
index 3f74193..9a7b446 100644 (file)
@@ -65,8 +65,6 @@ drm_atomic_state_init(struct drm_device *dev, struct drm_atomic_state *state)
         */
        state->allow_modeset = true;
 
-       state->num_connector = ACCESS_ONCE(dev->mode_config.num_connector);
-
        state->crtcs = kcalloc(dev->mode_config.num_crtc,
                               sizeof(*state->crtcs), GFP_KERNEL);
        if (!state->crtcs)
@@ -83,16 +81,6 @@ drm_atomic_state_init(struct drm_device *dev, struct drm_atomic_state *state)
                                      sizeof(*state->plane_states), GFP_KERNEL);
        if (!state->plane_states)
                goto fail;
-       state->connectors = kcalloc(state->num_connector,
-                                   sizeof(*state->connectors),
-                                   GFP_KERNEL);
-       if (!state->connectors)
-               goto fail;
-       state->connector_states = kcalloc(state->num_connector,
-                                         sizeof(*state->connector_states),
-                                         GFP_KERNEL);
-       if (!state->connector_states)
-               goto fail;
 
        state->dev = dev;
 
@@ -823,19 +811,27 @@ drm_atomic_get_connector_state(struct drm_atomic_state *state,
 
        index = drm_connector_index(connector);
 
-       /*
-        * Construction of atomic state updates can race with a connector
-        * hot-add which might overflow. In this case flip the table and just
-        * restart the entire ioctl - no one is fast enough to livelock a cpu
-        * with physical hotplug events anyway.
-        *
-        * Note that we only grab the indexes once we have the right lock to
-        * prevent hotplug/unplugging of connectors. So removal is no problem,
-        * at most the array is a bit too large.
-        */
        if (index >= state->num_connector) {
-               DRM_DEBUG_ATOMIC("Hot-added connector would overflow state array, restarting\n");
-               return ERR_PTR(-EAGAIN);
+               struct drm_connector **c;
+               struct drm_connector_state **cs;
+               int alloc = max(index + 1, config->num_connector);
+
+               c = krealloc(state->connectors, alloc * sizeof(*state->connectors), GFP_KERNEL);
+               if (!c)
+                       return ERR_PTR(-ENOMEM);
+
+               state->connectors = c;
+               memset(&state->connectors[state->num_connector], 0,
+                      sizeof(*state->connectors) * (alloc - state->num_connector));
+
+               cs = krealloc(state->connector_states, alloc * sizeof(*state->connector_states), GFP_KERNEL);
+               if (!cs)
+                       return ERR_PTR(-ENOMEM);
+
+               state->connector_states = cs;
+               memset(&state->connector_states[state->num_connector], 0,
+                      sizeof(*state->connector_states) * (alloc - state->num_connector));
+               state->num_connector = alloc;
        }
 
        if (state->connector_states[index])
index 57cccd6..4f2d3e1 100644 (file)
@@ -946,9 +946,23 @@ static void wait_for_fences(struct drm_device *dev,
        }
 }
 
-static bool framebuffer_changed(struct drm_device *dev,
-                               struct drm_atomic_state *old_state,
-                               struct drm_crtc *crtc)
+/**
+ * drm_atomic_helper_framebuffer_changed - check if framebuffer has changed
+ * @dev: DRM device
+ * @old_state: atomic state object with old state structures
+ * @crtc: DRM crtc
+ *
+ * Checks whether the framebuffer used for this CRTC changes as a result of
+ * the atomic update.  This is useful for drivers which cannot use
+ * drm_atomic_helper_wait_for_vblanks() and need to reimplement its
+ * functionality.
+ *
+ * Returns:
+ * true if the framebuffer changed.
+ */
+bool drm_atomic_helper_framebuffer_changed(struct drm_device *dev,
+                                          struct drm_atomic_state *old_state,
+                                          struct drm_crtc *crtc)
 {
        struct drm_plane *plane;
        struct drm_plane_state *old_plane_state;
@@ -965,6 +979,7 @@ static bool framebuffer_changed(struct drm_device *dev,
 
        return false;
 }
+EXPORT_SYMBOL(drm_atomic_helper_framebuffer_changed);
 
 /**
  * drm_atomic_helper_wait_for_vblanks - wait for vblank on crtcs
@@ -999,7 +1014,8 @@ drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
                if (old_state->legacy_cursor_update)
                        continue;
 
-               if (!framebuffer_changed(dev, old_state, crtc))
+               if (!drm_atomic_helper_framebuffer_changed(dev,
+                               old_state, crtc))
                        continue;
 
                ret = drm_crtc_vblank_get(crtc);
@@ -1477,7 +1493,7 @@ void drm_atomic_helper_swap_state(struct drm_device *dev,
 {
        int i;
 
-       for (i = 0; i < dev->mode_config.num_connector; i++) {
+       for (i = 0; i < state->num_connector; i++) {
                struct drm_connector *connector = state->connectors[i];
 
                if (!connector)
index d40bab2..f619121 100644 (file)
@@ -918,12 +918,19 @@ int drm_connector_init(struct drm_device *dev,
        connector->base.properties = &connector->properties;
        connector->dev = dev;
        connector->funcs = funcs;
+
+       connector->connector_id = ida_simple_get(&config->connector_ida, 0, 0, GFP_KERNEL);
+       if (connector->connector_id < 0) {
+               ret = connector->connector_id;
+               goto out_put;
+       }
+
        connector->connector_type = connector_type;
        connector->connector_type_id =
                ida_simple_get(connector_ida, 1, 0, GFP_KERNEL);
        if (connector->connector_type_id < 0) {
                ret = connector->connector_type_id;
-               goto out_put;
+               goto out_put_id;
        }
        connector->name =
                kasprintf(GFP_KERNEL, "%s-%d",
@@ -931,7 +938,7 @@ int drm_connector_init(struct drm_device *dev,
                          connector->connector_type_id);
        if (!connector->name) {
                ret = -ENOMEM;
-               goto out_put;
+               goto out_put_type_id;
        }
 
        INIT_LIST_HEAD(&connector->probed_modes);
@@ -959,7 +966,12 @@ int drm_connector_init(struct drm_device *dev,
        }
 
        connector->debugfs_entry = NULL;
-
+out_put_type_id:
+       if (ret)
+               ida_remove(connector_ida, connector->connector_type_id);
+out_put_id:
+       if (ret)
+               ida_remove(&config->connector_ida, connector->connector_id);
 out_put:
        if (ret)
                drm_mode_object_put(dev, &connector->base);
@@ -996,6 +1008,9 @@ void drm_connector_cleanup(struct drm_connector *connector)
        ida_remove(&drm_connector_enum_list[connector->connector_type].ida,
                   connector->connector_type_id);
 
+       ida_remove(&dev->mode_config.connector_ida,
+                  connector->connector_id);
+
        kfree(connector->display_info.bus_formats);
        drm_mode_object_put(dev, &connector->base);
        kfree(connector->name);
@@ -1012,32 +1027,6 @@ void drm_connector_cleanup(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_connector_cleanup);
 
-/**
- * drm_connector_index - find the index of a registered connector
- * @connector: connector to find index for
- *
- * Given a registered connector, return the index of that connector within a DRM
- * device's list of connectors.
- */
-unsigned int drm_connector_index(struct drm_connector *connector)
-{
-       unsigned int index = 0;
-       struct drm_connector *tmp;
-       struct drm_mode_config *config = &connector->dev->mode_config;
-
-       WARN_ON(!drm_modeset_is_locked(&config->connection_mutex));
-
-       drm_for_each_connector(tmp, connector->dev) {
-               if (tmp == connector)
-                       return index;
-
-               index++;
-       }
-
-       BUG();
-}
-EXPORT_SYMBOL(drm_connector_index);
-
 /**
  * drm_connector_register - register a connector
  * @connector: the connector to register
@@ -5789,6 +5778,7 @@ void drm_mode_config_init(struct drm_device *dev)
        INIT_LIST_HEAD(&dev->mode_config.plane_list);
        idr_init(&dev->mode_config.crtc_idr);
        idr_init(&dev->mode_config.tile_idr);
+       ida_init(&dev->mode_config.connector_ida);
 
        drm_modeset_lock_all(dev);
        drm_mode_create_standard_properties(dev);
@@ -5869,6 +5859,7 @@ void drm_mode_config_cleanup(struct drm_device *dev)
                crtc->funcs->destroy(crtc);
        }
 
+       ida_destroy(&dev->mode_config.connector_ida);
        idr_destroy(&dev->mode_config.tile_idr);
        idr_destroy(&dev->mode_config.crtc_idr);
        drm_modeset_lock_fini(&dev->mode_config.connection_mutex);
index 6ed90a2..27fbd79 100644 (file)
@@ -803,12 +803,33 @@ static struct drm_dp_mst_branch *drm_dp_add_mst_branch_device(u8 lct, u8 *rad)
        return mstb;
 }
 
+static void drm_dp_free_mst_port(struct kref *kref);
+
+static void drm_dp_free_mst_branch_device(struct kref *kref)
+{
+       struct drm_dp_mst_branch *mstb = container_of(kref, struct drm_dp_mst_branch, kref);
+       if (mstb->port_parent) {
+               if (list_empty(&mstb->port_parent->next))
+                       kref_put(&mstb->port_parent->kref, drm_dp_free_mst_port);
+       }
+       kfree(mstb);
+}
+
 static void drm_dp_destroy_mst_branch_device(struct kref *kref)
 {
        struct drm_dp_mst_branch *mstb = container_of(kref, struct drm_dp_mst_branch, kref);
        struct drm_dp_mst_port *port, *tmp;
        bool wake_tx = false;
 
+       /*
+        * init kref again to be used by ports to remove mst branch when it is
+        * not needed anymore
+        */
+       kref_init(kref);
+
+       if (mstb->port_parent && list_empty(&mstb->port_parent->next))
+               kref_get(&mstb->port_parent->kref);
+
        /*
         * destroy all ports - don't need lock
         * as there are no more references to the mst branch
@@ -835,7 +856,8 @@ static void drm_dp_destroy_mst_branch_device(struct kref *kref)
 
        if (wake_tx)
                wake_up(&mstb->mgr->tx_waitq);
-       kfree(mstb);
+
+       kref_put(kref, drm_dp_free_mst_branch_device);
 }
 
 static void drm_dp_put_mst_branch_device(struct drm_dp_mst_branch *mstb)
@@ -883,6 +905,7 @@ static void drm_dp_destroy_port(struct kref *kref)
                         * from an EDID retrieval */
 
                        mutex_lock(&mgr->destroy_connector_lock);
+                       kref_get(&port->parent->kref);
                        list_add(&port->next, &mgr->destroy_connector_list);
                        mutex_unlock(&mgr->destroy_connector_lock);
                        schedule_work(&mgr->destroy_connector_work);
@@ -1018,18 +1041,27 @@ static bool drm_dp_port_setup_pdt(struct drm_dp_mst_port *port)
        return send_link;
 }
 
-static void drm_dp_check_port_guid(struct drm_dp_mst_branch *mstb,
-                                  struct drm_dp_mst_port *port)
+static void drm_dp_check_mstb_guid(struct drm_dp_mst_branch *mstb, u8 *guid)
 {
        int ret;
-       if (port->dpcd_rev >= 0x12) {
-               port->guid_valid = drm_dp_validate_guid(mstb->mgr, port->guid);
-               if (!port->guid_valid) {
-                       ret = drm_dp_send_dpcd_write(mstb->mgr,
-                                                    port,
-                                                    DP_GUID,
-                                                    16, port->guid);
-                       port->guid_valid = true;
+
+       memcpy(mstb->guid, guid, 16);
+
+       if (!drm_dp_validate_guid(mstb->mgr, mstb->guid)) {
+               if (mstb->port_parent) {
+                       ret = drm_dp_send_dpcd_write(
+                                       mstb->mgr,
+                                       mstb->port_parent,
+                                       DP_GUID,
+                                       16,
+                                       mstb->guid);
+               } else {
+
+                       ret = drm_dp_dpcd_write(
+                                       mstb->mgr->aux,
+                                       DP_GUID,
+                                       mstb->guid,
+                                       16);
                }
        }
 }
@@ -1086,7 +1118,6 @@ static void drm_dp_add_port(struct drm_dp_mst_branch *mstb,
        port->dpcd_rev = port_msg->dpcd_revision;
        port->num_sdp_streams = port_msg->num_sdp_streams;
        port->num_sdp_stream_sinks = port_msg->num_sdp_stream_sinks;
-       memcpy(port->guid, port_msg->peer_guid, 16);
 
        /* manage mstb port lists with mgr lock - take a reference
           for this list */
@@ -1099,11 +1130,9 @@ static void drm_dp_add_port(struct drm_dp_mst_branch *mstb,
 
        if (old_ddps != port->ddps) {
                if (port->ddps) {
-                       drm_dp_check_port_guid(mstb, port);
                        if (!port->input)
                                drm_dp_send_enum_path_resources(mstb->mgr, mstb, port);
                } else {
-                       port->guid_valid = false;
                        port->available_pbn = 0;
                        }
        }
@@ -1162,10 +1191,8 @@ static void drm_dp_update_port(struct drm_dp_mst_branch *mstb,
 
        if (old_ddps != port->ddps) {
                if (port->ddps) {
-                       drm_dp_check_port_guid(mstb, port);
                        dowork = true;
                } else {
-                       port->guid_valid = false;
                        port->available_pbn = 0;
                }
        }
@@ -1222,13 +1249,14 @@ static struct drm_dp_mst_branch *get_mst_branch_device_by_guid_helper(
        struct drm_dp_mst_branch *found_mstb;
        struct drm_dp_mst_port *port;
 
+       if (memcmp(mstb->guid, guid, 16) == 0)
+               return mstb;
+
+
        list_for_each_entry(port, &mstb->ports, next) {
                if (!port->mstb)
                        continue;
 
-               if (port->guid_valid && memcmp(port->guid, guid, 16) == 0)
-                       return port->mstb;
-
                found_mstb = get_mst_branch_device_by_guid_helper(port->mstb, guid);
 
                if (found_mstb)
@@ -1247,10 +1275,7 @@ static struct drm_dp_mst_branch *drm_dp_get_mst_branch_device_by_guid(
        /* find the port by iterating down */
        mutex_lock(&mgr->lock);
 
-       if (mgr->guid_valid && memcmp(mgr->guid, guid, 16) == 0)
-               mstb = mgr->mst_primary;
-       else
-               mstb = get_mst_branch_device_by_guid_helper(mgr->mst_primary, guid);
+       mstb = get_mst_branch_device_by_guid_helper(mgr->mst_primary, guid);
 
        if (mstb)
                kref_get(&mstb->kref);
@@ -1555,6 +1580,9 @@ static void drm_dp_send_link_address(struct drm_dp_mst_topology_mgr *mgr,
                                       txmsg->reply.u.link_addr.ports[i].num_sdp_streams,
                                       txmsg->reply.u.link_addr.ports[i].num_sdp_stream_sinks);
                        }
+
+                       drm_dp_check_mstb_guid(mstb, txmsg->reply.u.link_addr.guid);
+
                        for (i = 0; i < txmsg->reply.u.link_addr.nports; i++) {
                                drm_dp_add_port(mstb, mgr->dev, &txmsg->reply.u.link_addr.ports[i]);
                        }
@@ -1602,6 +1630,37 @@ static int drm_dp_send_enum_path_resources(struct drm_dp_mst_topology_mgr *mgr,
        return 0;
 }
 
+static struct drm_dp_mst_port *drm_dp_get_last_connected_port_to_mstb(struct drm_dp_mst_branch *mstb)
+{
+       if (!mstb->port_parent)
+               return NULL;
+
+       if (mstb->port_parent->mstb != mstb)
+               return mstb->port_parent;
+
+       return drm_dp_get_last_connected_port_to_mstb(mstb->port_parent->parent);
+}
+
+static struct drm_dp_mst_branch *drm_dp_get_last_connected_port_and_mstb(struct drm_dp_mst_topology_mgr *mgr,
+                                                                        struct drm_dp_mst_branch *mstb,
+                                                                        int *port_num)
+{
+       struct drm_dp_mst_branch *rmstb = NULL;
+       struct drm_dp_mst_port *found_port;
+       mutex_lock(&mgr->lock);
+       if (mgr->mst_primary) {
+               found_port = drm_dp_get_last_connected_port_to_mstb(mstb);
+
+               if (found_port) {
+                       rmstb = found_port->parent;
+                       kref_get(&rmstb->kref);
+                       *port_num = found_port->port_num;
+               }
+       }
+       mutex_unlock(&mgr->lock);
+       return rmstb;
+}
+
 static int drm_dp_payload_send_msg(struct drm_dp_mst_topology_mgr *mgr,
                                   struct drm_dp_mst_port *port,
                                   int id,
@@ -1609,13 +1668,18 @@ static int drm_dp_payload_send_msg(struct drm_dp_mst_topology_mgr *mgr,
 {
        struct drm_dp_sideband_msg_tx *txmsg;
        struct drm_dp_mst_branch *mstb;
-       int len, ret;
+       int len, ret, port_num;
        u8 sinks[DRM_DP_MAX_SDP_STREAMS];
        int i;
 
+       port_num = port->port_num;
        mstb = drm_dp_get_validated_mstb_ref(mgr, port->parent);
-       if (!mstb)
-               return -EINVAL;
+       if (!mstb) {
+               mstb = drm_dp_get_last_connected_port_and_mstb(mgr, port->parent, &port_num);
+
+               if (!mstb)
+                       return -EINVAL;
+       }
 
        txmsg = kzalloc(sizeof(*txmsg), GFP_KERNEL);
        if (!txmsg) {
@@ -1627,7 +1691,7 @@ static int drm_dp_payload_send_msg(struct drm_dp_mst_topology_mgr *mgr,
                sinks[i] = i;
 
        txmsg->dst = mstb;
-       len = build_allocate_payload(txmsg, port->port_num,
+       len = build_allocate_payload(txmsg, port_num,
                                     id,
                                     pbn, port->num_sdp_streams, sinks);
 
@@ -1983,31 +2047,17 @@ int drm_dp_mst_topology_mgr_set_mst(struct drm_dp_mst_topology_mgr *mgr, bool ms
                mgr->mst_primary = mstb;
                kref_get(&mgr->mst_primary->kref);
 
-               {
-                       struct drm_dp_payload reset_pay;
-                       reset_pay.start_slot = 0;
-                       reset_pay.num_slots = 0x3f;
-                       drm_dp_dpcd_write_payload(mgr, 0, &reset_pay);
-               }
-
                ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
-                                        DP_MST_EN | DP_UP_REQ_EN | DP_UPSTREAM_IS_SRC);
+                                                        DP_MST_EN | DP_UP_REQ_EN | DP_UPSTREAM_IS_SRC);
                if (ret < 0) {
                        goto out_unlock;
                }
 
-
-               /* sort out guid */
-               ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, mgr->guid, 16);
-               if (ret != 16) {
-                       DRM_DEBUG_KMS("failed to read DP GUID %d\n", ret);
-                       goto out_unlock;
-               }
-
-               mgr->guid_valid = drm_dp_validate_guid(mgr, mgr->guid);
-               if (!mgr->guid_valid) {
-                       ret = drm_dp_dpcd_write(mgr->aux, DP_GUID, mgr->guid, 16);
-                       mgr->guid_valid = true;
+               {
+                       struct drm_dp_payload reset_pay;
+                       reset_pay.start_slot = 0;
+                       reset_pay.num_slots = 0x3f;
+                       drm_dp_dpcd_write_payload(mgr, 0, &reset_pay);
                }
 
                queue_work(system_long_wq, &mgr->work);
@@ -2231,6 +2281,7 @@ static int drm_dp_mst_handle_up_req(struct drm_dp_mst_topology_mgr *mgr)
                        }
 
                        drm_dp_update_port(mstb, &msg.u.conn_stat);
+
                        DRM_DEBUG_KMS("Got CSN: pn: %d ldps:%d ddps: %d mcs: %d ip: %d pdt: %d\n", msg.u.conn_stat.port_number, msg.u.conn_stat.legacy_device_plug_status, msg.u.conn_stat.displayport_device_plug_status, msg.u.conn_stat.message_capability_status, msg.u.conn_stat.input_port, msg.u.conn_stat.peer_device_type);
                        (*mgr->cbs->hotplug)(mgr);
 
@@ -2446,6 +2497,7 @@ bool drm_dp_mst_allocate_vcpi(struct drm_dp_mst_topology_mgr *mgr, struct drm_dp
                DRM_DEBUG_KMS("payload: vcpi %d already allocated for pbn %d - requested pbn %d\n", port->vcpi.vcpi, port->vcpi.pbn, pbn);
                if (pbn == port->vcpi.pbn) {
                        *slots = port->vcpi.num_slots;
+                       drm_dp_put_port(port);
                        return true;
                }
        }
@@ -2605,32 +2657,31 @@ EXPORT_SYMBOL(drm_dp_check_act_status);
  */
 int drm_dp_calc_pbn_mode(int clock, int bpp)
 {
-       fixed20_12 pix_bw;
-       fixed20_12 fbpp;
-       fixed20_12 result;
-       fixed20_12 margin, tmp;
-       u32 res;
-
-       pix_bw.full = dfixed_const(clock);
-       fbpp.full = dfixed_const(bpp);
-       tmp.full = dfixed_const(8);
-       fbpp.full = dfixed_div(fbpp, tmp);
-
-       result.full = dfixed_mul(pix_bw, fbpp);
-       margin.full = dfixed_const(54);
-       tmp.full = dfixed_const(64);
-       margin.full = dfixed_div(margin, tmp);
-       result.full = dfixed_div(result, margin);
-
-       margin.full = dfixed_const(1006);
-       tmp.full = dfixed_const(1000);
-       margin.full = dfixed_div(margin, tmp);
-       result.full = dfixed_mul(result, margin);
-
-       result.full = dfixed_div(result, tmp);
-       result.full = dfixed_ceil(result);
-       res = dfixed_trunc(result);
-       return res;
+       u64 kbps;
+       s64 peak_kbps;
+       u32 numerator;
+       u32 denominator;
+
+       kbps = clock * bpp;
+
+       /*
+        * margin 5300ppm + 300ppm ~ 0.6% as per spec, factor is 1.006
+        * The unit of 54/64Mbytes/sec is an arbitrary unit chosen based on
+        * common multiplier to render an integer PBN for all link rate/lane
+        * counts combinations
+        * calculate
+        * peak_kbps *= (1006/1000)
+        * peak_kbps *= (64/54)
+        * peak_kbps *= 8    convert to bytes
+        */
+
+       numerator = 64 * 1006;
+       denominator = 54 * 8 * 1000 * 1000;
+
+       kbps *= numerator;
+       peak_kbps = drm_fixp_from_fraction(kbps, denominator);
+
+       return drm_fixp2int_ceil(peak_kbps);
 }
 EXPORT_SYMBOL(drm_dp_calc_pbn_mode);
 
@@ -2638,11 +2689,23 @@ static int test_calc_pbn_mode(void)
 {
        int ret;
        ret = drm_dp_calc_pbn_mode(154000, 30);
-       if (ret != 689)
+       if (ret != 689) {
+               DRM_ERROR("PBN calculation test failed - clock %d, bpp %d, expected PBN %d, actual PBN %d.\n",
+                               154000, 30, 689, ret);
                return -EINVAL;
+       }
        ret = drm_dp_calc_pbn_mode(234000, 30);
-       if (ret != 1047)
+       if (ret != 1047) {
+               DRM_ERROR("PBN calculation test failed - clock %d, bpp %d, expected PBN %d, actual PBN %d.\n",
+                               234000, 30, 1047, ret);
+               return -EINVAL;
+       }
+       ret = drm_dp_calc_pbn_mode(297000, 24);
+       if (ret != 1063) {
+               DRM_ERROR("PBN calculation test failed - clock %d, bpp %d, expected PBN %d, actual PBN %d.\n",
+                               297000, 24, 1063, ret);
                return -EINVAL;
+       }
        return 0;
 }
 
@@ -2783,6 +2846,13 @@ static void drm_dp_tx_work(struct work_struct *work)
        mutex_unlock(&mgr->qlock);
 }
 
+static void drm_dp_free_mst_port(struct kref *kref)
+{
+       struct drm_dp_mst_port *port = container_of(kref, struct drm_dp_mst_port, kref);
+       kref_put(&port->parent->kref, drm_dp_free_mst_branch_device);
+       kfree(port);
+}
+
 static void drm_dp_destroy_connector_work(struct work_struct *work)
 {
        struct drm_dp_mst_topology_mgr *mgr = container_of(work, struct drm_dp_mst_topology_mgr, destroy_connector_work);
@@ -2803,13 +2873,22 @@ static void drm_dp_destroy_connector_work(struct work_struct *work)
                list_del(&port->next);
                mutex_unlock(&mgr->destroy_connector_lock);
 
+               kref_init(&port->kref);
+               INIT_LIST_HEAD(&port->next);
+
                mgr->cbs->destroy_connector(mgr, port->connector);
 
                drm_dp_port_teardown_pdt(port, port->pdt);
 
-               if (!port->input && port->vcpi.vcpi > 0)
-                       drm_dp_mst_put_payload_id(mgr, port->vcpi.vcpi);
-               kfree(port);
+               if (!port->input && port->vcpi.vcpi > 0) {
+                       if (mgr->mst_state) {
+                               drm_dp_mst_reset_vcpi_slots(mgr, port);
+                               drm_dp_update_payload_part1(mgr);
+                               drm_dp_mst_put_payload_id(mgr, port->vcpi.vcpi);
+                       }
+               }
+
+               kref_put(&port->kref, drm_dp_free_mst_port);
                send_hotplug = true;
        }
        if (send_hotplug)
@@ -2847,6 +2926,9 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
        mgr->max_dpcd_transaction_bytes = max_dpcd_transaction_bytes;
        mgr->max_payloads = max_payloads;
        mgr->conn_base_id = conn_base_id;
+       if (max_payloads + 1 > sizeof(mgr->payload_mask) * 8 ||
+           max_payloads + 1 > sizeof(mgr->vcpi_mask) * 8)
+               return -EINVAL;
        mgr->payloads = kcalloc(max_payloads, sizeof(struct drm_dp_payload), GFP_KERNEL);
        if (!mgr->payloads)
                return -ENOMEM;
@@ -2854,7 +2936,9 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
        if (!mgr->proposed_vcpis)
                return -ENOMEM;
        set_bit(0, &mgr->payload_mask);
-       test_calc_pbn_mode();
+       if (test_calc_pbn_mode() < 0)
+               DRM_ERROR("MST PBN self-test failed\n");
+
        return 0;
 }
 EXPORT_SYMBOL(drm_dp_mst_topology_mgr_init);
index e5df53b..1f500a1 100644 (file)
@@ -109,8 +109,8 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
        if (IS_ERR(cma_obj))
                return cma_obj;
 
-       cma_obj->vaddr = dma_alloc_writecombine(drm->dev, size,
-                       &cma_obj->paddr, GFP_KERNEL | __GFP_NOWARN);
+       cma_obj->vaddr = dma_alloc_wc(drm->dev, size, &cma_obj->paddr,
+                                     GFP_KERNEL | __GFP_NOWARN);
        if (!cma_obj->vaddr) {
                dev_err(drm->dev, "failed to allocate buffer with size %zu\n",
                        size);
@@ -192,8 +192,8 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
        cma_obj = to_drm_gem_cma_obj(gem_obj);
 
        if (cma_obj->vaddr) {
-               dma_free_writecombine(gem_obj->dev->dev, cma_obj->base.size,
-                                     cma_obj->vaddr, cma_obj->paddr);
+               dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
+                           cma_obj->vaddr, cma_obj->paddr);
        } else if (gem_obj->import_attach) {
                drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
        }
@@ -324,9 +324,8 @@ static int drm_gem_cma_mmap_obj(struct drm_gem_cma_object *cma_obj,
        vma->vm_flags &= ~VM_PFNMAP;
        vma->vm_pgoff = 0;
 
-       ret = dma_mmap_writecombine(cma_obj->base.dev->dev, vma,
-                                   cma_obj->vaddr, cma_obj->paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(cma_obj->base.dev->dev, vma, cma_obj->vaddr,
+                         cma_obj->paddr, vma->vm_end - vma->vm_start);
        if (ret)
                drm_gem_vm_close(vma);
 
index d12a4ef..1fe1457 100644 (file)
@@ -224,6 +224,64 @@ static void drm_update_vblank_count(struct drm_device *dev, unsigned int pipe,
                diff = (flags & DRM_CALLED_FROM_VBLIRQ) != 0;
        }
 
+       /*
+        * Within a drm_vblank_pre_modeset - drm_vblank_post_modeset
+        * interval? If so then vblank irqs keep running and it will likely
+        * happen that the hardware vblank counter is not trustworthy as it
+        * might reset at some point in that interval and vblank timestamps
+        * are not trustworthy either in that interval. Iow. this can result
+        * in a bogus diff >> 1 which must be avoided as it would cause
+        * random large forward jumps of the software vblank counter.
+        */
+       if (diff > 1 && (vblank->inmodeset & 0x2)) {
+               DRM_DEBUG_VBL("clamping vblank bump to 1 on crtc %u: diffr=%u"
+                             " due to pre-modeset.\n", pipe, diff);
+               diff = 1;
+       }
+
+       /*
+        * FIMXE: Need to replace this hack with proper seqlocks.
+        *
+        * Restrict the bump of the software vblank counter to a safe maximum
+        * value of +1 whenever there is the possibility that concurrent readers
+        * of vblank timestamps could be active at the moment, as the current
+        * implementation of the timestamp caching and updating is not safe
+        * against concurrent readers for calls to store_vblank() with a bump
+        * of anything but +1. A bump != 1 would very likely return corrupted
+        * timestamps to userspace, because the same slot in the cache could
+        * be concurrently written by store_vblank() and read by one of those
+        * readers without the read-retry logic detecting the collision.
+        *
+        * Concurrent readers can exist when we are called from the
+        * drm_vblank_off() or drm_vblank_on() functions and other non-vblank-
+        * irq callers. However, all those calls to us are happening with the
+        * vbl_lock locked to prevent drm_vblank_get(), so the vblank refcount
+        * can't increase while we are executing. Therefore a zero refcount at
+        * this point is safe for arbitrary counter bumps if we are called
+        * outside vblank irq, a non-zero count is not 100% safe. Unfortunately
+        * we must also accept a refcount of 1, as whenever we are called from
+        * drm_vblank_get() -> drm_vblank_enable() the refcount will be 1 and
+        * we must let that one pass through in order to not lose vblank counts
+        * during vblank irq off - which would completely defeat the whole
+        * point of this routine.
+        *
+        * Whenever we are called from vblank irq, we have to assume concurrent
+        * readers exist or can show up any time during our execution, even if
+        * the refcount is currently zero, as vblank irqs are usually only
+        * enabled due to the presence of readers, and because when we are called
+        * from vblank irq we can't hold the vbl_lock to protect us from sudden
+        * bumps in vblank refcount. Therefore also restrict bumps to +1 when
+        * called from vblank irq.
+        */
+       if ((diff > 1) && (atomic_read(&vblank->refcount) > 1 ||
+           (flags & DRM_CALLED_FROM_VBLIRQ))) {
+               DRM_DEBUG_VBL("clamping vblank bump to 1 on crtc %u: diffr=%u "
+                             "refcount %u, vblirq %u\n", pipe, diff,
+                             atomic_read(&vblank->refcount),
+                             (flags & DRM_CALLED_FROM_VBLIRQ) != 0);
+               diff = 1;
+       }
+
        DRM_DEBUG_VBL("updating vblank count on crtc %u:"
                      " current=%u, diff=%u, hw=%u hw_last=%u\n",
                      pipe, vblank->count, diff, cur_vblank, vblank->last);
@@ -1316,7 +1374,13 @@ void drm_vblank_off(struct drm_device *dev, unsigned int pipe)
        spin_lock_irqsave(&dev->event_lock, irqflags);
 
        spin_lock(&dev->vbl_lock);
-       vblank_disable_and_save(dev, pipe);
+       DRM_DEBUG_VBL("crtc %d, vblank enabled %d, inmodeset %d\n",
+                     pipe, vblank->enabled, vblank->inmodeset);
+
+       /* Avoid redundant vblank disables without previous drm_vblank_on(). */
+       if (drm_core_check_feature(dev, DRIVER_ATOMIC) || !vblank->inmodeset)
+               vblank_disable_and_save(dev, pipe);
+
        wake_up(&vblank->queue);
 
        /*
@@ -1418,6 +1482,9 @@ void drm_vblank_on(struct drm_device *dev, unsigned int pipe)
                return;
 
        spin_lock_irqsave(&dev->vbl_lock, irqflags);
+       DRM_DEBUG_VBL("crtc %d, vblank enabled %d, inmodeset %d\n",
+                     pipe, vblank->enabled, vblank->inmodeset);
+
        /* Drop our private "prevent drm_vblank_get" refcount */
        if (vblank->inmodeset) {
                atomic_dec(&vblank->refcount);
@@ -1430,8 +1497,7 @@ void drm_vblank_on(struct drm_device *dev, unsigned int pipe)
         * re-enable interrupts if there are users left, or the
         * user wishes vblank interrupts to be enabled all the time.
         */
-       if (atomic_read(&vblank->refcount) != 0 ||
-           (!dev->vblank_disable_immediate && drm_vblank_offdelay == 0))
+       if (atomic_read(&vblank->refcount) != 0 || drm_vblank_offdelay == 0)
                WARN_ON(drm_vblank_enable(dev, pipe));
        spin_unlock_irqrestore(&dev->vbl_lock, irqflags);
 }
@@ -1526,6 +1592,7 @@ void drm_vblank_post_modeset(struct drm_device *dev, unsigned int pipe)
        if (vblank->inmodeset) {
                spin_lock_irqsave(&dev->vbl_lock, irqflags);
                dev->vblank_disable_allowed = true;
+               drm_reset_vblank_timestamp(dev, pipe);
                spin_unlock_irqrestore(&dev->vbl_lock, irqflags);
 
                if (vblank->inmodeset & 0x2)
index 9e585d5..e881482 100644 (file)
@@ -8,8 +8,8 @@ http://0x04.net/cgit/index.cgi/rules-ng-ng
 git clone git://0x04.net/rules-ng-ng
 
 The rules-ng-ng source files this header was generated from are:
-- state_vg.xml (   5973 bytes, from 2015-03-25 11:26:01)
-- common.xml   (  18437 bytes, from 2015-03-25 11:27:41)
+- state_hi.xml (  24309 bytes, from 2015-12-12 09:02:53)
+- common.xml   (  18379 bytes, from 2015-12-12 09:02:53)
 
 Copyright (C) 2015
 */
@@ -30,15 +30,19 @@ Copyright (C) 2015
 #define ENDIAN_MODE_NO_SWAP                                    0x00000000
 #define ENDIAN_MODE_SWAP_16                                    0x00000001
 #define ENDIAN_MODE_SWAP_32                                    0x00000002
+#define chipModel_GC200                                                0x00000200
 #define chipModel_GC300                                                0x00000300
 #define chipModel_GC320                                                0x00000320
+#define chipModel_GC328                                                0x00000328
 #define chipModel_GC350                                                0x00000350
 #define chipModel_GC355                                                0x00000355
 #define chipModel_GC400                                                0x00000400
 #define chipModel_GC410                                                0x00000410
 #define chipModel_GC420                                                0x00000420
+#define chipModel_GC428                                                0x00000428
 #define chipModel_GC450                                                0x00000450
 #define chipModel_GC500                                                0x00000500
+#define chipModel_GC520                                                0x00000520
 #define chipModel_GC530                                                0x00000530
 #define chipModel_GC600                                                0x00000600
 #define chipModel_GC700                                                0x00000700
@@ -46,9 +50,16 @@ Copyright (C) 2015
 #define chipModel_GC860                                                0x00000860
 #define chipModel_GC880                                                0x00000880
 #define chipModel_GC1000                                       0x00001000
+#define chipModel_GC1500                                       0x00001500
 #define chipModel_GC2000                                       0x00002000
 #define chipModel_GC2100                                       0x00002100
+#define chipModel_GC2200                                       0x00002200
+#define chipModel_GC2500                                       0x00002500
+#define chipModel_GC3000                                       0x00003000
 #define chipModel_GC4000                                       0x00004000
+#define chipModel_GC5000                                       0x00005000
+#define chipModel_GC5200                                       0x00005200
+#define chipModel_GC6400                                       0x00006400
 #define RGBA_BITS_R                                            0x00000001
 #define RGBA_BITS_G                                            0x00000002
 #define RGBA_BITS_B                                            0x00000004
@@ -160,7 +171,7 @@ Copyright (C) 2015
 #define chipMinorFeatures2_UNK8                                        0x00000100
 #define chipMinorFeatures2_UNK9                                        0x00000200
 #define chipMinorFeatures2_UNK10                               0x00000400
-#define chipMinorFeatures2_SAMPLERBASE_16                      0x00000800
+#define chipMinorFeatures2_HALTI1                              0x00000800
 #define chipMinorFeatures2_UNK12                               0x00001000
 #define chipMinorFeatures2_UNK13                               0x00002000
 #define chipMinorFeatures2_UNK14                               0x00004000
@@ -189,7 +200,7 @@ Copyright (C) 2015
 #define chipMinorFeatures3_UNK5                                        0x00000020
 #define chipMinorFeatures3_UNK6                                        0x00000040
 #define chipMinorFeatures3_UNK7                                        0x00000080
-#define chipMinorFeatures3_UNK8                                        0x00000100
+#define chipMinorFeatures3_FAST_MSAA                           0x00000100
 #define chipMinorFeatures3_UNK9                                        0x00000200
 #define chipMinorFeatures3_BUG_FIXES10                         0x00000400
 #define chipMinorFeatures3_UNK11                               0x00000800
@@ -199,7 +210,7 @@ Copyright (C) 2015
 #define chipMinorFeatures3_UNK15                               0x00008000
 #define chipMinorFeatures3_UNK16                               0x00010000
 #define chipMinorFeatures3_UNK17                               0x00020000
-#define chipMinorFeatures3_UNK18                               0x00040000
+#define chipMinorFeatures3_ACE                                 0x00040000
 #define chipMinorFeatures3_UNK19                               0x00080000
 #define chipMinorFeatures3_UNK20                               0x00100000
 #define chipMinorFeatures3_UNK21                               0x00200000
@@ -207,7 +218,7 @@ Copyright (C) 2015
 #define chipMinorFeatures3_UNK23                               0x00800000
 #define chipMinorFeatures3_UNK24                               0x01000000
 #define chipMinorFeatures3_UNK25                               0x02000000
-#define chipMinorFeatures3_UNK26                               0x04000000
+#define chipMinorFeatures3_NEW_HZ                              0x04000000
 #define chipMinorFeatures3_UNK27                               0x08000000
 #define chipMinorFeatures3_UNK28                               0x10000000
 #define chipMinorFeatures3_UNK29                               0x20000000
@@ -229,9 +240,9 @@ Copyright (C) 2015
 #define chipMinorFeatures4_UNK13                               0x00002000
 #define chipMinorFeatures4_UNK14                               0x00004000
 #define chipMinorFeatures4_UNK15                               0x00008000
-#define chipMinorFeatures4_UNK16                               0x00010000
+#define chipMinorFeatures4_HALTI2                              0x00010000
 #define chipMinorFeatures4_UNK17                               0x00020000
-#define chipMinorFeatures4_UNK18                               0x00040000
+#define chipMinorFeatures4_SMALL_MSAA                          0x00040000
 #define chipMinorFeatures4_UNK19                               0x00080000
 #define chipMinorFeatures4_UNK20                               0x00100000
 #define chipMinorFeatures4_UNK21                               0x00200000
@@ -245,5 +256,37 @@ Copyright (C) 2015
 #define chipMinorFeatures4_UNK29                               0x20000000
 #define chipMinorFeatures4_UNK30                               0x40000000
 #define chipMinorFeatures4_UNK31                               0x80000000
+#define chipMinorFeatures5_UNK0                                        0x00000001
+#define chipMinorFeatures5_UNK1                                        0x00000002
+#define chipMinorFeatures5_UNK2                                        0x00000004
+#define chipMinorFeatures5_UNK3                                        0x00000008
+#define chipMinorFeatures5_UNK4                                        0x00000010
+#define chipMinorFeatures5_UNK5                                        0x00000020
+#define chipMinorFeatures5_UNK6                                        0x00000040
+#define chipMinorFeatures5_UNK7                                        0x00000080
+#define chipMinorFeatures5_UNK8                                        0x00000100
+#define chipMinorFeatures5_HALTI3                              0x00000200
+#define chipMinorFeatures5_UNK10                               0x00000400
+#define chipMinorFeatures5_UNK11                               0x00000800
+#define chipMinorFeatures5_UNK12                               0x00001000
+#define chipMinorFeatures5_UNK13                               0x00002000
+#define chipMinorFeatures5_UNK14                               0x00004000
+#define chipMinorFeatures5_UNK15                               0x00008000
+#define chipMinorFeatures5_UNK16                               0x00010000
+#define chipMinorFeatures5_UNK17                               0x00020000
+#define chipMinorFeatures5_UNK18                               0x00040000
+#define chipMinorFeatures5_UNK19                               0x00080000
+#define chipMinorFeatures5_UNK20                               0x00100000
+#define chipMinorFeatures5_UNK21                               0x00200000
+#define chipMinorFeatures5_UNK22                               0x00400000
+#define chipMinorFeatures5_UNK23                               0x00800000
+#define chipMinorFeatures5_UNK24                               0x01000000
+#define chipMinorFeatures5_UNK25                               0x02000000
+#define chipMinorFeatures5_UNK26                               0x04000000
+#define chipMinorFeatures5_UNK27                               0x08000000
+#define chipMinorFeatures5_UNK28                               0x10000000
+#define chipMinorFeatures5_UNK29                               0x20000000
+#define chipMinorFeatures5_UNK30                               0x40000000
+#define chipMinorFeatures5_UNK31                               0x80000000
 
 #endif /* COMMON_XML */
index 5c89ebb..e885898 100644 (file)
@@ -668,7 +668,6 @@ static struct platform_driver etnaviv_platform_driver = {
        .probe      = etnaviv_pdev_probe,
        .remove     = etnaviv_pdev_remove,
        .driver     = {
-               .owner  = THIS_MODULE,
                .name   = "etnaviv",
                .of_match_table = dt_match,
        },
index d6bd438..1cd6046 100644 (file)
@@ -85,7 +85,7 @@ struct drm_gem_object *etnaviv_gem_prime_import_sg_table(struct drm_device *dev,
        struct dma_buf_attachment *attach, struct sg_table *sg);
 int etnaviv_gem_prime_pin(struct drm_gem_object *obj);
 void etnaviv_gem_prime_unpin(struct drm_gem_object *obj);
-void *etnaviv_gem_vaddr(struct drm_gem_object *obj);
+void *etnaviv_gem_vmap(struct drm_gem_object *obj);
 int etnaviv_gem_cpu_prep(struct drm_gem_object *obj, u32 op,
                struct timespec *timeout);
 int etnaviv_gem_cpu_fini(struct drm_gem_object *obj);
index bf8fa85..4a29eea 100644 (file)
@@ -201,7 +201,9 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 
                obj = vram->object;
 
+               mutex_lock(&obj->lock);
                pages = etnaviv_gem_get_pages(obj);
+               mutex_unlock(&obj->lock);
                if (pages) {
                        int j;
 
@@ -213,8 +215,8 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 
                iter.hdr->iova = cpu_to_le64(vram->iova);
 
-               vaddr = etnaviv_gem_vaddr(&obj->base);
-               if (vaddr && !IS_ERR(vaddr))
+               vaddr = etnaviv_gem_vmap(&obj->base);
+               if (vaddr)
                        memcpy(iter.data, vaddr, obj->base.size);
 
                etnaviv_core_dump_header(&iter, ETDUMP_BUF_BO, iter.data +
index 9f77c3b..4b519e4 100644 (file)
@@ -353,25 +353,39 @@ void etnaviv_gem_put_iova(struct etnaviv_gpu *gpu, struct drm_gem_object *obj)
        drm_gem_object_unreference_unlocked(obj);
 }
 
-void *etnaviv_gem_vaddr(struct drm_gem_object *obj)
+void *etnaviv_gem_vmap(struct drm_gem_object *obj)
 {
        struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 
-       mutex_lock(&etnaviv_obj->lock);
-       if (!etnaviv_obj->vaddr) {
-               struct page **pages = etnaviv_gem_get_pages(etnaviv_obj);
-
-               if (IS_ERR(pages))
-                       return ERR_CAST(pages);
+       if (etnaviv_obj->vaddr)
+               return etnaviv_obj->vaddr;
 
-               etnaviv_obj->vaddr = vmap(pages, obj->size >> PAGE_SHIFT,
-                               VM_MAP, pgprot_writecombine(PAGE_KERNEL));
-       }
+       mutex_lock(&etnaviv_obj->lock);
+       /*
+        * Need to check again, as we might have raced with another thread
+        * while waiting for the mutex.
+        */
+       if (!etnaviv_obj->vaddr)
+               etnaviv_obj->vaddr = etnaviv_obj->ops->vmap(etnaviv_obj);
        mutex_unlock(&etnaviv_obj->lock);
 
        return etnaviv_obj->vaddr;
 }
 
+static void *etnaviv_gem_vmap_impl(struct etnaviv_gem_object *obj)
+{
+       struct page **pages;
+
+       lockdep_assert_held(&obj->lock);
+
+       pages = etnaviv_gem_get_pages(obj);
+       if (IS_ERR(pages))
+               return NULL;
+
+       return vmap(pages, obj->base.size >> PAGE_SHIFT,
+                       VM_MAP, pgprot_writecombine(PAGE_KERNEL));
+}
+
 static inline enum dma_data_direction etnaviv_op_to_dma_dir(u32 op)
 {
        if (op & ETNA_PREP_READ)
@@ -522,6 +536,7 @@ static void etnaviv_gem_shmem_release(struct etnaviv_gem_object *etnaviv_obj)
 static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = {
        .get_pages = etnaviv_gem_shmem_get_pages,
        .release = etnaviv_gem_shmem_release,
+       .vmap = etnaviv_gem_vmap_impl,
 };
 
 void etnaviv_gem_free_object(struct drm_gem_object *obj)
@@ -866,6 +881,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
 static const struct etnaviv_gem_ops etnaviv_gem_userptr_ops = {
        .get_pages = etnaviv_gem_userptr_get_pages,
        .release = etnaviv_gem_userptr_release,
+       .vmap = etnaviv_gem_vmap_impl,
 };
 
 int etnaviv_gem_new_userptr(struct drm_device *dev, struct drm_file *file,
index a300b4b..ab5df81 100644 (file)
@@ -78,6 +78,7 @@ struct etnaviv_gem_object *to_etnaviv_bo(struct drm_gem_object *obj)
 struct etnaviv_gem_ops {
        int (*get_pages)(struct etnaviv_gem_object *);
        void (*release)(struct etnaviv_gem_object *);
+       void *(*vmap)(struct etnaviv_gem_object *);
 };
 
 static inline bool is_active(struct etnaviv_gem_object *etnaviv_obj)
index e94db4f..4e67395 100644 (file)
@@ -31,7 +31,7 @@ struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj)
 
 void *etnaviv_gem_prime_vmap(struct drm_gem_object *obj)
 {
-       return etnaviv_gem_vaddr(obj);
+       return etnaviv_gem_vmap(obj);
 }
 
 void etnaviv_gem_prime_vunmap(struct drm_gem_object *obj, void *vaddr)
@@ -77,9 +77,17 @@ static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
        drm_prime_gem_destroy(&etnaviv_obj->base, etnaviv_obj->sgt);
 }
 
+static void *etnaviv_gem_prime_vmap_impl(struct etnaviv_gem_object *etnaviv_obj)
+{
+       lockdep_assert_held(&etnaviv_obj->lock);
+
+       return dma_buf_vmap(etnaviv_obj->base.import_attach->dmabuf);
+}
+
 static const struct etnaviv_gem_ops etnaviv_gem_prime_ops = {
        /* .get_pages should never be called */
        .release = etnaviv_gem_prime_release,
+       .vmap = etnaviv_gem_prime_vmap_impl,
 };
 
 struct drm_gem_object *etnaviv_gem_prime_import_sg_table(struct drm_device *dev,
index 056a72e..3c1ce44 100644 (file)
@@ -72,6 +72,14 @@ int etnaviv_gpu_get_param(struct etnaviv_gpu *gpu, u32 param, u64 *value)
                *value = gpu->identity.minor_features3;
                break;
 
+       case ETNAVIV_PARAM_GPU_FEATURES_5:
+               *value = gpu->identity.minor_features4;
+               break;
+
+       case ETNAVIV_PARAM_GPU_FEATURES_6:
+               *value = gpu->identity.minor_features5;
+               break;
+
        case ETNAVIV_PARAM_GPU_STREAM_COUNT:
                *value = gpu->identity.stream_count;
                break;
@@ -112,6 +120,10 @@ int etnaviv_gpu_get_param(struct etnaviv_gpu *gpu, u32 param, u64 *value)
                *value = gpu->identity.num_constants;
                break;
 
+       case ETNAVIV_PARAM_GPU_NUM_VARYINGS:
+               *value = gpu->identity.varyings_count;
+               break;
+
        default:
                DBG("%s: invalid param: %u", dev_name(gpu->dev), param);
                return -EINVAL;
@@ -120,46 +132,56 @@ int etnaviv_gpu_get_param(struct etnaviv_gpu *gpu, u32 param, u64 *value)
        return 0;
 }
 
+
+#define etnaviv_is_model_rev(gpu, mod, rev) \
+       ((gpu)->identity.model == chipModel_##mod && \
+        (gpu)->identity.revision == rev)
+#define etnaviv_field(val, field) \
+       (((val) & field##__MASK) >> field##__SHIFT)
+
 static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
 {
        if (gpu->identity.minor_features0 &
            chipMinorFeatures0_MORE_MINOR_FEATURES) {
-               u32 specs[2];
+               u32 specs[4];
+               unsigned int streams;
 
                specs[0] = gpu_read(gpu, VIVS_HI_CHIP_SPECS);
                specs[1] = gpu_read(gpu, VIVS_HI_CHIP_SPECS_2);
-
-               gpu->identity.stream_count =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_STREAM_COUNT__MASK)
-                               >> VIVS_HI_CHIP_SPECS_STREAM_COUNT__SHIFT;
-               gpu->identity.register_max =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_REGISTER_MAX__MASK)
-                               >> VIVS_HI_CHIP_SPECS_REGISTER_MAX__SHIFT;
-               gpu->identity.thread_count =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_THREAD_COUNT__MASK)
-                               >> VIVS_HI_CHIP_SPECS_THREAD_COUNT__SHIFT;
-               gpu->identity.vertex_cache_size =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_VERTEX_CACHE_SIZE__MASK)
-                               >> VIVS_HI_CHIP_SPECS_VERTEX_CACHE_SIZE__SHIFT;
-               gpu->identity.shader_core_count =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_SHADER_CORE_COUNT__MASK)
-                               >> VIVS_HI_CHIP_SPECS_SHADER_CORE_COUNT__SHIFT;
-               gpu->identity.pixel_pipes =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_PIXEL_PIPES__MASK)
-                               >> VIVS_HI_CHIP_SPECS_PIXEL_PIPES__SHIFT;
+               specs[2] = gpu_read(gpu, VIVS_HI_CHIP_SPECS_3);
+               specs[3] = gpu_read(gpu, VIVS_HI_CHIP_SPECS_4);
+
+               gpu->identity.stream_count = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_STREAM_COUNT);
+               gpu->identity.register_max = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_REGISTER_MAX);
+               gpu->identity.thread_count = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_THREAD_COUNT);
+               gpu->identity.vertex_cache_size = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_VERTEX_CACHE_SIZE);
+               gpu->identity.shader_core_count = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_SHADER_CORE_COUNT);
+               gpu->identity.pixel_pipes = etnaviv_field(specs[0],
+                                       VIVS_HI_CHIP_SPECS_PIXEL_PIPES);
                gpu->identity.vertex_output_buffer_size =
-                       (specs[0] & VIVS_HI_CHIP_SPECS_VERTEX_OUTPUT_BUFFER_SIZE__MASK)
-                               >> VIVS_HI_CHIP_SPECS_VERTEX_OUTPUT_BUFFER_SIZE__SHIFT;
-
-               gpu->identity.buffer_size =
-                       (specs[1] & VIVS_HI_CHIP_SPECS_2_BUFFER_SIZE__MASK)
-                               >> VIVS_HI_CHIP_SPECS_2_BUFFER_SIZE__SHIFT;
-               gpu->identity.instruction_count =
-                       (specs[1] & VIVS_HI_CHIP_SPECS_2_INSTRUCTION_COUNT__MASK)
-                               >> VIVS_HI_CHIP_SPECS_2_INSTRUCTION_COUNT__SHIFT;
-               gpu->identity.num_constants =
-                       (specs[1] & VIVS_HI_CHIP_SPECS_2_NUM_CONSTANTS__MASK)
-                               >> VIVS_HI_CHIP_SPECS_2_NUM_CONSTANTS__SHIFT;
+                       etnaviv_field(specs[0],
+                               VIVS_HI_CHIP_SPECS_VERTEX_OUTPUT_BUFFER_SIZE);
+
+               gpu->identity.buffer_size = etnaviv_field(specs[1],
+                                       VIVS_HI_CHIP_SPECS_2_BUFFER_SIZE);
+               gpu->identity.instruction_count = etnaviv_field(specs[1],
+                                       VIVS_HI_CHIP_SPECS_2_INSTRUCTION_COUNT);
+               gpu->identity.num_constants = etnaviv_field(specs[1],
+                                       VIVS_HI_CHIP_SPECS_2_NUM_CONSTANTS);
+
+               gpu->identity.varyings_count = etnaviv_field(specs[2],
+                                       VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT);
+
+               /* This overrides the value from older register if non-zero */
+               streams = etnaviv_field(specs[3],
+                                       VIVS_HI_CHIP_SPECS_4_STREAM_COUNT);
+               if (streams)
+                       gpu->identity.stream_count = streams;
        }
 
        /* Fill in the stream count if not specified */
@@ -173,7 +195,7 @@ static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
        /* Convert the register max value */
        if (gpu->identity.register_max)
                gpu->identity.register_max = 1 << gpu->identity.register_max;
-       else if (gpu->identity.model == 0x0400)
+       else if (gpu->identity.model == chipModel_GC400)
                gpu->identity.register_max = 32;
        else
                gpu->identity.register_max = 64;
@@ -181,10 +203,10 @@ static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
        /* Convert thread count */
        if (gpu->identity.thread_count)
                gpu->identity.thread_count = 1 << gpu->identity.thread_count;
-       else if (gpu->identity.model == 0x0400)
+       else if (gpu->identity.model == chipModel_GC400)
                gpu->identity.thread_count = 64;
-       else if (gpu->identity.model == 0x0500 ||
-                gpu->identity.model == 0x0530)
+       else if (gpu->identity.model == chipModel_GC500 ||
+                gpu->identity.model == chipModel_GC530)
                gpu->identity.thread_count = 128;
        else
                gpu->identity.thread_count = 256;
@@ -206,7 +228,7 @@ static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
        if (gpu->identity.vertex_output_buffer_size) {
                gpu->identity.vertex_output_buffer_size =
                        1 << gpu->identity.vertex_output_buffer_size;
-       } else if (gpu->identity.model == 0x0400) {
+       } else if (gpu->identity.model == chipModel_GC400) {
                if (gpu->identity.revision < 0x4000)
                        gpu->identity.vertex_output_buffer_size = 512;
                else if (gpu->identity.revision < 0x4200)
@@ -219,9 +241,8 @@ static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
 
        switch (gpu->identity.instruction_count) {
        case 0:
-               if ((gpu->identity.model == 0x2000 &&
-                    gpu->identity.revision == 0x5108) ||
-                   gpu->identity.model == 0x880)
+               if (etnaviv_is_model_rev(gpu, GC2000, 0x5108) ||
+                   gpu->identity.model == chipModel_GC880)
                        gpu->identity.instruction_count = 512;
                else
                        gpu->identity.instruction_count = 256;
@@ -242,6 +263,30 @@ static void etnaviv_hw_specs(struct etnaviv_gpu *gpu)
 
        if (gpu->identity.num_constants == 0)
                gpu->identity.num_constants = 168;
+
+       if (gpu->identity.varyings_count == 0) {
+               if (gpu->identity.minor_features1 & chipMinorFeatures1_HALTI0)
+                       gpu->identity.varyings_count = 12;
+               else
+                       gpu->identity.varyings_count = 8;
+       }
+
+       /*
+        * For some cores, two varyings are consumed for position, so the
+        * maximum varying count needs to be reduced by one.
+        */
+       if (etnaviv_is_model_rev(gpu, GC5000, 0x5434) ||
+           etnaviv_is_model_rev(gpu, GC4000, 0x5222) ||
+           etnaviv_is_model_rev(gpu, GC4000, 0x5245) ||
+           etnaviv_is_model_rev(gpu, GC4000, 0x5208) ||
+           etnaviv_is_model_rev(gpu, GC3000, 0x5435) ||
+           etnaviv_is_model_rev(gpu, GC2200, 0x5244) ||
+           etnaviv_is_model_rev(gpu, GC2100, 0x5108) ||
+           etnaviv_is_model_rev(gpu, GC2000, 0x5108) ||
+           etnaviv_is_model_rev(gpu, GC1500, 0x5246) ||
+           etnaviv_is_model_rev(gpu, GC880, 0x5107) ||
+           etnaviv_is_model_rev(gpu, GC880, 0x5106))
+               gpu->identity.varyings_count -= 1;
 }
 
 static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
@@ -251,12 +296,10 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
        chipIdentity = gpu_read(gpu, VIVS_HI_CHIP_IDENTITY);
 
        /* Special case for older graphic cores. */
-       if (((chipIdentity & VIVS_HI_CHIP_IDENTITY_FAMILY__MASK)
-            >> VIVS_HI_CHIP_IDENTITY_FAMILY__SHIFT) ==  0x01) {
-               gpu->identity.model    = 0x500; /* gc500 */
-               gpu->identity.revision =
-                       (chipIdentity & VIVS_HI_CHIP_IDENTITY_REVISION__MASK)
-                       >> VIVS_HI_CHIP_IDENTITY_REVISION__SHIFT;
+       if (etnaviv_field(chipIdentity, VIVS_HI_CHIP_IDENTITY_FAMILY) == 0x01) {
+               gpu->identity.model    = chipModel_GC500;
+               gpu->identity.revision = etnaviv_field(chipIdentity,
+                                        VIVS_HI_CHIP_IDENTITY_REVISION);
        } else {
 
                gpu->identity.model = gpu_read(gpu, VIVS_HI_CHIP_MODEL);
@@ -269,13 +312,12 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
                 * same.  Only for GC400 family.
                 */
                if ((gpu->identity.model & 0xff00) == 0x0400 &&
-                   gpu->identity.model != 0x0420) {
+                   gpu->identity.model != chipModel_GC420) {
                        gpu->identity.model = gpu->identity.model & 0x0400;
                }
 
                /* Another special case */
-               if (gpu->identity.model == 0x300 &&
-                   gpu->identity.revision == 0x2201) {
+               if (etnaviv_is_model_rev(gpu, GC300, 0x2201)) {
                        u32 chipDate = gpu_read(gpu, VIVS_HI_CHIP_DATE);
                        u32 chipTime = gpu_read(gpu, VIVS_HI_CHIP_TIME);
 
@@ -295,11 +337,13 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
        gpu->identity.features = gpu_read(gpu, VIVS_HI_CHIP_FEATURE);
 
        /* Disable fast clear on GC700. */
-       if (gpu->identity.model == 0x700)
+       if (gpu->identity.model == chipModel_GC700)
                gpu->identity.features &= ~chipFeatures_FAST_CLEAR;
 
-       if ((gpu->identity.model == 0x500 && gpu->identity.revision < 2) ||
-           (gpu->identity.model == 0x300 && gpu->identity.revision < 0x2000)) {
+       if ((gpu->identity.model == chipModel_GC500 &&
+            gpu->identity.revision < 2) ||
+           (gpu->identity.model == chipModel_GC300 &&
+            gpu->identity.revision < 0x2000)) {
 
                /*
                 * GC500 rev 1.x and GC300 rev < 2.0 doesn't have these
@@ -309,6 +353,8 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
                gpu->identity.minor_features1 = 0;
                gpu->identity.minor_features2 = 0;
                gpu->identity.minor_features3 = 0;
+               gpu->identity.minor_features4 = 0;
+               gpu->identity.minor_features5 = 0;
        } else
                gpu->identity.minor_features0 =
                                gpu_read(gpu, VIVS_HI_CHIP_MINOR_FEATURE_0);
@@ -321,6 +367,10 @@ static void etnaviv_hw_identify(struct etnaviv_gpu *gpu)
                                gpu_read(gpu, VIVS_HI_CHIP_MINOR_FEATURE_2);
                gpu->identity.minor_features3 =
                                gpu_read(gpu, VIVS_HI_CHIP_MINOR_FEATURE_3);
+               gpu->identity.minor_features4 =
+                               gpu_read(gpu, VIVS_HI_CHIP_MINOR_FEATURE_4);
+               gpu->identity.minor_features5 =
+                               gpu_read(gpu, VIVS_HI_CHIP_MINOR_FEATURE_5);
        }
 
        /* GC600 idle register reports zero bits where modules aren't present */
@@ -441,10 +491,9 @@ static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 {
        u16 prefetch;
 
-       if (gpu->identity.model == chipModel_GC320 &&
-           gpu_read(gpu, VIVS_HI_CHIP_TIME) != 0x2062400 &&
-           (gpu->identity.revision == 0x5007 ||
-            gpu->identity.revision == 0x5220)) {
+       if ((etnaviv_is_model_rev(gpu, GC320, 0x5007) ||
+            etnaviv_is_model_rev(gpu, GC320, 0x5220)) &&
+           gpu_read(gpu, VIVS_HI_CHIP_TIME) != 0x2062400) {
                u32 mc_memory_debug;
 
                mc_memory_debug = gpu_read(gpu, VIVS_MC_DEBUG_MEMORY) & ~0xff;
@@ -466,7 +515,7 @@ static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
                  VIVS_HI_AXI_CONFIG_ARCACHE(2));
 
        /* GC2000 rev 5108 needs a special bus config */
-       if (gpu->identity.model == 0x2000 && gpu->identity.revision == 0x5108) {
+       if (etnaviv_is_model_rev(gpu, GC2000, 0x5108)) {
                u32 bus_config = gpu_read(gpu, VIVS_MC_BUS_CONFIG);
                bus_config &= ~(VIVS_MC_BUS_CONFIG_FE_BUS_CONFIG__MASK |
                                VIVS_MC_BUS_CONFIG_TX_BUS_CONFIG__MASK);
@@ -511,8 +560,16 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 
        if (gpu->identity.model == 0) {
                dev_err(gpu->dev, "Unknown GPU model\n");
-               pm_runtime_put_autosuspend(gpu->dev);
-               return -ENXIO;
+               ret = -ENXIO;
+               goto fail;
+       }
+
+       /* Exclude VG cores with FE2.0 */
+       if (gpu->identity.features & chipFeatures_PIPE_VG &&
+           gpu->identity.features & chipFeatures_FE20) {
+               dev_info(gpu->dev, "Ignoring GPU with VG and FE2.0\n");
+               ret = -ENXIO;
+               goto fail;
        }
 
        ret = etnaviv_hw_reset(gpu);
@@ -539,10 +596,9 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
                goto fail;
        }
 
-       /* TODO: we will leak here memory - fix it! */
-
        gpu->mmu = etnaviv_iommu_new(gpu, iommu, version);
        if (!gpu->mmu) {
+               iommu_domain_free(iommu);
                ret = -ENOMEM;
                goto fail;
        }
@@ -552,7 +608,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
        if (!gpu->buffer) {
                ret = -ENOMEM;
                dev_err(gpu->dev, "could not create command buffer\n");
-               goto fail;
+               goto destroy_iommu;
        }
        if (gpu->buffer->paddr - gpu->memory_base > 0x80000000) {
                ret = -EINVAL;
@@ -582,6 +638,9 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 free_buffer:
        etnaviv_gpu_cmdbuf_free(gpu->buffer);
        gpu->buffer = NULL;
+destroy_iommu:
+       etnaviv_iommu_destroy(gpu->mmu);
+       gpu->mmu = NULL;
 fail:
        pm_runtime_mark_last_busy(gpu->dev);
        pm_runtime_put_autosuspend(gpu->dev);
@@ -642,6 +701,10 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
                   gpu->identity.minor_features2);
        seq_printf(m, "\t minor_features3: 0x%08x\n",
                   gpu->identity.minor_features3);
+       seq_printf(m, "\t minor_features4: 0x%08x\n",
+                  gpu->identity.minor_features4);
+       seq_printf(m, "\t minor_features5: 0x%08x\n",
+                  gpu->identity.minor_features5);
 
        seq_puts(m, "\tspecs\n");
        seq_printf(m, "\t stream_count:  %d\n",
@@ -664,6 +727,8 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
                        gpu->identity.instruction_count);
        seq_printf(m, "\t num_constants: %d\n",
                        gpu->identity.num_constants);
+       seq_printf(m, "\t varyings_count: %d\n",
+                       gpu->identity.varyings_count);
 
        seq_printf(m, "\taxi: 0x%08x\n", axi);
        seq_printf(m, "\tidle: 0x%08x\n", idle);
@@ -1048,8 +1113,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
        if (!cmdbuf)
                return NULL;
 
-       cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
-                                              GFP_KERNEL);
+       cmdbuf->vaddr = dma_alloc_wc(gpu->dev, size, &cmdbuf->paddr,
+                                    GFP_KERNEL);
        if (!cmdbuf->vaddr) {
                kfree(cmdbuf);
                return NULL;
@@ -1063,8 +1128,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
 
 void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
 {
-       dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
-                             cmdbuf->vaddr, cmdbuf->paddr);
+       dma_free_wc(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+                   cmdbuf->paddr);
        kfree(cmdbuf);
 }
 
index c75d503..f233ac4 100644 (file)
@@ -46,6 +46,12 @@ struct etnaviv_chip_identity {
        /* Supported minor feature 3 fields. */
        u32 minor_features3;
 
+       /* Supported minor feature 4 fields. */
+       u32 minor_features4;
+
+       /* Supported minor feature 5 fields. */
+       u32 minor_features5;
+
        /* Number of streams supported. */
        u32 stream_count;
 
@@ -75,6 +81,9 @@ struct etnaviv_chip_identity {
 
        /* Buffer size */
        u32 buffer_size;
+
+       /* Number of varyings */
+       u8 varyings_count;
 };
 
 struct etnaviv_event {
index 0064f26..6a7de5f 100644 (file)
@@ -8,8 +8,8 @@ http://0x04.net/cgit/index.cgi/rules-ng-ng
 git clone git://0x04.net/rules-ng-ng
 
 The rules-ng-ng source files this header was generated from are:
-- state_hi.xml (  23420 bytes, from 2015-03-25 11:47:21)
-- common.xml   (  18437 bytes, from 2015-03-25 11:27:41)
+- state_hi.xml (  24309 bytes, from 2015-12-12 09:02:53)
+- common.xml   (  18437 bytes, from 2015-12-12 09:02:53)
 
 Copyright (C) 2015
 */
@@ -182,8 +182,25 @@ Copyright (C) 2015
 
 #define VIVS_HI_CHIP_MINOR_FEATURE_3                           0x00000088
 
+#define VIVS_HI_CHIP_SPECS_3                                   0x0000008c
+#define VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT__MASK              0x000001f0
+#define VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT__SHIFT             4
+#define VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT(x)                 (((x) << VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT__SHIFT) & VIVS_HI_CHIP_SPECS_3_VARYINGS_COUNT__MASK)
+#define VIVS_HI_CHIP_SPECS_3_GPU_CORE_COUNT__MASK              0x00000007
+#define VIVS_HI_CHIP_SPECS_3_GPU_CORE_COUNT__SHIFT             0
+#define VIVS_HI_CHIP_SPECS_3_GPU_CORE_COUNT(x)                 (((x) << VIVS_HI_CHIP_SPECS_3_GPU_CORE_COUNT__SHIFT) & VIVS_HI_CHIP_SPECS_3_GPU_CORE_COUNT__MASK)
+
 #define VIVS_HI_CHIP_MINOR_FEATURE_4                           0x00000094
 
+#define VIVS_HI_CHIP_SPECS_4                                   0x0000009c
+#define VIVS_HI_CHIP_SPECS_4_STREAM_COUNT__MASK                        0x0001f000
+#define VIVS_HI_CHIP_SPECS_4_STREAM_COUNT__SHIFT               12
+#define VIVS_HI_CHIP_SPECS_4_STREAM_COUNT(x)                   (((x) << VIVS_HI_CHIP_SPECS_4_STREAM_COUNT__SHIFT) & VIVS_HI_CHIP_SPECS_4_STREAM_COUNT__MASK)
+
+#define VIVS_HI_CHIP_MINOR_FEATURE_5                           0x000000a0
+
+#define VIVS_HI_CHIP_PRODUCT_ID                                        0x000000a8
+
 #define VIVS_PM                                                        0x00000000
 
 #define VIVS_PM_POWER_CONTROLS                                 0x00000100
@@ -206,6 +223,11 @@ Copyright (C) 2015
 #define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_FE            0x00000001
 #define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_DE            0x00000002
 #define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_PE            0x00000004
+#define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_SH            0x00000008
+#define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_PA            0x00000010
+#define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_SE            0x00000020
+#define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_RA            0x00000040
+#define VIVS_PM_MODULE_STATUS_MODULE_CLOCK_GATED_TX            0x00000080
 
 #define VIVS_PM_PULSE_EATER                                    0x0000010c
 
index 83efca9..f17d392 100644 (file)
@@ -1,6 +1,6 @@
 config DRM_EXYNOS
        tristate "DRM Support for Samsung SoC EXYNOS Series"
-       depends on OF && DRM && (PLAT_SAMSUNG || ARCH_MULTIPLATFORM)
+       depends on OF && DRM && (ARCH_S3C64XX || ARCH_EXYNOS || ARCH_MULTIPLATFORM)
        select DRM_KMS_HELPER
        select DRM_KMS_FB_HELPER
        select FB_CFB_FILLRECT
index 1bf6a21..162ab93 100644 (file)
@@ -93,7 +93,7 @@ static int decon_enable_vblank(struct exynos_drm_crtc *crtc)
        if (test_bit(BIT_SUSPENDED, &ctx->flags))
                return -EPERM;
 
-       if (test_and_set_bit(BIT_IRQS_ENABLED, &ctx->flags)) {
+       if (!test_and_set_bit(BIT_IRQS_ENABLED, &ctx->flags)) {
                val = VIDINTCON0_INTEN;
                if (ctx->out_type == IFTYPE_I80)
                        val |= VIDINTCON0_FRAMEDONE;
@@ -402,8 +402,6 @@ static void decon_enable(struct exynos_drm_crtc *crtc)
                decon_enable_vblank(ctx->crtc);
 
        decon_commit(ctx->crtc);
-
-       set_bit(BIT_SUSPENDED, &ctx->flags);
 }
 
 static void decon_disable(struct exynos_drm_crtc *crtc)
@@ -582,9 +580,9 @@ out:
 static int exynos5433_decon_suspend(struct device *dev)
 {
        struct decon_context *ctx = dev_get_drvdata(dev);
-       int i;
+       int i = ARRAY_SIZE(decon_clks_name);
 
-       for (i = 0; i < ARRAY_SIZE(decon_clks_name); i++)
+       while (--i >= 0)
                clk_disable_unprepare(ctx->clks[i]);
 
        return 0;
index b79c316..673164b 100644 (file)
@@ -1392,7 +1392,7 @@ static const struct component_ops exynos_dp_ops = {
 static int exynos_dp_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
-       struct device_node *panel_node = NULL, *bridge_node, *endpoint = NULL;
+       struct device_node *np = NULL, *endpoint = NULL;
        struct exynos_dp_device *dp;
        int ret;
 
@@ -1404,41 +1404,36 @@ static int exynos_dp_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, dp);
 
        /* This is for the backward compatibility. */
-       panel_node = of_parse_phandle(dev->of_node, "panel", 0);
-       if (panel_node) {
-               dp->panel = of_drm_find_panel(panel_node);
-               of_node_put(panel_node);
+       np = of_parse_phandle(dev->of_node, "panel", 0);
+       if (np) {
+               dp->panel = of_drm_find_panel(np);
+               of_node_put(np);
                if (!dp->panel)
                        return -EPROBE_DEFER;
-       } else {
-               endpoint = of_graph_get_next_endpoint(dev->of_node, NULL);
-               if (endpoint) {
-                       panel_node = of_graph_get_remote_port_parent(endpoint);
-                       if (panel_node) {
-                               dp->panel = of_drm_find_panel(panel_node);
-                               of_node_put(panel_node);
-                               if (!dp->panel)
-                                       return -EPROBE_DEFER;
-                       } else {
-                               DRM_ERROR("no port node for panel device.\n");
-                               return -EINVAL;
-                       }
-               }
-       }
-
-       if (endpoint)
                goto out;
+       }
 
        endpoint = of_graph_get_next_endpoint(dev->of_node, NULL);
        if (endpoint) {
-               bridge_node = of_graph_get_remote_port_parent(endpoint);
-               if (bridge_node) {
-                       dp->ptn_bridge = of_drm_find_bridge(bridge_node);
-                       of_node_put(bridge_node);
-                       if (!dp->ptn_bridge)
-                               return -EPROBE_DEFER;
-               } else
-                       return -EPROBE_DEFER;
+               np = of_graph_get_remote_port_parent(endpoint);
+               if (np) {
+                       /* The remote port can be either a panel or a bridge */
+                       dp->panel = of_drm_find_panel(np);
+                       if (!dp->panel) {
+                               dp->ptn_bridge = of_drm_find_bridge(np);
+                               if (!dp->ptn_bridge) {
+                                       of_node_put(np);
+                                       return -EPROBE_DEFER;
+                               }
+                       }
+                       of_node_put(np);
+               } else {
+                       DRM_ERROR("no remote endpoint device node found.\n");
+                       return -EINVAL;
+               }
+       } else {
+               DRM_ERROR("no port endpoint subnode found.\n");
+               return -EINVAL;
        }
 
 out:
index d84a498..26e81d1 100644 (file)
@@ -1782,6 +1782,7 @@ static int exynos_dsi_bind(struct device *dev, struct device *master,
 
        bridge = of_drm_find_bridge(dsi->bridge_node);
        if (bridge) {
+               encoder->bridge = bridge;
                drm_bridge_attach(drm_dev, bridge);
        }
 
@@ -1906,8 +1907,7 @@ static int exynos_dsi_remove(struct platform_device *pdev)
        return 0;
 }
 
-#ifdef CONFIG_PM
-static int exynos_dsi_suspend(struct device *dev)
+static int __maybe_unused exynos_dsi_suspend(struct device *dev)
 {
        struct drm_encoder *encoder = dev_get_drvdata(dev);
        struct exynos_dsi *dsi = encoder_to_dsi(encoder);
@@ -1938,7 +1938,7 @@ static int exynos_dsi_suspend(struct device *dev)
        return 0;
 }
 
-static int exynos_dsi_resume(struct device *dev)
+static int __maybe_unused exynos_dsi_resume(struct device *dev)
 {
        struct drm_encoder *encoder = dev_get_drvdata(dev);
        struct exynos_dsi *dsi = encoder_to_dsi(encoder);
@@ -1972,7 +1972,6 @@ err_clk:
 
        return ret;
 }
-#endif
 
 static const struct dev_pm_ops exynos_dsi_pm_ops = {
        SET_RUNTIME_PM_OPS(exynos_dsi_suspend, exynos_dsi_resume, NULL)
index f6118ba..8baabd8 100644 (file)
@@ -50,7 +50,7 @@ static int exynos_drm_fb_mmap(struct fb_info *info,
        if (vm_size > exynos_gem->size)
                return -EINVAL;
 
-       ret = dma_mmap_attrs(helper->dev->dev, vma, exynos_gem->pages,
+       ret = dma_mmap_attrs(helper->dev->dev, vma, exynos_gem->cookie,
                             exynos_gem->dma_addr, exynos_gem->size,
                             &exynos_gem->dma_attrs);
        if (ret < 0) {
index c747824..8a4f4a0 100644 (file)
@@ -1723,7 +1723,7 @@ static int fimc_probe(struct platform_device *pdev)
                goto err_put_clk;
        }
 
-       DRM_DEBUG_KMS("id[%d]ippdrv[0x%x]\n", ctx->id, (int)ippdrv);
+       DRM_DEBUG_KMS("id[%d]ippdrv[%p]\n", ctx->id, ippdrv);
 
        spin_lock_init(&ctx->lock);
        platform_set_drvdata(pdev, ctx);
index c17efdb..8dfe6e1 100644 (file)
@@ -1166,7 +1166,7 @@ int exynos_g2d_set_cmdlist_ioctl(struct drm_device *drm_dev, void *data,
                goto err_free_event;
        }
 
-       cmd = (struct drm_exynos_g2d_cmd *)(uint32_t)req->cmd;
+       cmd = (struct drm_exynos_g2d_cmd *)(unsigned long)req->cmd;
 
        if (copy_from_user(cmdlist->data + cmdlist->last,
                                (void __user *)cmd,
@@ -1184,7 +1184,8 @@ int exynos_g2d_set_cmdlist_ioctl(struct drm_device *drm_dev, void *data,
        if (req->cmd_buf_nr) {
                struct drm_exynos_g2d_cmd *cmd_buf;
 
-               cmd_buf = (struct drm_exynos_g2d_cmd *)(uint32_t)req->cmd_buf;
+               cmd_buf = (struct drm_exynos_g2d_cmd *)
+                               (unsigned long)req->cmd_buf;
 
                if (copy_from_user(cmdlist->data + cmdlist->last,
                                        (void __user *)cmd_buf,
index 32358c5..26b5e4b 100644 (file)
@@ -218,7 +218,7 @@ static struct exynos_drm_gem *exynos_drm_gem_init(struct drm_device *dev,
                return ERR_PTR(ret);
        }
 
-       DRM_DEBUG_KMS("created file object = 0x%x\n", (unsigned int)obj->filp);
+       DRM_DEBUG_KMS("created file object = %p\n", obj->filp);
 
        return exynos_gem;
 }
@@ -335,7 +335,7 @@ static int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem *exynos_gem,
        if (vm_size > exynos_gem->size)
                return -EINVAL;
 
-       ret = dma_mmap_attrs(drm_dev->dev, vma, exynos_gem->pages,
+       ret = dma_mmap_attrs(drm_dev->dev, vma, exynos_gem->cookie,
                             exynos_gem->dma_addr, exynos_gem->size,
                             &exynos_gem->dma_attrs);
        if (ret < 0) {
index 7aecd23..5d20da8 100644 (file)
@@ -1723,7 +1723,7 @@ static int gsc_probe(struct platform_device *pdev)
                return ret;
        }
 
-       DRM_DEBUG_KMS("id[%d]ippdrv[0x%x]\n", ctx->id, (int)ippdrv);
+       DRM_DEBUG_KMS("id[%d]ippdrv[%p]\n", ctx->id, ippdrv);
 
        mutex_init(&ctx->lock);
        platform_set_drvdata(pdev, ctx);
index 67d2423..95eeb91 100644 (file)
@@ -208,7 +208,7 @@ static struct exynos_drm_ippdrv *ipp_find_drv_by_handle(u32 prop_id)
         * e.g PAUSE state, queue buf, command control.
         */
        list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list) {
-               DRM_DEBUG_KMS("count[%d]ippdrv[0x%x]\n", count++, (int)ippdrv);
+               DRM_DEBUG_KMS("count[%d]ippdrv[%p]\n", count++, ippdrv);
 
                mutex_lock(&ippdrv->cmd_lock);
                list_for_each_entry(c_node, &ippdrv->cmd_list, list) {
@@ -388,8 +388,8 @@ int exynos_drm_ipp_set_property(struct drm_device *drm_dev, void *data,
        }
        property->prop_id = ret;
 
-       DRM_DEBUG_KMS("created prop_id[%d]cmd[%d]ippdrv[0x%x]\n",
-               property->prop_id, property->cmd, (int)ippdrv);
+       DRM_DEBUG_KMS("created prop_id[%d]cmd[%d]ippdrv[%p]\n",
+               property->prop_id, property->cmd, ippdrv);
 
        /* stored property information and ippdrv in private data */
        c_node->property = *property;
@@ -518,7 +518,7 @@ static int ipp_put_mem_node(struct drm_device *drm_dev,
 {
        int i;
 
-       DRM_DEBUG_KMS("node[0x%x]\n", (int)m_node);
+       DRM_DEBUG_KMS("node[%p]\n", m_node);
 
        if (!m_node) {
                DRM_ERROR("invalid dequeue node.\n");
@@ -562,7 +562,7 @@ static struct drm_exynos_ipp_mem_node
        m_node->buf_id = qbuf->buf_id;
        INIT_LIST_HEAD(&m_node->list);
 
-       DRM_DEBUG_KMS("m_node[0x%x]ops_id[%d]\n", (int)m_node, qbuf->ops_id);
+       DRM_DEBUG_KMS("m_node[%p]ops_id[%d]\n", m_node, qbuf->ops_id);
        DRM_DEBUG_KMS("prop_id[%d]buf_id[%d]\n", qbuf->prop_id, m_node->buf_id);
 
        for_each_ipp_planar(i) {
@@ -582,8 +582,8 @@ static struct drm_exynos_ipp_mem_node
 
                        buf_info->handles[i] = qbuf->handle[i];
                        buf_info->base[i] = *addr;
-                       DRM_DEBUG_KMS("i[%d]base[0x%x]hd[0x%lx]\n", i,
-                                     buf_info->base[i], buf_info->handles[i]);
+                       DRM_DEBUG_KMS("i[%d]base[%pad]hd[0x%lx]\n", i,
+                                     &buf_info->base[i], buf_info->handles[i]);
                }
        }
 
@@ -664,7 +664,7 @@ static void ipp_put_event(struct drm_exynos_ipp_cmd_node *c_node,
 
        mutex_lock(&c_node->event_lock);
        list_for_each_entry_safe(e, te, &c_node->event_list, base.link) {
-               DRM_DEBUG_KMS("count[%d]e[0x%x]\n", count++, (int)e);
+               DRM_DEBUG_KMS("count[%d]e[%p]\n", count++, e);
 
                /*
                 * qbuf == NULL condition means all event deletion.
@@ -755,7 +755,7 @@ static struct drm_exynos_ipp_mem_node
 
        /* find memory node from memory list */
        list_for_each_entry(m_node, head, list) {
-               DRM_DEBUG_KMS("count[%d]m_node[0x%x]\n", count++, (int)m_node);
+               DRM_DEBUG_KMS("count[%d]m_node[%p]\n", count++, m_node);
 
                /* compare buffer id */
                if (m_node->buf_id == qbuf->buf_id)
@@ -772,7 +772,7 @@ static int ipp_set_mem_node(struct exynos_drm_ippdrv *ippdrv,
        struct exynos_drm_ipp_ops *ops = NULL;
        int ret = 0;
 
-       DRM_DEBUG_KMS("node[0x%x]\n", (int)m_node);
+       DRM_DEBUG_KMS("node[%p]\n", m_node);
 
        if (!m_node) {
                DRM_ERROR("invalid queue node.\n");
@@ -1237,7 +1237,7 @@ static int ipp_start_property(struct exynos_drm_ippdrv *ippdrv,
                        m_node = list_first_entry(head,
                                struct drm_exynos_ipp_mem_node, list);
 
-                       DRM_DEBUG_KMS("m_node[0x%x]\n", (int)m_node);
+                       DRM_DEBUG_KMS("m_node[%p]\n", m_node);
 
                        ret = ipp_set_mem_node(ippdrv, c_node, m_node);
                        if (ret) {
@@ -1610,8 +1610,8 @@ static int ipp_subdrv_probe(struct drm_device *drm_dev, struct device *dev)
                }
                ippdrv->prop_list.ipp_id = ret;
 
-               DRM_DEBUG_KMS("count[%d]ippdrv[0x%x]ipp_id[%d]\n",
-                       count++, (int)ippdrv, ret);
+               DRM_DEBUG_KMS("count[%d]ippdrv[%p]ipp_id[%d]\n",
+                       count++, ippdrv, ret);
 
                /* store parent device for node */
                ippdrv->parent_dev = dev;
@@ -1668,7 +1668,7 @@ static int ipp_subdrv_open(struct drm_device *drm_dev, struct device *dev,
 
        file_priv->ipp_dev = dev;
 
-       DRM_DEBUG_KMS("done priv[0x%x]\n", (int)dev);
+       DRM_DEBUG_KMS("done priv[%p]\n", dev);
 
        return 0;
 }
@@ -1685,8 +1685,8 @@ static void ipp_subdrv_close(struct drm_device *drm_dev, struct device *dev,
                mutex_lock(&ippdrv->cmd_lock);
                list_for_each_entry_safe(c_node, tc_node,
                        &ippdrv->cmd_list, list) {
-                       DRM_DEBUG_KMS("count[%d]ippdrv[0x%x]\n",
-                               count++, (int)ippdrv);
+                       DRM_DEBUG_KMS("count[%d]ippdrv[%p]\n",
+                               count++, ippdrv);
 
                        if (c_node->filp == file) {
                                /*
index 4eaef36..9869d70 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/of.h>
 #include <linux/of_graph.h>
 #include <linux/clk.h>
+#include <linux/component.h>
 #include <drm/drmP.h>
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
@@ -306,9 +307,9 @@ exit:
        return ret;
 }
 
-void mic_disable(struct drm_bridge *bridge) { }
+static void mic_disable(struct drm_bridge *bridge) { }
 
-void mic_post_disable(struct drm_bridge *bridge)
+static void mic_post_disable(struct drm_bridge *bridge)
 {
        struct exynos_mic *mic = bridge->driver_private;
        int i;
@@ -328,7 +329,7 @@ already_disabled:
        mutex_unlock(&mic_mutex);
 }
 
-void mic_pre_enable(struct drm_bridge *bridge)
+static void mic_pre_enable(struct drm_bridge *bridge)
 {
        struct exynos_mic *mic = bridge->driver_private;
        int ret, i;
@@ -371,11 +372,35 @@ already_enabled:
        mutex_unlock(&mic_mutex);
 }
 
-void mic_enable(struct drm_bridge *bridge) { }
+static void mic_enable(struct drm_bridge *bridge) { }
 
-void mic_destroy(struct drm_bridge *bridge)
+static const struct drm_bridge_funcs mic_bridge_funcs = {
+       .disable = mic_disable,
+       .post_disable = mic_post_disable,
+       .pre_enable = mic_pre_enable,
+       .enable = mic_enable,
+};
+
+static int exynos_mic_bind(struct device *dev, struct device *master,
+                          void *data)
 {
-       struct exynos_mic *mic = bridge->driver_private;
+       struct exynos_mic *mic = dev_get_drvdata(dev);
+       int ret;
+
+       mic->bridge.funcs = &mic_bridge_funcs;
+       mic->bridge.of_node = dev->of_node;
+       mic->bridge.driver_private = mic;
+       ret = drm_bridge_add(&mic->bridge);
+       if (ret)
+               DRM_ERROR("mic: Failed to add MIC to the global bridge list\n");
+
+       return ret;
+}
+
+static void exynos_mic_unbind(struct device *dev, struct device *master,
+                             void *data)
+{
+       struct exynos_mic *mic = dev_get_drvdata(dev);
        int i;
 
        mutex_lock(&mic_mutex);
@@ -387,16 +412,16 @@ void mic_destroy(struct drm_bridge *bridge)
 
 already_disabled:
        mutex_unlock(&mic_mutex);
+
+       drm_bridge_remove(&mic->bridge);
 }
 
-static const struct drm_bridge_funcs mic_bridge_funcs = {
-       .disable = mic_disable,
-       .post_disable = mic_post_disable,
-       .pre_enable = mic_pre_enable,
-       .enable = mic_enable,
+static const struct component_ops exynos_mic_component_ops = {
+       .bind   = exynos_mic_bind,
+       .unbind = exynos_mic_unbind,
 };
 
-int exynos_mic_probe(struct platform_device *pdev)
+static int exynos_mic_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct exynos_mic *mic;
@@ -435,17 +460,8 @@ int exynos_mic_probe(struct platform_device *pdev)
                goto err;
        }
 
-       mic->bridge.funcs = &mic_bridge_funcs;
-       mic->bridge.of_node = dev->of_node;
-       mic->bridge.driver_private = mic;
-       ret = drm_bridge_add(&mic->bridge);
-       if (ret) {
-               DRM_ERROR("mic: Failed to add MIC to the global bridge list\n");
-               goto err;
-       }
-
        for (i = 0; i < NUM_CLKS; i++) {
-               mic->clks[i] = of_clk_get_by_name(dev->of_node, clk_names[i]);
+               mic->clks[i] = devm_clk_get(dev, clk_names[i]);
                if (IS_ERR(mic->clks[i])) {
                        DRM_ERROR("mic: Failed to get clock (%s)\n",
                                                                clk_names[i]);
@@ -454,7 +470,10 @@ int exynos_mic_probe(struct platform_device *pdev)
                }
        }
 
+       platform_set_drvdata(pdev, mic);
+
        DRM_DEBUG_KMS("MIC has been probed\n");
+       return component_add(dev, &exynos_mic_component_ops);
 
 err:
        return ret;
@@ -462,14 +481,7 @@ err:
 
 static int exynos_mic_remove(struct platform_device *pdev)
 {
-       struct exynos_mic *mic = platform_get_drvdata(pdev);
-       int i;
-
-       drm_bridge_remove(&mic->bridge);
-
-       for (i = NUM_CLKS - 1; i > -1; i--)
-               clk_put(mic->clks[i]);
-
+       component_del(&pdev->dev, &exynos_mic_component_ops);
        return 0;
 }
 
index bea0f78..ce59f44 100644 (file)
@@ -754,7 +754,7 @@ static int rotator_probe(struct platform_device *pdev)
                goto err_ippdrv_register;
        }
 
-       DRM_DEBUG_KMS("ippdrv[0x%x]\n", (int)ippdrv);
+       DRM_DEBUG_KMS("ippdrv[%p]\n", ippdrv);
 
        platform_set_drvdata(pdev, rot);
 
index 62ac4e5..b605bd7 100644 (file)
@@ -223,7 +223,7 @@ static void vidi_fake_vblank_handler(struct work_struct *work)
        }
 }
 
-static int vidi_show_connection(struct device *dev,
+static ssize_t vidi_show_connection(struct device *dev,
                                struct device_attribute *attr, char *buf)
 {
        struct vidi_context *ctx = dev_get_drvdata(dev);
@@ -238,7 +238,7 @@ static int vidi_show_connection(struct device *dev,
        return rc;
 }
 
-static int vidi_store_connection(struct device *dev,
+static ssize_t vidi_store_connection(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t len)
 {
@@ -294,7 +294,9 @@ int vidi_connection_ioctl(struct drm_device *drm_dev, void *data,
        }
 
        if (vidi->connection) {
-               struct edid *raw_edid  = (struct edid *)(uint32_t)vidi->edid;
+               struct edid *raw_edid;
+
+               raw_edid = (struct edid *)(unsigned long)vidi->edid;
                if (!drm_edid_is_valid(raw_edid)) {
                        DRM_DEBUG_KMS("edid data is invalid.\n");
                        return -EINVAL;
index b5fbc1c..0a5a600 100644 (file)
@@ -1289,8 +1289,7 @@ static int mixer_remove(struct platform_device *pdev)
        return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int exynos_mixer_suspend(struct device *dev)
+static int __maybe_unused exynos_mixer_suspend(struct device *dev)
 {
        struct mixer_context *ctx = dev_get_drvdata(dev);
        struct mixer_resources *res = &ctx->mixer_res;
@@ -1306,7 +1305,7 @@ static int exynos_mixer_suspend(struct device *dev)
        return 0;
 }
 
-static int exynos_mixer_resume(struct device *dev)
+static int __maybe_unused exynos_mixer_resume(struct device *dev)
 {
        struct mixer_context *ctx = dev_get_drvdata(dev);
        struct mixer_resources *res = &ctx->mixer_res;
@@ -1342,7 +1341,6 @@ static int exynos_mixer_resume(struct device *dev)
 
        return 0;
 }
-#endif
 
 static const struct dev_pm_ops exynos_mixer_pm_ops = {
        SET_RUNTIME_PM_OPS(exynos_mixer_suspend, exynos_mixer_resume, NULL)
index 533d1e3..a02112b 100644 (file)
@@ -136,6 +136,7 @@ static bool adv7511_register_volatile(struct device *dev, unsigned int reg)
        case ADV7511_REG_BKSV(3):
        case ADV7511_REG_BKSV(4):
        case ADV7511_REG_DDC_STATUS:
+       case ADV7511_REG_EDID_READ_CTRL:
        case ADV7511_REG_BSTATUS(0):
        case ADV7511_REG_BSTATUS(1):
        case ADV7511_REG_CHIP_ID_HIGH:
@@ -362,24 +363,31 @@ static void adv7511_power_on(struct adv7511 *adv7511)
 {
        adv7511->current_edid_segment = -1;
 
-       regmap_write(adv7511->regmap, ADV7511_REG_INT(0),
-                    ADV7511_INT0_EDID_READY);
-       regmap_write(adv7511->regmap, ADV7511_REG_INT(1),
-                    ADV7511_INT1_DDC_ERROR);
        regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER,
                           ADV7511_POWER_POWER_DOWN, 0);
+       if (adv7511->i2c_main->irq) {
+               /*
+                * Documentation says the INT_ENABLE registers are reset in
+                * POWER_DOWN mode. My 7511w preserved the bits, however.
+                * Still, let's be safe and stick to the documentation.
+                */
+               regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(0),
+                            ADV7511_INT0_EDID_READY);
+               regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(1),
+                            ADV7511_INT1_DDC_ERROR);
+       }
 
        /*
-        * Per spec it is allowed to pulse the HDP signal to indicate that the
+        * Per spec it is allowed to pulse the HPD signal to indicate that the
         * EDID information has changed. Some monitors do this when they wakeup
-        * from standby or are enabled. When the HDP goes low the adv7511 is
+        * from standby or are enabled. When the HPD goes low the adv7511 is
         * reset and the outputs are disabled which might cause the monitor to
-        * go to standby again. To avoid this we ignore the HDP pin for the
+        * go to standby again. To avoid this we ignore the HPD pin for the
         * first few seconds after enabling the output.
         */
        regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER2,
-                          ADV7511_REG_POWER2_HDP_SRC_MASK,
-                          ADV7511_REG_POWER2_HDP_SRC_NONE);
+                          ADV7511_REG_POWER2_HPD_SRC_MASK,
+                          ADV7511_REG_POWER2_HPD_SRC_NONE);
 
        /*
         * Most of the registers are reset during power down or when HPD is low.
@@ -413,9 +421,9 @@ static bool adv7511_hpd(struct adv7511 *adv7511)
        if (ret < 0)
                return false;
 
-       if (irq0 & ADV7511_INT0_HDP) {
+       if (irq0 & ADV7511_INT0_HPD) {
                regmap_write(adv7511->regmap, ADV7511_REG_INT(0),
-                            ADV7511_INT0_HDP);
+                            ADV7511_INT0_HPD);
                return true;
        }
 
@@ -438,7 +446,7 @@ static int adv7511_irq_process(struct adv7511 *adv7511)
        regmap_write(adv7511->regmap, ADV7511_REG_INT(0), irq0);
        regmap_write(adv7511->regmap, ADV7511_REG_INT(1), irq1);
 
-       if (irq0 & ADV7511_INT0_HDP && adv7511->encoder)
+       if (irq0 & ADV7511_INT0_HPD && adv7511->encoder)
                drm_helper_hpd_irq_event(adv7511->encoder->dev);
 
        if (irq0 & ADV7511_INT0_EDID_READY || irq1 & ADV7511_INT1_DDC_ERROR) {
@@ -567,12 +575,14 @@ static int adv7511_get_modes(struct drm_encoder *encoder,
 
        /* Reading the EDID only works if the device is powered */
        if (!adv7511->powered) {
-               regmap_write(adv7511->regmap, ADV7511_REG_INT(0),
-                            ADV7511_INT0_EDID_READY);
-               regmap_write(adv7511->regmap, ADV7511_REG_INT(1),
-                            ADV7511_INT1_DDC_ERROR);
                regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER,
                                   ADV7511_POWER_POWER_DOWN, 0);
+               if (adv7511->i2c_main->irq) {
+                       regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(0),
+                                    ADV7511_INT0_EDID_READY);
+                       regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(1),
+                                    ADV7511_INT1_DDC_ERROR);
+               }
                adv7511->current_edid_segment = -1;
        }
 
@@ -638,10 +648,10 @@ adv7511_encoder_detect(struct drm_encoder *encoder,
                if (adv7511->status == connector_status_connected)
                        status = connector_status_disconnected;
        } else {
-               /* Renable HDP sensing */
+               /* Renable HPD sensing */
                regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER2,
-                                  ADV7511_REG_POWER2_HDP_SRC_MASK,
-                                  ADV7511_REG_POWER2_HDP_SRC_BOTH);
+                                  ADV7511_REG_POWER2_HPD_SRC_MASK,
+                                  ADV7511_REG_POWER2_HPD_SRC_BOTH);
        }
 
        adv7511->status = status;
index 6599ed5..38515b3 100644 (file)
@@ -90,7 +90,7 @@
 #define ADV7511_CSC_ENABLE                     BIT(7)
 #define ADV7511_CSC_UPDATE_MODE                        BIT(5)
 
-#define ADV7511_INT0_HDP                       BIT(7)
+#define ADV7511_INT0_HPD                       BIT(7)
 #define ADV7511_INT0_VSYNC                     BIT(5)
 #define ADV7511_INT0_AUDIO_FIFO_FULL           BIT(4)
 #define ADV7511_INT0_EDID_READY                        BIT(2)
 #define ADV7511_PACKET_ENABLE_SPARE2           BIT(1)
 #define ADV7511_PACKET_ENABLE_SPARE1           BIT(0)
 
-#define ADV7511_REG_POWER2_HDP_SRC_MASK                0xc0
-#define ADV7511_REG_POWER2_HDP_SRC_BOTH                0x00
-#define ADV7511_REG_POWER2_HDP_SRC_HDP         0x40
-#define ADV7511_REG_POWER2_HDP_SRC_CEC         0x80
-#define ADV7511_REG_POWER2_HDP_SRC_NONE                0xc0
+#define ADV7511_REG_POWER2_HPD_SRC_MASK                0xc0
+#define ADV7511_REG_POWER2_HPD_SRC_BOTH                0x00
+#define ADV7511_REG_POWER2_HPD_SRC_HPD         0x40
+#define ADV7511_REG_POWER2_HPD_SRC_CEC         0x80
+#define ADV7511_REG_POWER2_HPD_SRC_NONE                0xc0
 #define ADV7511_REG_POWER2_TDMS_ENABLE         BIT(4)
 #define ADV7511_REG_POWER2_GATE_INPUT_CLK      BIT(0)
 
index 34e3874..f8ee740 100644 (file)
@@ -1382,8 +1382,16 @@ static void tda998x_connector_destroy(struct drm_connector *connector)
        drm_connector_cleanup(connector);
 }
 
+static int tda998x_connector_dpms(struct drm_connector *connector, int mode)
+{
+       if (drm_core_check_feature(connector->dev, DRIVER_ATOMIC))
+               return drm_atomic_helper_connector_dpms(connector, mode);
+       else
+               return drm_helper_connector_dpms(connector, mode);
+}
+
 static const struct drm_connector_funcs tda998x_connector_funcs = {
-       .dpms = drm_atomic_helper_connector_dpms,
+       .dpms = tda998x_connector_dpms,
        .reset = drm_atomic_helper_connector_reset,
        .fill_modes = drm_helper_probe_single_connector_modes,
        .detect = tda998x_connector_detect,
index fcd77b2..051eab3 100644 (file)
@@ -10,7 +10,6 @@ config DRM_I915
        # the shmem_readpage() which depends upon tmpfs
        select SHMEM
        select TMPFS
-       select STOP_MACHINE
        select DRM_KMS_HELPER
        select DRM_PANEL
        select DRM_MIPI_DSI
index 0fc38bb..cf39ed3 100644 (file)
@@ -825,8 +825,11 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
                }
 
                for_each_pipe(dev_priv, pipe) {
-                       if (!intel_display_power_is_enabled(dev_priv,
-                                               POWER_DOMAIN_PIPE(pipe))) {
+                       enum intel_display_power_domain power_domain;
+
+                       power_domain = POWER_DOMAIN_PIPE(pipe);
+                       if (!intel_display_power_get_if_enabled(dev_priv,
+                                                               power_domain)) {
                                seq_printf(m, "Pipe %c power disabled\n",
                                           pipe_name(pipe));
                                continue;
@@ -840,6 +843,8 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
                        seq_printf(m, "Pipe %c IER:\t%08x\n",
                                   pipe_name(pipe),
                                   I915_READ(GEN8_DE_PIPE_IER(pipe)));
+
+                       intel_display_power_put(dev_priv, power_domain);
                }
 
                seq_printf(m, "Display Engine port interrupt mask:\t%08x\n",
@@ -3985,6 +3990,7 @@ static int pipe_crc_set_source(struct drm_device *dev, enum pipe pipe,
        struct intel_pipe_crc *pipe_crc = &dev_priv->pipe_crc[pipe];
        struct intel_crtc *crtc = to_intel_crtc(intel_get_crtc_for_pipe(dev,
                                                                        pipe));
+       enum intel_display_power_domain power_domain;
        u32 val = 0; /* shut up gcc */
        int ret;
 
@@ -3995,7 +4001,8 @@ static int pipe_crc_set_source(struct drm_device *dev, enum pipe pipe,
        if (pipe_crc->source && source)
                return -EINVAL;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PIPE(pipe))) {
+       power_domain = POWER_DOMAIN_PIPE(pipe);
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain)) {
                DRM_DEBUG_KMS("Trying to capture CRC while pipe is off\n");
                return -EIO;
        }
@@ -4012,7 +4019,7 @@ static int pipe_crc_set_source(struct drm_device *dev, enum pipe pipe,
                ret = ivb_pipe_crc_ctl_reg(dev, pipe, &source, &val);
 
        if (ret != 0)
-               return ret;
+               goto out;
 
        /* none -> real source transition */
        if (source) {
@@ -4024,8 +4031,10 @@ static int pipe_crc_set_source(struct drm_device *dev, enum pipe pipe,
                entries = kcalloc(INTEL_PIPE_CRC_ENTRIES_NR,
                                  sizeof(pipe_crc->entries[0]),
                                  GFP_KERNEL);
-               if (!entries)
-                       return -ENOMEM;
+               if (!entries) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
 
                /*
                 * When IPS gets enabled, the pipe CRC changes. Since IPS gets
@@ -4081,7 +4090,12 @@ static int pipe_crc_set_source(struct drm_device *dev, enum pipe pipe,
                hsw_enable_ips(crtc);
        }
 
-       return 0;
+       ret = 0;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 /*
index 3ac616d..f357058 100644 (file)
@@ -501,7 +501,9 @@ void intel_detect_pch(struct drm_device *dev)
                                WARN_ON(!IS_SKYLAKE(dev) &&
                                        !IS_KABYLAKE(dev));
                        } else if ((id == INTEL_PCH_P2X_DEVICE_ID_TYPE) ||
-                                  (id == INTEL_PCH_QEMU_DEVICE_ID_TYPE)) {
+                                  ((id == INTEL_PCH_QEMU_DEVICE_ID_TYPE) &&
+                                   pch->subsystem_vendor == 0x1af4 &&
+                                   pch->subsystem_device == 0x1100)) {
                                dev_priv->pch_type = intel_virt_detect_pch(dev);
                        } else
                                continue;
index f0f75d7..b0847b9 100644 (file)
@@ -751,6 +751,7 @@ struct intel_csr {
        uint32_t mmio_count;
        i915_reg_t mmioaddr[8];
        uint32_t mmiodata[8];
+       uint32_t dc_state;
 };
 
 #define DEV_INFO_FOR_EACH_FLAG(func, sep) \
@@ -1988,6 +1989,9 @@ enum hdmi_force_audio {
 #define I915_GTT_OFFSET_NONE ((u32)-1)
 
 struct drm_i915_gem_object_ops {
+       unsigned int flags;
+#define I915_GEM_OBJECT_HAS_STRUCT_PAGE 0x1
+
        /* Interface between the GEM object and its backing storage.
         * get_pages() is called once prior to the use of the associated set
         * of pages before to binding them into the GTT, and put_pages() is
@@ -2003,6 +2007,7 @@ struct drm_i915_gem_object_ops {
         */
        int (*get_pages)(struct drm_i915_gem_object *);
        void (*put_pages)(struct drm_i915_gem_object *);
+
        int (*dmabuf_export)(struct drm_i915_gem_object *);
        void (*release)(struct drm_i915_gem_object *);
 };
index ddc21d4..bb44bad 100644 (file)
@@ -4425,6 +4425,7 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 }
 
 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
+       .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE,
        .get_pages = i915_gem_object_get_pages_gtt,
        .put_pages = i915_gem_object_put_pages_gtt,
 };
@@ -5261,7 +5262,7 @@ i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n)
        struct page *page;
 
        /* Only default objects have per-page dirty tracking */
-       if (WARN_ON(obj->ops != &i915_gem_object_ops))
+       if (WARN_ON((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0))
                return NULL;
 
        page = i915_gem_object_get_page(obj, n);
index 19fb0bd..59e45b3 100644 (file)
@@ -789,9 +789,10 @@ i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object *obj)
 }
 
 static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
-       .dmabuf_export = i915_gem_userptr_dmabuf_export,
+       .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE,
        .get_pages = i915_gem_userptr_get_pages,
        .put_pages = i915_gem_userptr_put_pages,
+       .dmabuf_export = i915_gem_userptr_dmabuf_export,
        .release = i915_gem_userptr_release,
 };
 
index 007ae83..4897728 100644 (file)
@@ -3287,19 +3287,20 @@ enum skl_disp_power_wells {
 
 #define PORT_HOTPLUG_STAT      _MMIO(dev_priv->info.display_mmio_offset + 0x61114)
 /*
- * HDMI/DP bits are gen4+
+ * HDMI/DP bits are g4x+
  *
  * WARNING: Bspec for hpd status bits on gen4 seems to be completely confused.
  * Please check the detailed lore in the commit message for for experimental
  * evidence.
  */
-#define   PORTD_HOTPLUG_LIVE_STATUS_G4X                (1 << 29)
+/* Bspec says GM45 should match G4X/VLV/CHV, but reality disagrees */
+#define   PORTD_HOTPLUG_LIVE_STATUS_GM45       (1 << 29)
+#define   PORTC_HOTPLUG_LIVE_STATUS_GM45       (1 << 28)
+#define   PORTB_HOTPLUG_LIVE_STATUS_GM45       (1 << 27)
+/* G4X/VLV/CHV DP/HDMI bits again match Bspec */
+#define   PORTD_HOTPLUG_LIVE_STATUS_G4X                (1 << 27)
 #define   PORTC_HOTPLUG_LIVE_STATUS_G4X                (1 << 28)
-#define   PORTB_HOTPLUG_LIVE_STATUS_G4X                (1 << 27)
-/* VLV DP/HDMI bits again match Bspec */
-#define   PORTD_HOTPLUG_LIVE_STATUS_VLV                (1 << 27)
-#define   PORTC_HOTPLUG_LIVE_STATUS_VLV                (1 << 28)
-#define   PORTB_HOTPLUG_LIVE_STATUS_VLV                (1 << 29)
+#define   PORTB_HOTPLUG_LIVE_STATUS_G4X                (1 << 29)
 #define   PORTD_HOTPLUG_INT_STATUS             (3 << 21)
 #define   PORTD_HOTPLUG_INT_LONG_PULSE         (2 << 21)
 #define   PORTD_HOTPLUG_INT_SHORT_PULSE                (1 << 21)
@@ -7514,7 +7515,7 @@ enum skl_disp_power_wells {
 #define  DPLL_CFGCR2_PDIV_7 (4<<2)
 #define  DPLL_CFGCR2_CENTRAL_FREQ_MASK (3)
 
-#define DPLL_CFGCR1(id)        _MMIO_PIPE((id) - SKL_DPLL1, _DPLL1_CFGCR1, _DPLL2_CFGCR2)
+#define DPLL_CFGCR1(id)        _MMIO_PIPE((id) - SKL_DPLL1, _DPLL1_CFGCR1, _DPLL2_CFGCR1)
 #define DPLL_CFGCR2(id)        _MMIO_PIPE((id) - SKL_DPLL1, _DPLL1_CFGCR2, _DPLL2_CFGCR2)
 
 /* BXT display engine PLL */
index a2aa09c..a8af594 100644 (file)
@@ -49,7 +49,7 @@ static void i915_save_display(struct drm_device *dev)
                dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PCH_PP_ON_DELAYS);
                dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PCH_PP_OFF_DELAYS);
                dev_priv->regfile.savePP_DIVISOR = I915_READ(PCH_PP_DIVISOR);
-       } else if (!IS_VALLEYVIEW(dev) && !IS_CHERRYVIEW(dev)) {
+       } else if (INTEL_INFO(dev)->gen <= 4) {
                dev_priv->regfile.savePP_CONTROL = I915_READ(PP_CONTROL);
                dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PP_ON_DELAYS);
                dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PP_OFF_DELAYS);
@@ -84,7 +84,7 @@ static void i915_restore_display(struct drm_device *dev)
                I915_WRITE(PCH_PP_OFF_DELAYS, dev_priv->regfile.savePP_OFF_DELAYS);
                I915_WRITE(PCH_PP_DIVISOR, dev_priv->regfile.savePP_DIVISOR);
                I915_WRITE(PCH_PP_CONTROL, dev_priv->regfile.savePP_CONTROL);
-       } else if (!IS_VALLEYVIEW(dev) && !IS_CHERRYVIEW(dev)) {
+       } else if (INTEL_INFO(dev)->gen <= 4) {
                I915_WRITE(PP_ON_DELAYS, dev_priv->regfile.savePP_ON_DELAYS);
                I915_WRITE(PP_OFF_DELAYS, dev_priv->regfile.savePP_OFF_DELAYS);
                I915_WRITE(PP_DIVISOR, dev_priv->regfile.savePP_DIVISOR);
index 31f6d21..30f9214 100644 (file)
@@ -527,6 +527,8 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
 
        mutex_lock(&dev_priv->av_mutex);
        intel_dig_port->audio_connector = connector;
+       /* referred in audio callbacks */
+       dev_priv->dig_port_map[port] = intel_encoder;
        mutex_unlock(&dev_priv->av_mutex);
 
        if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
@@ -554,6 +556,7 @@ void intel_audio_codec_disable(struct intel_encoder *intel_encoder)
 
        mutex_lock(&dev_priv->av_mutex);
        intel_dig_port->audio_connector = NULL;
+       dev_priv->dig_port_map[port] = NULL;
        mutex_unlock(&dev_priv->av_mutex);
 
        if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
index 9c89df1..a7b4a52 100644 (file)
@@ -71,22 +71,29 @@ static bool intel_crt_get_hw_state(struct intel_encoder *encoder,
        struct intel_crt *crt = intel_encoder_to_crt(encoder);
        enum intel_display_power_domain power_domain;
        u32 tmp;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        tmp = I915_READ(crt->adpa_reg);
 
        if (!(tmp & ADPA_DAC_ENABLE))
-               return false;
+               goto out;
 
        if (HAS_PCH_CPT(dev))
                *pipe = PORT_TO_PIPE_CPT(tmp);
        else
                *pipe = PORT_TO_PIPE(tmp);
 
-       return true;
+       ret = true;
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static unsigned int intel_crt_get_flags(struct intel_encoder *encoder)
index 9bb63a8..647d85e 100644 (file)
@@ -240,6 +240,8 @@ void intel_csr_load_program(struct drm_i915_private *dev_priv)
                I915_WRITE(dev_priv->csr.mmioaddr[i],
                           dev_priv->csr.mmiodata[i]);
        }
+
+       dev_priv->csr.dc_state = 0;
 }
 
 static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
index e6408e5..084d558 100644 (file)
@@ -1589,7 +1589,8 @@ skl_ddi_pll_select(struct intel_crtc *intel_crtc,
                         DPLL_CFGCR2_KDIV(wrpll_params.kdiv) |
                         DPLL_CFGCR2_PDIV(wrpll_params.pdiv) |
                         wrpll_params.central_freq;
-       } else if (intel_encoder->type == INTEL_OUTPUT_DISPLAYPORT) {
+       } else if (intel_encoder->type == INTEL_OUTPUT_DISPLAYPORT ||
+                  intel_encoder->type == INTEL_OUTPUT_DP_MST) {
                switch (crtc_state->port_clock / 2) {
                case 81000:
                        ctrl1 |= DPLL_CTRL1_LINK_RATE(DPLL_CTRL1_LINK_RATE_810, 0);
@@ -1968,13 +1969,16 @@ bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector)
        enum transcoder cpu_transcoder;
        enum intel_display_power_domain power_domain;
        uint32_t tmp;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(intel_encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
-       if (!intel_encoder->get_hw_state(intel_encoder, &pipe))
-               return false;
+       if (!intel_encoder->get_hw_state(intel_encoder, &pipe)) {
+               ret = false;
+               goto out;
+       }
 
        if (port == PORT_A)
                cpu_transcoder = TRANSCODER_EDP;
@@ -1986,23 +1990,33 @@ bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector)
        switch (tmp & TRANS_DDI_MODE_SELECT_MASK) {
        case TRANS_DDI_MODE_SELECT_HDMI:
        case TRANS_DDI_MODE_SELECT_DVI:
-               return (type == DRM_MODE_CONNECTOR_HDMIA);
+               ret = type == DRM_MODE_CONNECTOR_HDMIA;
+               break;
 
        case TRANS_DDI_MODE_SELECT_DP_SST:
-               if (type == DRM_MODE_CONNECTOR_eDP)
-                       return true;
-               return (type == DRM_MODE_CONNECTOR_DisplayPort);
+               ret = type == DRM_MODE_CONNECTOR_eDP ||
+                     type == DRM_MODE_CONNECTOR_DisplayPort;
+               break;
+
        case TRANS_DDI_MODE_SELECT_DP_MST:
                /* if the transcoder is in MST state then
                 * connector isn't connected */
-               return false;
+               ret = false;
+               break;
 
        case TRANS_DDI_MODE_SELECT_FDI:
-               return (type == DRM_MODE_CONNECTOR_VGA);
+               ret = type == DRM_MODE_CONNECTOR_VGA;
+               break;
 
        default:
-               return false;
+               ret = false;
+               break;
        }
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 bool intel_ddi_get_hw_state(struct intel_encoder *encoder,
@@ -2014,15 +2028,18 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder,
        enum intel_display_power_domain power_domain;
        u32 tmp;
        int i;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        tmp = I915_READ(DDI_BUF_CTL(port));
 
        if (!(tmp & DDI_BUF_CTL_ENABLE))
-               return false;
+               goto out;
 
        if (port == PORT_A) {
                tmp = I915_READ(TRANS_DDI_FUNC_CTL(TRANSCODER_EDP));
@@ -2040,25 +2057,32 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder,
                        break;
                }
 
-               return true;
-       } else {
-               for (i = TRANSCODER_A; i <= TRANSCODER_C; i++) {
-                       tmp = I915_READ(TRANS_DDI_FUNC_CTL(i));
+               ret = true;
 
-                       if ((tmp & TRANS_DDI_PORT_MASK)
-                           == TRANS_DDI_SELECT_PORT(port)) {
-                               if ((tmp & TRANS_DDI_MODE_SELECT_MASK) == TRANS_DDI_MODE_SELECT_DP_MST)
-                                       return false;
+               goto out;
+       }
 
-                               *pipe = i;
-                               return true;
-                       }
+       for (i = TRANSCODER_A; i <= TRANSCODER_C; i++) {
+               tmp = I915_READ(TRANS_DDI_FUNC_CTL(i));
+
+               if ((tmp & TRANS_DDI_PORT_MASK) == TRANS_DDI_SELECT_PORT(port)) {
+                       if ((tmp & TRANS_DDI_MODE_SELECT_MASK) ==
+                           TRANS_DDI_MODE_SELECT_DP_MST)
+                               goto out;
+
+                       *pipe = i;
+                       ret = true;
+
+                       goto out;
                }
        }
 
        DRM_DEBUG_KMS("No pipe for ddi port %c found\n", port_name(port));
 
-       return false;
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 void intel_ddi_enable_pipe_clock(struct intel_crtc *intel_crtc)
@@ -2507,12 +2531,14 @@ static bool hsw_ddi_wrpll_get_hw_state(struct drm_i915_private *dev_priv,
 {
        uint32_t val;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PLLS))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_PLLS))
                return false;
 
        val = I915_READ(WRPLL_CTL(pll->id));
        hw_state->wrpll = val;
 
+       intel_display_power_put(dev_priv, POWER_DOMAIN_PLLS);
+
        return val & WRPLL_PLL_ENABLE;
 }
 
@@ -2522,12 +2548,14 @@ static bool hsw_ddi_spll_get_hw_state(struct drm_i915_private *dev_priv,
 {
        uint32_t val;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PLLS))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_PLLS))
                return false;
 
        val = I915_READ(SPLL_CTL);
        hw_state->spll = val;
 
+       intel_display_power_put(dev_priv, POWER_DOMAIN_PLLS);
+
        return val & SPLL_PLL_ENABLE;
 }
 
@@ -2644,16 +2672,19 @@ static bool skl_ddi_pll_get_hw_state(struct drm_i915_private *dev_priv,
        uint32_t val;
        unsigned int dpll;
        const struct skl_dpll_regs *regs = skl_dpll_regs;
+       bool ret;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PLLS))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_PLLS))
                return false;
 
+       ret = false;
+
        /* DPLL0 is not part of the shared DPLLs, so pll->id is 0 for DPLL1 */
        dpll = pll->id + 1;
 
        val = I915_READ(regs[pll->id].ctl);
        if (!(val & LCPLL_PLL_ENABLE))
-               return false;
+               goto out;
 
        val = I915_READ(DPLL_CTRL1);
        hw_state->ctrl1 = (val >> (dpll * 6)) & 0x3f;
@@ -2663,8 +2694,12 @@ static bool skl_ddi_pll_get_hw_state(struct drm_i915_private *dev_priv,
                hw_state->cfgcr1 = I915_READ(regs[pll->id].cfgcr1);
                hw_state->cfgcr2 = I915_READ(regs[pll->id].cfgcr2);
        }
+       ret = true;
 
-       return true;
+out:
+       intel_display_power_put(dev_priv, POWER_DOMAIN_PLLS);
+
+       return ret;
 }
 
 static void skl_shared_dplls_init(struct drm_i915_private *dev_priv)
@@ -2931,13 +2966,16 @@ static bool bxt_ddi_pll_get_hw_state(struct drm_i915_private *dev_priv,
 {
        enum port port = (enum port)pll->id;    /* 1:1 port->PLL mapping */
        uint32_t val;
+       bool ret;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PLLS))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_PLLS))
                return false;
 
+       ret = false;
+
        val = I915_READ(BXT_PORT_PLL_ENABLE(port));
        if (!(val & PORT_PLL_ENABLE))
-               return false;
+               goto out;
 
        hw_state->ebb0 = I915_READ(BXT_PORT_PLL_EBB_0(port));
        hw_state->ebb0 &= PORT_PLL_P1_MASK | PORT_PLL_P2_MASK;
@@ -2984,7 +3022,12 @@ static bool bxt_ddi_pll_get_hw_state(struct drm_i915_private *dev_priv,
                                 I915_READ(BXT_PORT_PCS_DW12_LN23(port)));
        hw_state->pcsdw12 &= LANE_STAGGER_MASK | LANESTAGGER_STRAP_OVRD;
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, POWER_DOMAIN_PLLS);
+
+       return ret;
 }
 
 static void bxt_shared_dplls_init(struct drm_i915_private *dev_priv)
@@ -3119,11 +3162,15 @@ bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv,
 {
        u32 temp;
 
-       if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
+       if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
                temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD);
+
+               intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO);
+
                if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe))
                        return true;
        }
+
        return false;
 }
 
@@ -3311,7 +3358,6 @@ void intel_ddi_init(struct drm_device *dev, enum port port)
        intel_encoder->get_config = intel_ddi_get_config;
 
        intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
        intel_dig_port->saved_port_bits = I915_READ(DDI_BUF_CTL(port)) &
                                          (DDI_BUF_PORT_REVERSAL |
                                           DDI_A_4_LANES);
index 2f00828..46947ff 100644 (file)
@@ -1351,18 +1351,21 @@ void assert_pipe(struct drm_i915_private *dev_priv,
        bool cur_state;
        enum transcoder cpu_transcoder = intel_pipe_to_cpu_transcoder(dev_priv,
                                                                      pipe);
+       enum intel_display_power_domain power_domain;
 
        /* if we need the pipe quirk it must be always on */
        if ((pipe == PIPE_A && dev_priv->quirks & QUIRK_PIPEA_FORCE) ||
            (pipe == PIPE_B && dev_priv->quirks & QUIRK_PIPEB_FORCE))
                state = true;
 
-       if (!intel_display_power_is_enabled(dev_priv,
-                               POWER_DOMAIN_TRANSCODER(cpu_transcoder))) {
-               cur_state = false;
-       } else {
+       power_domain = POWER_DOMAIN_TRANSCODER(cpu_transcoder);
+       if (intel_display_power_get_if_enabled(dev_priv, power_domain)) {
                u32 val = I915_READ(PIPECONF(cpu_transcoder));
                cur_state = !!(val & PIPECONF_ENABLE);
+
+               intel_display_power_put(dev_priv, power_domain);
+       } else {
+               cur_state = false;
        }
 
        I915_STATE_WARN(cur_state != state,
@@ -2946,7 +2949,7 @@ u32 intel_plane_obj_offset(struct intel_plane *intel_plane,
        struct i915_vma *vma;
        u64 offset;
 
-       intel_fill_fb_ggtt_view(&view, intel_plane->base.fb,
+       intel_fill_fb_ggtt_view(&view, intel_plane->base.state->fb,
                                intel_plane->base.state);
 
        vma = i915_gem_obj_to_ggtt_view(obj, &view);
@@ -8171,18 +8174,22 @@ static bool i9xx_get_pipe_config(struct intel_crtc *crtc,
 {
        struct drm_device *dev = crtc->base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
+       enum intel_display_power_domain power_domain;
        uint32_t tmp;
+       bool ret;
 
-       if (!intel_display_power_is_enabled(dev_priv,
-                                           POWER_DOMAIN_PIPE(crtc->pipe)))
+       power_domain = POWER_DOMAIN_PIPE(crtc->pipe);
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
        pipe_config->cpu_transcoder = (enum transcoder) crtc->pipe;
        pipe_config->shared_dpll = DPLL_ID_PRIVATE;
 
+       ret = false;
+
        tmp = I915_READ(PIPECONF(crtc->pipe));
        if (!(tmp & PIPECONF_ENABLE))
-               return false;
+               goto out;
 
        if (IS_G4X(dev) || IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
                switch (tmp & PIPECONF_BPC_MASK) {
@@ -8262,7 +8269,12 @@ static bool i9xx_get_pipe_config(struct intel_crtc *crtc,
        pipe_config->base.adjusted_mode.crtc_clock =
                pipe_config->port_clock / pipe_config->pixel_multiplier;
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void ironlake_init_pch_refclk(struct drm_device *dev)
@@ -9366,18 +9378,21 @@ static bool ironlake_get_pipe_config(struct intel_crtc *crtc,
 {
        struct drm_device *dev = crtc->base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
+       enum intel_display_power_domain power_domain;
        uint32_t tmp;
+       bool ret;
 
-       if (!intel_display_power_is_enabled(dev_priv,
-                                           POWER_DOMAIN_PIPE(crtc->pipe)))
+       power_domain = POWER_DOMAIN_PIPE(crtc->pipe);
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
        pipe_config->cpu_transcoder = (enum transcoder) crtc->pipe;
        pipe_config->shared_dpll = DPLL_ID_PRIVATE;
 
+       ret = false;
        tmp = I915_READ(PIPECONF(crtc->pipe));
        if (!(tmp & PIPECONF_ENABLE))
-               return false;
+               goto out;
 
        switch (tmp & PIPECONF_BPC_MASK) {
        case PIPECONF_6BPC:
@@ -9440,7 +9455,12 @@ static bool ironlake_get_pipe_config(struct intel_crtc *crtc,
 
        ironlake_get_pfit_config(crtc, pipe_config);
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void assert_can_disable_lcpll(struct drm_i915_private *dev_priv)
@@ -9950,12 +9970,17 @@ static bool haswell_get_pipe_config(struct intel_crtc *crtc,
 {
        struct drm_device *dev = crtc->base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       enum intel_display_power_domain pfit_domain;
+       enum intel_display_power_domain power_domain;
+       unsigned long power_domain_mask;
        uint32_t tmp;
+       bool ret;
 
-       if (!intel_display_power_is_enabled(dev_priv,
-                                        POWER_DOMAIN_PIPE(crtc->pipe)))
+       power_domain = POWER_DOMAIN_PIPE(crtc->pipe);
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
+       power_domain_mask = BIT(power_domain);
+
+       ret = false;
 
        pipe_config->cpu_transcoder = (enum transcoder) crtc->pipe;
        pipe_config->shared_dpll = DPLL_ID_PRIVATE;
@@ -9982,13 +10007,14 @@ static bool haswell_get_pipe_config(struct intel_crtc *crtc,
                        pipe_config->cpu_transcoder = TRANSCODER_EDP;
        }
 
-       if (!intel_display_power_is_enabled(dev_priv,
-                       POWER_DOMAIN_TRANSCODER(pipe_config->cpu_transcoder)))
-               return false;
+       power_domain = POWER_DOMAIN_TRANSCODER(pipe_config->cpu_transcoder);
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
+               goto out;
+       power_domain_mask |= BIT(power_domain);
 
        tmp = I915_READ(PIPECONF(pipe_config->cpu_transcoder));
        if (!(tmp & PIPECONF_ENABLE))
-               return false;
+               goto out;
 
        haswell_get_ddi_port_state(crtc, pipe_config);
 
@@ -9998,14 +10024,14 @@ static bool haswell_get_pipe_config(struct intel_crtc *crtc,
                skl_init_scalers(dev, crtc, pipe_config);
        }
 
-       pfit_domain = POWER_DOMAIN_PIPE_PANEL_FITTER(crtc->pipe);
-
        if (INTEL_INFO(dev)->gen >= 9) {
                pipe_config->scaler_state.scaler_id = -1;
                pipe_config->scaler_state.scaler_users &= ~(1 << SKL_CRTC_INDEX);
        }
 
-       if (intel_display_power_is_enabled(dev_priv, pfit_domain)) {
+       power_domain = POWER_DOMAIN_PIPE_PANEL_FITTER(crtc->pipe);
+       if (intel_display_power_get_if_enabled(dev_priv, power_domain)) {
+               power_domain_mask |= BIT(power_domain);
                if (INTEL_INFO(dev)->gen >= 9)
                        skylake_get_pfit_config(crtc, pipe_config);
                else
@@ -10023,7 +10049,13 @@ static bool haswell_get_pipe_config(struct intel_crtc *crtc,
                pipe_config->pixel_multiplier = 1;
        }
 
-       return true;
+       ret = true;
+
+out:
+       for_each_power_domain(power_domain, power_domain_mask)
+               intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void i845_update_cursor(struct drm_crtc *crtc, u32 base, bool on)
@@ -12075,11 +12107,21 @@ connected_sink_compute_bpp(struct intel_connector *connector,
                pipe_config->pipe_bpp = connector->base.display_info.bpc*3;
        }
 
-       /* Clamp bpp to 8 on screens without EDID 1.4 */
-       if (connector->base.display_info.bpc == 0 && bpp > 24) {
-               DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of 24\n",
-                             bpp);
-               pipe_config->pipe_bpp = 24;
+       /* Clamp bpp to default limit on screens without EDID 1.4 */
+       if (connector->base.display_info.bpc == 0) {
+               int type = connector->base.connector_type;
+               int clamp_bpp = 24;
+
+               /* Fall back to 18 bpp when DP sink capability is unknown. */
+               if (type == DRM_MODE_CONNECTOR_DisplayPort ||
+                   type == DRM_MODE_CONNECTOR_eDP)
+                       clamp_bpp = 18;
+
+               if (bpp > clamp_bpp) {
+                       DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of %d\n",
+                                     bpp, clamp_bpp);
+                       pipe_config->pipe_bpp = clamp_bpp;
+               }
        }
 }
 
@@ -13620,7 +13662,7 @@ static bool ibx_pch_dpll_get_hw_state(struct drm_i915_private *dev_priv,
 {
        uint32_t val;
 
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PLLS))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_PLLS))
                return false;
 
        val = I915_READ(PCH_DPLL(pll->id));
@@ -13628,6 +13670,8 @@ static bool ibx_pch_dpll_get_hw_state(struct drm_i915_private *dev_priv,
        hw_state->fp0 = I915_READ(PCH_FP0(pll->id));
        hw_state->fp1 = I915_READ(PCH_FP1(pll->id));
 
+       intel_display_power_put(dev_priv, POWER_DOMAIN_PLLS);
+
        return val & DPLL_VCO_ENABLE;
 }
 
@@ -13883,11 +13927,12 @@ intel_check_primary_plane(struct drm_plane *plane,
        int max_scale = DRM_PLANE_HELPER_NO_SCALING;
        bool can_position = false;
 
-       /* use scaler when colorkey is not required */
-       if (INTEL_INFO(plane->dev)->gen >= 9 &&
-           state->ckey.flags == I915_SET_COLORKEY_NONE) {
-               min_scale = 1;
-               max_scale = skl_max_scale(to_intel_crtc(crtc), crtc_state);
+       if (INTEL_INFO(plane->dev)->gen >= 9) {
+               /* use scaler when colorkey is not required */
+               if (state->ckey.flags == I915_SET_COLORKEY_NONE) {
+                       min_scale = 1;
+                       max_scale = skl_max_scale(to_intel_crtc(crtc), crtc_state);
+               }
                can_position = true;
        }
 
@@ -15557,10 +15602,12 @@ void i915_redisable_vga(struct drm_device *dev)
         * level, just check if the power well is enabled instead of trying to
         * follow the "don't touch the power well if we don't need it" policy
         * the rest of the driver uses. */
-       if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_VGA))
+       if (!intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_VGA))
                return;
 
        i915_redisable_vga_power_on(dev);
+
+       intel_display_power_put(dev_priv, POWER_DOMAIN_VGA);
 }
 
 static bool primary_get_hw_state(struct intel_plane *plane)
index 796e3d3..cdc2c15 100644 (file)
@@ -2362,15 +2362,18 @@ static bool intel_dp_get_hw_state(struct intel_encoder *encoder,
        struct drm_i915_private *dev_priv = dev->dev_private;
        enum intel_display_power_domain power_domain;
        u32 tmp;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        tmp = I915_READ(intel_dp->output_reg);
 
        if (!(tmp & DP_PORT_EN))
-               return false;
+               goto out;
 
        if (IS_GEN7(dev) && port == PORT_A) {
                *pipe = PORT_TO_PIPE_CPT(tmp);
@@ -2381,7 +2384,9 @@ static bool intel_dp_get_hw_state(struct intel_encoder *encoder,
                        u32 trans_dp = I915_READ(TRANS_DP_CTL(p));
                        if (TRANS_DP_PIPE_TO_PORT(trans_dp) == port) {
                                *pipe = p;
-                               return true;
+                               ret = true;
+
+                               goto out;
                        }
                }
 
@@ -2393,7 +2398,12 @@ static bool intel_dp_get_hw_state(struct intel_encoder *encoder,
                *pipe = PORT_TO_PIPE(tmp);
        }
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void intel_dp_get_config(struct intel_encoder *encoder,
@@ -4493,20 +4503,20 @@ static bool g4x_digital_port_connected(struct drm_i915_private *dev_priv,
        return I915_READ(PORT_HOTPLUG_STAT) & bit;
 }
 
-static bool vlv_digital_port_connected(struct drm_i915_private *dev_priv,
-                                      struct intel_digital_port *port)
+static bool gm45_digital_port_connected(struct drm_i915_private *dev_priv,
+                                       struct intel_digital_port *port)
 {
        u32 bit;
 
        switch (port->port) {
        case PORT_B:
-               bit = PORTB_HOTPLUG_LIVE_STATUS_VLV;
+               bit = PORTB_HOTPLUG_LIVE_STATUS_GM45;
                break;
        case PORT_C:
-               bit = PORTC_HOTPLUG_LIVE_STATUS_VLV;
+               bit = PORTC_HOTPLUG_LIVE_STATUS_GM45;
                break;
        case PORT_D:
-               bit = PORTD_HOTPLUG_LIVE_STATUS_VLV;
+               bit = PORTD_HOTPLUG_LIVE_STATUS_GM45;
                break;
        default:
                MISSING_CASE(port->port);
@@ -4558,8 +4568,8 @@ bool intel_digital_port_connected(struct drm_i915_private *dev_priv,
                return cpt_digital_port_connected(dev_priv, port);
        else if (IS_BROXTON(dev_priv))
                return bxt_digital_port_connected(dev_priv, port);
-       else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
-               return vlv_digital_port_connected(dev_priv, port);
+       else if (IS_GM45(dev_priv))
+               return gm45_digital_port_connected(dev_priv, port);
        else
                return g4x_digital_port_connected(dev_priv, port);
 }
@@ -6035,7 +6045,6 @@ intel_dp_init(struct drm_device *dev,
        }
 
        intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
        intel_dig_port->dp.output_reg = output_reg;
 
        intel_encoder->type = INTEL_OUTPUT_DISPLAYPORT;
index 8888793..0b8eefc 100644 (file)
@@ -215,27 +215,46 @@ intel_dp_link_training_clock_recovery(struct intel_dp *intel_dp)
        }
 }
 
-static void
-intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp)
+/*
+ * Pick training pattern for channel equalization. Training Pattern 3 for HBR2
+ * or 1.2 devices that support it, Training Pattern 2 otherwise.
+ */
+static u32 intel_dp_training_pattern(struct intel_dp *intel_dp)
 {
-       bool channel_eq = false;
-       int tries, cr_tries;
-       uint32_t training_pattern = DP_TRAINING_PATTERN_2;
+       u32 training_pattern = DP_TRAINING_PATTERN_2;
+       bool source_tps3, sink_tps3;
 
        /*
-        * Training Pattern 3 for HBR2 or 1.2 devices that support it.
-        *
         * Intel platforms that support HBR2 also support TPS3. TPS3 support is
-        * also mandatory for downstream devices that support HBR2.
+        * also mandatory for downstream devices that support HBR2. However, not
+        * all sinks follow the spec.
         *
         * Due to WaDisableHBR2 SKL < B0 is the only exception where TPS3 is
-        * supported but still not enabled.
+        * supported in source but still not enabled.
         */
-       if (intel_dp_source_supports_hbr2(intel_dp) &&
-           drm_dp_tps3_supported(intel_dp->dpcd))
+       source_tps3 = intel_dp_source_supports_hbr2(intel_dp);
+       sink_tps3 = drm_dp_tps3_supported(intel_dp->dpcd);
+
+       if (source_tps3 && sink_tps3) {
                training_pattern = DP_TRAINING_PATTERN_3;
-       else if (intel_dp->link_rate == 540000)
-               DRM_ERROR("5.4 Gbps link rate without HBR2/TPS3 support\n");
+       } else if (intel_dp->link_rate == 540000) {
+               if (!source_tps3)
+                       DRM_DEBUG_KMS("5.4 Gbps link rate without source HBR2/TPS3 support\n");
+               if (!sink_tps3)
+                       DRM_DEBUG_KMS("5.4 Gbps link rate without sink TPS3 support\n");
+       }
+
+       return training_pattern;
+}
+
+static void
+intel_dp_link_training_channel_equalization(struct intel_dp *intel_dp)
+{
+       bool channel_eq = false;
+       int tries, cr_tries;
+       u32 training_pattern;
+
+       training_pattern = intel_dp_training_pattern(intel_dp);
 
        /* channel equalization */
        if (!intel_dp_set_link_train(intel_dp,
index ea54158..df7f3cb 100644 (file)
@@ -1428,6 +1428,8 @@ bool __intel_display_power_is_enabled(struct drm_i915_private *dev_priv,
                                      enum intel_display_power_domain domain);
 void intel_display_power_get(struct drm_i915_private *dev_priv,
                             enum intel_display_power_domain domain);
+bool intel_display_power_get_if_enabled(struct drm_i915_private *dev_priv,
+                                       enum intel_display_power_domain domain);
 void intel_display_power_put(struct drm_i915_private *dev_priv,
                             enum intel_display_power_domain domain);
 
@@ -1514,6 +1516,7 @@ enable_rpm_wakeref_asserts(struct drm_i915_private *dev_priv)
        enable_rpm_wakeref_asserts(dev_priv)
 
 void intel_runtime_pm_get(struct drm_i915_private *dev_priv);
+bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv);
 void intel_runtime_pm_get_noresume(struct drm_i915_private *dev_priv);
 void intel_runtime_pm_put(struct drm_i915_private *dev_priv);
 
index 44742fa..0193c62 100644 (file)
@@ -664,13 +664,16 @@ static bool intel_dsi_get_hw_state(struct intel_encoder *encoder,
        struct drm_device *dev = encoder->base.dev;
        enum intel_display_power_domain power_domain;
        enum port port;
+       bool ret;
 
        DRM_DEBUG_KMS("\n");
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        /* XXX: this only works for one DSI output */
        for_each_dsi_port(port, intel_dsi->ports) {
                i915_reg_t ctrl_reg = IS_BROXTON(dev) ?
@@ -691,12 +694,16 @@ static bool intel_dsi_get_hw_state(struct intel_encoder *encoder,
                if (dpi_enabled || (func & CMD_MODE_DATA_WIDTH_MASK)) {
                        if (I915_READ(MIPI_DEVICE_READY(port)) & DEVICE_READY) {
                                *pipe = port == PORT_A ? PIPE_A : PIPE_B;
-                               return true;
+                               ret = true;
+
+                               goto out;
                        }
                }
        }
+out:
+       intel_display_power_put(dev_priv, power_domain);
 
-       return false;
+       return ret;
 }
 
 static void intel_dsi_get_config(struct intel_encoder *encoder,
index a5e99ac..e8113ad 100644 (file)
@@ -204,10 +204,28 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data)
        struct drm_device *dev = intel_dsi->base.base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
+       if (dev_priv->vbt.dsi.seq_version >= 3)
+               data++;
+
        gpio = *data++;
 
        /* pull up/down */
-       action = *data++;
+       action = *data++ & 1;
+
+       if (gpio >= ARRAY_SIZE(gtable)) {
+               DRM_DEBUG_KMS("unknown gpio %u\n", gpio);
+               goto out;
+       }
+
+       if (!IS_VALLEYVIEW(dev_priv)) {
+               DRM_DEBUG_KMS("GPIO element not supported on this platform\n");
+               goto out;
+       }
+
+       if (dev_priv->vbt.dsi.seq_version >= 3) {
+               DRM_DEBUG_KMS("GPIO element v3 not supported\n");
+               goto out;
+       }
 
        function = gtable[gpio].function_reg;
        pad = gtable[gpio].pad_reg;
@@ -226,6 +244,7 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data)
        vlv_gpio_nc_write(dev_priv, pad, val);
        mutex_unlock(&dev_priv->sb_lock);
 
+out:
        return data;
 }
 
index 4a77639..616108c 100644 (file)
@@ -880,15 +880,18 @@ static bool intel_hdmi_get_hw_state(struct intel_encoder *encoder,
        struct intel_hdmi *intel_hdmi = enc_to_intel_hdmi(&encoder->base);
        enum intel_display_power_domain power_domain;
        u32 tmp;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        tmp = I915_READ(intel_hdmi->hdmi_reg);
 
        if (!(tmp & SDVO_ENABLE))
-               return false;
+               goto out;
 
        if (HAS_PCH_CPT(dev))
                *pipe = PORT_TO_PIPE_CPT(tmp);
@@ -897,7 +900,12 @@ static bool intel_hdmi_get_hw_state(struct intel_encoder *encoder,
        else
                *pipe = PORT_TO_PIPE(tmp);
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void intel_hdmi_get_config(struct intel_encoder *encoder,
@@ -2146,7 +2154,6 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
 void intel_hdmi_init(struct drm_device *dev,
                     i915_reg_t hdmi_reg, enum port port)
 {
-       struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_digital_port *intel_dig_port;
        struct intel_encoder *intel_encoder;
        struct intel_connector *intel_connector;
@@ -2215,7 +2222,6 @@ void intel_hdmi_init(struct drm_device *dev,
                intel_encoder->cloneable |= 1 << INTEL_OUTPUT_HDMI;
 
        intel_dig_port->port = port;
-       dev_priv->dig_port_map[port] = intel_encoder;
        intel_dig_port->hdmi.hdmi_reg = hdmi_reg;
        intel_dig_port->dp.output_reg = INVALID_MMIO_REG;
 
index 25254b5..52fbe53 100644 (file)
@@ -664,6 +664,12 @@ int intel_setup_gmbus(struct drm_device *dev)
 
                bus->adapter.algo = &gmbus_algorithm;
 
+               /*
+                * We wish to retry with bit banging
+                * after a timed out GMBUS attempt.
+                */
+               bus->adapter.retries = 1;
+
                /* By default use a conservative clock rate */
                bus->reg0 = pin | GMBUS_RATE_100KHZ;
 
@@ -683,7 +689,7 @@ int intel_setup_gmbus(struct drm_device *dev)
        return 0;
 
 err:
-       while (--pin) {
+       while (pin--) {
                if (!intel_gmbus_is_valid_pin(dev_priv, pin))
                        continue;
 
index 3aa6147..f1fa756 100644 (file)
@@ -1707,6 +1707,7 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+               flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
                flags |= PIPE_CONTROL_FLUSH_ENABLE;
        }
 
index 0da0240..bc04d8d 100644 (file)
@@ -75,22 +75,30 @@ static bool intel_lvds_get_hw_state(struct intel_encoder *encoder,
        struct intel_lvds_encoder *lvds_encoder = to_lvds_encoder(&encoder->base);
        enum intel_display_power_domain power_domain;
        u32 tmp;
+       bool ret;
 
        power_domain = intel_display_port_power_domain(encoder);
-       if (!intel_display_power_is_enabled(dev_priv, power_domain))
+       if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                return false;
 
+       ret = false;
+
        tmp = I915_READ(lvds_encoder->reg);
 
        if (!(tmp & LVDS_PORT_EN))
-               return false;
+               goto out;
 
        if (HAS_PCH_CPT(dev))
                *pipe = PORT_TO_PIPE_CPT(tmp);
        else
                *pipe = PORT_TO_PIPE(tmp);
 
-       return true;
+       ret = true;
+
+out:
+       intel_display_power_put(dev_priv, power_domain);
+
+       return ret;
 }
 
 static void intel_lvds_get_config(struct intel_encoder *encoder,
index eb5fa05..b28c29f 100644 (file)
@@ -1783,16 +1783,20 @@ static uint32_t ilk_compute_cur_wm(const struct intel_crtc_state *cstate,
                                   const struct intel_plane_state *pstate,
                                   uint32_t mem_value)
 {
-       int bpp = pstate->base.fb ? pstate->base.fb->bits_per_pixel / 8 : 0;
+       /*
+        * We treat the cursor plane as always-on for the purposes of watermark
+        * calculation.  Until we have two-stage watermark programming merged,
+        * this is necessary to avoid flickering.
+        */
+       int cpp = 4;
+       int width = pstate->visible ? pstate->base.crtc_w : 64;
 
-       if (!cstate->base.active || !pstate->visible)
+       if (!cstate->base.active)
                return 0;
 
        return ilk_wm_method2(ilk_pipe_pixel_rate(cstate),
                              cstate->base.adjusted_mode.crtc_htotal,
-                             drm_rect_width(&pstate->dst),
-                             bpp,
-                             mem_value);
+                             width, cpp, mem_value);
 }
 
 /* Only for WM_LP. */
@@ -2825,7 +2829,10 @@ void skl_ddb_get_hw_state(struct drm_i915_private *dev_priv,
        memset(ddb, 0, sizeof(*ddb));
 
        for_each_pipe(dev_priv, pipe) {
-               if (!intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_PIPE(pipe)))
+               enum intel_display_power_domain power_domain;
+
+               power_domain = POWER_DOMAIN_PIPE(pipe);
+               if (!intel_display_power_get_if_enabled(dev_priv, power_domain))
                        continue;
 
                for_each_plane(dev_priv, pipe, plane) {
@@ -2837,6 +2844,8 @@ void skl_ddb_get_hw_state(struct drm_i915_private *dev_priv,
                val = I915_READ(CUR_BUF_CFG(pipe));
                skl_ddb_entry_init_from_hw(&ddb->plane[pipe][PLANE_CURSOR],
                                           val);
+
+               intel_display_power_put(dev_priv, power_domain);
        }
 }
 
index 339701d..40c6aff 100644 (file)
@@ -331,6 +331,7 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req,
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+               flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
                flags |= PIPE_CONTROL_FLUSH_ENABLE;
        }
        if (invalidate_domains) {
@@ -403,6 +404,7 @@ gen8_render_ring_flush(struct drm_i915_gem_request *req,
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+               flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
                flags |= PIPE_CONTROL_FLUSH_ENABLE;
        }
        if (invalidate_domains) {
index ddbdbff..4f43d9b 100644 (file)
@@ -470,6 +470,43 @@ static void gen9_set_dc_state_debugmask_memory_up(
        }
 }
 
+static void gen9_write_dc_state(struct drm_i915_private *dev_priv,
+                               u32 state)
+{
+       int rewrites = 0;
+       int rereads = 0;
+       u32 v;
+
+       I915_WRITE(DC_STATE_EN, state);
+
+       /* It has been observed that disabling the dc6 state sometimes
+        * doesn't stick and dmc keeps returning old value. Make sure
+        * the write really sticks enough times and also force rewrite until
+        * we are confident that state is exactly what we want.
+        */
+       do  {
+               v = I915_READ(DC_STATE_EN);
+
+               if (v != state) {
+                       I915_WRITE(DC_STATE_EN, state);
+                       rewrites++;
+                       rereads = 0;
+               } else if (rereads++ > 5) {
+                       break;
+               }
+
+       } while (rewrites < 100);
+
+       if (v != state)
+               DRM_ERROR("Writing dc state to 0x%x failed, now 0x%x\n",
+                         state, v);
+
+       /* Most of the times we need one retry, avoid spam */
+       if (rewrites > 1)
+               DRM_DEBUG_KMS("Rewrote dc state to 0x%x %d times\n",
+                             state, rewrites);
+}
+
 static void gen9_set_dc_state(struct drm_i915_private *dev_priv, uint32_t state)
 {
        uint32_t val;
@@ -494,10 +531,18 @@ static void gen9_set_dc_state(struct drm_i915_private *dev_priv, uint32_t state)
        val = I915_READ(DC_STATE_EN);
        DRM_DEBUG_KMS("Setting DC state from %02x to %02x\n",
                      val & mask, state);
+
+       /* Check if DMC is ignoring our DC state requests */
+       if ((val & mask) != dev_priv->csr.dc_state)
+               DRM_ERROR("DC state mismatch (0x%x -> 0x%x)\n",
+                         dev_priv->csr.dc_state, val & mask);
+
        val &= ~mask;
        val |= state;
-       I915_WRITE(DC_STATE_EN, val);
-       POSTING_READ(DC_STATE_EN);
+
+       gen9_write_dc_state(dev_priv, val);
+
+       dev_priv->csr.dc_state = val & mask;
 }
 
 void bxt_enable_dc9(struct drm_i915_private *dev_priv)
@@ -1442,6 +1487,22 @@ static void chv_pipe_power_well_disable(struct drm_i915_private *dev_priv,
        chv_set_pipe_power_well(dev_priv, power_well, false);
 }
 
+static void
+__intel_display_power_get_domain(struct drm_i915_private *dev_priv,
+                                enum intel_display_power_domain domain)
+{
+       struct i915_power_domains *power_domains = &dev_priv->power_domains;
+       struct i915_power_well *power_well;
+       int i;
+
+       for_each_power_well(i, power_well, BIT(domain), power_domains) {
+               if (!power_well->count++)
+                       intel_power_well_enable(dev_priv, power_well);
+       }
+
+       power_domains->domain_use_count[domain]++;
+}
+
 /**
  * intel_display_power_get - grab a power domain reference
  * @dev_priv: i915 device instance
@@ -1457,24 +1518,53 @@ static void chv_pipe_power_well_disable(struct drm_i915_private *dev_priv,
 void intel_display_power_get(struct drm_i915_private *dev_priv,
                             enum intel_display_power_domain domain)
 {
-       struct i915_power_domains *power_domains;
-       struct i915_power_well *power_well;
-       int i;
+       struct i915_power_domains *power_domains = &dev_priv->power_domains;
 
        intel_runtime_pm_get(dev_priv);
 
-       power_domains = &dev_priv->power_domains;
+       mutex_lock(&power_domains->lock);
+
+       __intel_display_power_get_domain(dev_priv, domain);
+
+       mutex_unlock(&power_domains->lock);
+}
+
+/**
+ * intel_display_power_get_if_enabled - grab a reference for an enabled display power domain
+ * @dev_priv: i915 device instance
+ * @domain: power domain to reference
+ *
+ * This function grabs a power domain reference for @domain and ensures that the
+ * power domain and all its parents are powered up. Therefore users should only
+ * grab a reference to the innermost power domain they need.
+ *
+ * Any power domain reference obtained by this function must have a symmetric
+ * call to intel_display_power_put() to release the reference again.
+ */
+bool intel_display_power_get_if_enabled(struct drm_i915_private *dev_priv,
+                                       enum intel_display_power_domain domain)
+{
+       struct i915_power_domains *power_domains = &dev_priv->power_domains;
+       bool is_enabled;
+
+       if (!intel_runtime_pm_get_if_in_use(dev_priv))
+               return false;
 
        mutex_lock(&power_domains->lock);
 
-       for_each_power_well(i, power_well, BIT(domain), power_domains) {
-               if (!power_well->count++)
-                       intel_power_well_enable(dev_priv, power_well);
+       if (__intel_display_power_is_enabled(dev_priv, domain)) {
+               __intel_display_power_get_domain(dev_priv, domain);
+               is_enabled = true;
+       } else {
+               is_enabled = false;
        }
 
-       power_domains->domain_use_count[domain]++;
-
        mutex_unlock(&power_domains->lock);
+
+       if (!is_enabled)
+               intel_runtime_pm_put(dev_priv);
+
+       return is_enabled;
 }
 
 /**
@@ -2213,15 +2303,15 @@ void intel_power_domains_init_hw(struct drm_i915_private *dev_priv, bool resume)
  */
 void intel_power_domains_suspend(struct drm_i915_private *dev_priv)
 {
-       if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
-               skl_display_core_uninit(dev_priv);
-
        /*
         * Even if power well support was disabled we still want to disable
         * power wells while we are system suspended.
         */
        if (!i915.disable_power_well)
                intel_display_power_put(dev_priv, POWER_DOMAIN_INIT);
+
+       if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
+               skl_display_core_uninit(dev_priv);
 }
 
 /**
@@ -2245,6 +2335,41 @@ void intel_runtime_pm_get(struct drm_i915_private *dev_priv)
        assert_rpm_wakelock_held(dev_priv);
 }
 
+/**
+ * intel_runtime_pm_get_if_in_use - grab a runtime pm reference if device in use
+ * @dev_priv: i915 device instance
+ *
+ * This function grabs a device-level runtime pm reference if the device is
+ * already in use and ensures that it is powered up.
+ *
+ * Any runtime pm reference obtained by this function must have a symmetric
+ * call to intel_runtime_pm_put() to release the reference again.
+ */
+bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv)
+{
+       struct drm_device *dev = dev_priv->dev;
+       struct device *device = &dev->pdev->dev;
+
+       if (IS_ENABLED(CONFIG_PM)) {
+               int ret = pm_runtime_get_if_in_use(device);
+
+               /*
+                * In cases runtime PM is disabled by the RPM core and we get
+                * an -EINVAL return value we are not supposed to call this
+                * function, since the power state is undefined. This applies
+                * atm to the late/early system suspend/resume handlers.
+                */
+               WARN_ON_ONCE(ret < 0);
+               if (ret <= 0)
+                       return false;
+       }
+
+       atomic_inc(&dev_priv->pm.wakeref_count);
+       assert_rpm_wakelock_held(dev_priv);
+
+       return true;
+}
+
 /**
  * intel_runtime_pm_get_noresume - grab a runtime pm reference
  * @dev_priv: i915 device instance
index 30a5718..2872263 100644 (file)
@@ -64,6 +64,7 @@ static void ipu_fb_enable(struct ipu_crtc *ipu_crtc)
        /* Start DC channel and DI after IDMAC */
        ipu_dc_enable_channel(ipu_crtc->dc);
        ipu_di_enable(ipu_crtc->di);
+       drm_crtc_vblank_on(&ipu_crtc->base);
 
        ipu_crtc->enabled = 1;
 }
@@ -80,6 +81,7 @@ static void ipu_fb_disable(struct ipu_crtc *ipu_crtc)
        ipu_di_disable(ipu_crtc->di);
        ipu_plane_disable(ipu_crtc->plane[0]);
        ipu_dc_disable(ipu);
+       drm_crtc_vblank_off(&ipu_crtc->base);
 
        ipu_crtc->enabled = 0;
 }
index 591ba2f..26bb1b6 100644 (file)
@@ -42,6 +42,7 @@ static const uint32_t ipu_plane_formats[] = {
        DRM_FORMAT_YVYU,
        DRM_FORMAT_YUV420,
        DRM_FORMAT_YVU420,
+       DRM_FORMAT_RGB565,
 };
 
 int ipu_plane_irq(struct ipu_plane *ipu_plane)
index 78f520d..e3acc35 100644 (file)
@@ -1520,7 +1520,7 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
                                    DMA_BIDIRECTIONAL);
 
                if (dma_mapping_error(pdev, addr)) {
-                       while (--i) {
+                       while (i--) {
                                dma_unmap_page(pdev, ttm_dma->dma_address[i],
                                               PAGE_SIZE, DMA_BIDIRECTIONAL);
                                ttm_dma->dma_address[i] = 0;
index 24be27d..20935eb 100644 (file)
@@ -635,10 +635,6 @@ nouveau_display_resume(struct drm_device *dev, bool runtime)
                nv_crtc->lut.depth = 0;
        }
 
-       /* Make sure that drm and hw vblank irqs get resumed if needed. */
-       for (head = 0; head < dev->mode_config.num_crtc; head++)
-               drm_vblank_on(dev, head);
-
        /* This should ensure we don't hit a locking problem when someone
         * wakes us up via a connector.  We should never go into suspend
         * while the display is on anyways.
@@ -648,6 +644,10 @@ nouveau_display_resume(struct drm_device *dev, bool runtime)
 
        drm_helper_resume_force_mode(dev);
 
+       /* Make sure that drm and hw vblank irqs get resumed if needed. */
+       for (head = 0; head < dev->mode_config.num_crtc; head++)
+               drm_vblank_on(dev, head);
+
        list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
                struct nouveau_crtc *nv_crtc = nouveau_crtc(crtc);
 
index 8a70cec..2dfe58a 100644 (file)
@@ -24,7 +24,7 @@
 static int nouveau_platform_probe(struct platform_device *pdev)
 {
        const struct nvkm_device_tegra_func *func;
-       struct nvkm_device *device;
+       struct nvkm_device *device = NULL;
        struct drm_device *drm;
        int ret;
 
index 7f8a427..e7e581d 100644 (file)
@@ -252,32 +252,40 @@ nvkm_device_tegra_new(const struct nvkm_device_tegra_func *func,
 
        if (!(tdev = kzalloc(sizeof(*tdev), GFP_KERNEL)))
                return -ENOMEM;
-       *pdevice = &tdev->device;
+
        tdev->func = func;
        tdev->pdev = pdev;
        tdev->irq = -1;
 
        tdev->vdd = devm_regulator_get(&pdev->dev, "vdd");
-       if (IS_ERR(tdev->vdd))
-               return PTR_ERR(tdev->vdd);
+       if (IS_ERR(tdev->vdd)) {
+               ret = PTR_ERR(tdev->vdd);
+               goto free;
+       }
 
        tdev->rst = devm_reset_control_get(&pdev->dev, "gpu");
-       if (IS_ERR(tdev->rst))
-               return PTR_ERR(tdev->rst);
+       if (IS_ERR(tdev->rst)) {
+               ret = PTR_ERR(tdev->rst);
+               goto free;
+       }
 
        tdev->clk = devm_clk_get(&pdev->dev, "gpu");
-       if (IS_ERR(tdev->clk))
-               return PTR_ERR(tdev->clk);
+       if (IS_ERR(tdev->clk)) {
+               ret = PTR_ERR(tdev->clk);
+               goto free;
+       }
 
        tdev->clk_pwr = devm_clk_get(&pdev->dev, "pwr");
-       if (IS_ERR(tdev->clk_pwr))
-               return PTR_ERR(tdev->clk_pwr);
+       if (IS_ERR(tdev->clk_pwr)) {
+               ret = PTR_ERR(tdev->clk_pwr);
+               goto free;
+       }
 
        nvkm_device_tegra_probe_iommu(tdev);
 
        ret = nvkm_device_tegra_power_up(tdev);
        if (ret)
-               return ret;
+               goto remove;
 
        tdev->gpu_speedo = tegra_sku_info.gpu_speedo_value;
        ret = nvkm_device_ctor(&nvkm_device_tegra_func, NULL, &pdev->dev,
@@ -285,9 +293,19 @@ nvkm_device_tegra_new(const struct nvkm_device_tegra_func *func,
                               cfg, dbg, detect, mmio, subdev_mask,
                               &tdev->device);
        if (ret)
-               return ret;
+               goto powerdown;
+
+       *pdevice = &tdev->device;
 
        return 0;
+
+powerdown:
+       nvkm_device_tegra_power_down(tdev);
+remove:
+       nvkm_device_tegra_remove_iommu(tdev);
+free:
+       kfree(tdev);
+       return ret;
 }
 #else
 int
index 74e2f7c..9688970 100644 (file)
@@ -328,6 +328,7 @@ nvkm_dp_train(struct work_struct *w)
                .outp = outp,
        }, *dp = &_dp;
        u32 datarate = 0;
+       u8  pwr;
        int ret;
 
        if (!outp->base.info.location && disp->func->sor.magic)
@@ -355,6 +356,15 @@ nvkm_dp_train(struct work_struct *w)
        /* disable link interrupt handling during link training */
        nvkm_notify_put(&outp->irq);
 
+       /* ensure sink is not in a low-power state */
+       if (!nvkm_rdaux(outp->aux, DPCD_SC00, &pwr, 1)) {
+               if ((pwr & DPCD_SC00_SET_POWER) != DPCD_SC00_SET_POWER_D0) {
+                       pwr &= ~DPCD_SC00_SET_POWER;
+                       pwr |=  DPCD_SC00_SET_POWER_D0;
+                       nvkm_wraux(outp->aux, DPCD_SC00, &pwr, 1);
+               }
+       }
+
        /* enable down-spreading and execute pre-train script from vbios */
        dp_link_train_init(dp, outp->dpcd[3] & 0x01);
 
index 9596290..6e10c5e 100644 (file)
 #define DPCD_LS0C_LANE1_POST_CURSOR2                                       0x0c
 #define DPCD_LS0C_LANE0_POST_CURSOR2                                       0x03
 
+/* DPCD Sink Control */
+#define DPCD_SC00                                                       0x00600
+#define DPCD_SC00_SET_POWER                                                0x03
+#define DPCD_SC00_SET_POWER_D0                                             0x01
+#define DPCD_SC00_SET_POWER_D3                                             0x03
+
 void nvkm_dp_train(struct work_struct *);
 #endif
index dfebdc4..85dfe36 100644 (file)
@@ -573,10 +573,9 @@ static int omap_dmm_remove(struct platform_device *dev)
 
                kfree(omap_dmm->engines);
                if (omap_dmm->refill_va)
-                       dma_free_writecombine(omap_dmm->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               omap_dmm->refill_va,
-                               omap_dmm->refill_pa);
+                       dma_free_wc(omap_dmm->dev,
+                                   REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                   omap_dmm->refill_va, omap_dmm->refill_pa);
                if (omap_dmm->dummy_page)
                        __free_page(omap_dmm->dummy_page);
 
@@ -701,9 +700,9 @@ static int omap_dmm_probe(struct platform_device *dev)
        omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
 
        /* alloc refill memory */
-       omap_dmm->refill_va = dma_alloc_writecombine(&dev->dev,
-                               REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-                               &omap_dmm->refill_pa, GFP_KERNEL);
+       omap_dmm->refill_va = dma_alloc_wc(&dev->dev,
+                                          REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+                                          &omap_dmm->refill_pa, GFP_KERNEL);
        if (!omap_dmm->refill_va) {
                dev_err(&dev->dev, "could not allocate refill memory\n");
                goto fail;
index 8495a1a..359b0d7 100644 (file)
@@ -1330,8 +1330,8 @@ void omap_gem_free_object(struct drm_gem_object *obj)
                        omap_gem_detach_pages(obj);
 
                if (!is_shmem(obj)) {
-                       dma_free_writecombine(dev->dev, obj->size,
-                                       omap_obj->vaddr, omap_obj->paddr);
+                       dma_free_wc(dev->dev, obj->size, omap_obj->vaddr,
+                                   omap_obj->paddr);
                } else if (omap_obj->vaddr) {
                        vunmap(omap_obj->vaddr);
                }
@@ -1395,8 +1395,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev,
                /* attempt to allocate contiguous memory if we don't
                 * have DMM for remappign discontiguous buffers
                 */
-               omap_obj->vaddr =  dma_alloc_writecombine(dev->dev, size,
-                               &omap_obj->paddr, GFP_KERNEL);
+               omap_obj->vaddr =  dma_alloc_wc(dev->dev, size,
+                                               &omap_obj->paddr, GFP_KERNEL);
                if (!omap_obj->vaddr) {
                        kfree(omap_obj);
 
index 2ae8577..7c2e782 100644 (file)
@@ -168,7 +168,8 @@ static int qxl_process_single_command(struct qxl_device *qdev,
                       cmd->command_size))
                return -EFAULT;
 
-       reloc_info = kmalloc(sizeof(struct qxl_reloc_info) * cmd->relocs_num, GFP_KERNEL);
+       reloc_info = kmalloc_array(cmd->relocs_num,
+                                  sizeof(struct qxl_reloc_info), GFP_KERNEL);
        if (!reloc_info)
                return -ENOMEM;
 
index 3d031b5..9f029dd 100644 (file)
@@ -68,5 +68,5 @@ int qxl_gem_prime_mmap(struct drm_gem_object *obj,
                       struct vm_area_struct *area)
 {
        WARN_ONCE(1, "not implemented");
-       return ENOSYS;
+       return -ENOSYS;
 }
index 44ee72e..6af8325 100644 (file)
@@ -315,15 +315,27 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
        unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
        unsigned lane_num, i, max_pix_clock;
 
-       for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-               for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-                       max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+       if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+           ENCODER_OBJECT_ID_NUTMEG) {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       max_pix_clock = (lane_num * 270000 * 8) / bpp;
                        if (max_pix_clock >= pix_clock) {
                                *dp_lanes = lane_num;
-                               *dp_rate = link_rates[i];
+                               *dp_rate = 270000;
                                return 0;
                        }
                }
+       } else {
+               for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+                       for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+                               max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+                               if (max_pix_clock >= pix_clock) {
+                                       *dp_lanes = lane_num;
+                                       *dp_rate = link_rates[i];
+                                       return 0;
+                               }
+                       }
+               }
        }
 
        return -EINVAL;
index 6bfc463..367a916 100644 (file)
@@ -304,18 +304,10 @@ void dce6_dp_audio_set_dto(struct radeon_device *rdev,
                unsigned int div = (RREG32(DENTIST_DISPCLK_CNTL) &
                        DENTIST_DPREFCLK_WDIVIDER_MASK) >>
                        DENTIST_DPREFCLK_WDIVIDER_SHIFT;
-
-               if (div < 128 && div >= 96)
-                       div -= 64;
-               else if (div >= 64)
-                       div = div / 2 - 16;
-               else if (div >= 8)
-                       div /= 4;
-               else
-                       div = 0;
+               div = radeon_audio_decode_dfs_div(div);
 
                if (div)
-                       clock = rdev->clock.gpupll_outputfreq * 10 / div;
+                       clock = clock * 100 / div;
 
                WREG32(DCE8_DCCG_AUDIO_DTO1_PHASE, 24000);
                WREG32(DCE8_DCCG_AUDIO_DTO1_MODULE, clock);
index 9953356..3cf04a2 100644 (file)
@@ -289,6 +289,16 @@ void dce4_dp_audio_set_dto(struct radeon_device *rdev,
         * number (coefficient of two integer numbers.  DCCG_AUDIO_DTOx_PHASE
         * is the numerator, DCCG_AUDIO_DTOx_MODULE is the denominator
         */
+       if (ASIC_IS_DCE41(rdev)) {
+               unsigned int div = (RREG32(DCE41_DENTIST_DISPCLK_CNTL) &
+                       DENTIST_DPREFCLK_WDIVIDER_MASK) >>
+                       DENTIST_DPREFCLK_WDIVIDER_SHIFT;
+               div = radeon_audio_decode_dfs_div(div);
+
+               if (div)
+                       clock = 100 * clock / div;
+       }
+
        WREG32(DCCG_AUDIO_DTO1_PHASE, 24000);
        WREG32(DCCG_AUDIO_DTO1_MODULE, clock);
 }
index 4aa5f75..13b6029 100644 (file)
 #define DCCG_AUDIO_DTO1_CNTL              0x05cc
 #       define DCCG_AUDIO_DTO1_USE_512FBR_DTO (1 << 3)
 
+#define DCE41_DENTIST_DISPCLK_CNTL                     0x049c
+#       define DENTIST_DPREFCLK_WDIVIDER(x)            (((x) & 0x7f) << 24)
+#       define DENTIST_DPREFCLK_WDIVIDER_MASK          (0x7f << 24)
+#       define DENTIST_DPREFCLK_WDIVIDER_SHIFT         24
+
 /* DCE 4.0 AFMT */
 #define HDMI_CONTROL                         0x7030
 #       define HDMI_KEEPOUT_MODE             (1 << 0)
index 5ae6db9..78a51b3 100644 (file)
@@ -268,7 +268,7 @@ struct radeon_clock {
        uint32_t current_dispclk;
        uint32_t dp_extclk;
        uint32_t max_pixel_clock;
-       uint32_t gpupll_outputfreq;
+       uint32_t vco_freq;
 };
 
 /*
index 08fc1b5..de9a2ff 100644 (file)
@@ -1106,6 +1106,31 @@ union firmware_info {
        ATOM_FIRMWARE_INFO_V2_2 info_22;
 };
 
+union igp_info {
+       struct _ATOM_INTEGRATED_SYSTEM_INFO info;
+       struct _ATOM_INTEGRATED_SYSTEM_INFO_V2 info_2;
+       struct _ATOM_INTEGRATED_SYSTEM_INFO_V6 info_6;
+       struct _ATOM_INTEGRATED_SYSTEM_INFO_V1_7 info_7;
+       struct _ATOM_INTEGRATED_SYSTEM_INFO_V1_8 info_8;
+};
+
+static void radeon_atombios_get_dentist_vco_freq(struct radeon_device *rdev)
+{
+       struct radeon_mode_info *mode_info = &rdev->mode_info;
+       int index = GetIndexIntoMasterTable(DATA, IntegratedSystemInfo);
+       union igp_info *igp_info;
+       u8 frev, crev;
+       u16 data_offset;
+
+       if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+                       &frev, &crev, &data_offset)) {
+               igp_info = (union igp_info *)(mode_info->atom_context->bios +
+                       data_offset);
+               rdev->clock.vco_freq =
+                       le32_to_cpu(igp_info->info_6.ulDentistVCOFreq);
+       }
+}
+
 bool radeon_atom_get_clock_info(struct drm_device *dev)
 {
        struct radeon_device *rdev = dev->dev_private;
@@ -1257,12 +1282,18 @@ bool radeon_atom_get_clock_info(struct drm_device *dev)
                rdev->mode_info.firmware_flags =
                        le16_to_cpu(firmware_info->info.usFirmwareCapability.susAccess);
 
-               if (ASIC_IS_DCE8(rdev)) {
-                       rdev->clock.gpupll_outputfreq =
+               if (ASIC_IS_DCE8(rdev))
+                       rdev->clock.vco_freq =
                                le32_to_cpu(firmware_info->info_22.ulGPUPLL_OutputFreq);
-                       if (rdev->clock.gpupll_outputfreq == 0)
-                               rdev->clock.gpupll_outputfreq = 360000; /* 3.6 GHz */
-               }
+               else if (ASIC_IS_DCE5(rdev))
+                       rdev->clock.vco_freq = rdev->clock.current_dispclk;
+               else if (ASIC_IS_DCE41(rdev))
+                       radeon_atombios_get_dentist_vco_freq(rdev);
+               else
+                       rdev->clock.vco_freq = rdev->clock.current_dispclk;
+
+               if (rdev->clock.vco_freq == 0)
+                       rdev->clock.vco_freq = 360000;  /* 3.6 GHz */
 
                return true;
        }
@@ -1270,14 +1301,6 @@ bool radeon_atom_get_clock_info(struct drm_device *dev)
        return false;
 }
 
-union igp_info {
-       struct _ATOM_INTEGRATED_SYSTEM_INFO info;
-       struct _ATOM_INTEGRATED_SYSTEM_INFO_V2 info_2;
-       struct _ATOM_INTEGRATED_SYSTEM_INFO_V6 info_6;
-       struct _ATOM_INTEGRATED_SYSTEM_INFO_V1_7 info_7;
-       struct _ATOM_INTEGRATED_SYSTEM_INFO_V1_8 info_8;
-};
-
 bool radeon_atombios_sideport_present(struct radeon_device *rdev)
 {
        struct radeon_mode_info *mode_info = &rdev->mode_info;
index 2c02e99..b214663 100644 (file)
@@ -739,9 +739,6 @@ static void radeon_audio_dp_mode_set(struct drm_encoder *encoder,
        struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
        struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
        struct drm_connector *connector = radeon_get_connector_for_encoder(encoder);
-       struct radeon_connector *radeon_connector = to_radeon_connector(connector);
-       struct radeon_connector_atom_dig *dig_connector =
-               radeon_connector->con_priv;
 
        if (!dig || !dig->afmt)
                return;
@@ -753,10 +750,7 @@ static void radeon_audio_dp_mode_set(struct drm_encoder *encoder,
                radeon_audio_write_speaker_allocation(encoder);
                radeon_audio_write_sad_regs(encoder);
                radeon_audio_write_latency_fields(encoder, mode);
-               if (rdev->clock.dp_extclk || ASIC_IS_DCE5(rdev))
-                       radeon_audio_set_dto(encoder, rdev->clock.default_dispclk * 10);
-               else
-                       radeon_audio_set_dto(encoder, dig_connector->dp_clock);
+               radeon_audio_set_dto(encoder, rdev->clock.vco_freq * 10);
                radeon_audio_set_audio_packet(encoder);
                radeon_audio_select_pin(encoder);
 
@@ -781,3 +775,15 @@ void radeon_audio_dpms(struct drm_encoder *encoder, int mode)
        if (radeon_encoder->audio && radeon_encoder->audio->dpms)
                radeon_encoder->audio->dpms(encoder, mode == DRM_MODE_DPMS_ON);
 }
+
+unsigned int radeon_audio_decode_dfs_div(unsigned int div)
+{
+       if (div >= 8 && div < 64)
+               return (div - 8) * 25 + 200;
+       else if (div >= 64 && div < 96)
+               return (div - 64) * 50 + 1600;
+       else if (div >= 96 && div < 128)
+               return (div - 96) * 100 + 3200;
+       else
+               return 0;
+}
index 059cc30..5c70cce 100644 (file)
@@ -79,5 +79,6 @@ void radeon_audio_fini(struct radeon_device *rdev);
 void radeon_audio_mode_set(struct drm_encoder *encoder,
        struct drm_display_mode *mode);
 void radeon_audio_dpms(struct drm_encoder *encoder, int mode);
+unsigned int radeon_audio_decode_dfs_div(unsigned int div);
 
 #endif
index 902b59c..4197ca1 100644 (file)
@@ -1744,7 +1744,6 @@ int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon)
        }
 
        drm_kms_helper_poll_enable(dev);
-       drm_helper_hpd_irq_event(dev);
 
        /* set the power state here in case we are a PX system or headless */
        if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
index b3bb923..2d9196a 100644 (file)
@@ -403,7 +403,8 @@ static void radeon_flip_work_func(struct work_struct *__work)
        struct drm_crtc *crtc = &radeon_crtc->base;
        unsigned long flags;
        int r;
-       int vpos, hpos, stat, min_udelay;
+       int vpos, hpos, stat, min_udelay = 0;
+       unsigned repcnt = 4;
        struct drm_vblank_crtc *vblank = &crtc->dev->vblank[work->crtc_id];
 
         down_read(&rdev->exclusive_lock);
@@ -454,7 +455,7 @@ static void radeon_flip_work_func(struct work_struct *__work)
         * In practice this won't execute very often unless on very fast
         * machines because the time window for this to happen is very small.
         */
-       for (;;) {
+       while (radeon_crtc->enabled && --repcnt) {
                /* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
                 * start in hpos, and to the "fudged earlier" vblank start in
                 * vpos.
@@ -470,12 +471,24 @@ static void radeon_flip_work_func(struct work_struct *__work)
                        break;
 
                /* Sleep at least until estimated real start of hw vblank */
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
+               if (min_udelay > vblank->framedur_ns / 2000) {
+                       /* Don't wait ridiculously long - something is wrong */
+                       repcnt = 0;
+                       break;
+               }
+               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
                usleep_range(min_udelay, 2 * min_udelay);
                spin_lock_irqsave(&crtc->dev->event_lock, flags);
        };
 
+       if (!repcnt)
+               DRM_DEBUG_DRIVER("Delay problem on crtc %d: min_udelay %d, "
+                                "framedur %d, linedur %d, stat %d, vpos %d, "
+                                "hpos %d\n", work->crtc_id, min_udelay,
+                                vblank->framedur_ns / 1000,
+                                vblank->linedur_ns / 1000, stat, vpos, hpos);
+
        /* do the flip (mmio) */
        radeon_page_flip(rdev, radeon_crtc->crtc_id, work->base);
 
@@ -1670,8 +1683,10 @@ int radeon_modeset_init(struct radeon_device *rdev)
        /* setup afmt */
        radeon_afmt_init(rdev);
 
-       radeon_fbdev_init(rdev);
-       drm_kms_helper_poll_init(rdev->ddev);
+       if (!list_empty(&rdev->ddev->mode_config.connector_list)) {
+               radeon_fbdev_init(rdev);
+               drm_kms_helper_poll_init(rdev->ddev);
+       }
 
        /* do pm late init */
        ret = radeon_pm_late_init(rdev);
index 3dcc573..e26c963 100644 (file)
@@ -663,6 +663,7 @@ int radeon_gem_va_ioctl(struct drm_device *dev, void *data,
        bo_va = radeon_vm_bo_find(&fpriv->vm, rbo);
        if (!bo_va) {
                args->operation = RADEON_VA_RESULT_ERROR;
+               radeon_bo_unreserve(rbo);
                drm_gem_object_unreference_unlocked(gobj);
                return -ENOENT;
        }
index 84d4563..fb6ad14 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/slab.h>
 #include <drm/drmP.h>
 #include <drm/radeon_drm.h>
+#include <drm/drm_cache.h>
 #include "radeon.h"
 #include "radeon_trace.h"
 
@@ -245,6 +246,12 @@ int radeon_bo_create(struct radeon_device *rdev,
                DRM_INFO_ONCE("Please enable CONFIG_MTRR and CONFIG_X86_PAT for "
                              "better performance thanks to write-combining\n");
        bo->flags &= ~(RADEON_GEM_GTT_WC | RADEON_GEM_GTT_UC);
+#else
+       /* For architectures that don't support WC memory,
+        * mask out the WC flag from the BO
+        */
+       if (!drm_arch_can_wc_memory())
+               bo->flags &= ~RADEON_GEM_GTT_WC;
 #endif
 
        radeon_ttm_placement_from_domain(bo, domain);
index 460c8f2..7a98823 100644 (file)
@@ -276,8 +276,12 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev)
        if (rdev->irq.installed) {
                for (i = 0; i < rdev->num_crtc; i++) {
                        if (rdev->pm.active_crtcs & (1 << i)) {
-                               rdev->pm.req_vblank |= (1 << i);
-                               drm_vblank_get(rdev->ddev, i);
+                               /* This can fail if a modeset is in progress */
+                               if (drm_vblank_get(rdev->ddev, i) == 0)
+                                       rdev->pm.req_vblank |= (1 << i);
+                               else
+                                       DRM_DEBUG_DRIVER("crtc %d no vblank, can glitch\n",
+                                                        i);
                        }
                }
        }
@@ -1078,10 +1082,6 @@ force:
        /* update displays */
        radeon_dpm_display_configuration_changed(rdev);
 
-       rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
-       rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
-       rdev->pm.dpm.single_display = single_display;
-
        /* wait for the rings to drain */
        for (i = 0; i < RADEON_NUM_RINGS; i++) {
                struct radeon_ring *ring = &rdev->ring[i];
@@ -1097,6 +1097,10 @@ force:
 
        radeon_dpm_post_set_power_state(rdev);
 
+       rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
+       rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
+       rdev->pm.dpm.single_display = single_display;
+
        if (rdev->asic->dpm.force_performance_level) {
                if (rdev->pm.dpm.thermal_active) {
                        enum radeon_dpm_forced_level level = rdev->pm.dpm.forced_level;
index c507896..197b157 100644 (file)
@@ -349,8 +349,13 @@ int radeon_sa_bo_new(struct radeon_device *rdev,
                        /* see if we can skip over some allocations */
                } while (radeon_sa_bo_next_hole(sa_manager, fences, tries));
 
+               for (i = 0; i < RADEON_NUM_RINGS; ++i)
+                       radeon_fence_ref(fences[i]);
+
                spin_unlock(&sa_manager->wq.lock);
                r = radeon_fence_wait_any(rdev, fences, false);
+               for (i = 0; i < RADEON_NUM_RINGS; ++i)
+                       radeon_fence_unref(&fences[i]);
                spin_lock(&sa_manager->wq.lock);
                /* if we have nothing to wait for block */
                if (r == -ENOENT) {
index e343074..e06ac54 100644 (file)
@@ -758,7 +758,7 @@ static int radeon_ttm_tt_populate(struct ttm_tt *ttm)
                                                       0, PAGE_SIZE,
                                                       PCI_DMA_BIDIRECTIONAL);
                if (pci_dma_mapping_error(rdev->pdev, gtt->ttm.dma_address[i])) {
-                       while (--i) {
+                       while (i--) {
                                pci_unmap_page(rdev->pdev, gtt->ttm.dma_address[i],
                                               PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
                                gtt->ttm.dma_address[i] = 0;
index 07a0d37..a01efe3 100644 (file)
@@ -178,12 +178,12 @@ int vce_v1_0_load_fw(struct radeon_device *rdev, uint32_t *data)
                return -EINVAL;
        }
 
-       for (i = 0; i < sign->num; ++i) {
-               if (sign->val[i].chip_id == chip_id)
+       for (i = 0; i < le32_to_cpu(sign->num); ++i) {
+               if (le32_to_cpu(sign->val[i].chip_id) == chip_id)
                        break;
        }
 
-       if (i == sign->num)
+       if (i == le32_to_cpu(sign->num))
                return -EINVAL;
 
        data += (256 - 64) / 4;
@@ -191,18 +191,18 @@ int vce_v1_0_load_fw(struct radeon_device *rdev, uint32_t *data)
        data[1] = sign->val[i].nonce[1];
        data[2] = sign->val[i].nonce[2];
        data[3] = sign->val[i].nonce[3];
-       data[4] = sign->len + 64;
+       data[4] = cpu_to_le32(le32_to_cpu(sign->len) + 64);
 
        memset(&data[5], 0, 44);
        memcpy(&data[16], &sign[1], rdev->vce_fw->size - sizeof(*sign));
 
-       data += data[4] / 4;
+       data += le32_to_cpu(data[4]) / 4;
        data[0] = sign->val[i].sigval[0];
        data[1] = sign->val[i].sigval[1];
        data[2] = sign->val[i].sigval[2];
        data[3] = sign->val[i].sigval[3];
 
-       rdev->vce.keyselect = sign->val[i].keyselect;
+       rdev->vce.keyselect = le32_to_cpu(sign->val[i].keyselect);
 
        return 0;
 }
index d1dc0f7..f6a809a 100644 (file)
@@ -2,11 +2,11 @@
 # Makefile for the drm device driver.  This driver provides support for the
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
-rockchipdrm-y := rockchip_drm_drv.o rockchip_drm_fb.o rockchip_drm_fbdev.o \
-               rockchip_drm_gem.o
+rockchipdrm-y := rockchip_drm_drv.o rockchip_drm_fb.o \
+               rockchip_drm_gem.o rockchip_drm_vop.o
+rockchipdrm-$(CONFIG_DRM_FBDEV_EMULATION) += rockchip_drm_fbdev.o
 
 obj-$(CONFIG_ROCKCHIP_DW_HDMI) += dw_hdmi-rockchip.o
 obj-$(CONFIG_ROCKCHIP_DW_MIPI_DSI) += dw-mipi-dsi.o
 
-obj-$(CONFIG_DRM_ROCKCHIP) += rockchipdrm.o rockchip_drm_vop.o \
-                               rockchip_vop_reg.o
+obj-$(CONFIG_DRM_ROCKCHIP) += rockchipdrm.o rockchip_vop_reg.o
index 7bfe243..f8f8f29 100644 (file)
@@ -461,10 +461,11 @@ static int dw_mipi_dsi_phy_init(struct dw_mipi_dsi *dsi)
 
 static int dw_mipi_dsi_get_lane_bps(struct dw_mipi_dsi *dsi)
 {
-       unsigned int bpp, i, pre;
+       unsigned int i, pre;
        unsigned long mpclk, pllref, tmp;
        unsigned int m = 1, n = 1, target_mbps = 1000;
        unsigned int max_mbps = dptdin_map[ARRAY_SIZE(dptdin_map) - 1].max_mbps;
+       int bpp;
 
        bpp = mipi_dsi_pixel_format_to_bpp(dsi->format);
        if (bpp < 0) {
index 8397d1b..a0d51cc 100644 (file)
@@ -55,14 +55,12 @@ int rockchip_drm_dma_attach_device(struct drm_device *drm_dev,
 
        return arm_iommu_attach_device(dev, mapping);
 }
-EXPORT_SYMBOL_GPL(rockchip_drm_dma_attach_device);
 
 void rockchip_drm_dma_detach_device(struct drm_device *drm_dev,
                                    struct device *dev)
 {
        arm_iommu_detach_device(dev);
 }
-EXPORT_SYMBOL_GPL(rockchip_drm_dma_detach_device);
 
 int rockchip_register_crtc_funcs(struct drm_crtc *crtc,
                                 const struct rockchip_crtc_funcs *crtc_funcs)
@@ -77,7 +75,6 @@ int rockchip_register_crtc_funcs(struct drm_crtc *crtc,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(rockchip_register_crtc_funcs);
 
 void rockchip_unregister_crtc_funcs(struct drm_crtc *crtc)
 {
@@ -89,7 +86,6 @@ void rockchip_unregister_crtc_funcs(struct drm_crtc *crtc)
 
        priv->crtc_funcs[pipe] = NULL;
 }
-EXPORT_SYMBOL_GPL(rockchip_unregister_crtc_funcs);
 
 static struct drm_crtc *rockchip_crtc_from_pipe(struct drm_device *drm,
                                                int pipe)
index f784488..3b8f652 100644 (file)
@@ -39,7 +39,6 @@ struct drm_gem_object *rockchip_fb_get_gem_obj(struct drm_framebuffer *fb,
 
        return rk_fb->obj[plane];
 }
-EXPORT_SYMBOL_GPL(rockchip_fb_get_gem_obj);
 
 static void rockchip_drm_fb_destroy(struct drm_framebuffer *fb)
 {
@@ -177,8 +176,23 @@ static void rockchip_crtc_wait_for_update(struct drm_crtc *crtc)
                crtc_funcs->wait_for_update(crtc);
 }
 
+/*
+ * We can't use drm_atomic_helper_wait_for_vblanks() because rk3288 and rk3066
+ * have hardware counters for neither vblanks nor scanlines, which results in
+ * a race where:
+ *                             | <-- HW vsync irq and reg take effect
+ *            plane_commit --> |
+ *     get_vblank and wait --> |
+ *                             | <-- handle_vblank, vblank->count + 1
+ *              cleanup_fb --> |
+ *             iommu crash --> |
+ *                             | <-- HW vsync irq and reg take effect
+ *
+ * This function is equivalent but uses rockchip_crtc_wait_for_update() instead
+ * of waiting for vblank_count to change.
+ */
 static void
-rockchip_atomic_wait_for_complete(struct drm_atomic_state *old_state)
+rockchip_atomic_wait_for_complete(struct drm_device *dev, struct drm_atomic_state *old_state)
 {
        struct drm_crtc_state *old_crtc_state;
        struct drm_crtc *crtc;
@@ -194,6 +208,10 @@ rockchip_atomic_wait_for_complete(struct drm_atomic_state *old_state)
                if (!crtc->state->active)
                        continue;
 
+               if (!drm_atomic_helper_framebuffer_changed(dev,
+                               old_state, crtc))
+                       continue;
+
                ret = drm_crtc_vblank_get(crtc);
                if (ret != 0)
                        continue;
@@ -241,7 +259,7 @@ rockchip_atomic_commit_complete(struct rockchip_atomic_commit *commit)
 
        drm_atomic_helper_commit_planes(dev, state, true);
 
-       rockchip_atomic_wait_for_complete(state);
+       rockchip_atomic_wait_for_complete(dev, state);
 
        drm_atomic_helper_cleanup_planes(dev, state);
 
index 50432e9..73718c5 100644 (file)
 #ifndef _ROCKCHIP_DRM_FBDEV_H
 #define _ROCKCHIP_DRM_FBDEV_H
 
+#ifdef CONFIG_DRM_FBDEV_EMULATION
 int rockchip_drm_fbdev_init(struct drm_device *dev);
 void rockchip_drm_fbdev_fini(struct drm_device *dev);
+#else
+static inline int rockchip_drm_fbdev_init(struct drm_device *dev)
+{
+       return 0;
+}
+
+static inline void rockchip_drm_fbdev_fini(struct drm_device *dev)
+{
+}
+#endif
 
 #endif /* _ROCKCHIP_DRM_FBDEV_H */
index d908321..18e0733 100644 (file)
@@ -234,13 +234,8 @@ int rockchip_gem_dumb_create(struct drm_file *file_priv,
        /*
         * align to 64 bytes since Mali requires it.
         */
-       min_pitch = ALIGN(min_pitch, 64);
-
-       if (args->pitch < min_pitch)
-               args->pitch = min_pitch;
-
-       if (args->size < args->pitch * args->height)
-               args->size = args->pitch * args->height;
+       args->pitch = ALIGN(min_pitch, 64);
+       args->size = args->pitch * args->height;
 
        rk_obj = rockchip_gem_create_with_handle(file_priv, dev, args->size,
                                                 &args->handle);
index 46c2a8d..fd37054 100644 (file)
@@ -43,8 +43,8 @@
 
 #define REG_SET(x, base, reg, v, mode) \
                __REG_SET_##mode(x, base + reg.offset, reg.mask, reg.shift, v)
-#define REG_SET_MASK(x, base, reg, v, mode) \
-               __REG_SET_##mode(x, base + reg.offset, reg.mask, reg.shift, v)
+#define REG_SET_MASK(x, base, reg, mask, v, mode) \
+               __REG_SET_##mode(x, base + reg.offset, mask, reg.shift, v)
 
 #define VOP_WIN_SET(x, win, name, v) \
                REG_SET(x, win->base, win->phy->name, v, RELAXED)
 #define VOP_INTR_GET(vop, name) \
                vop_read_reg(vop, 0, &vop->data->ctrl->name)
 
-#define VOP_INTR_SET(vop, name, v) \
-               REG_SET(vop, 0, vop->data->intr->name, v, NORMAL)
+#define VOP_INTR_SET(vop, name, mask, v) \
+               REG_SET_MASK(vop, 0, vop->data->intr->name, mask, v, NORMAL)
 #define VOP_INTR_SET_TYPE(vop, name, type, v) \
        do { \
-               int i, reg = 0; \
+               int i, reg = 0, mask = 0; \
                for (i = 0; i < vop->data->intr->nintrs; i++) { \
-                       if (vop->data->intr->intrs[i] & type) \
+                       if (vop->data->intr->intrs[i] & type) \
                                reg |= (v) << i; \
+                               mask |= 1 << i; \
+                       } \
                } \
-               VOP_INTR_SET(vop, name, reg); \
+               VOP_INTR_SET(vop, name, mask, reg); \
        } while (0)
 #define VOP_INTR_GET_TYPE(vop, name, type) \
                vop_get_intr_type(vop, &vop->data->intr->name, type)
index 8078631..bd736ac 100644 (file)
@@ -157,17 +157,15 @@ static void sti_cursor_atomic_update(struct drm_plane *drm_plane,
                cursor->height = src_h;
 
                if (cursor->pixmap.base)
-                       dma_free_writecombine(cursor->dev,
-                                             cursor->pixmap.size,
-                                             cursor->pixmap.base,
-                                             cursor->pixmap.paddr);
+                       dma_free_wc(cursor->dev, cursor->pixmap.size,
+                                   cursor->pixmap.base, cursor->pixmap.paddr);
 
                cursor->pixmap.size = cursor->width * cursor->height;
 
-               cursor->pixmap.base = dma_alloc_writecombine(cursor->dev,
-                                                       cursor->pixmap.size,
-                                                       &cursor->pixmap.paddr,
-                                                       GFP_KERNEL | GFP_DMA);
+               cursor->pixmap.base = dma_alloc_wc(cursor->dev,
+                                                  cursor->pixmap.size,
+                                                  &cursor->pixmap.paddr,
+                                                  GFP_KERNEL | GFP_DMA);
                if (!cursor->pixmap.base) {
                        DRM_ERROR("Failed to allocate memory for pixmap\n");
                        return;
@@ -252,8 +250,8 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
 
        /* Allocate clut buffer */
        size = 0x100 * sizeof(unsigned short);
-       cursor->clut = dma_alloc_writecombine(dev, size, &cursor->clut_paddr,
-                                             GFP_KERNEL | GFP_DMA);
+       cursor->clut = dma_alloc_wc(dev, size, &cursor->clut_paddr,
+                                   GFP_KERNEL | GFP_DMA);
 
        if (!cursor->clut) {
                DRM_ERROR("Failed to allocate memory for cursor clut\n");
@@ -286,7 +284,7 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
        return &cursor->plane.drm_plane;
 
 err_plane:
-       dma_free_writecombine(dev, size, cursor->clut, cursor->clut_paddr);
+       dma_free_wc(dev, size, cursor->clut, cursor->clut_paddr);
 err_clut:
        devm_kfree(dev, cursor);
        return NULL;
index f9a1d92..514551c 100644 (file)
@@ -312,8 +312,7 @@ static void sti_gdp_init(struct sti_gdp *gdp)
        /* Allocate all the nodes within a single memory page */
        size = sizeof(struct sti_gdp_node) *
            GDP_NODE_PER_FIELD * GDP_NODE_NB_BANK;
-       base = dma_alloc_writecombine(gdp->dev,
-                                     size, &dma_addr, GFP_KERNEL | GFP_DMA);
+       base = dma_alloc_wc(gdp->dev, size, &dma_addr, GFP_KERNEL | GFP_DMA);
 
        if (!base) {
                DRM_ERROR("Failed to allocate memory for GDP node\n");
index 43861b5..1d3c3d0 100644 (file)
@@ -617,9 +617,9 @@ static void sti_hqvdp_init(struct sti_hqvdp *hqvdp)
 
        /* Allocate memory for the VDP commands */
        size = NB_VDP_CMD * sizeof(struct sti_hqvdp_cmd);
-       hqvdp->hqvdp_cmd = dma_alloc_writecombine(hqvdp->dev, size,
-                                        &hqvdp->hqvdp_cmd_paddr,
-                                        GFP_KERNEL | GFP_DMA);
+       hqvdp->hqvdp_cmd = dma_alloc_wc(hqvdp->dev, size,
+                                       &hqvdp->hqvdp_cmd_paddr,
+                                       GFP_KERNEL | GFP_DMA);
        if (!hqvdp->hqvdp_cmd) {
                DRM_ERROR("Failed to allocate memory for VDP cmd\n");
                return;
index 33add93..3b0d8c3 100644 (file)
@@ -175,8 +175,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
                sg_free_table(bo->sgt);
                kfree(bo->sgt);
        } else if (bo->vaddr) {
-               dma_free_writecombine(drm->dev, bo->gem.size, bo->vaddr,
-                                     bo->paddr);
+               dma_free_wc(drm->dev, bo->gem.size, bo->vaddr, bo->paddr);
        }
 }
 
@@ -233,8 +232,8 @@ static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
        } else {
                size_t size = bo->gem.size;
 
-               bo->vaddr = dma_alloc_writecombine(drm->dev, size, &bo->paddr,
-                                                  GFP_KERNEL | __GFP_NOWARN);
+               bo->vaddr = dma_alloc_wc(drm->dev, size, &bo->paddr,
+                                        GFP_KERNEL | __GFP_NOWARN);
                if (!bo->vaddr) {
                        dev_err(drm->dev,
                                "failed to allocate buffer of size %zu\n",
@@ -472,8 +471,8 @@ int tegra_drm_mmap(struct file *file, struct vm_area_struct *vma)
                vma->vm_flags &= ~VM_PFNMAP;
                vma->vm_pgoff = 0;
 
-               ret = dma_mmap_writecombine(gem->dev->dev, vma, bo->vaddr,
-                                           bo->paddr, gem->size);
+               ret = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->paddr,
+                                 gem->size);
                if (ret) {
                        drm_gem_vm_close(vma);
                        return ret;
index 18dfe3e..034ef2d 100644 (file)
@@ -215,7 +215,7 @@ struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
        struct drm_gem_cma_object *cma_obj;
 
        if (size == 0)
-               return NULL;
+               return ERR_PTR(-EINVAL);
 
        /* First, try to get a vc4_bo from the kernel BO cache. */
        if (from_cache) {
@@ -237,7 +237,7 @@ struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
                if (IS_ERR(cma_obj)) {
                        DRM_ERROR("Failed to allocate from CMA:\n");
                        vc4_bo_stats_dump(vc4);
-                       return NULL;
+                       return ERR_PTR(-ENOMEM);
                }
        }
 
@@ -259,8 +259,8 @@ int vc4_dumb_create(struct drm_file *file_priv,
                args->size = args->pitch * args->height;
 
        bo = vc4_bo_create(dev, args->size, false);
-       if (!bo)
-               return -ENOMEM;
+       if (IS_ERR(bo))
+               return PTR_ERR(bo);
 
        ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
        drm_gem_object_unreference_unlocked(&bo->base.base);
@@ -398,9 +398,8 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
        vma->vm_flags &= ~VM_PFNMAP;
        vma->vm_pgoff = 0;
 
-       ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
-                                   bo->base.vaddr, bo->base.paddr,
-                                   vma->vm_end - vma->vm_start);
+       ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
+                         bo->base.paddr, vma->vm_end - vma->vm_start);
        if (ret)
                drm_gem_vm_close(vma);
 
@@ -443,8 +442,8 @@ int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
         * get zeroed, and that might leak data between users.
         */
        bo = vc4_bo_create(dev, args->size, false);
-       if (!bo)
-               return -ENOMEM;
+       if (IS_ERR(bo))
+               return PTR_ERR(bo);
 
        ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
        drm_gem_object_unreference_unlocked(&bo->base.base);
@@ -496,8 +495,8 @@ vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
        }
 
        bo = vc4_bo_create(dev, args->size, true);
-       if (!bo)
-               return -ENOMEM;
+       if (IS_ERR(bo))
+               return PTR_ERR(bo);
 
        ret = copy_from_user(bo->base.vaddr,
                             (void __user *)(uintptr_t)args->data,
index 080865e..51a6333 100644 (file)
@@ -91,8 +91,12 @@ struct vc4_dev {
        struct vc4_bo *overflow_mem;
        struct work_struct overflow_mem_work;
 
+       int power_refcount;
+
+       /* Mutex controlling the power refcount. */
+       struct mutex power_lock;
+
        struct {
-               uint32_t last_ct0ca, last_ct1ca;
                struct timer_list timer;
                struct work_struct reset_work;
        } hangcheck;
@@ -142,6 +146,7 @@ struct vc4_seqno_cb {
 };
 
 struct vc4_v3d {
+       struct vc4_dev *vc4;
        struct platform_device *pdev;
        void __iomem *regs;
 };
@@ -192,6 +197,11 @@ struct vc4_exec_info {
        /* Sequence number for this bin/render job. */
        uint64_t seqno;
 
+       /* Last current addresses the hardware was processing when the
+        * hangcheck timer checked on us.
+        */
+       uint32_t last_ct0ca, last_ct1ca;
+
        /* Kernel-space copy of the ioctl arguments */
        struct drm_vc4_submit_cl *args;
 
@@ -434,7 +444,6 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
-int vc4_v3d_set_power(struct vc4_dev *vc4, bool on);
 
 /* vc4_validate.c */
 int
index 48ce30a..202aa15 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/device.h>
 #include <linux/io.h>
 
@@ -228,8 +229,16 @@ vc4_reset(struct drm_device *dev)
        struct vc4_dev *vc4 = to_vc4_dev(dev);
 
        DRM_INFO("Resetting GPU.\n");
-       vc4_v3d_set_power(vc4, false);
-       vc4_v3d_set_power(vc4, true);
+
+       mutex_lock(&vc4->power_lock);
+       if (vc4->power_refcount) {
+               /* Power the device off and back on the by dropping the
+                * reference on runtime PM.
+                */
+               pm_runtime_put_sync_suspend(&vc4->v3d->pdev->dev);
+               pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+       }
+       mutex_unlock(&vc4->power_lock);
 
        vc4_irq_reset(dev);
 
@@ -257,10 +266,17 @@ vc4_hangcheck_elapsed(unsigned long data)
        struct drm_device *dev = (struct drm_device *)data;
        struct vc4_dev *vc4 = to_vc4_dev(dev);
        uint32_t ct0ca, ct1ca;
+       unsigned long irqflags;
+       struct vc4_exec_info *exec;
+
+       spin_lock_irqsave(&vc4->job_lock, irqflags);
+       exec = vc4_first_job(vc4);
 
        /* If idle, we can stop watching for hangs. */
-       if (list_empty(&vc4->job_list))
+       if (!exec) {
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
                return;
+       }
 
        ct0ca = V3D_READ(V3D_CTNCA(0));
        ct1ca = V3D_READ(V3D_CTNCA(1));
@@ -268,14 +284,16 @@ vc4_hangcheck_elapsed(unsigned long data)
        /* If we've made any progress in execution, rearm the timer
         * and wait.
         */
-       if (ct0ca != vc4->hangcheck.last_ct0ca ||
-           ct1ca != vc4->hangcheck.last_ct1ca) {
-               vc4->hangcheck.last_ct0ca = ct0ca;
-               vc4->hangcheck.last_ct1ca = ct1ca;
+       if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
+               exec->last_ct0ca = ct0ca;
+               exec->last_ct1ca = ct1ca;
+               spin_unlock_irqrestore(&vc4->job_lock, irqflags);
                vc4_queue_hangcheck(dev);
                return;
        }
 
+       spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
        /* We've gone too long with no progress, reset.  This has to
         * be done from a work struct, since resetting can sleep and
         * this timer hook isn't allowed to.
@@ -340,12 +358,7 @@ vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns,
        finish_wait(&vc4->job_wait_queue, &wait);
        trace_vc4_wait_for_seqno_end(dev, seqno);
 
-       if (ret && ret != -ERESTARTSYS) {
-               DRM_ERROR("timeout waiting for render thread idle\n");
-               return ret;
-       }
-
-       return 0;
+       return ret;
 }
 
 static void
@@ -578,9 +591,9 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
        }
 
        bo = vc4_bo_create(dev, exec_size, true);
-       if (!bo) {
+       if (IS_ERR(bo)) {
                DRM_ERROR("Couldn't allocate BO for binning\n");
-               ret = -ENOMEM;
+               ret = PTR_ERR(bo);
                goto fail;
        }
        exec->exec_bo = &bo->base;
@@ -617,6 +630,7 @@ fail:
 static void
 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 {
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
        unsigned i;
 
        /* Need the struct lock for drm_gem_object_unreference(). */
@@ -635,6 +649,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
        }
        mutex_unlock(&dev->struct_mutex);
 
+       mutex_lock(&vc4->power_lock);
+       if (--vc4->power_refcount == 0)
+               pm_runtime_put(&vc4->v3d->pdev->dev);
+       mutex_unlock(&vc4->power_lock);
+
        kfree(exec);
 }
 
@@ -746,6 +765,9 @@ vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
        struct drm_gem_object *gem_obj;
        struct vc4_bo *bo;
 
+       if (args->pad != 0)
+               return -EINVAL;
+
        gem_obj = drm_gem_object_lookup(dev, file_priv, args->handle);
        if (!gem_obj) {
                DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
@@ -772,7 +794,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
        struct vc4_dev *vc4 = to_vc4_dev(dev);
        struct drm_vc4_submit_cl *args = data;
        struct vc4_exec_info *exec;
-       int ret;
+       int ret = 0;
 
        if ((args->flags & ~VC4_SUBMIT_CL_USE_CLEAR_COLOR) != 0) {
                DRM_ERROR("Unknown flags: 0x%02x\n", args->flags);
@@ -785,6 +807,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
                return -ENOMEM;
        }
 
+       mutex_lock(&vc4->power_lock);
+       if (vc4->power_refcount++ == 0)
+               ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+       mutex_unlock(&vc4->power_lock);
+       if (ret < 0) {
+               kfree(exec);
+               return ret;
+       }
+
        exec->args = args;
        INIT_LIST_HEAD(&exec->unref_list);
 
@@ -839,6 +870,8 @@ vc4_gem_init(struct drm_device *dev)
                    (unsigned long)dev);
 
        INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
+
+       mutex_init(&vc4->power_lock);
 }
 
 void
index b68060e..78a2135 100644 (file)
@@ -57,7 +57,7 @@ vc4_overflow_mem_work(struct work_struct *work)
        struct vc4_bo *bo;
 
        bo = vc4_bo_create(dev, 256 * 1024, true);
-       if (!bo) {
+       if (IS_ERR(bo)) {
                DRM_ERROR("Couldn't allocate binner overflow mem\n");
                return;
        }
index 8a2a312..0f12418 100644 (file)
@@ -316,20 +316,11 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
        size += xtiles * ytiles * loop_body_size;
 
        setup->rcl = &vc4_bo_create(dev, size, true)->base;
-       if (!setup->rcl)
-               return -ENOMEM;
+       if (IS_ERR(setup->rcl))
+               return PTR_ERR(setup->rcl);
        list_add_tail(&to_vc4_bo(&setup->rcl->base)->unref_head,
                      &exec->unref_list);
 
-       rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
-       rcl_u32(setup,
-               (setup->color_write ? (setup->color_write->paddr +
-                                      args->color_write.offset) :
-                0));
-       rcl_u16(setup, args->width);
-       rcl_u16(setup, args->height);
-       rcl_u16(setup, args->color_write.bits);
-
        /* The tile buffer gets cleared when the previous tile is stored.  If
         * the clear values changed between frames, then the tile buffer has
         * stale clear values in it, so we have to do a store in None mode (no
@@ -349,6 +340,15 @@ static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
                rcl_u32(setup, 0); /* no address, since we're in None mode */
        }
 
+       rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+       rcl_u32(setup,
+               (setup->color_write ? (setup->color_write->paddr +
+                                      args->color_write.offset) :
+                0));
+       rcl_u16(setup, args->width);
+       rcl_u16(setup, args->height);
+       rcl_u16(setup, args->color_write.bits);
+
        for (y = min_y_tile; y <= max_y_tile; y++) {
                for (x = min_x_tile; x <= max_x_tile; x++) {
                        bool first = (x == min_x_tile && y == min_y_tile);
index 424d515..31de5d1 100644 (file)
@@ -17,6 +17,7 @@
  */
 
 #include "linux/component.h"
+#include "linux/pm_runtime.h"
 #include "vc4_drv.h"
 #include "vc4_regs.h"
 
@@ -144,21 +145,6 @@ int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
 }
 #endif /* CONFIG_DEBUG_FS */
 
-/*
- * Asks the firmware to turn on power to the V3D engine.
- *
- * This may be doable with just the clocks interface, though this
- * packet does some other register setup from the firmware, too.
- */
-int
-vc4_v3d_set_power(struct vc4_dev *vc4, bool on)
-{
-       if (on)
-               return pm_generic_poweroff(&vc4->v3d->pdev->dev);
-       else
-               return pm_generic_resume(&vc4->v3d->pdev->dev);
-}
-
 static void vc4_v3d_init_hw(struct drm_device *dev)
 {
        struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -170,6 +156,29 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
        V3D_WRITE(V3D_VPMBASE, 0);
 }
 
+#ifdef CONFIG_PM
+static int vc4_v3d_runtime_suspend(struct device *dev)
+{
+       struct vc4_v3d *v3d = dev_get_drvdata(dev);
+       struct vc4_dev *vc4 = v3d->vc4;
+
+       vc4_irq_uninstall(vc4->dev);
+
+       return 0;
+}
+
+static int vc4_v3d_runtime_resume(struct device *dev)
+{
+       struct vc4_v3d *v3d = dev_get_drvdata(dev);
+       struct vc4_dev *vc4 = v3d->vc4;
+
+       vc4_v3d_init_hw(vc4->dev);
+       vc4_irq_postinstall(vc4->dev);
+
+       return 0;
+}
+#endif
+
 static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 {
        struct platform_device *pdev = to_platform_device(dev);
@@ -182,6 +191,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
        if (!v3d)
                return -ENOMEM;
 
+       dev_set_drvdata(dev, v3d);
+
        v3d->pdev = pdev;
 
        v3d->regs = vc4_ioremap_regs(pdev, 0);
@@ -189,6 +200,7 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
                return PTR_ERR(v3d->regs);
 
        vc4->v3d = v3d;
+       v3d->vc4 = vc4;
 
        if (V3D_READ(V3D_IDENT0) != V3D_EXPECTED_IDENT0) {
                DRM_ERROR("V3D_IDENT0 read 0x%08x instead of 0x%08x\n",
@@ -210,6 +222,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
                return ret;
        }
 
+       pm_runtime_enable(dev);
+
        return 0;
 }
 
@@ -219,6 +233,8 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
        struct drm_device *drm = dev_get_drvdata(master);
        struct vc4_dev *vc4 = to_vc4_dev(drm);
 
+       pm_runtime_disable(dev);
+
        drm_irq_uninstall(drm);
 
        /* Disable the binner's overflow memory address, so the next
@@ -231,6 +247,10 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
        vc4->v3d = NULL;
 }
 
+static const struct dev_pm_ops vc4_v3d_pm_ops = {
+       SET_RUNTIME_PM_OPS(vc4_v3d_runtime_suspend, vc4_v3d_runtime_resume, NULL)
+};
+
 static const struct component_ops vc4_v3d_ops = {
        .bind   = vc4_v3d_bind,
        .unbind = vc4_v3d_unbind,
@@ -258,5 +278,6 @@ struct platform_driver vc4_v3d_driver = {
        .driver = {
                .name = "vc4_v3d",
                .of_match_table = vc4_v3d_dt_match,
+               .pm = &vc4_v3d_pm_ops,
        },
 };
index e26d9f6..24c2c74 100644 (file)
@@ -401,8 +401,8 @@ validate_tile_binning_config(VALIDATE_ARGS)
        tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
                                true);
        exec->tile_bo = &tile_bo->base;
-       if (!exec->tile_bo)
-               return -ENOMEM;
+       if (IS_ERR(exec->tile_bo))
+               return PTR_ERR(exec->tile_bo);
        list_add_tail(&tile_bo->unref_head, &exec->unref_list);
 
        /* tile alloc address. */
index c49812b..24fb348 100644 (file)
@@ -25,6 +25,7 @@
  *
  **************************************************************************/
 #include <linux/module.h>
+#include <linux/console.h>
 
 #include <drm/drmP.h>
 #include "vmwgfx_drv.h"
@@ -1538,6 +1539,12 @@ static int vmw_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 static int __init vmwgfx_init(void)
 {
        int ret;
+
+#ifdef CONFIG_VGA_CONSOLE
+       if (vgacon_text_force())
+               return -EINVAL;
+#endif
+
        ret = drm_pci_init(&driver, &vmw_pci_driver);
        if (ret)
                DRM_ERROR("Failed initializing DRM.\n");
index db082be..c5a1a08 100644 (file)
@@ -563,6 +563,8 @@ static void vmw_sou_connector_destroy(struct drm_connector *connector)
 
 static const struct drm_connector_funcs vmw_sou_connector_funcs = {
        .dpms = vmw_du_connector_dpms,
+       .detect = vmw_du_connector_detect,
+       .fill_modes = vmw_du_connector_fill_modes,
        .set_property = vmw_du_connector_set_property,
        .destroy = vmw_sou_connector_destroy,
 };
index da462af..dd2dbb9 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/host1x.h>
 #include <linux/of.h>
 #include <linux/slab.h>
+#include <linux/of_device.h>
 
 #include "bus.h"
 #include "dev.h"
@@ -394,6 +395,7 @@ static int host1x_device_add(struct host1x *host1x,
        device->dev.coherent_dma_mask = host1x->dev->coherent_dma_mask;
        device->dev.dma_mask = &device->dev.coherent_dma_mask;
        dev_set_name(&device->dev, "%s", driver->driver.name);
+       of_dma_configure(&device->dev, host1x->dev->of_node);
        device->dev.release = host1x_device_release;
        device->dev.bus = &host1x_bus_type;
        device->dev.parent = host1x->dev;
index 5a8c8d5..a18db4d 100644 (file)
@@ -52,8 +52,8 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
        struct host1x *host1x = cdma_to_host1x(cdma);
 
        if (pb->phys != 0)
-               dma_free_writecombine(host1x->dev, pb->size_bytes + 4,
-                                     pb->mapped, pb->phys);
+               dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
+                           pb->phys);
 
        pb->mapped = NULL;
        pb->phys = 0;
@@ -76,8 +76,8 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
        pb->pos = 0;
 
        /* allocate and map pushbuffer memory */
-       pb->mapped = dma_alloc_writecombine(host1x->dev, pb->size_bytes + 4,
-                                           &pb->phys, GFP_KERNEL);
+       pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
+                                 GFP_KERNEL);
        if (!pb->mapped)
                goto fail;
 
index 314bf37..ff34869 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/of_device.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/dma-mapping.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/host1x.h>
@@ -68,6 +69,7 @@ static const struct host1x_info host1x01_info = {
        .nb_bases       = 8,
        .init           = host1x01_init,
        .sync_offset    = 0x3000,
+       .dma_mask       = DMA_BIT_MASK(32),
 };
 
 static const struct host1x_info host1x02_info = {
@@ -77,6 +79,7 @@ static const struct host1x_info host1x02_info = {
        .nb_bases = 12,
        .init = host1x02_init,
        .sync_offset = 0x3000,
+       .dma_mask = DMA_BIT_MASK(32),
 };
 
 static const struct host1x_info host1x04_info = {
@@ -86,6 +89,7 @@ static const struct host1x_info host1x04_info = {
        .nb_bases = 64,
        .init = host1x04_init,
        .sync_offset = 0x2100,
+       .dma_mask = DMA_BIT_MASK(34),
 };
 
 static const struct host1x_info host1x05_info = {
@@ -95,6 +99,7 @@ static const struct host1x_info host1x05_info = {
        .nb_bases = 64,
        .init = host1x05_init,
        .sync_offset = 0x2100,
+       .dma_mask = DMA_BIT_MASK(34),
 };
 
 static struct of_device_id host1x_of_match[] = {
@@ -148,6 +153,8 @@ static int host1x_probe(struct platform_device *pdev)
        if (IS_ERR(host->regs))
                return PTR_ERR(host->regs);
 
+       dma_set_mask_and_coherent(host->dev, host->info->dma_mask);
+
        if (host->info->init) {
                err = host->info->init(host);
                if (err)
index 0b6e8e9..dace124 100644 (file)
@@ -96,6 +96,7 @@ struct host1x_info {
        int     nb_mlocks;              /* host1x: number of mlocks */
        int     (*init)(struct host1x *); /* initialize per SoC ops */
        int     sync_offset;
+       u64     dma_mask;               /* mask of addressable memory */
 };
 
 struct host1x {
index 63bd63f..defa799 100644 (file)
@@ -467,9 +467,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
                size += g->words * sizeof(u32);
        }
 
-       job->gather_copy_mapped = dma_alloc_writecombine(dev, size,
-                                                        &job->gather_copy,
-                                                        GFP_KERNEL);
+       job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy,
+                                              GFP_KERNEL);
        if (!job->gather_copy_mapped) {
                job->gather_copy_mapped = NULL;
                return -ENOMEM;
@@ -578,9 +577,8 @@ void host1x_job_unpin(struct host1x_job *job)
        job->num_unpins = 0;
 
        if (job->gather_copy_size)
-               dma_free_writecombine(job->channel->dev, job->gather_copy_size,
-                                     job->gather_copy_mapped,
-                                     job->gather_copy);
+               dma_free_wc(job->channel->dev, job->gather_copy_size,
+                           job->gather_copy_mapped, job->gather_copy);
 }
 EXPORT_SYMBOL(host1x_job_unpin);
 
index f2e13eb..e00db3f 100644 (file)
@@ -1050,6 +1050,17 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
        for (i = 0; i < ARRAY_SIZE(client_reg); i++) {
                const struct ipu_platform_reg *reg = &client_reg[i];
                struct platform_device *pdev;
+               struct device_node *of_node;
+
+               /* Associate subdevice with the corresponding port node */
+               of_node = of_graph_get_port_by_id(dev->of_node, i);
+               if (!of_node) {
+                       dev_info(dev,
+                                "no port@%d node in %s, not using %s%d\n",
+                                i, dev->of_node->full_name,
+                                (i / 2) ? "DI" : "CSI", i % 2);
+                       continue;
+               }
 
                pdev = platform_device_alloc(reg->name, id++);
                if (!pdev) {
@@ -1057,17 +1068,9 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
                        goto err_register;
                }
 
+               pdev->dev.of_node = of_node;
                pdev->dev.parent = dev;
 
-               /* Associate subdevice with the corresponding port node */
-               pdev->dev.of_node = of_graph_get_port_by_id(dev->of_node, i);
-               if (!pdev->dev.of_node) {
-                       dev_err(dev, "missing port@%d node in %s\n", i,
-                               dev->of_node->full_name);
-                       ret = -ENODEV;
-                       goto err_register;
-               }
-
                ret = platform_device_add_data(pdev, &reg->pdata,
                                               sizeof(reg->pdata));
                if (!ret)
@@ -1289,10 +1292,6 @@ static int ipu_probe(struct platform_device *pdev)
        ipu->irq_sync = irq_sync;
        ipu->irq_err = irq_err;
 
-       ret = ipu_irq_init(ipu);
-       if (ret)
-               goto out_failed_irq;
-
        ret = device_reset(&pdev->dev);
        if (ret) {
                dev_err(&pdev->dev, "failed to reset: %d\n", ret);
@@ -1302,6 +1301,10 @@ static int ipu_probe(struct platform_device *pdev)
        if (ret)
                goto out_failed_reset;
 
+       ret = ipu_irq_init(ipu);
+       if (ret)
+               goto out_failed_irq;
+
        /* Set MCU_T to divide MCU access window into 2 */
        ipu_cm_write(ipu, 0x00400000L | (IPU_MCU_T_DEFAULT << 18),
                        IPU_DISP_GEN);
@@ -1324,9 +1327,9 @@ static int ipu_probe(struct platform_device *pdev)
 failed_add_clients:
        ipu_submodules_exit(ipu);
 failed_submodules_init:
-out_failed_reset:
        ipu_irq_exit(ipu);
 out_failed_irq:
+out_failed_reset:
        clk_disable_unprepare(ipu->clk);
        return ret;
 }
index f155b83..2b3105c 100644 (file)
@@ -126,7 +126,7 @@ static int ads1015_reg_to_mv(struct i2c_client *client, unsigned int channel,
        struct ads1015_data *data = i2c_get_clientdata(client);
        unsigned int pga = data->channel_data[channel].pga;
        int fullscale = fullscale_table[pga];
-       const unsigned mask = data->id == ads1115 ? 0x7fff : 0x7ff0;
+       const int mask = data->id == ads1115 ? 0x7fff : 0x7ff0;
 
        return DIV_ROUND_CLOSEST(reg * fullscale, mask);
 }
index c848789..c43318d 100644 (file)
@@ -930,6 +930,17 @@ static struct dmi_system_id i8k_dmi_table[] __initdata = {
 MODULE_DEVICE_TABLE(dmi, i8k_dmi_table);
 
 static struct dmi_system_id i8k_blacklist_dmi_table[] __initdata = {
+       {
+               /*
+                * CPU fan speed going up and down on Dell Studio XPS 8000
+                * for unknown reasons.
+                */
+               .ident = "Dell Studio XPS 8000",
+               .matches = {
+                       DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Studio XPS 8000"),
+               },
+       },
        {
                /*
                 * CPU fan speed going up and down on Dell Studio XPS 8100
index f77eb97..4f695d8 100644 (file)
@@ -90,7 +90,15 @@ static ssize_t show_power(struct device *dev,
        pci_bus_read_config_dword(f4->bus, PCI_DEVFN(PCI_SLOT(f4->devfn), 5),
                                  REG_TDP_LIMIT3, &val);
 
-       tdp_limit = val >> 16;
+       /*
+        * On Carrizo and later platforms, ApmTdpLimit bit field
+        * is extended to 16:31 from 16:28.
+        */
+       if (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model >= 0x60)
+               tdp_limit = val >> 16;
+       else
+               tdp_limit = (val >> 16) & 0x1fff;
+
        curr_pwr_watts = ((u64)(tdp_limit +
                                data->base_tdp)) << running_avg_range;
        curr_pwr_watts -= running_avg_capture;
index 82de3de..685568b 100644 (file)
@@ -406,16 +406,11 @@ static int gpio_fan_get_cur_state(struct thermal_cooling_device *cdev,
                                  unsigned long *state)
 {
        struct gpio_fan_data *fan_data = cdev->devdata;
-       int r;
 
        if (!fan_data)
                return -EINVAL;
 
-       r = get_fan_speed_index(fan_data);
-       if (r < 0)
-               return r;
-
-       *state = r;
+       *state = fan_data->speed_index;
        return 0;
 }
 
index 52f708b..d50c701 100644 (file)
@@ -313,6 +313,10 @@ int of_hwspin_lock_get_id(struct device_node *np, int index)
                hwlock = radix_tree_deref_slot(slot);
                if (unlikely(!hwlock))
                        continue;
+               if (radix_tree_is_indirect_ptr(hwlock)) {
+                       slot = radix_tree_iter_retry(&iter);
+                       continue;
+               }
 
                if (hwlock->bank->dev->of_node == args.np) {
                        ret = 0;
index 3711df1..4a45408 100644 (file)
@@ -586,8 +586,7 @@ static int brcmstb_i2c_probe(struct platform_device *pdev)
        if (!dev)
                return -ENOMEM;
 
-       dev->bsc_regmap = devm_kzalloc(&pdev->dev, sizeof(struct bsc_regs *),
-                                      GFP_KERNEL);
+       dev->bsc_regmap = devm_kzalloc(&pdev->dev, sizeof(*dev->bsc_regmap), GFP_KERNEL);
        if (!dev->bsc_regmap)
                return -ENOMEM;
 
index ba9732c..10fbd6d 100644 (file)
@@ -874,7 +874,8 @@ int i2c_dw_probe(struct dw_i2c_dev *dev)
        i2c_set_adapdata(adap, dev);
 
        i2c_dw_disable_int(dev);
-       r = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr, IRQF_SHARED,
+       r = devm_request_irq(dev->dev, dev->irq, i2c_dw_isr,
+                            IRQF_SHARED | IRQF_COND_SUSPEND,
                             dev_name(dev->dev), dev);
        if (r) {
                dev_err(dev->dev, "failure requesting irq %i: %d\n",
index f62d697..27fa0cb 100644 (file)
@@ -1271,6 +1271,8 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id)
        switch (dev->device) {
        case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS:
        case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS:
+       case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS:
+       case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS:
        case PCI_DEVICE_ID_INTEL_DNV_SMBUS:
                priv->features |= FEATURE_I2C_BLOCK_READ;
                priv->features |= FEATURE_IRQ;
index 08d26ba..13c4529 100644 (file)
@@ -1450,7 +1450,8 @@ omap_i2c_probe(struct platform_device *pdev)
 
 err_unuse_clocks:
        omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0);
-       pm_runtime_put(omap->dev);
+       pm_runtime_dont_use_autosuspend(omap->dev);
+       pm_runtime_put_sync(omap->dev);
        pm_runtime_disable(&pdev->dev);
 err_free_mem:
 
@@ -1468,6 +1469,7 @@ static int omap_i2c_remove(struct platform_device *pdev)
                return ret;
 
        omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0);
+       pm_runtime_dont_use_autosuspend(&pdev->dev);
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
        return 0;
index e045985..93f2895 100644 (file)
@@ -137,10 +137,11 @@ static const struct dmi_system_id piix4_dmi_ibm[] = {
 };
 
 /* SB800 globals */
+static DEFINE_MUTEX(piix4_mutex_sb800);
 static const char *piix4_main_port_names_sb800[PIIX4_MAX_ADAPTERS] = {
-       "SDA0", "SDA2", "SDA3", "SDA4"
+       " port 0", " port 2", " port 3", " port 4"
 };
-static const char *piix4_aux_port_name_sb800 = "SDA1";
+static const char *piix4_aux_port_name_sb800 = " port 1";
 
 struct i2c_piix4_adapdata {
        unsigned short smba;
@@ -148,7 +149,6 @@ struct i2c_piix4_adapdata {
        /* SB800 */
        bool sb800_main;
        unsigned short port;
-       struct mutex *mutex;
 };
 
 static int piix4_setup(struct pci_dev *PIIX4_dev,
@@ -275,10 +275,12 @@ static int piix4_setup_sb800(struct pci_dev *PIIX4_dev,
        else
                smb_en = (aux) ? 0x28 : 0x2c;
 
+       mutex_lock(&piix4_mutex_sb800);
        outb_p(smb_en, SB800_PIIX4_SMB_IDX);
        smba_en_lo = inb_p(SB800_PIIX4_SMB_IDX + 1);
        outb_p(smb_en + 1, SB800_PIIX4_SMB_IDX);
        smba_en_hi = inb_p(SB800_PIIX4_SMB_IDX + 1);
+       mutex_unlock(&piix4_mutex_sb800);
 
        if (!smb_en) {
                smb_en_status = smba_en_lo & 0x10;
@@ -559,7 +561,7 @@ static s32 piix4_access_sb800(struct i2c_adapter *adap, u16 addr,
        u8 port;
        int retval;
 
-       mutex_lock(adapdata->mutex);
+       mutex_lock(&piix4_mutex_sb800);
 
        outb_p(SB800_PIIX4_PORT_IDX, SB800_PIIX4_SMB_IDX);
        smba_en_lo = inb_p(SB800_PIIX4_SMB_IDX + 1);
@@ -574,7 +576,7 @@ static s32 piix4_access_sb800(struct i2c_adapter *adap, u16 addr,
 
        outb_p(smba_en_lo, SB800_PIIX4_SMB_IDX + 1);
 
-       mutex_unlock(adapdata->mutex);
+       mutex_unlock(&piix4_mutex_sb800);
 
        return retval;
 }
@@ -625,6 +627,7 @@ static struct i2c_adapter *piix4_main_adapters[PIIX4_MAX_ADAPTERS];
 static struct i2c_adapter *piix4_aux_adapter;
 
 static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba,
+                            bool sb800_main, unsigned short port,
                             const char *name, struct i2c_adapter **padap)
 {
        struct i2c_adapter *adap;
@@ -639,7 +642,8 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba,
 
        adap->owner = THIS_MODULE;
        adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
-       adap->algo = &smbus_algorithm;
+       adap->algo = sb800_main ? &piix4_smbus_algorithm_sb800
+                               : &smbus_algorithm;
 
        adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
        if (adapdata == NULL) {
@@ -649,12 +653,14 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba,
        }
 
        adapdata->smba = smba;
+       adapdata->sb800_main = sb800_main;
+       adapdata->port = port;
 
        /* set up the sysfs linkage to our parent device */
        adap->dev.parent = &dev->dev;
 
        snprintf(adap->name, sizeof(adap->name),
-               "SMBus PIIX4 adapter %s at %04x", name, smba);
+               "SMBus PIIX4 adapter%s at %04x", name, smba);
 
        i2c_set_adapdata(adap, adapdata);
 
@@ -673,30 +679,16 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba,
 
 static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba)
 {
-       struct mutex *mutex;
        struct i2c_piix4_adapdata *adapdata;
        int port;
        int retval;
 
-       mutex = kzalloc(sizeof(*mutex), GFP_KERNEL);
-       if (mutex == NULL)
-               return -ENOMEM;
-
-       mutex_init(mutex);
-
        for (port = 0; port < PIIX4_MAX_ADAPTERS; port++) {
-               retval = piix4_add_adapter(dev, smba,
+               retval = piix4_add_adapter(dev, smba, true, port,
                                           piix4_main_port_names_sb800[port],
                                           &piix4_main_adapters[port]);
                if (retval < 0)
                        goto error;
-
-               piix4_main_adapters[port]->algo = &piix4_smbus_algorithm_sb800;
-
-               adapdata = i2c_get_adapdata(piix4_main_adapters[port]);
-               adapdata->sb800_main = true;
-               adapdata->port = port;
-               adapdata->mutex = mutex;
        }
 
        return retval;
@@ -714,19 +706,20 @@ error:
                }
        }
 
-       kfree(mutex);
-
        return retval;
 }
 
 static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
        int retval;
+       bool is_sb800 = false;
 
        if ((dev->vendor == PCI_VENDOR_ID_ATI &&
             dev->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
             dev->revision >= 0x40) ||
            dev->vendor == PCI_VENDOR_ID_AMD) {
+               is_sb800 = true;
+
                if (!request_region(SB800_PIIX4_SMB_IDX, 2, "smba_idx")) {
                        dev_err(&dev->dev,
                        "SMBus base address index region 0x%x already in use!\n",
@@ -756,7 +749,7 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id)
                        return retval;
 
                /* Try to register main SMBus adapter, give up if we can't */
-               retval = piix4_add_adapter(dev, retval, "main",
+               retval = piix4_add_adapter(dev, retval, false, 0, "",
                                           &piix4_main_adapters[0]);
                if (retval < 0)
                        return retval;
@@ -783,7 +776,8 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id)
        if (retval > 0) {
                /* Try to add the aux adapter if it exists,
                 * piix4_add_adapter will clean up if this fails */
-               piix4_add_adapter(dev, retval, piix4_aux_port_name_sb800,
+               piix4_add_adapter(dev, retval, false, 0,
+                                 is_sb800 ? piix4_aux_port_name_sb800 : "",
                                  &piix4_aux_adapter);
        }
 
@@ -798,10 +792,8 @@ static void piix4_adap_remove(struct i2c_adapter *adap)
                i2c_del_adapter(adap);
                if (adapdata->port == 0) {
                        release_region(adapdata->smba, SMBIOSIZE);
-                       if (adapdata->sb800_main) {
-                               kfree(adapdata->mutex);
+                       if (adapdata->sb800_main)
                                release_region(SB800_PIIX4_SMB_IDX, 2);
-                       }
                }
                kfree(adapdata);
                kfree(adap);
index f3e5ff8..213ba55 100644 (file)
@@ -467,7 +467,7 @@ static int uniphier_fi2c_clk_init(struct device *dev,
                bus_speed = UNIPHIER_FI2C_DEFAULT_SPEED;
 
        if (!bus_speed) {
-               dev_err(dev, "clock-freqyency should not be zero\n");
+               dev_err(dev, "clock-frequency should not be zero\n");
                return -EINVAL;
        }
 
index 1f4f3f5..89eaa8a 100644 (file)
@@ -328,7 +328,7 @@ static int uniphier_i2c_clk_init(struct device *dev,
                bus_speed = UNIPHIER_I2C_DEFAULT_SPEED;
 
        if (!bus_speed) {
-               dev_err(dev, "clock-freqyency should not be zero\n");
+               dev_err(dev, "clock-frequency should not be zero\n");
                return -EINVAL;
        }
 
index edc29b1..833ea9d 100644 (file)
@@ -213,6 +213,7 @@ config STK8312
 config STK8BA50
        tristate "Sensortek STK8BA50 3-Axis Accelerometer Driver"
        depends on I2C
+       depends on IIO_TRIGGER
        help
          Say yes here to get support for the Sensortek STK8BA50 3-axis
          accelerometer.
index 605ff42..283ded7 100644 (file)
@@ -175,6 +175,7 @@ config DA9150_GPADC
 config EXYNOS_ADC
        tristate "Exynos ADC driver support"
        depends on ARCH_EXYNOS || ARCH_S3C24XX || ARCH_S3C64XX || (OF && COMPILE_TEST)
+       depends on HAS_IOMEM
        help
          Core support for the ADC block found in the Samsung EXYNOS series
          of SoCs for drivers such as the touchscreen and hwmon to use to share
@@ -207,6 +208,7 @@ config INA2XX_ADC
 config IMX7D_ADC
        tristate "IMX7D ADC driver"
        depends on ARCH_MXC || COMPILE_TEST
+       depends on HAS_IOMEM
        help
          Say yes here to build support for IMX7D ADC.
 
@@ -409,6 +411,7 @@ config TWL6030_GPADC
 config VF610_ADC
        tristate "Freescale vf610 ADC driver"
        depends on OF
+       depends on HAS_IOMEM
        select IIO_BUFFER
        select IIO_TRIGGERED_BUFFER
        help
index 942320e..c1e0553 100644 (file)
@@ -289,7 +289,7 @@ static int tiadc_iio_buffered_hardware_setup(struct iio_dev *indio_dev,
                goto error_kfifo_free;
 
        indio_dev->setup_ops = setup_ops;
-       indio_dev->modes |= INDIO_BUFFER_HARDWARE;
+       indio_dev->modes |= INDIO_BUFFER_SOFTWARE;
 
        return 0;
 
index 43d1458..b4dde83 100644 (file)
@@ -300,6 +300,7 @@ static int mcp4725_probe(struct i2c_client *client,
        data->client = client;
 
        indio_dev->dev.parent = &client->dev;
+       indio_dev->name = id->name;
        indio_dev->info = &mcp4725_info;
        indio_dev->channels = &mcp4725_channel;
        indio_dev->num_channels = 1;
index 1165b1c..cfc5a05 100644 (file)
@@ -117,7 +117,7 @@ static int dht11_decode(struct dht11 *dht11, int offset, int timeres)
        if (((hum_int + hum_dec + temp_int + temp_dec) & 0xff) != checksum)
                return -EIO;
 
-       dht11->timestamp = ktime_get_real_ns();
+       dht11->timestamp = ktime_get_boot_ns();
        if (hum_int < 20) {  /* DHT22 */
                dht11->temperature = (((temp_int & 0x7f) << 8) + temp_dec) *
                                        ((temp_int & 0x80) ? -100 : 100);
@@ -145,7 +145,7 @@ static irqreturn_t dht11_handle_irq(int irq, void *data)
 
        /* TODO: Consider making the handler safe for IRQ sharing */
        if (dht11->num_edges < DHT11_EDGES_PER_READ && dht11->num_edges >= 0) {
-               dht11->edges[dht11->num_edges].ts = ktime_get_real_ns();
+               dht11->edges[dht11->num_edges].ts = ktime_get_boot_ns();
                dht11->edges[dht11->num_edges++].value =
                                                gpio_get_value(dht11->gpio);
 
@@ -164,7 +164,7 @@ static int dht11_read_raw(struct iio_dev *iio_dev,
        int ret, timeres;
 
        mutex_lock(&dht11->lock);
-       if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_real_ns()) {
+       if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_boot_ns()) {
                timeres = ktime_get_resolution_ns();
                if (DHT11_DATA_BIT_HIGH < 2 * timeres) {
                        dev_err(dht11->dev, "timeresolution %dns too low\n",
@@ -279,7 +279,7 @@ static int dht11_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       dht11->timestamp = ktime_get_real_ns() - DHT11_DATA_VALID_TIME - 1;
+       dht11->timestamp = ktime_get_boot_ns() - DHT11_DATA_VALID_TIME - 1;
        dht11->num_edges = -1;
 
        platform_set_drvdata(pdev, iio);
index cb32b59..36607d5 100644 (file)
@@ -43,7 +43,7 @@ int adis_update_scan_mode(struct iio_dev *indio_dev,
                return -ENOMEM;
 
        rx = adis->buffer;
-       tx = rx + indio_dev->scan_bytes;
+       tx = rx + scan_count;
 
        spi_message_init(&adis->msg);
 
index 48fbc0b..8f8d137 100644 (file)
@@ -5,9 +5,9 @@
 config INV_MPU6050_IIO
        tristate "Invensense MPU6050 devices"
        depends on I2C && SYSFS
+       depends on I2C_MUX
        select IIO_BUFFER
        select IIO_TRIGGERED_BUFFER
-       select I2C_MUX
        help
          This driver supports the Invensense MPU6050 devices.
          This driver can also support MPU6500 in MPU6050 compatibility mode
index 80fbbfd..734a004 100644 (file)
@@ -349,6 +349,8 @@ EXPORT_SYMBOL_GPL(iio_channel_get);
 
 void iio_channel_release(struct iio_channel *channel)
 {
+       if (!channel)
+               return;
        iio_device_put(channel->indio_dev);
        kfree(channel);
 }
index 60537ec..53201d9 100644 (file)
@@ -54,7 +54,9 @@ static const struct iio_chan_spec acpi_als_channels[] = {
                        .realbits       = 32,
                        .storagebits    = 32,
                },
-               .info_mask_separate     = BIT(IIO_CHAN_INFO_RAW),
+               /* _RAW is here for backward ABI compatibility */
+               .info_mask_separate     = BIT(IIO_CHAN_INFO_RAW) |
+                                         BIT(IIO_CHAN_INFO_PROCESSED),
        },
 };
 
@@ -152,7 +154,7 @@ static int acpi_als_read_raw(struct iio_dev *indio_dev,
        s32 temp_val;
        int ret;
 
-       if (mask != IIO_CHAN_INFO_RAW)
+       if ((mask != IIO_CHAN_INFO_PROCESSED) && (mask != IIO_CHAN_INFO_RAW))
                return -EINVAL;
 
        /* we support only illumination (_ALI) so far. */
index 809a961..6bf89d8 100644 (file)
@@ -180,7 +180,7 @@ static const struct ltr501_samp_table ltr501_ps_samp_table[] = {
                        {500000, 2000000}
 };
 
-static unsigned int ltr501_match_samp_freq(const struct ltr501_samp_table *tab,
+static int ltr501_match_samp_freq(const struct ltr501_samp_table *tab,
                                           int len, int val, int val2)
 {
        int i, freq;
index f5ecd6e..a0d7dee 100644 (file)
@@ -117,7 +117,7 @@ static int mpl115_read_raw(struct iio_dev *indio_dev,
                *val = ret >> 6;
                return IIO_VAL_INT;
        case IIO_CHAN_INFO_OFFSET:
-               *val = 605;
+               *val = -605;
                *val2 = 750000;
                return IIO_VAL_INT_PLUS_MICRO;
        case IIO_CHAN_INFO_SCALE:
index 93e29fb..db35e04 100644 (file)
@@ -87,7 +87,7 @@ static int lidar_i2c_xfer(struct lidar_data *data, u8 reg, u8 *val, int len)
 
        ret = i2c_transfer(client->adapter, msg, 2);
 
-       return (ret == 2) ? 0 : ret;
+       return (ret == 2) ? 0 : -EIO;
 }
 
 static int lidar_smbus_xfer(struct lidar_data *data, u8 reg, u8 *val, int len)
index 00da80e..94b80a5 100644 (file)
@@ -358,6 +358,7 @@ int ib_register_device(struct ib_device *device,
        ret = device->query_device(device, &device->attrs, &uhw);
        if (ret) {
                printk(KERN_WARNING "Couldn't query the device attributes\n");
+               ib_cache_cleanup_one(device);
                goto out;
        }
 
index f334090..1e37f35 100644 (file)
@@ -1071,7 +1071,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
                }
        }
 
-       if (rec->hop_limit > 1 || use_roce) {
+       if (rec->hop_limit > 0 || use_roce) {
                ah_attr->ah_flags = IB_AH_GRH;
                ah_attr->grh.dgid = rec->dgid;
 
index 3de9351..14606af 100644 (file)
@@ -336,7 +336,6 @@ static ssize_t _show_port_gid_attr(struct ib_port *p,
        union ib_gid gid;
        struct ib_gid_attr gid_attr = {};
        ssize_t ret;
-       va_list args;
 
        ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
                           &gid_attr);
@@ -348,7 +347,6 @@ static ssize_t _show_port_gid_attr(struct ib_port *p,
 err:
        if (gid_attr.ndev)
                dev_put(gid_attr.ndev);
-       va_end(args);
        return ret;
 }
 
@@ -722,12 +720,11 @@ static struct attribute_group *get_counter_table(struct ib_device *dev,
 
        if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
                                &cpi, 40, sizeof(cpi)) >= 0) {
-
-               if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH)
+               if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH)
                        /* We have extended counters */
                        return &pma_group_ext;
 
-               if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+               if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
                        /* But not the IETF ones */
                        return &pma_group_noietf;
        }
index 19837d2..2116132 100644 (file)
@@ -322,6 +322,8 @@ int ib_ud_header_init(int     payload_bytes,
                      int    immediate_present,
                      struct ib_ud_header *header)
 {
+       size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
        grh_present = grh_present && !ip_version;
        memset(header, 0, sizeof *header);
 
@@ -353,7 +355,8 @@ int ib_ud_header_init(int     payload_bytes,
        if (ip_version == 6 || grh_present) {
                header->grh.ip_version      = 6;
                header->grh.payload_length  =
-                       cpu_to_be16((IB_BTH_BYTES     +
+                       cpu_to_be16((udp_bytes        +
+                                    IB_BTH_BYTES     +
                                     IB_DETH_BYTES    +
                                     payload_bytes    +
                                     4                + /* ICRC     */
@@ -362,8 +365,6 @@ int ib_ud_header_init(int     payload_bytes,
        }
 
        if (ip_version == 4) {
-               int udp_bytes = udp_present ? IB_UDP_BYTES : 0;
-
                header->ip4.ver = 4; /* version 4 */
                header->ip4.hdr_len = 5; /* 5 words */
                header->ip4.tot_len =
index 6ffc9c4..6c6fbff 100644 (file)
@@ -1970,7 +1970,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
                   resp_size);
        INIT_UDATA(&uhw, buf + sizeof(cmd),
                   (unsigned long)cmd.response + resp_size,
-                  in_len - sizeof(cmd), out_len - resp_size);
+                  in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+                  out_len - resp_size);
 
        memset(&cmd_ex, 0, sizeof(cmd_ex));
        cmd_ex.user_handle = cmd.user_handle;
@@ -3413,7 +3414,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 
        INIT_UDATA(&udata, buf + sizeof cmd,
                   (unsigned long) cmd.response + sizeof resp,
-                  in_len - sizeof cmd, out_len - sizeof resp);
+                  in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+                  out_len - sizeof resp);
 
        ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
        if (ret)
@@ -3439,7 +3441,8 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 
        INIT_UDATA(&udata, buf + sizeof cmd,
                   (unsigned long) cmd.response + sizeof resp,
-                  in_len - sizeof cmd, out_len - sizeof resp);
+                  in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+                  out_len - sizeof resp);
 
        ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
        if (ret)
index 26833bf..d68f506 100644 (file)
@@ -817,17 +817,48 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
        return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
-static void edit_counter(struct mlx4_counter *cnt,
-                                       struct ib_pma_portcounters *pma_cnt)
+static void edit_counter(struct mlx4_counter *cnt, void *counters,
+                        __be16 attr_id)
 {
-       ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
-                            (be64_to_cpu(cnt->tx_bytes) >> 2));
-       ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
-                            (be64_to_cpu(cnt->rx_bytes) >> 2));
-       ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
-                            be64_to_cpu(cnt->tx_frames));
-       ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
-                            be64_to_cpu(cnt->rx_frames));
+       switch (attr_id) {
+       case IB_PMA_PORT_COUNTERS:
+       {
+               struct ib_pma_portcounters *pma_cnt =
+                       (struct ib_pma_portcounters *)counters;
+
+               ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+                                    (be64_to_cpu(cnt->tx_bytes) >> 2));
+               ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+                                    (be64_to_cpu(cnt->rx_bytes) >> 2));
+               ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+                                    be64_to_cpu(cnt->tx_frames));
+               ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+                                    be64_to_cpu(cnt->rx_frames));
+               break;
+       }
+       case IB_PMA_PORT_COUNTERS_EXT:
+       {
+               struct ib_pma_portcounters_ext *pma_cnt_ext =
+                       (struct ib_pma_portcounters_ext *)counters;
+
+               pma_cnt_ext->port_xmit_data =
+                       cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2);
+               pma_cnt_ext->port_rcv_data =
+                       cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2);
+               pma_cnt_ext->port_xmit_packets = cnt->tx_frames;
+               pma_cnt_ext->port_rcv_packets = cnt->rx_frames;
+               break;
+       }
+       }
+}
+
+static int iboe_process_mad_port_info(void *out_mad)
+{
+       struct ib_class_port_info cpi = {};
+
+       cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+       memcpy(out_mad, &cpi, sizeof(cpi));
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
 static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -842,6 +873,9 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
        if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
                return -EINVAL;
 
+       if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)
+               return iboe_process_mad_port_info((void *)(out_mad->data + 40));
+
        memset(&counter_stats, 0, sizeof(counter_stats));
        mutex_lock(&dev->counters_table[port_num - 1].mutex);
        list_for_each_entry(tmp_counter,
@@ -863,7 +897,8 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
                switch (counter_stats.counter_mode & 0xf) {
                case 0:
                        edit_counter(&counter_stats,
-                                    (void *)(out_mad->data + 40));
+                                    (void *)(out_mad->data + 40),
+                                    in_mad->mad_hdr.attr_id);
                        err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
                        break;
                default:
@@ -894,8 +929,10 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
         */
        if (link == IB_LINK_LAYER_INFINIBAND) {
                if (mlx4_is_slave(dev->dev) &&
-                   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
-                   in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS)
+                   (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+                    (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
+                     in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT ||
+                     in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)))
                        return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
                                                in_grh, in_mad, out_mad);
 
index bc5536f..fd97534 100644 (file)
@@ -1681,9 +1681,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        }
 
        if (qp->ibqp.uobject)
-               context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+               context->usr_page = cpu_to_be32(
+                       mlx4_to_hw_uar_index(dev->dev,
+                                            to_mucontext(ibqp->uobject->context)->uar.index));
        else
-               context->usr_page = cpu_to_be32(dev->priv_uar.index);
+               context->usr_page = cpu_to_be32(
+                       mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
 
        if (attr_mask & IB_QP_DEST_QPN)
                context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
index ec737e2..03c418c 100644 (file)
@@ -844,6 +844,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        int err;
        int i;
        size_t reqlen;
+       size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
+                                    max_cqe_version);
 
        if (!dev->ib_active)
                return ERR_PTR(-EAGAIN);
@@ -854,7 +856,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
        if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
                ver = 0;
-       else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+       else if (reqlen >= min_req_v2)
                ver = 2;
        else
                return ERR_PTR(-EINVAL);
@@ -2214,7 +2216,9 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
                (1ull << IB_USER_VERBS_CMD_OPEN_QP);
        dev->ib_dev.uverbs_ex_cmd_mask =
-               (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+               (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
+               (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
+               (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
 
        dev->ib_dev.query_device        = mlx5_ib_query_device;
        dev->ib_dev.query_port          = mlx5_ib_query_port;
index 8fb9c27..34cb8e8 100644 (file)
@@ -270,8 +270,10 @@ static int sq_overhead(enum ib_qp_type qp_type)
                /* fall through */
        case IB_QPT_RC:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
-                       sizeof(struct mlx5_wqe_atomic_seg) +
-                       sizeof(struct mlx5_wqe_raddr_seg);
+                       max(sizeof(struct mlx5_wqe_atomic_seg) +
+                           sizeof(struct mlx5_wqe_raddr_seg),
+                           sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+                           sizeof(struct mlx5_mkey_seg));
                break;
 
        case IB_QPT_XRC_TGT:
@@ -279,9 +281,9 @@ static int sq_overhead(enum ib_qp_type qp_type)
 
        case IB_QPT_UC:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
-                       sizeof(struct mlx5_wqe_raddr_seg) +
-                       sizeof(struct mlx5_wqe_umr_ctrl_seg) +
-                       sizeof(struct mlx5_mkey_seg);
+                       max(sizeof(struct mlx5_wqe_raddr_seg),
+                           sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+                           sizeof(struct mlx5_mkey_seg));
                break;
 
        case IB_QPT_UD:
@@ -1036,7 +1038,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
        wq = MLX5_ADDR_OF(rqc, rqc, wq);
        MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
        MLX5_SET(wq, wq, end_padding_mode,
-                MLX5_GET64(qpc, qpc, end_padding_mode));
+                MLX5_GET(qpc, qpc, end_padding_mode));
        MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
        MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
        MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
@@ -1615,15 +1617,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 
        if (pd) {
                dev = to_mdev(pd->device);
-       } else {
-               /* being cautious here */
-               if (init_attr->qp_type != IB_QPT_XRC_TGT &&
-                   init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
-                       pr_warn("%s: no PD for transport %s\n", __func__,
-                               ib_qp_type_str(init_attr->qp_type));
-                       return ERR_PTR(-EINVAL);
-               }
-               dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
 
                if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
                        if (!pd->uobject) {
@@ -1634,6 +1627,15 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
                                return ERR_PTR(-EINVAL);
                        }
                }
+       } else {
+               /* being cautious here */
+               if (init_attr->qp_type != IB_QPT_XRC_TGT &&
+                   init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
+                       pr_warn("%s: no PD for transport %s\n", __func__,
+                               ib_qp_type_str(init_attr->qp_type));
+                       return ERR_PTR(-EINVAL);
+               }
+               dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
        }
 
        switch (init_attr->qp_type) {
index 4659256..3b2ddd6 100644 (file)
@@ -75,7 +75,8 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
 
 static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                           struct mlx5_create_srq_mbox_in **in,
-                          struct ib_udata *udata, int buf_size, int *inlen)
+                          struct ib_udata *udata, int buf_size, int *inlen,
+                          int is_xrc)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_create_srq ucmd = {};
@@ -87,13 +88,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
        int ncont;
        u32 offset;
        u32 uidx = MLX5_IB_DEFAULT_UIDX;
-       int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 
-       if (drv_data < 0)
-               return -EINVAL;
-
-       ucmdlen = (drv_data < sizeof(ucmd)) ?
-                 drv_data : sizeof(ucmd);
+       ucmdlen = min(udata->inlen, sizeof(ucmd));
 
        if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
                mlx5_ib_dbg(dev, "failed copy udata\n");
@@ -103,15 +99,17 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
        if (ucmd.reserved0 || ucmd.reserved1)
                return -EINVAL;
 
-       if (drv_data > sizeof(ucmd) &&
+       if (udata->inlen > sizeof(ucmd) &&
            !ib_is_udata_cleared(udata, sizeof(ucmd),
-                                drv_data - sizeof(ucmd)))
+                                udata->inlen - sizeof(ucmd)))
                return -EINVAL;
 
-       err = get_srq_user_index(to_mucontext(pd->uobject->context),
-                                &ucmd, udata->inlen, &uidx);
-       if (err)
-               return err;
+       if (is_xrc) {
+               err = get_srq_user_index(to_mucontext(pd->uobject->context),
+                                        &ucmd, udata->inlen, &uidx);
+               if (err)
+                       return err;
+       }
 
        srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
@@ -151,7 +149,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
        (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
-       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
+            is_xrc){
                xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
                                     xrc_srq_context_entry);
                MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
@@ -170,7 +169,7 @@ err_umem:
 
 static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
                             struct mlx5_create_srq_mbox_in **in, int buf_size,
-                            int *inlen)
+                            int *inlen, int is_xrc)
 {
        int err;
        int i;
@@ -224,7 +223,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 
        (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
-       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+       if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
+            is_xrc){
                xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
                                     xrc_srq_context_entry);
                /* 0xffffff means we ask to work with cqe version 0 */
@@ -302,10 +302,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
                    srq->msrq.max_avail_gather);
 
+       is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
+
        if (pd->uobject)
-               err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen);
+               err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen,
+                                     is_xrc);
        else
-               err = create_srq_kernel(dev, srq, &in, buf_size, &inlen);
+               err = create_srq_kernel(dev, srq, &in, buf_size, &inlen,
+                                       is_xrc);
 
        if (err) {
                mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
@@ -313,7 +317,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                goto err_srq;
        }
 
-       is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
        in->ctx.state_log_sz = ilog2(srq->msrq.max);
        flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
        xrcdn = 0;
index 040bb8b..12503f1 100644 (file)
@@ -323,9 +323,6 @@ struct ocrdma_cq {
                         */
        u32 max_hw_cqe;
        bool phase_change;
-       bool deferred_arm, deferred_sol;
-       bool first_arm;
-
        spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization
                                                   * to cq polling
                                                   */
index 5738493..f387430 100644 (file)
@@ -228,6 +228,11 @@ static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
 
        ocrdma_alloc_pd_pool(dev);
 
+       if (!ocrdma_alloc_stats_resources(dev)) {
+               pr_err("%s: stats resource allocation failed\n", __func__);
+               goto alloc_err;
+       }
+
        spin_lock_init(&dev->av_tbl.lock);
        spin_lock_init(&dev->flush_q_lock);
        return 0;
@@ -238,6 +243,7 @@ alloc_err:
 
 static void ocrdma_free_resources(struct ocrdma_dev *dev)
 {
+       ocrdma_release_stats_resources(dev);
        kfree(dev->stag_arr);
        kfree(dev->qp_tbl);
        kfree(dev->cq_tbl);
index 86c303a..255f774 100644 (file)
@@ -64,10 +64,11 @@ static int ocrdma_add_stat(char *start, char *pcur,
        return cpy_len;
 }
 
-static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev)
+bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev)
 {
        struct stats_mem *mem = &dev->stats_mem;
 
+       mutex_init(&dev->stats_lock);
        /* Alloc mbox command mem*/
        mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
                        sizeof(struct ocrdma_rdma_stats_resp));
@@ -91,13 +92,14 @@ static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev)
        return true;
 }
 
-static void ocrdma_release_stats_mem(struct ocrdma_dev *dev)
+void ocrdma_release_stats_resources(struct ocrdma_dev *dev)
 {
        struct stats_mem *mem = &dev->stats_mem;
 
        if (mem->va)
                dma_free_coherent(&dev->nic_info.pdev->dev, mem->size,
                                  mem->va, mem->pa);
+       mem->va = NULL;
        kfree(mem->debugfs_mem);
 }
 
@@ -838,15 +840,9 @@ void ocrdma_add_port_stats(struct ocrdma_dev *dev)
                                &dev->reset_stats, &ocrdma_dbg_ops))
                goto err;
 
-       /* Now create dma_mem for stats mbx command */
-       if (!ocrdma_alloc_stats_mem(dev))
-               goto err;
-
-       mutex_init(&dev->stats_lock);
 
        return;
 err:
-       ocrdma_release_stats_mem(dev);
        debugfs_remove_recursive(dev->dir);
        dev->dir = NULL;
 }
@@ -855,9 +851,7 @@ void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
 {
        if (!dev->dir)
                return;
-       debugfs_remove(dev->dir);
-       mutex_destroy(&dev->stats_lock);
-       ocrdma_release_stats_mem(dev);
+       debugfs_remove_recursive(dev->dir);
 }
 
 void ocrdma_init_debugfs(void)
index c9e58d0..bba1fec 100644 (file)
@@ -65,6 +65,8 @@ enum OCRDMA_STATS_TYPE {
 
 void ocrdma_rem_debugfs(void);
 void ocrdma_init_debugfs(void);
+bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev);
+void ocrdma_release_stats_resources(struct ocrdma_dev *dev);
 void ocrdma_rem_port_stats(struct ocrdma_dev *dev);
 void ocrdma_add_port_stats(struct ocrdma_dev *dev);
 int ocrdma_pma_counters(struct ocrdma_dev *dev,
index d4c687b..12420e4 100644 (file)
@@ -125,8 +125,8 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
                                        IB_DEVICE_SYS_IMAGE_GUID |
                                        IB_DEVICE_LOCAL_DMA_LKEY |
                                        IB_DEVICE_MEM_MGT_EXTENSIONS;
-       attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge);
-       attr->max_sge_rd = 0;
+       attr->max_sge = dev->attr.max_send_sge;
+       attr->max_sge_rd = attr->max_sge;
        attr->max_cq = dev->attr.max_cq;
        attr->max_cqe = dev->attr.max_cqe;
        attr->max_mr = dev->attr.max_mr;
@@ -1094,7 +1094,6 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
        spin_lock_init(&cq->comp_handler_lock);
        INIT_LIST_HEAD(&cq->sq_head);
        INIT_LIST_HEAD(&cq->rq_head);
-       cq->first_arm = true;
 
        if (ib_ctx) {
                uctx = get_ocrdma_ucontext(ib_ctx);
@@ -2726,8 +2725,7 @@ static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe)
                OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT;
        ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) &
                                                OCRDMA_CQE_SRCQP_MASK;
-       ibwc->pkey_index = le32_to_cpu(cqe->ud.rxlen_pkey) &
-                                               OCRDMA_CQE_PKEY_MASK;
+       ibwc->pkey_index = 0;
        ibwc->wc_flags = IB_WC_GRH;
        ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
                                        OCRDMA_CQE_UD_XFER_LEN_SHIFT);
@@ -2911,12 +2909,9 @@ expand_cqe:
        }
 stop_cqe:
        cq->getp = cur_getp;
-       if (cq->deferred_arm || polled_hw_cqes) {
-               ocrdma_ring_cq_db(dev, cq->id, cq->deferred_arm,
-                                 cq->deferred_sol, polled_hw_cqes);
-               cq->deferred_arm = false;
-               cq->deferred_sol = false;
-       }
+
+       if (polled_hw_cqes)
+               ocrdma_ring_cq_db(dev, cq->id, false, false, polled_hw_cqes);
 
        return i;
 }
@@ -3000,13 +2995,7 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
        if (cq_flags & IB_CQ_SOLICITED)
                sol_needed = true;
 
-       if (cq->first_arm) {
-               ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
-               cq->first_arm = false;
-       }
-
-       cq->deferred_arm = true;
-       cq->deferred_sol = sol_needed;
+       ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
        spin_unlock_irqrestore(&cq->cq_lock, flags);
 
        return 0;
index 5ea0c14..fa9c42f 100644 (file)
@@ -245,8 +245,6 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
        skb_reset_mac_header(skb);
        skb_pull(skb, IPOIB_ENCAP_LEN);
 
-       skb->truesize = SKB_TRUESIZE(skb->len);
-
        ++dev->stats.rx_packets;
        dev->stats.rx_bytes += skb->len;
 
index 050dfa1..2588931 100644 (file)
@@ -456,7 +456,10 @@ out_locked:
        return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
+/*
+ * Caller must hold 'priv->lock'
+ */
+static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_sa_multicast *multicast;
@@ -466,6 +469,10 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
        ib_sa_comp_mask comp_mask;
        int ret = 0;
 
+       if (!priv->broadcast ||
+           !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+               return -EINVAL;
+
        ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
 
        rec.mgid     = mcast->mcmember.mgid;
@@ -525,20 +532,23 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
                        rec.join_state = 4;
 #endif
        }
+       spin_unlock_irq(&priv->lock);
 
        multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
                                         &rec, comp_mask, GFP_KERNEL,
                                         ipoib_mcast_join_complete, mcast);
+       spin_lock_irq(&priv->lock);
        if (IS_ERR(multicast)) {
                ret = PTR_ERR(multicast);
                ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
-               spin_lock_irq(&priv->lock);
                /* Requeue this join task with a backoff delay */
                __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
                clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
                spin_unlock_irq(&priv->lock);
                complete(&mcast->done);
+               spin_lock_irq(&priv->lock);
        }
+       return 0;
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
@@ -620,9 +630,10 @@ void ipoib_mcast_join_task(struct work_struct *work)
                                /* Found the next unjoined group */
                                init_completion(&mcast->done);
                                set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-                               spin_unlock_irq(&priv->lock);
-                               ipoib_mcast_join(dev, mcast);
-                               spin_lock_irq(&priv->lock);
+                               if (ipoib_mcast_join(dev, mcast)) {
+                                       spin_unlock_irq(&priv->lock);
+                                       return;
+                               }
                        } else if (!delay_until ||
                                 time_before(mcast->delay_until, delay_until))
                                delay_until = mcast->delay_until;
@@ -641,10 +652,9 @@ out:
        if (mcast) {
                init_completion(&mcast->done);
                set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+               ipoib_mcast_join(dev, mcast);
        }
        spin_unlock_irq(&priv->lock);
-       if (mcast)
-               ipoib_mcast_join(dev, mcast);
 }
 
 int ipoib_mcast_start_thread(struct net_device *dev)
index 6727954..e8a84d1 100644 (file)
@@ -1207,7 +1207,6 @@ static void xpad_led_disconnect(struct usb_xpad *xpad)
 #else
 static int xpad_led_probe(struct usb_xpad *xpad) { return 0; }
 static void xpad_led_disconnect(struct usb_xpad *xpad) { }
-static void xpad_identify_controller(struct usb_xpad *xpad) { }
 #endif
 
 static int xpad_start_input(struct usb_xpad *xpad)
index 4d446d5..c01a1d6 100644 (file)
@@ -235,7 +235,7 @@ struct adp5589_kpad {
        unsigned short gpimapsize;
        unsigned extend_cfg;
        bool is_adp5585;
-       bool adp5585_support_row5;
+       bool support_row5;
 #ifdef CONFIG_GPIOLIB
        unsigned char gpiomap[ADP5589_MAXGPIO];
        bool export_gpio;
@@ -485,7 +485,7 @@ static int adp5589_build_gpiomap(struct adp5589_kpad *kpad,
        if (kpad->extend_cfg & C4_EXTEND_CFG)
                pin_used[kpad->var->c4_extend_cfg] = true;
 
-       if (!kpad->adp5585_support_row5)
+       if (!kpad->support_row5)
                pin_used[5] = true;
 
        for (i = 0; i < kpad->var->maxgpio; i++)
@@ -884,12 +884,13 @@ static int adp5589_probe(struct i2c_client *client,
 
        switch (id->driver_data) {
        case ADP5585_02:
-               kpad->adp5585_support_row5 = true;
+               kpad->support_row5 = true;
        case ADP5585_01:
                kpad->is_adp5585 = true;
                kpad->var = &const_adp5585;
                break;
        case ADP5589:
+               kpad->support_row5 = true;
                kpad->var = &const_adp5589;
                break;
        }
index 378db10..4401be2 100644 (file)
@@ -304,8 +304,10 @@ static int cap11xx_init_leds(struct device *dev,
                led->cdev.brightness = LED_OFF;
 
                error = of_property_read_u32(child, "reg", &reg);
-               if (error != 0 || reg >= num_leds)
+               if (error != 0 || reg >= num_leds) {
+                       of_node_put(child);
                        return -EINVAL;
+               }
 
                led->reg = reg;
                led->priv = priv;
@@ -313,8 +315,10 @@ static int cap11xx_init_leds(struct device *dev,
                INIT_WORK(&led->work, cap11xx_led_work);
 
                error = devm_led_classdev_register(dev, &led->cdev);
-               if (error)
+               if (error) {
+                       of_node_put(child);
                        return error;
+               }
 
                priv->num_leds++;
                led++;
index d6d16fa..1f2337a 100644 (file)
@@ -733,7 +733,7 @@ config INPUT_XEN_KBDDEV_FRONTEND
          module will be called xen-kbdfront.
 
 config INPUT_SIRFSOC_ONKEY
-       bool "CSR SiRFSoC power on/off/suspend key support"
+       tristate "CSR SiRFSoC power on/off/suspend key support"
        depends on ARCH_SIRF && OF
        default y
        help
index 9d5b89b..ed7237f 100644 (file)
@@ -101,7 +101,7 @@ static void sirfsoc_pwrc_close(struct input_dev *input)
 static const struct of_device_id sirfsoc_pwrc_of_match[] = {
        { .compatible = "sirf,prima2-pwrc" },
        {},
-}
+};
 MODULE_DEVICE_TABLE(of, sirfsoc_pwrc_of_match);
 
 static int sirfsoc_pwrc_probe(struct platform_device *pdev)
index e272f06..a3f0f5a 100644 (file)
@@ -458,8 +458,6 @@ int vmmouse_init(struct psmouse *psmouse)
        priv->abs_dev = abs_dev;
        psmouse->private = priv;
 
-       input_set_capability(rel_dev, EV_REL, REL_WHEEL);
-
        /* Set up and register absolute device */
        snprintf(priv->phys, sizeof(priv->phys), "%s/input1",
                 psmouse->ps2dev.serio->phys);
@@ -475,10 +473,6 @@ int vmmouse_init(struct psmouse *psmouse)
        abs_dev->id.version = psmouse->model;
        abs_dev->dev.parent = &psmouse->ps2dev.serio->dev;
 
-       error = input_register_device(priv->abs_dev);
-       if (error)
-               goto init_fail;
-
        /* Set absolute device capabilities */
        input_set_capability(abs_dev, EV_KEY, BTN_LEFT);
        input_set_capability(abs_dev, EV_KEY, BTN_RIGHT);
@@ -488,6 +482,13 @@ int vmmouse_init(struct psmouse *psmouse)
        input_set_abs_params(abs_dev, ABS_X, 0, VMMOUSE_MAX_X, 0, 0);
        input_set_abs_params(abs_dev, ABS_Y, 0, VMMOUSE_MAX_Y, 0, 0);
 
+       error = input_register_device(priv->abs_dev);
+       if (error)
+               goto init_fail;
+
+       /* Add wheel capability to the relative device */
+       input_set_capability(rel_dev, EV_REL, REL_WHEEL);
+
        psmouse->protocol_handler = vmmouse_process_byte;
        psmouse->disconnect = vmmouse_disconnect;
        psmouse->reconnect = vmmouse_reconnect;
index 8f82897..1ca7f55 100644 (file)
@@ -134,7 +134,7 @@ static void serio_find_driver(struct serio *serio)
        int error;
 
        error = device_attach(&serio->dev);
-       if (error < 0)
+       if (error < 0 && error != -EPROBE_DEFER)
                dev_warn(&serio->dev,
                         "device_attach() failed for %s (%s), error: %d\n",
                         serio->phys, serio->name, error);
index 5d4903a..69828d0 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
index 0b0f8c1..23fbe38 100644 (file)
@@ -822,16 +822,22 @@ static void edt_ft5x06_ts_get_defaults(struct device *dev,
        int error;
 
        error = device_property_read_u32(dev, "threshold", &val);
-       if (!error)
-               reg_addr->reg_threshold = val;
+       if (!error) {
+               edt_ft5x06_register_write(tsdata, reg_addr->reg_threshold, val);
+               tsdata->threshold = val;
+       }
 
        error = device_property_read_u32(dev, "gain", &val);
-       if (!error)
-               reg_addr->reg_gain = val;
+       if (!error) {
+               edt_ft5x06_register_write(tsdata, reg_addr->reg_gain, val);
+               tsdata->gain = val;
+       }
 
        error = device_property_read_u32(dev, "offset", &val);
-       if (!error)
-               reg_addr->reg_offset = val;
+       if (!error) {
+               edt_ft5x06_register_write(tsdata, reg_addr->reg_offset, val);
+               tsdata->offset = val;
+       }
 }
 
 static void
index 539b0de..374c129 100644 (file)
@@ -114,6 +114,7 @@ struct kmem_cache *amd_iommu_irq_cache;
 
 static void update_domain(struct protection_domain *domain);
 static int protection_domain_init(struct protection_domain *domain);
+static void detach_device(struct device *dev);
 
 /*
  * For dynamic growth the aperture size is split into ranges of 128MB of
@@ -384,6 +385,9 @@ static void iommu_uninit_device(struct device *dev)
        if (!dev_data)
                return;
 
+       if (dev_data->domain)
+               detach_device(dev);
+
        iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
                            dev);
 
@@ -2049,7 +2053,7 @@ static void do_attach(struct iommu_dev_data *dev_data,
        /* Update device table */
        set_dte_entry(dev_data->devid, domain, ats);
        if (alias != dev_data->devid)
-               set_dte_entry(dev_data->devid, domain, ats);
+               set_dte_entry(alias, domain, ats);
 
        device_flush_dte(dev_data);
 }
index 013bdff..bf4959f 100644 (file)
@@ -228,6 +228,10 @@ static int amd_iommu_enable_interrupts(void);
 static int __init iommu_go_to_state(enum iommu_init_state state);
 static void init_device_table_dma(void);
 
+static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
+                                   u8 bank, u8 cntr, u8 fxn,
+                                   u64 *value, bool is_write);
+
 static inline void update_last_devid(u16 devid)
 {
        if (devid > amd_iommu_last_bdf)
@@ -1015,6 +1019,34 @@ static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu)
        pci_write_config_dword(iommu->dev, 0xf0, 0x90);
 }
 
+/*
+ * Family15h Model 30h-3fh (IOMMU Mishandles ATS Write Permission)
+ * Workaround:
+ *     BIOS should enable ATS write permission check by setting
+ *     L2_DEBUG_3[AtsIgnoreIWDis](D0F2xF4_x47[0]) = 1b
+ */
+static void amd_iommu_ats_write_check_workaround(struct amd_iommu *iommu)
+{
+       u32 value;
+
+       if ((boot_cpu_data.x86 != 0x15) ||
+           (boot_cpu_data.x86_model < 0x30) ||
+           (boot_cpu_data.x86_model > 0x3f))
+               return;
+
+       /* Test L2_DEBUG_3[AtsIgnoreIWDis] == 1 */
+       value = iommu_read_l2(iommu, 0x47);
+
+       if (value & BIT(0))
+               return;
+
+       /* Set L2_DEBUG_3[AtsIgnoreIWDis] = 1 */
+       iommu_write_l2(iommu, 0x47, value | BIT(0));
+
+       pr_info("AMD-Vi: Applying ATS write check workaround for IOMMU at %s\n",
+               dev_name(&iommu->dev->dev));
+}
+
 /*
  * This function clues the initialization function for one IOMMU
  * together and also allocates the command buffer and programs the
@@ -1142,8 +1174,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
        amd_iommu_pc_present = true;
 
        /* Check if the performance counters can be written to */
-       if ((0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val, true)) ||
-           (0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val2, false)) ||
+       if ((0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
+           (0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
            (val != val2)) {
                pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
                amd_iommu_pc_present = false;
@@ -1284,6 +1316,7 @@ static int iommu_init_pci(struct amd_iommu *iommu)
        }
 
        amd_iommu_erratum_746_workaround(iommu);
+       amd_iommu_ats_write_check_workaround(iommu);
 
        iommu->iommu_dev = iommu_device_create(&iommu->dev->dev, iommu,
                                               amd_iommu_groups, "ivhd%d",
@@ -2283,22 +2316,15 @@ u8 amd_iommu_pc_get_max_counters(u16 devid)
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
 
-int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
+static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
+                                   u8 bank, u8 cntr, u8 fxn,
                                    u64 *value, bool is_write)
 {
-       struct amd_iommu *iommu;
        u32 offset;
        u32 max_offset_lim;
 
-       /* Make sure the IOMMU PC resource is available */
-       if (!amd_iommu_pc_present)
-               return -ENODEV;
-
-       /* Locate the iommu associated with the device ID */
-       iommu = amd_iommu_rlookup_table[devid];
-
        /* Check for valid iommu and pc register indexing */
-       if (WARN_ON((iommu == NULL) || (fxn > 0x28) || (fxn & 7)))
+       if (WARN_ON((fxn > 0x28) || (fxn & 7)))
                return -ENODEV;
 
        offset = (u32)(((0x40|bank) << 12) | (cntr << 8) | fxn);
@@ -2322,3 +2348,16 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
        return 0;
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
+
+int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
+                                   u64 *value, bool is_write)
+{
+       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+       /* Make sure the IOMMU PC resource is available */
+       if (!amd_iommu_pc_present || iommu == NULL)
+               return -ENODEV;
+
+       return iommu_pc_get_set_reg_val(iommu, bank, cntr, fxn,
+                                       value, is_write);
+}
index 62a400c..8ffd756 100644 (file)
@@ -329,7 +329,8 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
        /* Only care about add/remove events for physical functions */
        if (pdev->is_virtfn)
                return NOTIFY_DONE;
-       if (action != BUS_NOTIFY_ADD_DEVICE && action != BUS_NOTIFY_DEL_DEVICE)
+       if (action != BUS_NOTIFY_ADD_DEVICE &&
+           action != BUS_NOTIFY_REMOVED_DEVICE)
                return NOTIFY_DONE;
 
        info = dmar_alloc_pci_notify_info(pdev, action);
@@ -339,7 +340,7 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
        down_write(&dmar_global_lock);
        if (action == BUS_NOTIFY_ADD_DEVICE)
                dmar_pci_bus_add_dev(info);
-       else if (action == BUS_NOTIFY_DEL_DEVICE)
+       else if (action == BUS_NOTIFY_REMOVED_DEVICE)
                dmar_pci_bus_del_dev(info);
        up_write(&dmar_global_lock);
 
@@ -1353,7 +1354,7 @@ void dmar_disable_qi(struct intel_iommu *iommu)
 
        raw_spin_lock_irqsave(&iommu->register_lock, flags);
 
-       sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG);
+       sts =  readl(iommu->reg + DMAR_GSTS_REG);
        if (!(sts & DMA_GSTS_QIES))
                goto end;
 
index ac73876..a2e1b7f 100644 (file)
@@ -1489,7 +1489,7 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
-       if (dev_is_pci(info->dev))
+       if (!dev_is_pci(info->dev))
                return;
 
        pdev = to_pci_dev(info->dev);
@@ -4367,7 +4367,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
                                rmrru->devices_cnt);
                        if(ret < 0)
                                return ret;
-               } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
+               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
                        dmar_remove_dev_scope(info, rmrr->segment,
                                rmrru->devices, rmrru->devices_cnt);
                }
@@ -4387,7 +4387,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
                                break;
                        else if(ret < 0)
                                return ret;
-               } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
+               } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
                        if (dmar_remove_dev_scope(info, atsr->segment,
                                        atsru->devices, atsru->devices_cnt))
                                break;
index 5046483..d9939fa 100644 (file)
@@ -249,12 +249,30 @@ static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *s
 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+       struct intel_svm_dev *sdev;
 
+       /* This might end up being called from exit_mmap(), *before* the page
+        * tables are cleared. And __mmu_notifier_release() will delete us from
+        * the list of notifiers so that our invalidate_range() callback doesn't
+        * get called when the page tables are cleared. So we need to protect
+        * against hardware accessing those page tables.
+        *
+        * We do it by clearing the entry in the PASID table and then flushing
+        * the IOTLB and the PASID table caches. This might upset hardware;
+        * perhaps we'll want to point the PASID to a dummy PGD (like the zero
+        * page) so that we end up taking a fault that the hardware really
+        * *has* to handle gracefully without affecting other processes.
+        */
        svm->iommu->pasid_table[svm->pasid].val = 0;
+       wmb();
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(sdev, &svm->devs, list) {
+               intel_flush_pasid_dev(svm, sdev, svm->pasid);
+               intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
+       }
+       rcu_read_unlock();
 
-       /* There's no need to do any flush because we can't get here if there
-        * are any devices left anyway. */
-       WARN_ON(!list_empty(&svm->devs));
 }
 
 static const struct mmu_notifier_ops intel_mmuops = {
@@ -379,7 +397,6 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
                                goto out;
                        }
                        iommu->pasid_table[svm->pasid].val = (u64)__pa(mm->pgd) | 1;
-                       mm = NULL;
                } else
                        iommu->pasid_table[svm->pasid].val = (u64)__pa(init_mm.pgd) | 1 | (1ULL << 11);
                wmb();
@@ -442,11 +459,11 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
                                kfree_rcu(sdev, rcu);
 
                                if (list_empty(&svm->devs)) {
-                                       mmu_notifier_unregister(&svm->notifier, svm->mm);
 
                                        idr_remove(&svm->iommu->pasid_idr, svm->pasid);
                                        if (svm->mm)
-                                               mmput(svm->mm);
+                                               mmu_notifier_unregister(&svm->notifier, svm->mm);
+
                                        /* We mandate that no page faults may be outstanding
                                         * for the PASID when intel_svm_unbind_mm() is called.
                                         * If that is not obeyed, subtle errors will happen.
@@ -507,6 +524,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
        struct intel_svm *svm = NULL;
        int head, tail, handled = 0;
 
+       /* Clear PPR bit before reading head/tail registers, to
+        * ensure that we get a new interrupt if needed. */
+       writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
        tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
        head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
        while (head != tail) {
@@ -551,6 +572,9 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                 * any faults on kernel addresses. */
                if (!svm->mm)
                        goto bad_req;
+               /* If the mm is already defunct, don't handle faults. */
+               if (!atomic_inc_not_zero(&svm->mm->mm_users))
+                       goto bad_req;
                down_read(&svm->mm->mmap_sem);
                vma = find_extend_vma(svm->mm, address);
                if (!vma || address < vma->vm_start)
@@ -567,6 +591,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                result = QI_RESP_SUCCESS;
        invalid:
                up_read(&svm->mm->mmap_sem);
+               mmput(svm->mm);
        bad_req:
                /* Accounting for major/minor faults? */
                rcu_read_lock();
index c12ba45..ac59692 100644 (file)
@@ -629,7 +629,7 @@ static void iommu_disable_irq_remapping(struct intel_iommu *iommu)
 
        raw_spin_lock_irqsave(&iommu->register_lock, flags);
 
-       sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+       sts = readl(iommu->reg + DMAR_GSTS_REG);
        if (!(sts & DMA_GSTS_IRES))
                goto end;
 
index 8bbcbfe..381ca5a 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/barrier.h>
 
index 715923d..7e8c441 100644 (file)
@@ -60,6 +60,17 @@ config ARM_VIC_NR
          The maximum number of VICs available in the system, for
          power management.
 
+config ARMADA_370_XP_IRQ
+       bool
+       select GENERIC_IRQ_CHIP
+       select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+
+config ALPINE_MSI
+       bool
+       depends on PCI && PCI_MSI
+       select GENERIC_IRQ_CHIP
+       select PCI_MSI_IRQ_DOMAIN
+
 config ATMEL_AIC_IRQ
        bool
        select GENERIC_IRQ_CHIP
@@ -78,6 +89,11 @@ config I8259
        bool
        select IRQ_DOMAIN
 
+config BCM6345_L1_IRQ
+       bool
+       select GENERIC_IRQ_CHIP
+       select IRQ_DOMAIN
+
 config BCM7038_L1_IRQ
        bool
        select GENERIC_IRQ_CHIP
@@ -151,6 +167,11 @@ config ST_IRQCHIP
        help
          Enables SysCfg Controlled IRQs on STi based platforms.
 
+config TANGO_IRQ
+       bool
+       select IRQ_DOMAIN
+       select GENERIC_IRQ_CHIP
+
 config TB10X_IRQC
        bool
        select IRQ_DOMAIN
@@ -159,6 +180,8 @@ config TB10X_IRQC
 config TS4800_IRQ
        tristate "TS-4800 IRQ controller"
        select IRQ_DOMAIN
+       depends on HAS_IOMEM
+       depends on SOC_IMX51 || COMPILE_TEST
        help
          Support for the TS-4800 FPGA IRQ controller
 
@@ -192,6 +215,8 @@ config KEYSTONE_IRQ
 
 config MIPS_GIC
        bool
+       select GENERIC_IRQ_IPI
+       select IRQ_DOMAIN_HIERARCHY
        select MIPS_CM
 
 config INGENIC_IRQ
@@ -217,3 +242,7 @@ config IRQ_MXS
        def_bool y if MACH_ASM9260 || ARCH_MXS
        select IRQ_DOMAIN
        select STMP_DEVICE
+
+config MVEBU_ODMI
+       bool
+       select GENERIC_MSI_IRQ_DOMAIN
index 18caacb..b03cfcb 100644 (file)
@@ -1,11 +1,13 @@
 obj-$(CONFIG_IRQCHIP)                  += irqchip.o
 
+obj-$(CONFIG_ALPINE_MSI)               += irq-alpine-msi.o
+obj-$(CONFIG_ATH79)                    += irq-ath79-cpu.o
+obj-$(CONFIG_ATH79)                    += irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835)             += irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835)             += irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)              += exynos-combiner.o
 obj-$(CONFIG_ARCH_HIP04)               += irq-hip04.o
 obj-$(CONFIG_ARCH_MMP)                 += irq-mmp.o
-obj-$(CONFIG_ARCH_MVEBU)               += irq-armada-370-xp.o
 obj-$(CONFIG_IRQ_MXS)                  += irq-mxs.o
 obj-$(CONFIG_ARCH_TEGRA)               += irq-tegra.o
 obj-$(CONFIG_ARCH_S3C24XX)             += irq-s3c24xx.o
@@ -28,6 +30,7 @@ obj-$(CONFIG_ARM_GIC_V3_ITS)          += irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-g
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)     += irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)                 += irq-nvic.o
 obj-$(CONFIG_ARM_VIC)                  += irq-vic.o
+obj-$(CONFIG_ARMADA_370_XP_IRQ)                += irq-armada-370-xp.o
 obj-$(CONFIG_ATMEL_AIC_IRQ)            += irq-atmel-aic-common.o irq-atmel-aic.o
 obj-$(CONFIG_ATMEL_AIC5_IRQ)   += irq-atmel-aic-common.o irq-atmel-aic5.o
 obj-$(CONFIG_I8259)                    += irq-i8259.o
@@ -40,12 +43,14 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ)    += irq-versatile-fpga.o
 obj-$(CONFIG_ARCH_NSPIRE)              += irq-zevio.o
 obj-$(CONFIG_ARCH_VT8500)              += irq-vt8500.o
 obj-$(CONFIG_ST_IRQCHIP)               += irq-st.o
+obj-$(CONFIG_TANGO_IRQ)                        += irq-tango.o
 obj-$(CONFIG_TB10X_IRQC)               += irq-tb10x.o
 obj-$(CONFIG_TS4800_IRQ)               += irq-ts4800.o
 obj-$(CONFIG_XTENSA)                   += irq-xtensa-pic.o
 obj-$(CONFIG_XTENSA_MX)                        += irq-xtensa-mx.o
 obj-$(CONFIG_IRQ_CROSSBAR)             += irq-crossbar.o
 obj-$(CONFIG_SOC_VF610)                        += irq-vf610-mscm-ir.o
+obj-$(CONFIG_BCM6345_L1_IRQ)           += irq-bcm6345-l1.o
 obj-$(CONFIG_BCM7038_L1_IRQ)           += irq-bcm7038-l1.o
 obj-$(CONFIG_BCM7120_L2_IRQ)           += irq-bcm7120-l2.o
 obj-$(CONFIG_BRCMSTB_L2_IRQ)           += irq-brcmstb-l2.o
@@ -59,3 +64,4 @@ obj-$(CONFIG_ARCH_SA1100)             += irq-sa11x0.o
 obj-$(CONFIG_INGENIC_IRQ)              += irq-ingenic.o
 obj-$(CONFIG_IMX_GPCV2)                        += irq-imx-gpcv2.o
 obj-$(CONFIG_PIC32_EVIC)               += irq-pic32-evic.o
+obj-$(CONFIG_MVEBU_ODMI)               += irq-mvebu-odmi.o
diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c
new file mode 100644 (file)
index 0000000..2538425
--- /dev/null
@@ -0,0 +1,293 @@
+/*
+ * Annapurna Labs MSIX support services
+ *
+ * Copyright (C) 2016, Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Antoine Tenart <antoine.tenart@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include <asm/irq.h>
+#include <asm-generic/msi.h>
+
+/* MSIX message address format: local GIC target */
+#define ALPINE_MSIX_SPI_TARGET_CLUSTER0                BIT(16)
+
+struct alpine_msix_data {
+       spinlock_t msi_map_lock;
+       phys_addr_t addr;
+       u32 spi_first;          /* The SGI number that MSIs start */
+       u32 num_spis;           /* The number of SGIs for MSIs */
+       unsigned long *msi_map;
+};
+
+static void alpine_msix_mask_msi_irq(struct irq_data *d)
+{
+       pci_msi_mask_irq(d);
+       irq_chip_mask_parent(d);
+}
+
+static void alpine_msix_unmask_msi_irq(struct irq_data *d)
+{
+       pci_msi_unmask_irq(d);
+       irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip alpine_msix_irq_chip = {
+       .name                   = "MSIx",
+       .irq_mask               = alpine_msix_mask_msi_irq,
+       .irq_unmask             = alpine_msix_unmask_msi_irq,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+};
+
+static int alpine_msix_allocate_sgi(struct alpine_msix_data *priv, int num_req)
+{
+       int first;
+
+       spin_lock(&priv->msi_map_lock);
+
+       first = bitmap_find_next_zero_area(priv->msi_map, priv->num_spis, 0,
+                                          num_req, 0);
+       if (first >= priv->num_spis) {
+               spin_unlock(&priv->msi_map_lock);
+               return -ENOSPC;
+       }
+
+       bitmap_set(priv->msi_map, first, num_req);
+
+       spin_unlock(&priv->msi_map_lock);
+
+       return priv->spi_first + first;
+}
+
+static void alpine_msix_free_sgi(struct alpine_msix_data *priv, unsigned sgi,
+                                int num_req)
+{
+       int first = sgi - priv->spi_first;
+
+       spin_lock(&priv->msi_map_lock);
+
+       bitmap_clear(priv->msi_map, first, num_req);
+
+       spin_unlock(&priv->msi_map_lock);
+}
+
+static void alpine_msix_compose_msi_msg(struct irq_data *data,
+                                       struct msi_msg *msg)
+{
+       struct alpine_msix_data *priv = irq_data_get_irq_chip_data(data);
+       phys_addr_t msg_addr = priv->addr;
+
+       msg_addr |= (data->hwirq << 3);
+
+       msg->address_hi = upper_32_bits(msg_addr);
+       msg->address_lo = lower_32_bits(msg_addr);
+       msg->data = 0;
+}
+
+static struct msi_domain_info alpine_msix_domain_info = {
+       .flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                 MSI_FLAG_PCI_MSIX,
+       .chip   = &alpine_msix_irq_chip,
+};
+
+static struct irq_chip middle_irq_chip = {
+       .name                   = "alpine_msix_middle",
+       .irq_mask               = irq_chip_mask_parent,
+       .irq_unmask             = irq_chip_unmask_parent,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+       .irq_compose_msi_msg    = alpine_msix_compose_msi_msg,
+};
+
+static int alpine_msix_gic_domain_alloc(struct irq_domain *domain,
+                                       unsigned int virq, int sgi)
+{
+       struct irq_fwspec fwspec;
+       struct irq_data *d;
+       int ret;
+
+       if (!is_of_node(domain->parent->fwnode))
+               return -EINVAL;
+
+       fwspec.fwnode = domain->parent->fwnode;
+       fwspec.param_count = 3;
+       fwspec.param[0] = 0;
+       fwspec.param[1] = sgi;
+       fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+       ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+       if (ret)
+               return ret;
+
+       d = irq_domain_get_irq_data(domain->parent, virq);
+       d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+
+       return 0;
+}
+
+static int alpine_msix_middle_domain_alloc(struct irq_domain *domain,
+                                          unsigned int virq,
+                                          unsigned int nr_irqs, void *args)
+{
+       struct alpine_msix_data *priv = domain->host_data;
+       int sgi, err, i;
+
+       sgi = alpine_msix_allocate_sgi(priv, nr_irqs);
+       if (sgi < 0)
+               return sgi;
+
+       for (i = 0; i < nr_irqs; i++) {
+               err = alpine_msix_gic_domain_alloc(domain, virq + i, sgi + i);
+               if (err)
+                       goto err_sgi;
+
+               irq_domain_set_hwirq_and_chip(domain, virq + i, sgi + i,
+                                             &middle_irq_chip, priv);
+       }
+
+       return 0;
+
+err_sgi:
+       while (--i >= 0)
+               irq_domain_free_irqs_parent(domain, virq, i);
+       alpine_msix_free_sgi(priv, sgi, nr_irqs);
+       return err;
+}
+
+static void alpine_msix_middle_domain_free(struct irq_domain *domain,
+                                          unsigned int virq,
+                                          unsigned int nr_irqs)
+{
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+       struct alpine_msix_data *priv = irq_data_get_irq_chip_data(d);
+
+       irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+       alpine_msix_free_sgi(priv, d->hwirq, nr_irqs);
+}
+
+static const struct irq_domain_ops alpine_msix_middle_domain_ops = {
+       .alloc  = alpine_msix_middle_domain_alloc,
+       .free   = alpine_msix_middle_domain_free,
+};
+
+static int alpine_msix_init_domains(struct alpine_msix_data *priv,
+                                   struct device_node *node)
+{
+       struct irq_domain *middle_domain, *msi_domain, *gic_domain;
+       struct device_node *gic_node;
+
+       gic_node = of_irq_find_parent(node);
+       if (!gic_node) {
+               pr_err("Failed to find the GIC node\n");
+               return -ENODEV;
+       }
+
+       gic_domain = irq_find_host(gic_node);
+       if (!gic_domain) {
+               pr_err("Failed to find the GIC domain\n");
+               return -ENXIO;
+       }
+
+       middle_domain = irq_domain_add_tree(NULL,
+                                           &alpine_msix_middle_domain_ops,
+                                           priv);
+       if (!middle_domain) {
+               pr_err("Failed to create the MSIX middle domain\n");
+               return -ENOMEM;
+       }
+
+       middle_domain->parent = gic_domain;
+
+       msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node),
+                                              &alpine_msix_domain_info,
+                                              middle_domain);
+       if (!msi_domain) {
+               pr_err("Failed to create MSI domain\n");
+               irq_domain_remove(middle_domain);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int alpine_msix_init(struct device_node *node,
+                           struct device_node *parent)
+{
+       struct alpine_msix_data *priv;
+       struct resource res;
+       int ret;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       spin_lock_init(&priv->msi_map_lock);
+
+       ret = of_address_to_resource(node, 0, &res);
+       if (ret) {
+               pr_err("Failed to allocate resource\n");
+               goto err_priv;
+       }
+
+       /*
+        * The 20 least significant bits of addr provide direct information
+        * regarding the interrupt destination.
+        *
+        * To select the primary GIC as the target GIC, bits [18:17] must be set
+        * to 0x0. In this case, bit 16 (SPI_TARGET_CLUSTER0) must be set.
+        */
+       priv->addr = res.start & GENMASK_ULL(63,20);
+       priv->addr |= ALPINE_MSIX_SPI_TARGET_CLUSTER0;
+
+       if (of_property_read_u32(node, "al,msi-base-spi", &priv->spi_first)) {
+               pr_err("Unable to parse MSI base\n");
+               ret = -EINVAL;
+               goto err_priv;
+       }
+
+       if (of_property_read_u32(node, "al,msi-num-spis", &priv->num_spis)) {
+               pr_err("Unable to parse MSI numbers\n");
+               ret = -EINVAL;
+               goto err_priv;
+       }
+
+       priv->msi_map = kzalloc(sizeof(*priv->msi_map) * BITS_TO_LONGS(priv->num_spis),
+                               GFP_KERNEL);
+       if (!priv->msi_map) {
+               ret = -ENOMEM;
+               goto err_priv;
+       }
+
+       pr_debug("Registering %d msixs, starting at %d\n",
+                priv->num_spis, priv->spi_first);
+
+       ret = alpine_msix_init_domains(priv, node);
+       if (ret)
+               goto err_map;
+
+       return 0;
+
+err_map:
+       kfree(priv->msi_map);
+err_priv:
+       kfree(priv);
+       return ret;
+}
+IRQCHIP_DECLARE(alpine_msix, "al,alpine-msix", alpine_msix_init);
index 3f3a8c3..e7dc6cb 100644 (file)
@@ -71,6 +71,7 @@ static u32 doorbell_mask_reg;
 static int parent_irq;
 #ifdef CONFIG_PCI_MSI
 static struct irq_domain *armada_370_xp_msi_domain;
+static struct irq_domain *armada_370_xp_msi_inner_domain;
 static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR);
 static DEFINE_MUTEX(msi_used_lock);
 static phys_addr_t msi_doorbell_addr;
@@ -115,127 +116,102 @@ static void armada_370_xp_irq_unmask(struct irq_data *d)
 
 #ifdef CONFIG_PCI_MSI
 
-static int armada_370_xp_alloc_msi(void)
-{
-       int hwirq;
+static struct irq_chip armada_370_xp_msi_irq_chip = {
+       .name = "MPIC MSI",
+       .irq_mask = pci_msi_mask_irq,
+       .irq_unmask = pci_msi_unmask_irq,
+};
 
-       mutex_lock(&msi_used_lock);
-       hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR);
-       if (hwirq >= PCI_MSI_DOORBELL_NR)
-               hwirq = -ENOSPC;
-       else
-               set_bit(hwirq, msi_used);
-       mutex_unlock(&msi_used_lock);
+static struct msi_domain_info armada_370_xp_msi_domain_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                  MSI_FLAG_MULTI_PCI_MSI),
+       .chip   = &armada_370_xp_msi_irq_chip,
+};
 
-       return hwirq;
+static void armada_370_xp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       msg->address_lo = lower_32_bits(msi_doorbell_addr);
+       msg->address_hi = upper_32_bits(msi_doorbell_addr);
+       msg->data = 0xf00 | (data->hwirq + PCI_MSI_DOORBELL_START);
 }
 
-static void armada_370_xp_free_msi(int hwirq)
+static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data,
+                                         const struct cpumask *mask, bool force)
 {
-       mutex_lock(&msi_used_lock);
-       if (!test_bit(hwirq, msi_used))
-               pr_err("trying to free unused MSI#%d\n", hwirq);
-       else
-               clear_bit(hwirq, msi_used);
-       mutex_unlock(&msi_used_lock);
+        return -EINVAL;
 }
 
-static int armada_370_xp_setup_msi_irq(struct msi_controller *chip,
-                                      struct pci_dev *pdev,
-                                      struct msi_desc *desc)
-{
-       struct msi_msg msg;
-       int virq, hwirq;
+static struct irq_chip armada_370_xp_msi_bottom_irq_chip = {
+       .name                   = "MPIC MSI",
+       .irq_compose_msi_msg    = armada_370_xp_compose_msi_msg,
+       .irq_set_affinity       = armada_370_xp_msi_set_affinity,
+};
 
-       /* We support MSI, but not MSI-X */
-       if (desc->msi_attrib.is_msix)
-               return -EINVAL;
+static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq,
+                                  unsigned int nr_irqs, void *args)
+{
+       int hwirq, i;
 
-       hwirq = armada_370_xp_alloc_msi();
-       if (hwirq < 0)
-               return hwirq;
+       mutex_lock(&msi_used_lock);
 
-       virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq);
-       if (!virq) {
-               armada_370_xp_free_msi(hwirq);
-               return -EINVAL;
+       hwirq = bitmap_find_next_zero_area(msi_used, PCI_MSI_DOORBELL_NR,
+                                          0, nr_irqs, 0);
+       if (hwirq >= PCI_MSI_DOORBELL_NR) {
+               mutex_unlock(&msi_used_lock);
+               return -ENOSPC;
        }
 
-       irq_set_msi_desc(virq, desc);
-
-       msg.address_lo = msi_doorbell_addr;
-       msg.address_hi = 0;
-       msg.data = 0xf00 | (hwirq + 16);
-
-       pci_write_msi_msg(virq, &msg);
-       return 0;
-}
+       bitmap_set(msi_used, hwirq, nr_irqs);
+       mutex_unlock(&msi_used_lock);
 
-static void armada_370_xp_teardown_msi_irq(struct msi_controller *chip,
-                                          unsigned int irq)
-{
-       struct irq_data *d = irq_get_irq_data(irq);
-       unsigned long hwirq = d->hwirq;
+       for (i = 0; i < nr_irqs; i++) {
+               irq_domain_set_info(domain, virq + i, hwirq + i,
+                                   &armada_370_xp_msi_bottom_irq_chip,
+                                   domain->host_data, handle_simple_irq,
+                                   NULL, NULL);
+       }
 
-       irq_dispose_mapping(irq);
-       armada_370_xp_free_msi(hwirq);
+       return hwirq;
 }
 
-static struct irq_chip armada_370_xp_msi_irq_chip = {
-       .name = "armada_370_xp_msi_irq",
-       .irq_enable = pci_msi_unmask_irq,
-       .irq_disable = pci_msi_mask_irq,
-       .irq_mask = pci_msi_mask_irq,
-       .irq_unmask = pci_msi_unmask_irq,
-};
-
-static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq,
-                                irq_hw_number_t hw)
+static void armada_370_xp_msi_free(struct irq_domain *domain,
+                                  unsigned int virq, unsigned int nr_irqs)
 {
-       irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip,
-                                handle_simple_irq);
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
 
-       return 0;
+       mutex_lock(&msi_used_lock);
+       bitmap_clear(msi_used, d->hwirq, nr_irqs);
+       mutex_unlock(&msi_used_lock);
 }
 
-static const struct irq_domain_ops armada_370_xp_msi_irq_ops = {
-       .map = armada_370_xp_msi_map,
+static const struct irq_domain_ops armada_370_xp_msi_domain_ops = {
+       .alloc  = armada_370_xp_msi_alloc,
+       .free   = armada_370_xp_msi_free,
 };
 
 static int armada_370_xp_msi_init(struct device_node *node,
                                  phys_addr_t main_int_phys_base)
 {
-       struct msi_controller *msi_chip;
        u32 reg;
-       int ret;
 
        msi_doorbell_addr = main_int_phys_base +
                ARMADA_370_XP_SW_TRIG_INT_OFFS;
 
-       msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL);
-       if (!msi_chip)
+       armada_370_xp_msi_inner_domain =
+               irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
+                                     &armada_370_xp_msi_domain_ops, NULL);
+       if (!armada_370_xp_msi_inner_domain)
                return -ENOMEM;
 
-       msi_chip->setup_irq = armada_370_xp_setup_msi_irq;
-       msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq;
-       msi_chip->of_node = node;
-
        armada_370_xp_msi_domain =
-               irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
-                                     &armada_370_xp_msi_irq_ops,
-                                     NULL);
+               pci_msi_create_irq_domain(of_node_to_fwnode(node),
+                                         &armada_370_xp_msi_domain_info,
+                                         armada_370_xp_msi_inner_domain);
        if (!armada_370_xp_msi_domain) {
-               kfree(msi_chip);
+               irq_domain_remove(armada_370_xp_msi_inner_domain);
                return -ENOMEM;
        }
 
-       ret = of_pci_msi_chip_add(msi_chip);
-       if (ret < 0) {
-               irq_domain_remove(armada_370_xp_msi_domain);
-               kfree(msi_chip);
-               return ret;
-       }
-
        reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS)
                | PCI_MSI_DOORBELL_MASK;
 
@@ -280,7 +256,7 @@ static int armada_xp_set_affinity(struct irq_data *d,
 #endif
 
 static struct irq_chip armada_370_xp_irq_chip = {
-       .name           = "armada_370_xp_irq",
+       .name           = "MPIC",
        .irq_mask       = armada_370_xp_irq_mask,
        .irq_mask_ack   = armada_370_xp_irq_mask,
        .irq_unmask     = armada_370_xp_irq_unmask,
@@ -427,12 +403,12 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained)
                        continue;
 
                if (is_chained) {
-                       irq = irq_find_mapping(armada_370_xp_msi_domain,
-                                              msinr - 16);
+                       irq = irq_find_mapping(armada_370_xp_msi_inner_domain,
+                                              msinr - PCI_MSI_DOORBELL_START);
                        generic_handle_irq(irq);
                } else {
-                       irq = msinr - 16;
-                       handle_domain_irq(armada_370_xp_msi_domain,
+                       irq = msinr - PCI_MSI_DOORBELL_START;
+                       handle_domain_irq(armada_370_xp_msi_inner_domain,
                                          irq, regs);
                }
        }
@@ -604,8 +580,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node,
        armada_370_xp_mpic_domain =
                irq_domain_add_linear(node, nr_irqs,
                                &armada_370_xp_mpic_irq_ops, NULL);
-
        BUG_ON(!armada_370_xp_mpic_domain);
+       armada_370_xp_mpic_domain->bus_token = DOMAIN_BUS_WIRED;
 
        /* Setup for the boot CPU */
        armada_xp_mpic_perf_init();
diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c
new file mode 100644 (file)
index 0000000..befe93c
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ *  Atheros AR71xx/AR724x/AR913x specific interrupt handling
+ *
+ *  Copyright (C) 2015 Alban Bedel <albeu@free.fr>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
+ *  Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
+ *  Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irqchip.h>
+#include <linux/of.h>
+
+#include <asm/irq_cpu.h>
+#include <asm/mach-ath79/ath79.h>
+
+/*
+ * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
+ * these devices typically allocate coherent DMA memory, however the
+ * DMA controller may still have some unsynchronized data in the FIFO.
+ * Issue a flush in the handlers to ensure that the driver sees
+ * the update.
+ *
+ * This array map the interrupt lines to the DDR write buffer channels.
+ */
+
+static unsigned irq_wb_chan[8] = {
+       -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+asmlinkage void plat_irq_dispatch(void)
+{
+       unsigned long pending;
+       int irq;
+
+       pending = read_c0_status() & read_c0_cause() & ST0_IM;
+
+       if (!pending) {
+               spurious_interrupt();
+               return;
+       }
+
+       pending >>= CAUSEB_IP;
+       while (pending) {
+               irq = fls(pending) - 1;
+               if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
+                       ath79_ddr_wb_flush(irq_wb_chan[irq]);
+               do_IRQ(MIPS_CPU_IRQ_BASE + irq);
+               pending &= ~BIT(irq);
+       }
+}
+
+static int __init ar79_cpu_intc_of_init(
+       struct device_node *node, struct device_node *parent)
+{
+       int err, i, count;
+
+       /* Fill the irq_wb_chan table */
+       count = of_count_phandle_with_args(
+               node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
+
+       for (i = 0; i < count; i++) {
+               struct of_phandle_args args;
+               u32 irq = i;
+
+               of_property_read_u32_index(
+                       node, "qca,ddr-wb-channel-interrupts", i, &irq);
+               if (irq >= ARRAY_SIZE(irq_wb_chan))
+                       continue;
+
+               err = of_parse_phandle_with_args(
+                       node, "qca,ddr-wb-channels",
+                       "#qca,ddr-wb-channel-cells",
+                       i, &args);
+               if (err)
+                       return err;
+
+               irq_wb_chan[irq] = args.args[0];
+       }
+
+       return mips_cpu_irq_of_init(node, parent);
+}
+IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
+               ar79_cpu_intc_of_init);
+
+void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3)
+{
+       irq_wb_chan[2] = irq_wb_chan2;
+       irq_wb_chan[3] = irq_wb_chan3;
+       mips_cpu_irq_init();
+}
diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c
new file mode 100644 (file)
index 0000000..aa72907
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ *  Atheros AR71xx/AR724x/AR913x MISC interrupt controller
+ *
+ *  Copyright (C) 2015 Alban Bedel <albeu@free.fr>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
+ *  Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
+ *  Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+#define AR71XX_RESET_REG_MISC_INT_STATUS       0
+#define AR71XX_RESET_REG_MISC_INT_ENABLE       4
+
+#define ATH79_MISC_IRQ_COUNT                   32
+
+static void ath79_misc_irq_handler(struct irq_desc *desc)
+{
+       struct irq_domain *domain = irq_desc_get_handler_data(desc);
+       struct irq_chip *chip = irq_desc_get_chip(desc);
+       void __iomem *base = domain->host_data;
+       u32 pending;
+
+       chained_irq_enter(chip, desc);
+
+       pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
+                 __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+       if (!pending) {
+               spurious_interrupt();
+               chained_irq_exit(chip, desc);
+               return;
+       }
+
+       while (pending) {
+               int bit = __ffs(pending);
+
+               generic_handle_irq(irq_linear_revmap(domain, bit));
+               pending &= ~BIT(bit);
+       }
+
+       chained_irq_exit(chip, desc);
+}
+
+static void ar71xx_misc_irq_unmask(struct irq_data *d)
+{
+       void __iomem *base = irq_data_get_irq_chip_data(d);
+       unsigned int irq = d->hwirq;
+       u32 t;
+
+       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+       __raw_writel(t | BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+       /* flush write */
+       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+}
+
+static void ar71xx_misc_irq_mask(struct irq_data *d)
+{
+       void __iomem *base = irq_data_get_irq_chip_data(d);
+       unsigned int irq = d->hwirq;
+       u32 t;
+
+       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+       __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+       /* flush write */
+       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+}
+
+static void ar724x_misc_irq_ack(struct irq_data *d)
+{
+       void __iomem *base = irq_data_get_irq_chip_data(d);
+       unsigned int irq = d->hwirq;
+       u32 t;
+
+       t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
+       __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
+
+       /* flush write */
+       __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
+}
+
+static struct irq_chip ath79_misc_irq_chip = {
+       .name           = "MISC",
+       .irq_unmask     = ar71xx_misc_irq_unmask,
+       .irq_mask       = ar71xx_misc_irq_mask,
+};
+
+static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
+{
+       irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
+       irq_set_chip_data(irq, d->host_data);
+       return 0;
+}
+
+static const struct irq_domain_ops misc_irq_domain_ops = {
+       .xlate = irq_domain_xlate_onecell,
+       .map = misc_map,
+};
+
+static void __init ath79_misc_intc_domain_init(
+       struct irq_domain *domain, int irq)
+{
+       void __iomem *base = domain->host_data;
+
+       /* Disable and clear all interrupts */
+       __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+       __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
+
+       irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
+}
+
+static int __init ath79_misc_intc_of_init(
+       struct device_node *node, struct device_node *parent)
+{
+       struct irq_domain *domain;
+       void __iomem *base;
+       int irq;
+
+       irq = irq_of_parse_and_map(node, 0);
+       if (!irq) {
+               pr_err("Failed to get MISC IRQ\n");
+               return -EINVAL;
+       }
+
+       base = of_iomap(node, 0);
+       if (!base) {
+               pr_err("Failed to get MISC IRQ registers\n");
+               return -ENOMEM;
+       }
+
+       domain = irq_domain_add_linear(node, ATH79_MISC_IRQ_COUNT,
+                               &misc_irq_domain_ops, base);
+       if (!domain) {
+               pr_err("Failed to add MISC irqdomain\n");
+               return -EINVAL;
+       }
+
+       ath79_misc_intc_domain_init(domain, irq);
+       return 0;
+}
+
+static int __init ar7100_misc_intc_of_init(
+       struct device_node *node, struct device_node *parent)
+{
+       ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
+       return ath79_misc_intc_of_init(node, parent);
+}
+
+IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
+               ar7100_misc_intc_of_init);
+
+static int __init ar7240_misc_intc_of_init(
+       struct device_node *node, struct device_node *parent)
+{
+       ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
+       return ath79_misc_intc_of_init(node, parent);
+}
+
+IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
+               ar7240_misc_intc_of_init);
+
+void __init ath79_misc_irq_init(void __iomem *regs, int irq,
+                               int irq_base, bool is_ar71xx)
+{
+       struct irq_domain *domain;
+
+       if (is_ar71xx)
+               ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
+       else
+               ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
+
+       domain = irq_domain_add_legacy(NULL, ATH79_MISC_IRQ_COUNT,
+                       irq_base, 0, &misc_irq_domain_ops, regs);
+       if (!domain)
+               panic("Failed to create MISC irqdomain");
+
+       ath79_misc_intc_domain_init(domain, irq);
+}
index b12a5d5..28b26c8 100644 (file)
@@ -80,16 +80,10 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val)
        return 0;
 }
 
-int aic_common_set_priority(int priority, unsigned *val)
+void aic_common_set_priority(int priority, unsigned *val)
 {
-       if (priority < AT91_AIC_IRQ_MIN_PRIORITY ||
-           priority > AT91_AIC_IRQ_MAX_PRIORITY)
-               return -EINVAL;
-
-       *val &= AT91_AIC_PRIOR;
+       *val &= ~AT91_AIC_PRIOR;
        *val |= priority;
-
-       return 0;
 }
 
 int aic_common_irq_domain_xlate(struct irq_domain *d,
@@ -193,7 +187,7 @@ void __init aic_common_rtt_irq_fixup(struct device_node *root)
        }
 }
 
-void __init aic_common_irq_fixup(const struct of_device_id *matches)
+static void __init aic_common_irq_fixup(const struct of_device_id *matches)
 {
        struct device_node *root = of_find_node_by_path("/");
        const struct of_device_id *match;
@@ -214,7 +208,8 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches)
 
 struct irq_domain *__init aic_common_of_init(struct device_node *node,
                                             const struct irq_domain_ops *ops,
-                                            const char *name, int nirqs)
+                                            const char *name, int nirqs,
+                                            const struct of_device_id *matches)
 {
        struct irq_chip_generic *gc;
        struct irq_domain *domain;
@@ -264,6 +259,7 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node,
        }
 
        aic_common_ext_irq_of_init(domain);
+       aic_common_irq_fixup(matches);
 
        return domain;
 
index 603f0a9..af60376 100644 (file)
@@ -19,7 +19,7 @@
 
 int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val);
 
-int aic_common_set_priority(int priority, unsigned *val);
+void aic_common_set_priority(int priority, unsigned *val);
 
 int aic_common_irq_domain_xlate(struct irq_domain *d,
                                struct device_node *ctrlr,
@@ -30,12 +30,11 @@ int aic_common_irq_domain_xlate(struct irq_domain *d,
 
 struct irq_domain *__init aic_common_of_init(struct device_node *node,
                                             const struct irq_domain_ops *ops,
-                                            const char *name, int nirqs);
+                                            const char *name, int nirqs,
+                                            const struct of_device_id *matches);
 
 void __init aic_common_rtc_irq_fixup(struct device_node *root);
 
 void __init aic_common_rtt_irq_fixup(struct device_node *root);
 
-void __init aic_common_irq_fixup(const struct of_device_id *matches);
-
 #endif /* __IRQ_ATMEL_AIC_COMMON_H */
index 8a0c7f2..112e17c 100644 (file)
@@ -196,9 +196,8 @@ static int aic_irq_domain_xlate(struct irq_domain *d,
 
        irq_gc_lock(gc);
        smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq));
-       ret = aic_common_set_priority(intspec[2], &smr);
-       if (!ret)
-               irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
+       aic_common_set_priority(intspec[2], &smr);
+       irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
        irq_gc_unlock(gc);
 
        return ret;
@@ -248,12 +247,10 @@ static int __init aic_of_init(struct device_node *node,
                return -EEXIST;
 
        domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic",
-                                   NR_AIC_IRQS);
+                                   NR_AIC_IRQS, aic_irq_fixups);
        if (IS_ERR(domain))
                return PTR_ERR(domain);
 
-       aic_common_irq_fixup(aic_irq_fixups);
-
        aic_domain = domain;
        gc = irq_get_domain_generic_chip(domain, 0);
 
index 62bb840..4f0d068 100644 (file)
@@ -272,9 +272,8 @@ static int aic5_irq_domain_xlate(struct irq_domain *d,
        irq_gc_lock(bgc);
        irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR);
        smr = irq_reg_readl(bgc, AT91_AIC5_SMR);
-       ret = aic_common_set_priority(intspec[2], &smr);
-       if (!ret)
-               irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR);
+       aic_common_set_priority(intspec[2], &smr);
+       irq_reg_writel(bgc, smr, AT91_AIC5_SMR);
        irq_gc_unlock(bgc);
 
        return ret;
@@ -312,12 +311,10 @@ static int __init aic5_of_init(struct device_node *node,
                return -EEXIST;
 
        domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5",
-                                   nirqs);
+                                   nirqs, aic5_irq_fixups);
        if (IS_ERR(domain))
                return PTR_ERR(domain);
 
-       aic_common_irq_fixup(aic5_irq_fixups);
-
        aic5_domain = domain;
        nchips = aic5_domain->revmap_size / 32;
        for (i = 0; i < nchips; i++) {
index 963065a..b6e950d 100644 (file)
@@ -229,7 +229,6 @@ int __init bcm2836_smp_boot_secondary(unsigned int cpu,
        unsigned long secondary_startup_phys =
                (unsigned long)virt_to_phys((void *)secondary_startup);
 
-       dsb();
        writel(secondary_startup_phys,
               intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu);
 
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
new file mode 100644 (file)
index 0000000..b844c89
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ * Broadcom BCM6345 style Level 1 interrupt controller driver
+ *
+ * Copyright (C) 2014 Broadcom Corporation
+ * Copyright 2015 Simon Arlott
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is based on the BCM7038 (which supports SMP) but with a single
+ * enable register instead of separate mask/set/clear registers.
+ *
+ * The BCM3380 has a similar mask/status register layout, but each pair
+ * of words is at separate locations (and SMP is not supported).
+ *
+ * ENABLE/STATUS words are packed next to each other for each CPU:
+ *
+ * BCM6368:
+ *   0x1000_0020: CPU0_W0_ENABLE
+ *   0x1000_0024: CPU0_W1_ENABLE
+ *   0x1000_0028: CPU0_W0_STATUS               IRQs 31-63
+ *   0x1000_002c: CPU0_W1_STATUS               IRQs 0-31
+ *   0x1000_0030: CPU1_W0_ENABLE
+ *   0x1000_0034: CPU1_W1_ENABLE
+ *   0x1000_0038: CPU1_W0_STATUS               IRQs 31-63
+ *   0x1000_003c: CPU1_W1_STATUS               IRQs 0-31
+ *
+ * BCM63168:
+ *   0x1000_0020: CPU0_W0_ENABLE
+ *   0x1000_0024: CPU0_W1_ENABLE
+ *   0x1000_0028: CPU0_W2_ENABLE
+ *   0x1000_002c: CPU0_W3_ENABLE
+ *   0x1000_0030: CPU0_W0_STATUS       IRQs 96-127
+ *   0x1000_0034: CPU0_W1_STATUS       IRQs 64-95
+ *   0x1000_0038: CPU0_W2_STATUS       IRQs 32-63
+ *   0x1000_003c: CPU0_W3_STATUS       IRQs 0-31
+ *   0x1000_0040: CPU1_W0_ENABLE
+ *   0x1000_0044: CPU1_W1_ENABLE
+ *   0x1000_0048: CPU1_W2_ENABLE
+ *   0x1000_004c: CPU1_W3_ENABLE
+ *   0x1000_0050: CPU1_W0_STATUS       IRQs 96-127
+ *   0x1000_0054: CPU1_W1_STATUS       IRQs 64-95
+ *   0x1000_0058: CPU1_W2_STATUS       IRQs 32-63
+ *   0x1000_005c: CPU1_W3_STATUS       IRQs 0-31
+ *
+ * IRQs are numbered in CPU native endian order
+ * (which is big-endian in these examples)
+ */
+
+#define pr_fmt(fmt)    KBUILD_MODNAME  ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+
+#define IRQS_PER_WORD          32
+#define REG_BYTES_PER_IRQ_WORD (sizeof(u32) * 2)
+
+struct bcm6345_l1_cpu;
+
+struct bcm6345_l1_chip {
+       raw_spinlock_t          lock;
+       unsigned int            n_words;
+       struct irq_domain       *domain;
+       struct cpumask          cpumask;
+       struct bcm6345_l1_cpu   *cpus[NR_CPUS];
+};
+
+struct bcm6345_l1_cpu {
+       void __iomem            *map_base;
+       unsigned int            parent_irq;
+       u32                     enable_cache[];
+};
+
+static inline unsigned int reg_enable(struct bcm6345_l1_chip *intc,
+                                          unsigned int word)
+{
+#ifdef __BIG_ENDIAN
+       return (1 * intc->n_words - word - 1) * sizeof(u32);
+#else
+       return (0 * intc->n_words + word) * sizeof(u32);
+#endif
+}
+
+static inline unsigned int reg_status(struct bcm6345_l1_chip *intc,
+                                     unsigned int word)
+{
+#ifdef __BIG_ENDIAN
+       return (2 * intc->n_words - word - 1) * sizeof(u32);
+#else
+       return (1 * intc->n_words + word) * sizeof(u32);
+#endif
+}
+
+static inline unsigned int cpu_for_irq(struct bcm6345_l1_chip *intc,
+                                       struct irq_data *d)
+{
+       return cpumask_first_and(&intc->cpumask, irq_data_get_affinity_mask(d));
+}
+
+static void bcm6345_l1_irq_handle(struct irq_desc *desc)
+{
+       struct bcm6345_l1_chip *intc = irq_desc_get_handler_data(desc);
+       struct bcm6345_l1_cpu *cpu;
+       struct irq_chip *chip = irq_desc_get_chip(desc);
+       unsigned int idx;
+
+#ifdef CONFIG_SMP
+       cpu = intc->cpus[cpu_logical_map(smp_processor_id())];
+#else
+       cpu = intc->cpus[0];
+#endif
+
+       chained_irq_enter(chip, desc);
+
+       for (idx = 0; idx < intc->n_words; idx++) {
+               int base = idx * IRQS_PER_WORD;
+               unsigned long pending;
+               irq_hw_number_t hwirq;
+               unsigned int irq;
+
+               pending = __raw_readl(cpu->map_base + reg_status(intc, idx));
+               pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx));
+
+               for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) {
+                       irq = irq_linear_revmap(intc->domain, base + hwirq);
+                       if (irq)
+                               do_IRQ(irq);
+                       else
+                               spurious_interrupt();
+               }
+       }
+
+       chained_irq_exit(chip, desc);
+}
+
+static inline void __bcm6345_l1_unmask(struct irq_data *d)
+{
+       struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+       u32 word = d->hwirq / IRQS_PER_WORD;
+       u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+       unsigned int cpu_idx = cpu_for_irq(intc, d);
+
+       intc->cpus[cpu_idx]->enable_cache[word] |= mask;
+       __raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
+               intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
+}
+
+static inline void __bcm6345_l1_mask(struct irq_data *d)
+{
+       struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+       u32 word = d->hwirq / IRQS_PER_WORD;
+       u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+       unsigned int cpu_idx = cpu_for_irq(intc, d);
+
+       intc->cpus[cpu_idx]->enable_cache[word] &= ~mask;
+       __raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
+               intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
+}
+
+static void bcm6345_l1_unmask(struct irq_data *d)
+{
+       struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&intc->lock, flags);
+       __bcm6345_l1_unmask(d);
+       raw_spin_unlock_irqrestore(&intc->lock, flags);
+}
+
+static void bcm6345_l1_mask(struct irq_data *d)
+{
+       struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&intc->lock, flags);
+       __bcm6345_l1_mask(d);
+       raw_spin_unlock_irqrestore(&intc->lock, flags);
+}
+
+static int bcm6345_l1_set_affinity(struct irq_data *d,
+                                  const struct cpumask *dest,
+                                  bool force)
+{
+       struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+       u32 word = d->hwirq / IRQS_PER_WORD;
+       u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+       unsigned int old_cpu = cpu_for_irq(intc, d);
+       unsigned int new_cpu;
+       struct cpumask valid;
+       unsigned long flags;
+       bool enabled;
+
+       if (!cpumask_and(&valid, &intc->cpumask, dest))
+               return -EINVAL;
+
+       new_cpu = cpumask_any_and(&valid, cpu_online_mask);
+       if (new_cpu >= nr_cpu_ids)
+               return -EINVAL;
+
+       dest = cpumask_of(new_cpu);
+
+       raw_spin_lock_irqsave(&intc->lock, flags);
+       if (old_cpu != new_cpu) {
+               enabled = intc->cpus[old_cpu]->enable_cache[word] & mask;
+               if (enabled)
+                       __bcm6345_l1_mask(d);
+               cpumask_copy(irq_data_get_affinity_mask(d), dest);
+               if (enabled)
+                       __bcm6345_l1_unmask(d);
+       } else {
+               cpumask_copy(irq_data_get_affinity_mask(d), dest);
+       }
+       raw_spin_unlock_irqrestore(&intc->lock, flags);
+
+       return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static int __init bcm6345_l1_init_one(struct device_node *dn,
+                                     unsigned int idx,
+                                     struct bcm6345_l1_chip *intc)
+{
+       struct resource res;
+       resource_size_t sz;
+       struct bcm6345_l1_cpu *cpu;
+       unsigned int i, n_words;
+
+       if (of_address_to_resource(dn, idx, &res))
+               return -EINVAL;
+       sz = resource_size(&res);
+       n_words = sz / REG_BYTES_PER_IRQ_WORD;
+
+       if (!intc->n_words)
+               intc->n_words = n_words;
+       else if (intc->n_words != n_words)
+               return -EINVAL;
+
+       cpu = intc->cpus[idx] = kzalloc(sizeof(*cpu) + n_words * sizeof(u32),
+                                       GFP_KERNEL);
+       if (!cpu)
+               return -ENOMEM;
+
+       cpu->map_base = ioremap(res.start, sz);
+       if (!cpu->map_base)
+               return -ENOMEM;
+
+       for (i = 0; i < n_words; i++) {
+               cpu->enable_cache[i] = 0;
+               __raw_writel(0, cpu->map_base + reg_enable(intc, i));
+       }
+
+       cpu->parent_irq = irq_of_parse_and_map(dn, idx);
+       if (!cpu->parent_irq) {
+               pr_err("failed to map parent interrupt %d\n", cpu->parent_irq);
+               return -EINVAL;
+       }
+       irq_set_chained_handler_and_data(cpu->parent_irq,
+                                               bcm6345_l1_irq_handle, intc);
+
+       return 0;
+}
+
+static struct irq_chip bcm6345_l1_irq_chip = {
+       .name                   = "bcm6345-l1",
+       .irq_mask               = bcm6345_l1_mask,
+       .irq_unmask             = bcm6345_l1_unmask,
+       .irq_set_affinity       = bcm6345_l1_set_affinity,
+};
+
+static int bcm6345_l1_map(struct irq_domain *d, unsigned int virq,
+                         irq_hw_number_t hw_irq)
+{
+       irq_set_chip_and_handler(virq,
+               &bcm6345_l1_irq_chip, handle_percpu_irq);
+       irq_set_chip_data(virq, d->host_data);
+       return 0;
+}
+
+static const struct irq_domain_ops bcm6345_l1_domain_ops = {
+       .xlate                  = irq_domain_xlate_onecell,
+       .map                    = bcm6345_l1_map,
+};
+
+static int __init bcm6345_l1_of_init(struct device_node *dn,
+                             struct device_node *parent)
+{
+       struct bcm6345_l1_chip *intc;
+       unsigned int idx;
+       int ret;
+
+       intc = kzalloc(sizeof(*intc), GFP_KERNEL);
+       if (!intc)
+               return -ENOMEM;
+
+       for_each_possible_cpu(idx) {
+               ret = bcm6345_l1_init_one(dn, idx, intc);
+               if (ret)
+                       pr_err("failed to init intc L1 for cpu %d: %d\n",
+                               idx, ret);
+               else
+                       cpumask_set_cpu(idx, &intc->cpumask);
+       }
+
+       if (!cpumask_weight(&intc->cpumask)) {
+               ret = -ENODEV;
+               goto out_free;
+       }
+
+       raw_spin_lock_init(&intc->lock);
+
+       intc->domain = irq_domain_add_linear(dn, IRQS_PER_WORD * intc->n_words,
+                                            &bcm6345_l1_domain_ops,
+                                            intc);
+       if (!intc->domain) {
+               ret = -ENOMEM;
+               goto out_unmap;
+       }
+
+       pr_info("registered BCM6345 L1 intc (IRQs: %d)\n",
+                       IRQS_PER_WORD * intc->n_words);
+       for_each_cpu(idx, &intc->cpumask) {
+               struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
+
+               pr_info("  CPU%u at MMIO 0x%p (irq = %d)\n", idx,
+                               cpu->map_base, cpu->parent_irq);
+       }
+
+       return 0;
+
+out_unmap:
+       for_each_possible_cpu(idx) {
+               struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
+
+               if (cpu) {
+                       if (cpu->map_base)
+                               iounmap(cpu->map_base);
+                       kfree(cpu);
+               }
+       }
+out_free:
+       kfree(intc);
+       return ret;
+}
+
+IRQCHIP_DECLARE(bcm6345_l1, "brcm,bcm6345-l1-intc", bcm6345_l1_of_init);
index aa46eb2..54c2964 100644 (file)
@@ -10,7 +10,8 @@
 #include <linux/irqchip/arm-gic.h>
 
 #define REALVIEW_SYS_LOCK_OFFSET       0x20
-#define REALVIEW_PB11MP_SYS_PLD_CTRL1  0x74
+#define REALVIEW_SYS_PLD_CTRL1         0x74
+#define REALVIEW_EB_REVB_SYS_PLD_CTRL1 0xD8
 #define VERSATILE_LOCK_VAL             0xA05F
 #define PLD_INTMODE_MASK               BIT(22)|BIT(23)|BIT(24)
 #define PLD_INTMODE_LEGACY             0x0
 #define PLD_INTMODE_NEW_NO_DCC         BIT(23)
 #define PLD_INTMODE_FIQ_ENABLE         BIT(24)
 
+/* For some reason RealView EB Rev B moved this register */
+static const struct of_device_id syscon_pldset_of_match[] = {
+       {
+               .compatible = "arm,realview-eb11mp-revb-syscon",
+               .data = (void *)REALVIEW_EB_REVB_SYS_PLD_CTRL1,
+       },
+       {
+               .compatible = "arm,realview-eb11mp-revc-syscon",
+               .data = (void *)REALVIEW_SYS_PLD_CTRL1,
+       },
+       {
+               .compatible = "arm,realview-eb-syscon",
+               .data = (void *)REALVIEW_SYS_PLD_CTRL1,
+       },
+       {
+               .compatible = "arm,realview-pb11mp-syscon",
+               .data = (void *)REALVIEW_SYS_PLD_CTRL1,
+       },
+       {},
+};
+
 static int __init
 realview_gic_of_init(struct device_node *node, struct device_node *parent)
 {
        static struct regmap *map;
+       struct device_node *np;
+       const struct of_device_id *gic_id;
+       u32 pld1_ctrl;
+
+       np = of_find_matching_node_and_match(NULL, syscon_pldset_of_match,
+                                            &gic_id);
+       if (!np)
+               return -ENODEV;
+       pld1_ctrl = (u32)gic_id->data;
 
        /* The PB11MPCore GIC needs to be configured in the syscon */
-       map = syscon_regmap_lookup_by_compatible("arm,realview-pb11mp-syscon");
+       map = syscon_node_to_regmap(np);
        if (!IS_ERR(map)) {
                /* new irq mode with no DCC */
                regmap_write(map, REALVIEW_SYS_LOCK_OFFSET,
                             VERSATILE_LOCK_VAL);
-               regmap_update_bits(map, REALVIEW_PB11MP_SYS_PLD_CTRL1,
+               regmap_update_bits(map, pld1_ctrl,
                                   PLD_INTMODE_NEW_NO_DCC,
                                   PLD_INTMODE_MASK);
                regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000);
-               pr_info("TC11MP GIC: set up interrupt controller to NEW mode, no DCC\n");
+               pr_info("RealView GIC: set up interrupt controller to NEW mode, no DCC\n");
        } else {
-               pr_err("TC11MP GIC setup: could not find syscon\n");
-               return -ENXIO;
+               pr_err("RealView GIC setup: could not find syscon\n");
+               return -ENODEV;
        }
        return gic_of_init(node, parent);
 }
 IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init);
+IRQCHIP_DECLARE(armeb11mp_gic, "arm,eb11mp-gic", realview_gic_of_init);
index c779f83..28f047c 100644 (file)
@@ -92,18 +92,6 @@ static struct msi_domain_info gicv2m_msi_domain_info = {
        .chip   = &gicv2m_msi_irq_chip,
 };
 
-static int gicv2m_set_affinity(struct irq_data *irq_data,
-                              const struct cpumask *mask, bool force)
-{
-       int ret;
-
-       ret = irq_chip_set_affinity_parent(irq_data, mask, force);
-       if (ret == IRQ_SET_MASK_OK)
-               ret = IRQ_SET_MASK_OK_DONE;
-
-       return ret;
-}
-
 static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
        struct v2m_data *v2m = irq_data_get_irq_chip_data(data);
@@ -122,7 +110,7 @@ static struct irq_chip gicv2m_irq_chip = {
        .irq_mask               = irq_chip_mask_parent,
        .irq_unmask             = irq_chip_unmask_parent,
        .irq_eoi                = irq_chip_eoi_parent,
-       .irq_set_affinity       = gicv2m_set_affinity,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
        .irq_compose_msi_msg    = gicv2m_compose_msi_msg,
 };
 
index e23d1d1..3926179 100644 (file)
@@ -66,7 +66,10 @@ struct its_node {
        unsigned long           phys_base;
        struct its_cmd_block    *cmd_base;
        struct its_cmd_block    *cmd_write;
-       void                    *tables[GITS_BASER_NR_REGS];
+       struct {
+               void            *base;
+               u32             order;
+       } tables[GITS_BASER_NR_REGS];
        struct its_collection   *collections;
        struct list_head        its_device_list;
        u64                     flags;
@@ -75,6 +78,9 @@ struct its_node {
 
 #define ITS_ITT_ALIGN          SZ_256
 
+/* Convert page order to size in bytes */
+#define PAGE_ORDER_TO_SIZE(o)  (PAGE_SIZE << (o))
+
 struct event_lpi_map {
        unsigned long           *lpi_map;
        u16                     *col_map;
@@ -97,7 +103,6 @@ struct its_device {
 
 static LIST_HEAD(its_nodes);
 static DEFINE_SPINLOCK(its_lock);
-static struct device_node *gic_root_node;
 static struct rdists *gic_rdists;
 
 #define gic_data_rdist()               (raw_cpu_ptr(gic_rdists->rdist))
@@ -597,11 +602,6 @@ static void its_unmask_irq(struct irq_data *d)
        lpi_set_config(d, true);
 }
 
-static void its_eoi_irq(struct irq_data *d)
-{
-       gic_write_eoir(d->hwirq);
-}
-
 static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
                            bool force)
 {
@@ -638,7 +638,7 @@ static struct irq_chip its_irq_chip = {
        .name                   = "ITS",
        .irq_mask               = its_mask_irq,
        .irq_unmask             = its_unmask_irq,
-       .irq_eoi                = its_eoi_irq,
+       .irq_eoi                = irq_chip_eoi_parent,
        .irq_set_affinity       = its_set_affinity,
        .irq_compose_msi_msg    = its_irq_compose_msi_msg,
 };
@@ -670,7 +670,7 @@ static int its_chunk_to_lpi(int chunk)
        return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192;
 }
 
-static int its_lpi_init(u32 id_bits)
+static int __init its_lpi_init(u32 id_bits)
 {
        lpi_chunks = its_lpi_to_chunk(1UL << id_bits);
 
@@ -807,9 +807,10 @@ static void its_free_tables(struct its_node *its)
        int i;
 
        for (i = 0; i < GITS_BASER_NR_REGS; i++) {
-               if (its->tables[i]) {
-                       free_page((unsigned long)its->tables[i]);
-                       its->tables[i] = NULL;
+               if (its->tables[i].base) {
+                       free_pages((unsigned long)its->tables[i].base,
+                                  its->tables[i].order);
+                       its->tables[i].base = NULL;
                }
        }
 }
@@ -842,7 +843,6 @@ static int its_alloc_tables(const char *node_name, struct its_node *its)
                u64 type = GITS_BASER_TYPE(val);
                u64 entry_size = GITS_BASER_ENTRY_SIZE(val);
                int order = get_order(psz);
-               int alloc_size;
                int alloc_pages;
                u64 tmp;
                void *base;
@@ -874,8 +874,8 @@ static int its_alloc_tables(const char *node_name, struct its_node *its)
                        }
                }
 
-               alloc_size = (1 << order) * PAGE_SIZE;
-               alloc_pages = (alloc_size / psz);
+retry_alloc_baser:
+               alloc_pages = (PAGE_ORDER_TO_SIZE(order) / psz);
                if (alloc_pages > GITS_BASER_PAGES_MAX) {
                        alloc_pages = GITS_BASER_PAGES_MAX;
                        order = get_order(GITS_BASER_PAGES_MAX * psz);
@@ -889,7 +889,8 @@ static int its_alloc_tables(const char *node_name, struct its_node *its)
                        goto out_free;
                }
 
-               its->tables[i] = base;
+               its->tables[i].base = base;
+               its->tables[i].order = order;
 
 retry_baser:
                val = (virt_to_phys(base)                                |
@@ -927,7 +928,7 @@ retry_baser:
                        shr = tmp & GITS_BASER_SHAREABILITY_MASK;
                        if (!shr) {
                                cache = GITS_BASER_nC;
-                               __flush_dcache_area(base, alloc_size);
+                               __flush_dcache_area(base, PAGE_ORDER_TO_SIZE(order));
                        }
                        goto retry_baser;
                }
@@ -938,13 +939,16 @@ retry_baser:
                         * size and retry. If we reach 4K, then
                         * something is horribly wrong...
                         */
+                       free_pages((unsigned long)base, order);
+                       its->tables[i].base = NULL;
+
                        switch (psz) {
                        case SZ_16K:
                                psz = SZ_4K;
-                               goto retry_baser;
+                               goto retry_alloc_baser;
                        case SZ_64K:
                                psz = SZ_16K;
-                               goto retry_baser;
+                               goto retry_alloc_baser;
                        }
                }
 
@@ -957,7 +961,7 @@ retry_baser:
                }
 
                pr_info("ITS: allocated %d %s @%lx (psz %dK, shr %d)\n",
-                       (int)(alloc_size / entry_size),
+                       (int)(PAGE_ORDER_TO_SIZE(order) / entry_size),
                        its_base_type_string[type],
                        (unsigned long)virt_to_phys(base),
                        psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT);
@@ -1425,7 +1429,8 @@ static void its_enable_quirks(struct its_node *its)
        gic_enable_quirks(iidr, its_quirks, its);
 }
 
-static int its_probe(struct device_node *node, struct irq_domain *parent)
+static int __init its_probe(struct device_node *node,
+                           struct irq_domain *parent)
 {
        struct resource res;
        struct its_node *its;
@@ -1586,7 +1591,7 @@ static struct of_device_id its_device_id[] = {
        {},
 };
 
-int its_init(struct device_node *node, struct rdists *rdists,
+int __init its_init(struct device_node *node, struct rdists *rdists,
             struct irq_domain *parent_domain)
 {
        struct device_node *np;
@@ -1602,8 +1607,6 @@ int its_init(struct device_node *node, struct rdists *rdists,
        }
 
        gic_rdists = rdists;
-       gic_root_node = node;
-
        its_alloc_lpi_tables();
        its_lpi_init(rdists->id_bits);
 
index d7be6dd..5b7d3c2 100644 (file)
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -38,6 +40,7 @@
 struct redist_region {
        void __iomem            *redist_base;
        phys_addr_t             phys_base;
+       bool                    single_redist;
 };
 
 struct gic_chip_data {
@@ -434,6 +437,9 @@ static int gic_populate_rdist(void)
                                return 0;
                        }
 
+                       if (gic_data.redist_regions[i].single_redist)
+                               break;
+
                        if (gic_data.redist_stride) {
                                ptr += gic_data.redist_stride;
                        } else {
@@ -634,7 +640,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
        else
                gic_dist_wait_for_rwp();
 
-       return IRQ_SET_MASK_OK;
+       return IRQ_SET_MASK_OK_DONE;
 }
 #else
 #define gic_set_affinity       NULL
@@ -764,6 +770,15 @@ static int gic_irq_domain_translate(struct irq_domain *d,
                return 0;
        }
 
+       if (is_fwnode_irqchip(fwspec->fwnode)) {
+               if(fwspec->param_count != 2)
+                       return -EINVAL;
+
+               *hwirq = fwspec->param[0];
+               *type = fwspec->param[1];
+               return 0;
+       }
+
        return -EINVAL;
 }
 
@@ -811,17 +826,88 @@ static void gicv3_enable_quirks(void)
 #endif
 }
 
+static int __init gic_init_bases(void __iomem *dist_base,
+                                struct redist_region *rdist_regs,
+                                u32 nr_redist_regions,
+                                u64 redist_stride,
+                                struct fwnode_handle *handle)
+{
+       struct device_node *node;
+       u32 typer;
+       int gic_irqs;
+       int err;
+
+       if (!is_hyp_mode_available())
+               static_key_slow_dec(&supports_deactivate);
+
+       if (static_key_true(&supports_deactivate))
+               pr_info("GIC: Using split EOI/Deactivate mode\n");
+
+       gic_data.dist_base = dist_base;
+       gic_data.redist_regions = rdist_regs;
+       gic_data.nr_redist_regions = nr_redist_regions;
+       gic_data.redist_stride = redist_stride;
+
+       gicv3_enable_quirks();
+
+       /*
+        * Find out how many interrupts are supported.
+        * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI)
+        */
+       typer = readl_relaxed(gic_data.dist_base + GICD_TYPER);
+       gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer);
+       gic_irqs = GICD_TYPER_IRQS(typer);
+       if (gic_irqs > 1020)
+               gic_irqs = 1020;
+       gic_data.irq_nr = gic_irqs;
+
+       gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
+                                                &gic_data);
+       gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
+
+       if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
+               err = -ENOMEM;
+               goto out_free;
+       }
+
+       set_handle_irq(gic_handle_irq);
+
+       node = to_of_node(handle);
+       if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() &&
+           node) /* Temp hack to prevent ITS init for ACPI */
+               its_init(node, &gic_data.rdists, gic_data.domain);
+
+       gic_smp_init();
+       gic_dist_init();
+       gic_cpu_init();
+       gic_cpu_pm_init();
+
+       return 0;
+
+out_free:
+       if (gic_data.domain)
+               irq_domain_remove(gic_data.domain);
+       free_percpu(gic_data.rdists.rdist);
+       return err;
+}
+
+static int __init gic_validate_dist_version(void __iomem *dist_base)
+{
+       u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+
+       if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4)
+               return -ENODEV;
+
+       return 0;
+}
+
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
 {
        void __iomem *dist_base;
        struct redist_region *rdist_regs;
        u64 redist_stride;
        u32 nr_redist_regions;
-       u32 typer;
-       u32 reg;
-       int gic_irqs;
-       int err;
-       int i;
+       int err, i;
 
        dist_base = of_iomap(node, 0);
        if (!dist_base) {
@@ -830,11 +916,10 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
                return -ENXIO;
        }
 
-       reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
-       if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) {
+       err = gic_validate_dist_version(dist_base);
+       if (err) {
                pr_err("%s: no distributor detected, giving up\n",
                        node->full_name);
-               err = -ENODEV;
                goto out_unmap_dist;
        }
 
@@ -865,63 +950,229 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
        if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
                redist_stride = 0;
 
-       if (!is_hyp_mode_available())
-               static_key_slow_dec(&supports_deactivate);
+       err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
+                            redist_stride, &node->fwnode);
+       if (!err)
+               return 0;
 
-       if (static_key_true(&supports_deactivate))
-               pr_info("GIC: Using split EOI/Deactivate mode\n");
+out_unmap_rdist:
+       for (i = 0; i < nr_redist_regions; i++)
+               if (rdist_regs[i].redist_base)
+                       iounmap(rdist_regs[i].redist_base);
+       kfree(rdist_regs);
+out_unmap_dist:
+       iounmap(dist_base);
+       return err;
+}
 
-       gic_data.dist_base = dist_base;
-       gic_data.redist_regions = rdist_regs;
-       gic_data.nr_redist_regions = nr_redist_regions;
-       gic_data.redist_stride = redist_stride;
+IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
 
-       gicv3_enable_quirks();
+#ifdef CONFIG_ACPI
+static void __iomem *dist_base;
+static struct redist_region *redist_regs __initdata;
+static u32 nr_redist_regions __initdata;
+static bool single_redist;
+
+static void __init
+gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
+{
+       static int count = 0;
+
+       redist_regs[count].phys_base = phys_base;
+       redist_regs[count].redist_base = redist_base;
+       redist_regs[count].single_redist = single_redist;
+       count++;
+}
+
+static int __init
+gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
+                          const unsigned long end)
+{
+       struct acpi_madt_generic_redistributor *redist =
+                       (struct acpi_madt_generic_redistributor *)header;
+       void __iomem *redist_base;
+
+       redist_base = ioremap(redist->base_address, redist->length);
+       if (!redist_base) {
+               pr_err("Couldn't map GICR region @%llx\n", redist->base_address);
+               return -ENOMEM;
+       }
+
+       gic_acpi_register_redist(redist->base_address, redist_base);
+       return 0;
+}
+
+static int __init
+gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
+                        const unsigned long end)
+{
+       struct acpi_madt_generic_interrupt *gicc =
+                               (struct acpi_madt_generic_interrupt *)header;
+       u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+       u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
+       void __iomem *redist_base;
+
+       redist_base = ioremap(gicc->gicr_base_address, size);
+       if (!redist_base)
+               return -ENOMEM;
+
+       gic_acpi_register_redist(gicc->gicr_base_address, redist_base);
+       return 0;
+}
+
+static int __init gic_acpi_collect_gicr_base(void)
+{
+       acpi_tbl_entry_handler redist_parser;
+       enum acpi_madt_type type;
+
+       if (single_redist) {
+               type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
+               redist_parser = gic_acpi_parse_madt_gicc;
+       } else {
+               type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR;
+               redist_parser = gic_acpi_parse_madt_redist;
+       }
+
+       /* Collect redistributor base addresses in GICR entries */
+       if (acpi_table_parse_madt(type, redist_parser, 0) > 0)
+               return 0;
+
+       pr_info("No valid GICR entries exist\n");
+       return -ENODEV;
+}
+
+static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
+                                 const unsigned long end)
+{
+       /* Subtable presence means that redist exists, that's it */
+       return 0;
+}
+
+static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
+                                     const unsigned long end)
+{
+       struct acpi_madt_generic_interrupt *gicc =
+                               (struct acpi_madt_generic_interrupt *)header;
 
        /*
-        * Find out how many interrupts are supported.
-        * The GIC only supports up to 1020 interrupt sources (SGI+PPI+SPI)
+        * If GICC is enabled and has valid gicr base address, then it means
+        * GICR base is presented via GICC
         */
-       typer = readl_relaxed(gic_data.dist_base + GICD_TYPER);
-       gic_data.rdists.id_bits = GICD_TYPER_ID_BITS(typer);
-       gic_irqs = GICD_TYPER_IRQS(typer);
-       if (gic_irqs > 1020)
-               gic_irqs = 1020;
-       gic_data.irq_nr = gic_irqs;
+       if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address)
+               return 0;
 
-       gic_data.domain = irq_domain_add_tree(node, &gic_irq_domain_ops,
-                                             &gic_data);
-       gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
+       return -ENODEV;
+}
 
-       if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
+static int __init gic_acpi_count_gicr_regions(void)
+{
+       int count;
+
+       /*
+        * Count how many redistributor regions we have. It is not allowed
+        * to mix redistributor description, GICR and GICC subtables have to be
+        * mutually exclusive.
+        */
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
+                                     gic_acpi_match_gicr, 0);
+       if (count > 0) {
+               single_redist = false;
+               return count;
+       }
+
+       count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+                                     gic_acpi_match_gicc, 0);
+       if (count > 0)
+               single_redist = true;
+
+       return count;
+}
+
+static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
+                                          struct acpi_probe_entry *ape)
+{
+       struct acpi_madt_generic_distributor *dist;
+       int count;
+
+       dist = (struct acpi_madt_generic_distributor *)header;
+       if (dist->version != ape->driver_data)
+               return false;
+
+       /* We need to do that exercise anyway, the sooner the better */
+       count = gic_acpi_count_gicr_regions();
+       if (count <= 0)
+               return false;
+
+       nr_redist_regions = count;
+       return true;
+}
+
+#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+
+static int __init
+gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
+{
+       struct acpi_madt_generic_distributor *dist;
+       struct fwnode_handle *domain_handle;
+       int i, err;
+
+       /* Get distributor base address */
+       dist = (struct acpi_madt_generic_distributor *)header;
+       dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
+       if (!dist_base) {
+               pr_err("Unable to map GICD registers\n");
+               return -ENOMEM;
+       }
+
+       err = gic_validate_dist_version(dist_base);
+       if (err) {
+               pr_err("No distributor detected at @%p, giving up", dist_base);
+               goto out_dist_unmap;
+       }
+
+       redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
+                             GFP_KERNEL);
+       if (!redist_regs) {
                err = -ENOMEM;
-               goto out_free;
+               goto out_dist_unmap;
        }
 
-       set_handle_irq(gic_handle_irq);
+       err = gic_acpi_collect_gicr_base();
+       if (err)
+               goto out_redist_unmap;
 
-       if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis())
-               its_init(node, &gic_data.rdists, gic_data.domain);
+       domain_handle = irq_domain_alloc_fwnode(dist_base);
+       if (!domain_handle) {
+               err = -ENOMEM;
+               goto out_redist_unmap;
+       }
 
-       gic_smp_init();
-       gic_dist_init();
-       gic_cpu_init();
-       gic_cpu_pm_init();
+       err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
+                            domain_handle);
+       if (err)
+               goto out_fwhandle_free;
 
+       acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
        return 0;
 
-out_free:
-       if (gic_data.domain)
-               irq_domain_remove(gic_data.domain);
-       free_percpu(gic_data.rdists.rdist);
-out_unmap_rdist:
+out_fwhandle_free:
+       irq_domain_free_fwnode(domain_handle);
+out_redist_unmap:
        for (i = 0; i < nr_redist_regions; i++)
-               if (rdist_regs[i].redist_base)
-                       iounmap(rdist_regs[i].redist_base);
-       kfree(rdist_regs);
-out_unmap_dist:
+               if (redist_regs[i].redist_base)
+                       iounmap(redist_regs[i].redist_base);
+       kfree(redist_regs);
+out_dist_unmap:
        iounmap(dist_base);
        return err;
 }
-
-IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
+IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+                    acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3,
+                    gic_acpi_init);
+IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+                    acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4,
+                    gic_acpi_init);
+IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+                    acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE,
+                    gic_acpi_init);
+#endif
index 911758c..282344b 100644 (file)
@@ -319,7 +319,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
        writel_relaxed(val | bit, reg);
        raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
 
-       return IRQ_SET_MASK_OK;
+       return IRQ_SET_MASK_OK_DONE;
 }
 #endif
 
@@ -384,9 +384,6 @@ static struct irq_chip gic_chip = {
        .irq_unmask             = gic_unmask_irq,
        .irq_eoi                = gic_eoi_irq,
        .irq_set_type           = gic_set_type,
-#ifdef CONFIG_SMP
-       .irq_set_affinity       = gic_set_affinity,
-#endif
        .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
        .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
        .flags                  = IRQCHIP_SET_TYPE_MASKED |
@@ -400,9 +397,6 @@ static struct irq_chip gic_eoimode1_chip = {
        .irq_unmask             = gic_unmask_irq,
        .irq_eoi                = gic_eoimode1_eoi_irq,
        .irq_set_type           = gic_set_type,
-#ifdef CONFIG_SMP
-       .irq_set_affinity       = gic_set_affinity,
-#endif
        .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
        .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
        .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
@@ -443,7 +437,7 @@ static void gic_cpu_if_up(struct gic_chip_data *gic)
        u32 bypass = 0;
        u32 mode = 0;
 
-       if (static_key_true(&supports_deactivate))
+       if (gic == &gic_data[0] && static_key_true(&supports_deactivate))
                mode = GIC_CPU_CTRL_EOImodeNS;
 
        /*
@@ -1039,6 +1033,11 @@ static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
                gic->chip.name = kasprintf(GFP_KERNEL, "GIC-%d", gic_nr);
        }
 
+#ifdef CONFIG_SMP
+       if (gic_nr == 0)
+               gic->chip.irq_set_affinity = gic_set_affinity;
+#endif
+
 #ifdef CONFIG_GIC_NON_BANKED
        if (percpu_offset) { /* Frankein-GIC without banked registers... */
                unsigned int cpu;
index 9e17ef2..94a30da 100644 (file)
@@ -29,16 +29,32 @@ struct gic_pcpu_mask {
        DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS);
 };
 
+struct gic_irq_spec {
+       enum {
+               GIC_DEVICE,
+               GIC_IPI
+       } type;
+
+       union {
+               struct cpumask *ipimask;
+               unsigned int hwirq;
+       };
+};
+
 static unsigned long __gic_base_addr;
+
 static void __iomem *gic_base;
 static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
 static DEFINE_SPINLOCK(gic_lock);
 static struct irq_domain *gic_irq_domain;
+static struct irq_domain *gic_dev_domain;
+static struct irq_domain *gic_ipi_domain;
 static int gic_shared_intrs;
 static int gic_vpes;
 static unsigned int gic_cpu_pin;
 static unsigned int timer_cpu_pin;
 static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
+DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS);
 
 static void __gic_irq_dispatch(void);
 
@@ -264,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set)
                  GIC_VPE_EIC_SS(irq), set);
 }
 
-void gic_send_ipi(unsigned int intr)
+static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
 {
-       gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr));
+       irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d));
+
+       gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq));
 }
 
 int gic_get_c0_compare_int(void)
@@ -449,7 +467,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask,
        gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp)));
 
        /* Update the pcpu_masks */
-       for (i = 0; i < NR_CPUS; i++)
+       for (i = 0; i < gic_vpes; i++)
                clear_bit(irq, pcpu_masks[i].pcpu_mask);
        set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask);
 
@@ -479,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = {
 #ifdef CONFIG_SMP
        .irq_set_affinity       =       gic_set_affinity,
 #endif
+       .ipi_send_single        =       gic_send_ipi,
 };
 
 static void gic_handle_local_int(bool chained)
@@ -572,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc)
        gic_handle_shared_int(true);
 }
 
-#ifdef CONFIG_MIPS_GIC_IPI
-static int gic_resched_int_base;
-static int gic_call_int_base;
-
-unsigned int plat_ipi_resched_int_xlate(unsigned int cpu)
-{
-       return gic_resched_int_base + cpu;
-}
-
-unsigned int plat_ipi_call_int_xlate(unsigned int cpu)
-{
-       return gic_call_int_base + cpu;
-}
-
-static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
-{
-       scheduler_ipi();
-
-       return IRQ_HANDLED;
-}
-
-static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
-{
-       generic_smp_call_function_interrupt();
-
-       return IRQ_HANDLED;
-}
-
-static struct irqaction irq_resched = {
-       .handler        = ipi_resched_interrupt,
-       .flags          = IRQF_PERCPU,
-       .name           = "IPI resched"
-};
-
-static struct irqaction irq_call = {
-       .handler        = ipi_call_interrupt,
-       .flags          = IRQF_PERCPU,
-       .name           = "IPI call"
-};
-
-static __init void gic_ipi_init_one(unsigned int intr, int cpu,
-                                   struct irqaction *action)
-{
-       int virq = irq_create_mapping(gic_irq_domain,
-                                     GIC_SHARED_TO_HWIRQ(intr));
-       int i;
-
-       gic_map_to_vpe(intr, mips_cm_vp_id(cpu));
-       for (i = 0; i < NR_CPUS; i++)
-               clear_bit(intr, pcpu_masks[i].pcpu_mask);
-       set_bit(intr, pcpu_masks[cpu].pcpu_mask);
-
-       irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
-
-       irq_set_handler(virq, handle_percpu_irq);
-       setup_irq(virq, action);
-}
-
-static __init void gic_ipi_init(void)
-{
-       int i;
-
-       /* Use last 2 * NR_CPUS interrupts as IPIs */
-       gic_resched_int_base = gic_shared_intrs - nr_cpu_ids;
-       gic_call_int_base = gic_resched_int_base - nr_cpu_ids;
-
-       for (i = 0; i < nr_cpu_ids; i++) {
-               gic_ipi_init_one(gic_call_int_base + i, i, &irq_call);
-               gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched);
-       }
-}
-#else
-static inline void gic_ipi_init(void)
-{
-}
-#endif
-
 static void __init gic_basic_init(void)
 {
        unsigned int i;
@@ -753,19 +695,21 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
 }
 
 static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
-                                    irq_hw_number_t hw)
+                                    irq_hw_number_t hw, unsigned int vpe)
 {
        int intr = GIC_HWIRQ_TO_SHARED(hw);
        unsigned long flags;
+       int i;
 
        irq_set_chip_and_handler(virq, &gic_level_irq_controller,
                                 handle_level_irq);
 
        spin_lock_irqsave(&gic_lock, flags);
        gic_map_to_pin(intr, gic_cpu_pin);
-       /* Map to VPE 0 by default */
-       gic_map_to_vpe(intr, 0);
-       set_bit(intr, pcpu_masks[0].pcpu_mask);
+       gic_map_to_vpe(intr, vpe);
+       for (i = 0; i < gic_vpes; i++)
+               clear_bit(intr, pcpu_masks[i].pcpu_mask);
+       set_bit(intr, pcpu_masks[vpe].pcpu_mask);
        spin_unlock_irqrestore(&gic_lock, flags);
 
        return 0;
@@ -776,10 +720,93 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 {
        if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS)
                return gic_local_irq_domain_map(d, virq, hw);
-       return gic_shared_irq_domain_map(d, virq, hw);
+       return gic_shared_irq_domain_map(d, virq, hw, 0);
 }
 
-static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
+                               unsigned int nr_irqs, void *arg)
+{
+       struct gic_irq_spec *spec = arg;
+       irq_hw_number_t hwirq, base_hwirq;
+       int cpu, ret, i;
+
+       if (spec->type == GIC_DEVICE) {
+               /* verify that it doesn't conflict with an IPI irq */
+               if (test_bit(spec->hwirq, ipi_resrv))
+                       return -EBUSY;
+       } else {
+               base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs);
+               if (base_hwirq == gic_shared_intrs) {
+                       return -ENOMEM;
+               }
+
+               /* check that we have enough space */
+               for (i = base_hwirq; i < nr_irqs; i++) {
+                       if (!test_bit(i, ipi_resrv))
+                               return -EBUSY;
+               }
+               bitmap_clear(ipi_resrv, base_hwirq, nr_irqs);
+
+               /* map the hwirq for each cpu consecutively */
+               i = 0;
+               for_each_cpu(cpu, spec->ipimask) {
+                       hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i);
+
+                       ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq,
+                                                           &gic_edge_irq_controller,
+                                                           NULL);
+                       if (ret)
+                               goto error;
+
+                       ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu);
+                       if (ret)
+                               goto error;
+
+                       i++;
+               }
+
+               /*
+                * tell the parent about the base hwirq we allocated so it can
+                * set its own domain data
+                */
+               spec->hwirq = base_hwirq;
+       }
+
+       return 0;
+error:
+       bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
+       return ret;
+}
+
+void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
+                        unsigned int nr_irqs)
+{
+       irq_hw_number_t base_hwirq;
+       struct irq_data *data;
+
+       data = irq_get_irq_data(virq);
+       if (!data)
+               return;
+
+       base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data));
+       bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
+}
+
+int gic_irq_domain_match(struct irq_domain *d, struct device_node *node,
+                        enum irq_domain_bus_token bus_token)
+{
+       /* this domain should'nt be accessed directly */
+       return 0;
+}
+
+static const struct irq_domain_ops gic_irq_domain_ops = {
+       .map = gic_irq_domain_map,
+       .alloc = gic_irq_domain_alloc,
+       .free = gic_irq_domain_free,
+       .match = gic_irq_domain_match,
+};
+
+static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
                                const u32 *intspec, unsigned int intsize,
                                irq_hw_number_t *out_hwirq,
                                unsigned int *out_type)
@@ -798,9 +825,130 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
        return 0;
 }
 
-static const struct irq_domain_ops gic_irq_domain_ops = {
-       .map = gic_irq_domain_map,
-       .xlate = gic_irq_domain_xlate,
+static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq,
+                               unsigned int nr_irqs, void *arg)
+{
+       struct irq_fwspec *fwspec = arg;
+       struct gic_irq_spec spec = {
+               .type = GIC_DEVICE,
+               .hwirq = fwspec->param[1],
+       };
+       int i, ret;
+       bool is_shared = fwspec->param[0] == GIC_SHARED;
+
+       if (is_shared) {
+               ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
+               if (ret)
+                       return ret;
+       }
+
+       for (i = 0; i < nr_irqs; i++) {
+               irq_hw_number_t hwirq;
+
+               if (is_shared)
+                       hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i);
+               else
+                       hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i);
+
+               ret = irq_domain_set_hwirq_and_chip(d, virq + i,
+                                                   hwirq,
+                                                   &gic_level_irq_controller,
+                                                   NULL);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+void gic_dev_domain_free(struct irq_domain *d, unsigned int virq,
+                        unsigned int nr_irqs)
+{
+       /* no real allocation is done for dev irqs, so no need to free anything */
+       return;
+}
+
+static struct irq_domain_ops gic_dev_domain_ops = {
+       .xlate = gic_dev_domain_xlate,
+       .alloc = gic_dev_domain_alloc,
+       .free = gic_dev_domain_free,
+};
+
+static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+                               const u32 *intspec, unsigned int intsize,
+                               irq_hw_number_t *out_hwirq,
+                               unsigned int *out_type)
+{
+       /*
+        * There's nothing to translate here. hwirq is dynamically allocated and
+        * the irq type is always edge triggered.
+        * */
+       *out_hwirq = 0;
+       *out_type = IRQ_TYPE_EDGE_RISING;
+
+       return 0;
+}
+
+static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq,
+                               unsigned int nr_irqs, void *arg)
+{
+       struct cpumask *ipimask = arg;
+       struct gic_irq_spec spec = {
+               .type = GIC_IPI,
+               .ipimask = ipimask
+       };
+       int ret, i;
+
+       ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
+       if (ret)
+               return ret;
+
+       /* the parent should have set spec.hwirq to the base_hwirq it allocated */
+       for (i = 0; i < nr_irqs; i++) {
+               ret = irq_domain_set_hwirq_and_chip(d, virq + i,
+                                                   GIC_SHARED_TO_HWIRQ(spec.hwirq + i),
+                                                   &gic_edge_irq_controller,
+                                                   NULL);
+               if (ret)
+                       goto error;
+
+               ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING);
+               if (ret)
+                       goto error;
+       }
+
+       return 0;
+error:
+       irq_domain_free_irqs_parent(d, virq, nr_irqs);
+       return ret;
+}
+
+void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq,
+                        unsigned int nr_irqs)
+{
+       irq_domain_free_irqs_parent(d, virq, nr_irqs);
+}
+
+int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node,
+                        enum irq_domain_bus_token bus_token)
+{
+       bool is_ipi;
+
+       switch (bus_token) {
+       case DOMAIN_BUS_IPI:
+               is_ipi = d->bus_token == bus_token;
+               return to_of_node(d->fwnode) == node && is_ipi;
+               break;
+       default:
+               return 0;
+       }
+}
+
+static struct irq_domain_ops gic_ipi_domain_ops = {
+       .xlate = gic_ipi_domain_xlate,
+       .alloc = gic_ipi_domain_alloc,
+       .free = gic_ipi_domain_free,
+       .match = gic_ipi_domain_match,
 };
 
 static void __init __gic_init(unsigned long gic_base_addr,
@@ -809,6 +957,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
                              struct device_node *node)
 {
        unsigned int gicconfig;
+       unsigned int v[2];
 
        __gic_base_addr = gic_base_addr;
 
@@ -864,9 +1013,32 @@ static void __init __gic_init(unsigned long gic_base_addr,
        if (!gic_irq_domain)
                panic("Failed to add GIC IRQ domain");
 
-       gic_basic_init();
+       gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0,
+                                                 GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
+                                                 node, &gic_dev_domain_ops, NULL);
+       if (!gic_dev_domain)
+               panic("Failed to add GIC DEV domain");
+
+       gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain,
+                                                 IRQ_DOMAIN_FLAG_IPI_PER_CPU,
+                                                 GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
+                                                 node, &gic_ipi_domain_ops, NULL);
+       if (!gic_ipi_domain)
+               panic("Failed to add GIC IPI domain");
 
-       gic_ipi_init();
+       gic_ipi_domain->bus_token = DOMAIN_BUS_IPI;
+
+       if (node &&
+           !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) {
+               bitmap_set(ipi_resrv, v[0], v[1]);
+       } else {
+               /* Make the last 2 * gic_vpes available for IPIs */
+               bitmap_set(ipi_resrv,
+                          gic_shared_intrs - 2 * gic_vpes,
+                          2 * gic_vpes);
+       }
+
+       gic_basic_init();
 }
 
 void __init gic_init(unsigned long gic_base_addr,
diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c
new file mode 100644 (file)
index 0000000..b4d3678
--- /dev/null
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2016 Marvell
+ *
+ * Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#define pr_fmt(fmt) "GIC-ODMI: " fmt
+
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+#define GICP_ODMIN_SET                 0x40
+#define   GICP_ODMI_INT_NUM_SHIFT      12
+#define GICP_ODMIN_GM_EP_R0            0x110
+#define GICP_ODMIN_GM_EP_R1            0x114
+#define GICP_ODMIN_GM_EA_R0            0x108
+#define GICP_ODMIN_GM_EA_R1            0x118
+
+/*
+ * We don't support the group events, so we simply have 8 interrupts
+ * per frame.
+ */
+#define NODMIS_SHIFT           3
+#define NODMIS_PER_FRAME       (1 << NODMIS_SHIFT)
+#define NODMIS_MASK            (NODMIS_PER_FRAME - 1)
+
+struct odmi_data {
+       struct resource res;
+       void __iomem *base;
+       unsigned int spi_base;
+};
+
+static struct odmi_data *odmis;
+static unsigned long *odmis_bm;
+static unsigned int odmis_count;
+
+/* Protects odmis_bm */
+static DEFINE_SPINLOCK(odmis_bm_lock);
+
+static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+       struct odmi_data *odmi;
+       phys_addr_t addr;
+       unsigned int odmin;
+
+       if (WARN_ON(d->hwirq >= odmis_count * NODMIS_PER_FRAME))
+               return;
+
+       odmi = &odmis[d->hwirq >> NODMIS_SHIFT];
+       odmin = d->hwirq & NODMIS_MASK;
+
+       addr = odmi->res.start + GICP_ODMIN_SET;
+
+       msg->address_hi = upper_32_bits(addr);
+       msg->address_lo = lower_32_bits(addr);
+       msg->data = odmin << GICP_ODMI_INT_NUM_SHIFT;
+}
+
+static struct irq_chip odmi_irq_chip = {
+       .name                   = "ODMI",
+       .irq_mask               = irq_chip_mask_parent,
+       .irq_unmask             = irq_chip_unmask_parent,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+       .irq_compose_msi_msg    = odmi_compose_msi_msg,
+};
+
+static int odmi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                unsigned int nr_irqs, void *args)
+{
+       struct odmi_data *odmi = NULL;
+       struct irq_fwspec fwspec;
+       struct irq_data *d;
+       unsigned int hwirq, odmin;
+       int ret;
+
+       spin_lock(&odmis_bm_lock);
+       hwirq = find_first_zero_bit(odmis_bm, NODMIS_PER_FRAME * odmis_count);
+       if (hwirq >= NODMIS_PER_FRAME * odmis_count) {
+               spin_unlock(&odmis_bm_lock);
+               return -ENOSPC;
+       }
+
+       __set_bit(hwirq, odmis_bm);
+       spin_unlock(&odmis_bm_lock);
+
+       odmi = &odmis[hwirq >> NODMIS_SHIFT];
+       odmin = hwirq & NODMIS_MASK;
+
+       fwspec.fwnode = domain->parent->fwnode;
+       fwspec.param_count = 3;
+       fwspec.param[0] = GIC_SPI;
+       fwspec.param[1] = odmi->spi_base - 32 + odmin;
+       fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+       ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+       if (ret) {
+               pr_err("Cannot allocate parent IRQ\n");
+               spin_lock(&odmis_bm_lock);
+               __clear_bit(odmin, odmis_bm);
+               spin_unlock(&odmis_bm_lock);
+               return ret;
+       }
+
+       /* Configure the interrupt line to be edge */
+       d = irq_domain_get_irq_data(domain->parent, virq);
+       d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+
+       irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+                                     &odmi_irq_chip, NULL);
+
+       return 0;
+}
+
+static void odmi_irq_domain_free(struct irq_domain *domain,
+                                unsigned int virq, unsigned int nr_irqs)
+{
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+
+       if (d->hwirq >= odmis_count * NODMIS_PER_FRAME) {
+               pr_err("Failed to teardown msi. Invalid hwirq %lu\n", d->hwirq);
+               return;
+       }
+
+       irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+
+       /* Actually free the MSI */
+       spin_lock(&odmis_bm_lock);
+       __clear_bit(d->hwirq, odmis_bm);
+       spin_unlock(&odmis_bm_lock);
+}
+
+static const struct irq_domain_ops odmi_domain_ops = {
+       .alloc  = odmi_irq_domain_alloc,
+       .free   = odmi_irq_domain_free,
+};
+
+static struct irq_chip odmi_msi_irq_chip = {
+       .name   = "ODMI",
+};
+
+static struct msi_domain_ops odmi_msi_ops = {
+};
+
+static struct msi_domain_info odmi_msi_domain_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+       .ops    = &odmi_msi_ops,
+       .chip   = &odmi_msi_irq_chip,
+};
+
+static int __init mvebu_odmi_init(struct device_node *node,
+                                 struct device_node *parent)
+{
+       struct irq_domain *inner_domain, *plat_domain;
+       int ret, i;
+
+       if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count))
+               return -EINVAL;
+
+       odmis = kcalloc(odmis_count, sizeof(struct odmi_data), GFP_KERNEL);
+       if (!odmis)
+               return -ENOMEM;
+
+       odmis_bm = kcalloc(BITS_TO_LONGS(odmis_count * NODMIS_PER_FRAME),
+                          sizeof(long), GFP_KERNEL);
+       if (!odmis_bm) {
+               ret = -ENOMEM;
+               goto err_alloc;
+       }
+
+       for (i = 0; i < odmis_count; i++) {
+               struct odmi_data *odmi = &odmis[i];
+
+               ret = of_address_to_resource(node, i, &odmi->res);
+               if (ret)
+                       goto err_unmap;
+
+               odmi->base = of_io_request_and_map(node, i, "odmi");
+               if (IS_ERR(odmi->base)) {
+                       ret = PTR_ERR(odmi->base);
+                       goto err_unmap;
+               }
+
+               if (of_property_read_u32_index(node, "marvell,spi-base",
+                                              i, &odmi->spi_base)) {
+                       ret = -EINVAL;
+                       goto err_unmap;
+               }
+       }
+
+       inner_domain = irq_domain_create_linear(of_node_to_fwnode(node),
+                                               odmis_count * NODMIS_PER_FRAME,
+                                               &odmi_domain_ops, NULL);
+       if (!inner_domain) {
+               ret = -ENOMEM;
+               goto err_unmap;
+       }
+
+       inner_domain->parent = irq_find_host(parent);
+
+       plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node),
+                                                    &odmi_msi_domain_info,
+                                                    inner_domain);
+       if (!plat_domain) {
+               ret = -ENOMEM;
+               goto err_remove_inner;
+       }
+
+       return 0;
+
+err_remove_inner:
+       irq_domain_remove(inner_domain);
+err_unmap:
+       for (i = 0; i < odmis_count; i++) {
+               struct odmi_data *odmi = &odmis[i];
+
+               if (odmi->base && !IS_ERR(odmi->base))
+                       iounmap(odmis[i].base);
+       }
+       kfree(odmis_bm);
+err_alloc:
+       kfree(odmis);
+       return ret;
+}
+
+IRQCHIP_DECLARE(mvebu_odmi, "marvell,odmi-controller", mvebu_odmi_init);
index c22e2d4..1730470 100644 (file)
@@ -183,7 +183,7 @@ static void __iomem * __init icoll_init_iobase(struct device_node *np)
        void __iomem *icoll_base;
 
        icoll_base = of_io_request_and_map(np, 0, np->name);
-       if (!icoll_base)
+       if (IS_ERR(icoll_base))
                panic("%s: unable to map resource", np->full_name);
        return icoll_base;
 }
@@ -241,6 +241,7 @@ static int __init asm9260_of_init(struct device_node *np,
                writel(0, icoll_priv.intr + i);
 
        icoll_add_domain(np, ASM9260_NUM_IRQS);
+       set_handle_irq(icoll_handle_irq);
 
        return 0;
 }
index c71914e..5dc5a76 100644 (file)
@@ -605,7 +605,7 @@ err:
        return ERR_PTR(ret);
 }
 
-static struct s3c_irq_data init_eint[32] = {
+static struct s3c_irq_data __maybe_unused init_eint[32] = {
        { .type = S3C_IRQTYPE_NONE, }, /* reserved */
        { .type = S3C_IRQTYPE_NONE, }, /* reserved */
        { .type = S3C_IRQTYPE_NONE, }, /* reserved */
index 0704362..376b280 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/of_irq.h>
 
 #include <asm/exception.h>
-#include <asm/mach/irq.h>
 
 #define SUN4I_IRQ_VECTOR_REG           0x00
 #define SUN4I_IRQ_PROTECTION_REG       0x08
index 0820f67..668730c 100644 (file)
@@ -160,9 +160,9 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node,
 
        gc = irq_get_domain_generic_chip(domain, 0);
        gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node));
-       if (!gc->reg_base) {
+       if (IS_ERR(gc->reg_base)) {
                pr_err("unable to map resource\n");
-               ret = -ENOMEM;
+               ret = PTR_ERR(gc->reg_base);
                goto fail_irqd_remove;
        }
 
diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c
new file mode 100644 (file)
index 0000000..bdbb5c0
--- /dev/null
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2014 Mans Rullgard <mans@mansr.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+
+#define IRQ0_CTL_BASE          0x0000
+#define IRQ1_CTL_BASE          0x0100
+#define EDGE_CTL_BASE          0x0200
+#define IRQ2_CTL_BASE          0x0300
+
+#define IRQ_CTL_HI             0x18
+#define EDGE_CTL_HI            0x20
+
+#define IRQ_STATUS             0x00
+#define IRQ_RAWSTAT            0x04
+#define IRQ_EN_SET             0x08
+#define IRQ_EN_CLR             0x0c
+#define IRQ_SOFT_SET           0x10
+#define IRQ_SOFT_CLR           0x14
+
+#define EDGE_STATUS            0x00
+#define EDGE_RAWSTAT           0x04
+#define EDGE_CFG_RISE          0x08
+#define EDGE_CFG_FALL          0x0c
+#define EDGE_CFG_RISE_SET      0x10
+#define EDGE_CFG_RISE_CLR      0x14
+#define EDGE_CFG_FALL_SET      0x18
+#define EDGE_CFG_FALL_CLR      0x1c
+
+struct tangox_irq_chip {
+       void __iomem *base;
+       unsigned long ctl;
+};
+
+static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg)
+{
+       return readl_relaxed(chip->base + reg);
+}
+
+static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val)
+{
+       writel_relaxed(val, chip->base + reg);
+}
+
+static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status,
+                                int base)
+{
+       unsigned int hwirq;
+       unsigned int virq;
+
+       while (status) {
+               hwirq = __ffs(status);
+               virq = irq_find_mapping(dom, base + hwirq);
+               if (virq)
+                       generic_handle_irq(virq);
+               status &= ~BIT(hwirq);
+       }
+}
+
+static void tangox_irq_handler(struct irq_desc *desc)
+{
+       struct irq_domain *dom = irq_desc_get_handler_data(desc);
+       struct irq_chip *host_chip = irq_desc_get_chip(desc);
+       struct tangox_irq_chip *chip = dom->host_data;
+       unsigned int status_lo, status_hi;
+
+       chained_irq_enter(host_chip, desc);
+
+       status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS);
+       status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS);
+
+       tangox_dispatch_irqs(dom, status_lo, 0);
+       tangox_dispatch_irqs(dom, status_hi, 32);
+
+       chained_irq_exit(host_chip, desc);
+}
+
+static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type)
+{
+       struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+       struct tangox_irq_chip *chip = gc->domain->host_data;
+       struct irq_chip_regs *regs = &gc->chip_types[0].regs;
+
+       switch (flow_type & IRQ_TYPE_SENSE_MASK) {
+       case IRQ_TYPE_EDGE_RISING:
+               intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
+               intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
+               break;
+
+       case IRQ_TYPE_EDGE_FALLING:
+               intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
+               intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
+               break;
+
+       case IRQ_TYPE_LEVEL_HIGH:
+               intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
+               intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
+               break;
+
+       case IRQ_TYPE_LEVEL_LOW:
+               intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
+               intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
+               break;
+
+       default:
+               pr_err("Invalid trigger mode %x for IRQ %d\n",
+                      flow_type, d->irq);
+               return -EINVAL;
+       }
+
+       return irq_setup_alt_chip(d, flow_type);
+}
+
+static void __init tangox_irq_init_chip(struct irq_chip_generic *gc,
+                                       unsigned long ctl_offs,
+                                       unsigned long edge_offs)
+{
+       struct tangox_irq_chip *chip = gc->domain->host_data;
+       struct irq_chip_type *ct = gc->chip_types;
+       unsigned long ctl_base = chip->ctl + ctl_offs;
+       unsigned long edge_base = EDGE_CTL_BASE + edge_offs;
+       int i;
+
+       gc->reg_base = chip->base;
+       gc->unused = 0;
+
+       for (i = 0; i < 2; i++) {
+               ct[i].chip.irq_ack = irq_gc_ack_set_bit;
+               ct[i].chip.irq_mask = irq_gc_mask_disable_reg;
+               ct[i].chip.irq_mask_ack = irq_gc_mask_disable_reg_and_ack;
+               ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg;
+               ct[i].chip.irq_set_type = tangox_irq_set_type;
+               ct[i].chip.name = gc->domain->name;
+
+               ct[i].regs.enable = ctl_base + IRQ_EN_SET;
+               ct[i].regs.disable = ctl_base + IRQ_EN_CLR;
+               ct[i].regs.ack = edge_base + EDGE_RAWSTAT;
+               ct[i].regs.type = edge_base;
+       }
+
+       ct[0].type = IRQ_TYPE_LEVEL_MASK;
+       ct[0].handler = handle_level_irq;
+
+       ct[1].type = IRQ_TYPE_EDGE_BOTH;
+       ct[1].handler = handle_edge_irq;
+
+       intc_writel(chip, ct->regs.disable, 0xffffffff);
+       intc_writel(chip, ct->regs.ack, 0xffffffff);
+}
+
+static void __init tangox_irq_domain_init(struct irq_domain *dom)
+{
+       struct irq_chip_generic *gc;
+       int i;
+
+       for (i = 0; i < 2; i++) {
+               gc = irq_get_domain_generic_chip(dom, i * 32);
+               tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI);
+       }
+}
+
+static int __init tangox_irq_init(void __iomem *base, struct resource *baseres,
+                                 struct device_node *node)
+{
+       struct tangox_irq_chip *chip;
+       struct irq_domain *dom;
+       struct resource res;
+       int irq;
+       int err;
+
+       irq = irq_of_parse_and_map(node, 0);
+       if (!irq)
+               panic("%s: failed to get IRQ", node->name);
+
+       err = of_address_to_resource(node, 0, &res);
+       if (err)
+               panic("%s: failed to get address", node->name);
+
+       chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+       chip->ctl = res.start - baseres->start;
+       chip->base = base;
+
+       dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip);
+       if (!dom)
+               panic("%s: failed to create irqdomain", node->name);
+
+       err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name,
+                                            handle_level_irq, 0, 0, 0);
+       if (err)
+               panic("%s: failed to allocate irqchip", node->name);
+
+       tangox_irq_domain_init(dom);
+
+       irq_set_chained_handler(irq, tangox_irq_handler);
+       irq_set_handler_data(irq, dom);
+
+       return 0;
+}
+
+static int __init tangox_of_irq_init(struct device_node *node,
+                                    struct device_node *parent)
+{
+       struct device_node *c;
+       struct resource res;
+       void __iomem *base;
+
+       base = of_iomap(node, 0);
+       if (!base)
+               panic("%s: of_iomap failed", node->name);
+
+       of_address_to_resource(node, 0, &res);
+
+       for_each_child_of_node(node, c)
+               tangox_irq_init(base, &res, c);
+
+       return 0;
+}
+IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init);
index 4192bdc..2325fb3 100644 (file)
@@ -59,7 +59,7 @@ static int ts4800_irqdomain_map(struct irq_domain *d, unsigned int irq,
        return 0;
 }
 
-struct irq_domain_ops ts4800_ic_ops = {
+static const struct irq_domain_ops ts4800_ic_ops = {
        .map = ts4800_irqdomain_map,
        .xlate = irq_domain_xlate_onecell,
 };
index 2a506fe..d1f8ab9 100644 (file)
@@ -373,13 +373,7 @@ static void gigaset_freecshw(struct cardstate *cs)
 
 static void gigaset_device_release(struct device *dev)
 {
-       struct cardstate *cs = dev_get_drvdata(dev);
-
-       if (!cs)
-               return;
-       dev_set_drvdata(dev, NULL);
-       kfree(cs->hw.ser);
-       cs->hw.ser = NULL;
+       kfree(container_of(dev, struct ser_cardstate, dev.dev));
 }
 
 /*
@@ -408,7 +402,6 @@ static int gigaset_initcshw(struct cardstate *cs)
                cs->hw.ser = NULL;
                return rc;
        }
-       dev_set_drvdata(&cs->hw.ser->dev.dev, cs);
 
        tasklet_init(&cs->write_tasklet,
                     gigaset_modem_fill, (unsigned long) cs);
index 8e29447..afde4ed 100644 (file)
@@ -392,7 +392,7 @@ read_dma(struct tiger_ch *bc, u32 idx, int cnt)
        }
        stat = bchannel_get_rxbuf(&bc->bch, cnt);
        /* only transparent use the count here, HDLC overun is detected later */
-       if (stat == ENOMEM) {
+       if (stat == -ENOMEM) {
                pr_warning("%s.B%d: No memory for %d bytes\n",
                           card->name, bc->bch.nr, cnt);
                return;
index 33224cb..9f6acd5 100644 (file)
@@ -572,11 +572,13 @@ int nvm_register(struct request_queue *q, char *disk_name,
                }
        }
 
-       ret = nvm_get_sysblock(dev, &dev->sb);
-       if (!ret)
-               pr_err("nvm: device not initialized.\n");
-       else if (ret < 0)
-               pr_err("nvm: err (%d) on device initialization\n", ret);
+       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
+               ret = nvm_get_sysblock(dev, &dev->sb);
+               if (!ret)
+                       pr_err("nvm: device not initialized.\n");
+               else if (ret < 0)
+                       pr_err("nvm: err (%d) on device initialization\n", ret);
+       }
 
        /* register device with a supported media manager */
        down_write(&nvm_lock);
@@ -1055,9 +1057,11 @@ static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init)
        strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN);
        info.fs_ppa.ppa = -1;
 
-       ret = nvm_init_sysblock(dev, &info);
-       if (ret)
-               return ret;
+       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
+               ret = nvm_init_sysblock(dev, &info);
+               if (ret)
+                       return ret;
+       }
 
        memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info));
 
@@ -1117,7 +1121,10 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
                dev->mt = NULL;
        }
 
-       return nvm_dev_factory(dev, fact.flags);
+       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT)
+               return nvm_dev_factory(dev, fact.flags);
+
+       return 0;
 }
 
 static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
index d8c7595..307db1e 100644 (file)
@@ -300,8 +300,10 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
        }
 
        page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
-       if (!page)
+       if (!page) {
+               bio_put(bio);
                return -ENOMEM;
+       }
 
        while ((slot = find_first_zero_bit(rblk->invalid_pages,
                                            nr_pgs_per_blk)) < nr_pgs_per_blk) {
index ef13ac7..f7b3733 100644 (file)
@@ -174,8 +174,7 @@ static inline sector_t rrpc_get_sector(sector_t laddr)
 static inline int request_intersects(struct rrpc_inflight_rq *r,
                                sector_t laddr_start, sector_t laddr_end)
 {
-       return (laddr_end >= r->l_start && laddr_end <= r->l_end) &&
-               (laddr_start >= r->l_start && laddr_start <= r->l_end);
+       return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
 }
 
 static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
@@ -184,6 +183,8 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
        sector_t laddr_end = laddr + pages - 1;
        struct rrpc_inflight_rq *rtmp;
 
+       WARN_ON(irqs_disabled());
+
        spin_lock_irq(&rrpc->inflights.lock);
        list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
                if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
index 546d05f..b2bbe86 100644 (file)
@@ -81,6 +81,7 @@ config STI_MBOX
 config MAILBOX_TEST
        tristate "Mailbox Test Client"
        depends on OF
+       depends on HAS_IOMEM
        help
          Test client to help with testing new Controller driver
          implementations.
index 45d85ae..8f779a1 100644 (file)
@@ -81,16 +81,10 @@ static struct mbox_controller pcc_mbox_ctrl = {};
  */
 static struct mbox_chan *get_pcc_channel(int id)
 {
-       struct mbox_chan *pcc_chan;
-
        if (id < 0 || id > pcc_mbox_ctrl.num_chans)
                return ERR_PTR(-ENOENT);
 
-       pcc_chan = (struct mbox_chan *)
-               (unsigned long) pcc_mbox_channels +
-               (id * sizeof(*pcc_chan));
-
-       return pcc_chan;
+       return &pcc_mbox_channels[id];
 }
 
 /**
index 4f22e91..d80cce4 100644 (file)
@@ -210,10 +210,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
        struct block_device *bdev;
        struct mddev *mddev = bitmap->mddev;
        struct bitmap_storage *store = &bitmap->storage;
-       int node_offset = 0;
-
-       if (mddev_is_clustered(bitmap->mddev))
-               node_offset = bitmap->cluster_slot * store->file_pages;
 
        while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
                int size = PAGE_SIZE;
index 5df4048..dd83492 100644 (file)
@@ -1191,6 +1191,8 @@ static void dm_unprep_request(struct request *rq)
 
        if (clone)
                free_rq_clone(clone);
+       else if (!tio->md->queue->mq_ops)
+               free_rq_tio(tio);
 }
 
 /*
index 4a8e150..685aa2d 100644 (file)
@@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
                conf->nfaults = n+1;
 }
 
-static void make_request(struct mddev *mddev, struct bio *bio)
+static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 {
        struct faulty_conf *conf = mddev->private;
        int failit = 0;
@@ -226,7 +226,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
        generic_make_request(bio);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void faulty_status(struct seq_file *seq, struct mddev *mddev)
 {
        struct faulty_conf *conf = mddev->private;
        int n;
@@ -259,7 +259,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 }
 
 
-static int reshape(struct mddev *mddev)
+static int faulty_reshape(struct mddev *mddev)
 {
        int mode = mddev->new_layout & ModeMask;
        int count = mddev->new_layout >> ModeShift;
@@ -299,7 +299,7 @@ static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disk
        return sectors;
 }
 
-static int run(struct mddev *mddev)
+static int faulty_run(struct mddev *mddev)
 {
        struct md_rdev *rdev;
        int i;
@@ -327,7 +327,7 @@ static int run(struct mddev *mddev)
        md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
        mddev->private = conf;
 
-       reshape(mddev);
+       faulty_reshape(mddev);
 
        return 0;
 }
@@ -344,11 +344,11 @@ static struct md_personality faulty_personality =
        .name           = "faulty",
        .level          = LEVEL_FAULTY,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = faulty_make_request,
+       .run            = faulty_run,
        .free           = faulty_free,
-       .status         = status,
-       .check_reshape  = reshape,
+       .status         = faulty_status,
+       .check_reshape  = faulty_reshape,
        .size           = faulty_size,
 };
 
index 0ded8e9..dd97d42 100644 (file)
@@ -293,6 +293,7 @@ static void recover_bitmaps(struct md_thread *thread)
 dlm_unlock:
                dlm_unlock_sync(bm_lockres);
 clear_bit:
+               lockres_free(bm_lockres);
                clear_bit(slot, &cinfo->recovery_map);
        }
 }
@@ -682,8 +683,10 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
                bm_lockres = lockres_init(mddev, str, NULL, 1);
                if (!bm_lockres)
                        return -ENOMEM;
-               if (i == (cinfo->slot_number - 1))
+               if (i == (cinfo->slot_number - 1)) {
+                       lockres_free(bm_lockres);
                        continue;
+               }
 
                bm_lockres->flags |= DLM_LKF_NOQUEUE;
                ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
@@ -858,6 +861,7 @@ static int leave(struct mddev *mddev)
        lockres_free(cinfo->token_lockres);
        lockres_free(cinfo->ack_lockres);
        lockres_free(cinfo->no_new_dev_lockres);
+       lockres_free(cinfo->resync_lockres);
        lockres_free(cinfo->bitmap_lockres);
        unlock_all_bitmaps(mddev);
        dlm_release_lockspace(cinfo->lockspace, 2);
index c4b9134..4e3843f 100644 (file)
@@ -1044,7 +1044,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
        kfree(plug);
 }
 
-static void make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 {
        struct r1conf *conf = mddev->private;
        struct raid1_info *mirror;
@@ -1422,7 +1422,7 @@ read_again:
        wake_up(&conf->wait_barrier);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 {
        struct r1conf *conf = mddev->private;
        int i;
@@ -1439,7 +1439,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
        seq_printf(seq, "]");
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 {
        char b[BDEVNAME_SIZE];
        struct r1conf *conf = mddev->private;
@@ -2472,7 +2472,8 @@ static int init_resync(struct r1conf *conf)
  * that can be installed to exclude normal IO requests.
  */
 
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
+                                  int *skipped)
 {
        struct r1conf *conf = mddev->private;
        struct r1bio *r1_bio;
@@ -2890,7 +2891,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 }
 
 static void raid1_free(struct mddev *mddev, void *priv);
-static int run(struct mddev *mddev)
+static int raid1_run(struct mddev *mddev)
 {
        struct r1conf *conf;
        int i;
@@ -3170,15 +3171,15 @@ static struct md_personality raid1_personality =
        .name           = "raid1",
        .level          = 1,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = raid1_make_request,
+       .run            = raid1_run,
        .free           = raid1_free,
-       .status         = status,
-       .error_handler  = error,
+       .status         = raid1_status,
+       .error_handler  = raid1_error,
        .hot_add_disk   = raid1_add_disk,
        .hot_remove_disk= raid1_remove_disk,
        .spare_active   = raid1_spare_active,
-       .sync_request   = sync_request,
+       .sync_request   = raid1_sync_request,
        .resize         = raid1_resize,
        .size           = raid1_size,
        .check_reshape  = raid1_reshape,
index ce959b4..1c1447d 100644 (file)
@@ -1442,7 +1442,7 @@ retry_write:
        one_write_done(r10_bio);
 }
 
-static void make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
        struct r10conf *conf = mddev->private;
        sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
@@ -1484,7 +1484,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
        wake_up(&conf->wait_barrier);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid10_status(struct seq_file *seq, struct mddev *mddev)
 {
        struct r10conf *conf = mddev->private;
        int i;
@@ -1562,7 +1562,7 @@ static int enough(struct r10conf *conf, int ignore)
                _enough(conf, 1, ignore);
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 {
        char b[BDEVNAME_SIZE];
        struct r10conf *conf = mddev->private;
@@ -2802,7 +2802,7 @@ static int init_resync(struct r10conf *conf)
  *
  */
 
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
+static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
                             int *skipped)
 {
        struct r10conf *conf = mddev->private;
@@ -3523,7 +3523,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
        return ERR_PTR(err);
 }
 
-static int run(struct mddev *mddev)
+static int raid10_run(struct mddev *mddev)
 {
        struct r10conf *conf;
        int i, disk_idx, chunk_size;
@@ -4617,15 +4617,15 @@ static struct md_personality raid10_personality =
        .name           = "raid10",
        .level          = 10,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = raid10_make_request,
+       .run            = raid10_run,
        .free           = raid10_free,
-       .status         = status,
-       .error_handler  = error,
+       .status         = raid10_status,
+       .error_handler  = raid10_error,
        .hot_add_disk   = raid10_add_disk,
        .hot_remove_disk= raid10_remove_disk,
        .spare_active   = raid10_spare_active,
-       .sync_request   = sync_request,
+       .sync_request   = raid10_sync_request,
        .quiesce        = raid10_quiesce,
        .size           = raid10_size,
        .resize         = raid10_resize,
index a086014..b4f02c9 100644 (file)
@@ -2496,7 +2496,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
        dev->sector = raid5_compute_blocknr(sh, i, previous);
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 {
        char b[BDEVNAME_SIZE];
        struct r5conf *conf = mddev->private;
@@ -2958,7 +2958,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
         * If several bio share a stripe. The bio bi_phys_segments acts as a
         * reference count to avoid race. The reference count should already be
         * increased before this function is called (for example, in
-        * make_request()), so other bio sharing this stripe will not free the
+        * raid5_make_request()), so other bio sharing this stripe will not free the
         * stripe. If a stripe is owned by one stripe, the stripe lock will
         * protect it.
         */
@@ -5135,7 +5135,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
        }
 }
 
-static void make_request(struct mddev *mddev, struct bio * bi)
+static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
        struct r5conf *conf = mddev->private;
        int dd_idx;
@@ -5225,7 +5225,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                new_sector = raid5_compute_sector(conf, logical_sector,
                                                  previous,
                                                  &dd_idx, NULL);
-               pr_debug("raid456: make_request, sector %llu logical %llu\n",
+               pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
                        (unsigned long long)new_sector,
                        (unsigned long long)logical_sector);
 
@@ -5575,7 +5575,8 @@ ret:
        return retn;
 }
 
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
+                                         int *skipped)
 {
        struct r5conf *conf = mddev->private;
        struct stripe_head *sh;
@@ -6674,7 +6675,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
        return 0;
 }
 
-static int run(struct mddev *mddev)
+static int raid5_run(struct mddev *mddev)
 {
        struct r5conf *conf;
        int working_disks = 0;
@@ -7048,7 +7049,7 @@ static void raid5_free(struct mddev *mddev, void *priv)
        mddev->to_remove = &raid5_attrs_group;
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid5_status(struct seq_file *seq, struct mddev *mddev)
 {
        struct r5conf *conf = mddev->private;
        int i;
@@ -7864,15 +7865,15 @@ static struct md_personality raid6_personality =
        .name           = "raid6",
        .level          = 6,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = raid5_make_request,
+       .run            = raid5_run,
        .free           = raid5_free,
-       .status         = status,
-       .error_handler  = error,
+       .status         = raid5_status,
+       .error_handler  = raid5_error,
        .hot_add_disk   = raid5_add_disk,
        .hot_remove_disk= raid5_remove_disk,
        .spare_active   = raid5_spare_active,
-       .sync_request   = sync_request,
+       .sync_request   = raid5_sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
        .check_reshape  = raid6_check_reshape,
@@ -7887,15 +7888,15 @@ static struct md_personality raid5_personality =
        .name           = "raid5",
        .level          = 5,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = raid5_make_request,
+       .run            = raid5_run,
        .free           = raid5_free,
-       .status         = status,
-       .error_handler  = error,
+       .status         = raid5_status,
+       .error_handler  = raid5_error,
        .hot_add_disk   = raid5_add_disk,
        .hot_remove_disk= raid5_remove_disk,
        .spare_active   = raid5_spare_active,
-       .sync_request   = sync_request,
+       .sync_request   = raid5_sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
@@ -7911,15 +7912,15 @@ static struct md_personality raid4_personality =
        .name           = "raid4",
        .level          = 4,
        .owner          = THIS_MODULE,
-       .make_request   = make_request,
-       .run            = run,
+       .make_request   = raid5_make_request,
+       .run            = raid5_run,
        .free           = raid5_free,
-       .status         = status,
-       .error_handler  = error,
+       .status         = raid5_status,
+       .error_handler  = raid5_error,
        .hot_add_disk   = raid5_add_disk,
        .hot_remove_disk= raid5_remove_disk,
        .spare_active   = raid5_spare_active,
-       .sync_request   = sync_request,
+       .sync_request   = raid5_sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
index 0e209b5..c6abeb4 100644 (file)
@@ -903,9 +903,18 @@ static int tda1004x_get_fe(struct dvb_frontend *fe)
 {
        struct dtv_frontend_properties *fe_params = &fe->dtv_property_cache;
        struct tda1004x_state* state = fe->demodulator_priv;
+       int status;
 
        dprintk("%s\n", __func__);
 
+       status = tda1004x_read_byte(state, TDA1004X_STATUS_CD);
+       if (status == -1)
+               return -EIO;
+
+       /* Only update the properties cache if device is locked */
+       if (!(status & 8))
+               return 0;
+
        // inversion status
        fe_params->inversion = INVERSION_OFF;
        if (tda1004x_read_byte(state, TDA1004X_CONFC1) & 0x20)
index 7e9cbf7..fb7ed73 100644 (file)
@@ -497,7 +497,7 @@ static int adp1653_probe(struct i2c_client *client,
                if (!client->dev.platform_data) {
                        dev_err(&client->dev,
                                "Neither DT not platform data provided\n");
-                       return EINVAL;
+                       return -EINVAL;
                }
                flash->platform_data = client->dev.platform_data;
        }
index f8dd750..e1719ff 100644 (file)
@@ -1960,10 +1960,9 @@ static int adv76xx_isr(struct v4l2_subdev *sd, u32 status, bool *handled)
        }
 
        /* tx 5v detect */
-       tx_5v = io_read(sd, 0x70) & info->cable_det_mask;
+       tx_5v = irq_reg_0x70 & info->cable_det_mask;
        if (tx_5v) {
                v4l2_dbg(1, debug, sd, "%s: tx_5v: 0x%x\n", __func__, tx_5v);
-               io_write(sd, 0x71, tx_5v);
                adv76xx_s_detect_tx_5v_ctrl(sd);
                if (handled)
                        *handled = true;
index 8304919..bf82726 100644 (file)
@@ -478,7 +478,6 @@ static const struct i2c_device_id ir_kbd_id[] = {
        { "ir_rx_z8f0811_hdpvr", 0 },
        { }
 };
-MODULE_DEVICE_TABLE(i2c, ir_kbd_id);
 
 static struct i2c_driver ir_kbd_driver = {
        .driver = {
index b9e43ff..cbe4711 100644 (file)
@@ -144,8 +144,7 @@ static int s5k6a3_set_fmt(struct v4l2_subdev *sd,
        mf = __s5k6a3_get_format(sensor, cfg, fmt->pad, fmt->which);
        if (mf) {
                mutex_lock(&sensor->lock);
-               if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE)
-                       *mf = fmt->format;
+               *mf = fmt->format;
                mutex_unlock(&sensor->lock);
        }
        return 0;
index 7dae0ac..e9219f5 100644 (file)
@@ -20,6 +20,9 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+/* We need to access legacy defines from linux/media.h */
+#define __NEED_MEDIA_LEGACY_API
+
 #include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/idr.h>
@@ -115,6 +118,26 @@ static long media_device_enum_entities(struct media_device *mdev,
        u_ent.group_id = 0;             /* Unused */
        u_ent.pads = ent->num_pads;
        u_ent.links = ent->num_links - ent->num_backlinks;
+
+       /*
+        * Workaround for a bug at media-ctl <= v1.10 that makes it to
+        * do the wrong thing if the entity function doesn't belong to
+        * either MEDIA_ENT_F_OLD_BASE or MEDIA_ENT_F_OLD_SUBDEV_BASE
+        * Ranges.
+        *
+        * Non-subdevices are expected to be at the MEDIA_ENT_F_OLD_BASE,
+        * or, otherwise, will be silently ignored by media-ctl when
+        * printing the graphviz diagram. So, map them into the devnode
+        * old range.
+        */
+       if (ent->function < MEDIA_ENT_F_OLD_BASE ||
+           ent->function > MEDIA_ENT_T_DEVNODE_UNKNOWN) {
+               if (is_media_entity_v4l2_subdev(ent))
+                       u_ent.type = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN;
+               else if (ent->function != MEDIA_ENT_F_IO_V4L)
+                       u_ent.type = MEDIA_ENT_T_DEVNODE_UNKNOWN;
+       }
+
        memcpy(&u_ent.raw, &ent->info, sizeof(ent->info));
        if (copy_to_user(uent, &u_ent, sizeof(u_ent)))
                return -EFAULT;
index 1d2c310..94f8162 100644 (file)
@@ -1211,6 +1211,8 @@ static int alsa_device_init(struct saa7134_dev *dev)
 
 static int alsa_device_exit(struct saa7134_dev *dev)
 {
+       if (!snd_saa7134_cards[dev->nr])
+               return 1;
 
        snd_card_free(snd_saa7134_cards[dev->nr]);
        snd_saa7134_cards[dev->nr] = NULL;
@@ -1260,7 +1262,8 @@ static void saa7134_alsa_exit(void)
        int idx;
 
        for (idx = 0; idx < SNDRV_CARDS; idx++) {
-               snd_card_free(snd_saa7134_cards[idx]);
+               if (snd_saa7134_cards[idx])
+                       snd_card_free(snd_saa7134_cards[idx]);
        }
 
        saa7134_dmasound_init = NULL;
index 5263594..8b89ebe 100644 (file)
@@ -215,6 +215,7 @@ config VIDEO_SAMSUNG_EXYNOS_GSC
 config VIDEO_STI_BDISP
        tristate "STMicroelectronics BDISP 2D blitter driver"
        depends on VIDEO_DEV && VIDEO_V4L2
+       depends on HAS_DMA
        depends on ARCH_STI || COMPILE_TEST
        select VIDEOBUF2_DMA_CONTIG
        select V4L2_MEM2MEM_DEV
index 7d28899..38aacc7 100644 (file)
@@ -1455,9 +1455,9 @@ static int coda_alloc_bitstream_buffer(struct coda_ctx *ctx,
                return 0;
 
        ctx->bitstream.size = roundup_pow_of_two(q_data->sizeimage * 2);
-       ctx->bitstream.vaddr = dma_alloc_writecombine(
-                       &ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                       &ctx->bitstream.paddr, GFP_KERNEL);
+       ctx->bitstream.vaddr = dma_alloc_wc(&ctx->dev->plat_dev->dev,
+                                           ctx->bitstream.size,
+                                           &ctx->bitstream.paddr, GFP_KERNEL);
        if (!ctx->bitstream.vaddr) {
                v4l2_err(&ctx->dev->v4l2_dev,
                         "failed to allocate bitstream ringbuffer");
@@ -1474,8 +1474,8 @@ static void coda_free_bitstream_buffer(struct coda_ctx *ctx)
        if (ctx->bitstream.vaddr == NULL)
                return;
 
-       dma_free_writecombine(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
-                             ctx->bitstream.vaddr, ctx->bitstream.paddr);
+       dma_free_wc(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
+                   ctx->bitstream.vaddr, ctx->bitstream.paddr);
        ctx->bitstream.vaddr = NULL;
        kfifo_init(&ctx->bitstream_fifo, NULL, 0);
 }
index 40423c6..57d42c6 100644 (file)
@@ -1,6 +1,6 @@
 
 config VIDEO_SAMSUNG_EXYNOS4_IS
-       bool "Samsung S5P/EXYNOS4 SoC series Camera Subsystem driver"
+       tristate "Samsung S5P/EXYNOS4 SoC series Camera Subsystem driver"
        depends on VIDEO_V4L2 && VIDEO_V4L2_SUBDEV_API
        depends on ARCH_S5PV210 || ARCH_EXYNOS || COMPILE_TEST
        depends on OF && COMMON_CLK
index 49658ca..979c388 100644 (file)
@@ -631,6 +631,12 @@ static int fimc_is_hw_open_sensor(struct fimc_is *is,
 
        fimc_is_mem_barrier();
 
+       /*
+        * Some user space use cases hang up here without this
+        * empirically chosen delay.
+        */
+       udelay(100);
+
        mcuctl_write(HIC_OPEN_SENSOR, is, MCUCTL_REG_ISSR(0));
        mcuctl_write(is->sensor_index, is, MCUCTL_REG_ISSR(1));
        mcuctl_write(sensor->drvdata->id, is, MCUCTL_REG_ISSR(2));
index bf9261e..c081672 100644 (file)
@@ -218,8 +218,8 @@ static void isp_video_capture_buffer_queue(struct vb2_buffer *vb)
                                                        ivb->dma_addr[i];
 
                        isp_dbg(2, &video->ve.vdev,
-                               "dma_buf %pad (%d/%d/%d) addr: %pad\n",
-                               &buf_index, ivb->index, i, vb->index,
+                               "dma_buf %d (%d/%d/%d) addr: %pad\n",
+                               buf_index, ivb->index, i, vb->index,
                                &ivb->dma_addr[i]);
                }
 
index f3b2dd3..e79ddbb 100644 (file)
@@ -185,6 +185,37 @@ error:
        return ret;
 }
 
+/**
+ * __fimc_pipeline_enable - enable power of all pipeline subdevs
+ *                         and the sensor clock
+ * @ep: video pipeline structure
+ * @fmd: fimc media device
+ *
+ * Called with the graph mutex held.
+ */
+static int __fimc_pipeline_enable(struct exynos_media_pipeline *ep,
+                                 struct fimc_md *fmd)
+{
+       struct fimc_pipeline *p = to_fimc_pipeline(ep);
+       int ret;
+
+       /* Enable PXLASYNC clock if this pipeline includes FIMC-IS */
+       if (!IS_ERR(fmd->wbclk[CLK_IDX_WB_B]) && p->subdevs[IDX_IS_ISP]) {
+               ret = clk_prepare_enable(fmd->wbclk[CLK_IDX_WB_B]);
+               if (ret < 0)
+                       return ret;
+       }
+
+       ret = fimc_pipeline_s_power(p, 1);
+       if (!ret)
+               return 0;
+
+       if (!IS_ERR(fmd->wbclk[CLK_IDX_WB_B]) && p->subdevs[IDX_IS_ISP])
+               clk_disable_unprepare(fmd->wbclk[CLK_IDX_WB_B]);
+
+       return ret;
+}
+
 /**
  * __fimc_pipeline_open - update the pipeline information, enable power
  *                        of all pipeline subdevs and the sensor clock
@@ -199,7 +230,6 @@ static int __fimc_pipeline_open(struct exynos_media_pipeline *ep,
        struct fimc_md *fmd = entity_to_fimc_mdev(me);
        struct fimc_pipeline *p = to_fimc_pipeline(ep);
        struct v4l2_subdev *sd;
-       int ret;
 
        if (WARN_ON(p == NULL || me == NULL))
                return -EINVAL;
@@ -208,24 +238,16 @@ static int __fimc_pipeline_open(struct exynos_media_pipeline *ep,
                fimc_pipeline_prepare(p, me);
 
        sd = p->subdevs[IDX_SENSOR];
-       if (sd == NULL)
-               return -EINVAL;
-
-       /* Disable PXLASYNC clock if this pipeline includes FIMC-IS */
-       if (!IS_ERR(fmd->wbclk[CLK_IDX_WB_B]) && p->subdevs[IDX_IS_ISP]) {
-               ret = clk_prepare_enable(fmd->wbclk[CLK_IDX_WB_B]);
-               if (ret < 0)
-                       return ret;
-       }
-
-       ret = fimc_pipeline_s_power(p, 1);
-       if (!ret)
+       if (sd == NULL) {
+               pr_warn("%s(): No sensor subdev\n", __func__);
+               /*
+                * Pipeline open cannot fail so as to make it possible
+                * for the user space to configure the pipeline.
+                */
                return 0;
+       }
 
-       if (!IS_ERR(fmd->wbclk[CLK_IDX_WB_B]) && p->subdevs[IDX_IS_ISP])
-               clk_disable_unprepare(fmd->wbclk[CLK_IDX_WB_B]);
-
-       return ret;
+       return __fimc_pipeline_enable(ep, fmd);
 }
 
 /**
@@ -269,10 +291,43 @@ static int __fimc_pipeline_s_stream(struct exynos_media_pipeline *ep, bool on)
                { IDX_CSIS, IDX_FLITE, IDX_FIMC, IDX_SENSOR, IDX_IS_ISP },
        };
        struct fimc_pipeline *p = to_fimc_pipeline(ep);
+       struct fimc_md *fmd = entity_to_fimc_mdev(&p->subdevs[IDX_CSIS]->entity);
+       enum fimc_subdev_index sd_id;
        int i, ret = 0;
 
-       if (p->subdevs[IDX_SENSOR] == NULL)
-               return -ENODEV;
+       if (p->subdevs[IDX_SENSOR] == NULL) {
+               if (!fmd->user_subdev_api) {
+                       /*
+                        * Sensor must be already discovered if we
+                        * aren't in the user_subdev_api mode
+                        */
+                       return -ENODEV;
+               }
+
+               /* Get pipeline sink entity */
+               if (p->subdevs[IDX_FIMC])
+                       sd_id = IDX_FIMC;
+               else if (p->subdevs[IDX_IS_ISP])
+                       sd_id = IDX_IS_ISP;
+               else if (p->subdevs[IDX_FLITE])
+                       sd_id = IDX_FLITE;
+               else
+                       return -ENODEV;
+
+               /*
+                * Sensor could have been linked between open and STREAMON -
+                * check if this is the case.
+                */
+               fimc_pipeline_prepare(p, &p->subdevs[sd_id]->entity);
+
+               if (p->subdevs[IDX_SENSOR] == NULL)
+                       return -ENODEV;
+
+               ret = __fimc_pipeline_enable(ep, fmd);
+               if (ret < 0)
+                       return ret;
+
+       }
 
        for (i = 0; i < IDX_MAX; i++) {
                unsigned int idx = seq[on][i];
@@ -282,8 +337,10 @@ static int __fimc_pipeline_s_stream(struct exynos_media_pipeline *ep, bool on)
                if (ret < 0 && ret != -ENOIOCTLCMD && ret != -ENODEV)
                        goto error;
        }
+
        return 0;
 error:
+       fimc_pipeline_s_power(p, !on);
        for (; i >= 0; i--) {
                unsigned int idx = seq[on][i];
                v4l2_subdev_call(p->subdevs[idx], video, s_stream, !on);
index c398b28..1af779e 100644 (file)
@@ -795,7 +795,7 @@ static int isi_camera_get_formats(struct soc_camera_device *icd,
                        xlate->host_fmt = &isi_camera_formats[i];
                        xlate->code     = code.code;
                        dev_dbg(icd->parent, "Providing format %s using code %d\n",
-                               isi_camera_formats[0].name, code.code);
+                               xlate->host_fmt->name, xlate->code);
                }
                break;
        default:
index cc84c6d..46c7186 100644 (file)
@@ -1493,6 +1493,8 @@ static void soc_camera_async_unbind(struct v4l2_async_notifier *notifier,
                                        struct soc_camera_async_client, notifier);
        struct soc_camera_device *icd = platform_get_drvdata(sasc->pdev);
 
+       icd->control = NULL;
+
        if (icd->clk) {
                v4l2_clk_unregister(icd->clk);
                icd->clk = NULL;
index 42dff9d..533bc79 100644 (file)
@@ -256,7 +256,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 
        /* Create links. */
        list_for_each_entry(entity, &vsp1->entities, list_dev) {
-               if (entity->type == VSP1_ENTITY_LIF) {
+               if (entity->type == VSP1_ENTITY_WPF) {
                        ret = vsp1_wpf_create_links(vsp1, entity);
                        if (ret < 0)
                                goto done;
@@ -264,7 +264,10 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
                        ret = vsp1_rpf_create_links(vsp1, entity);
                        if (ret < 0)
                                goto done;
-               } else {
+               }
+
+               if (entity->type != VSP1_ENTITY_LIF &&
+                   entity->type != VSP1_ENTITY_RPF) {
                        ret = vsp1_create_links(vsp1, entity);
                        if (ret < 0)
                                goto done;
index 637d0d6..b4dca57 100644 (file)
@@ -515,7 +515,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
        bool stopped;
 
        spin_lock_irqsave(&pipe->irqlock, flags);
-       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
+       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
        spin_unlock_irqrestore(&pipe->irqlock, flags);
 
        return stopped;
index 8c54fd2..a136257 100644 (file)
@@ -1843,8 +1843,7 @@ static void au0828_analog_create_entities(struct au0828_dev *dev)
                        ent->function = MEDIA_ENT_F_CONN_RF;
                        break;
                default: /* AU0828_VMUX_DEBUG */
-                       ent->function = MEDIA_ENT_F_CONN_TEST;
-                       break;
+                       continue;
                }
 
                ret = media_entity_pads_init(ent, 1, &dev->input_pad[i]);
index c5d49d7..ff8953a 100644 (file)
@@ -1063,8 +1063,11 @@ EXPORT_SYMBOL_GPL(vb2_discard_done);
  */
 static int __qbuf_mmap(struct vb2_buffer *vb, const void *pb)
 {
-       int ret = call_bufop(vb->vb2_queue, fill_vb2_buffer,
-                       vb, pb, vb->planes);
+       int ret = 0;
+
+       if (pb)
+               ret = call_bufop(vb->vb2_queue, fill_vb2_buffer,
+                                vb, pb, vb->planes);
        return ret ? ret : call_vb_qop(vb, buf_prepare, vb);
 }
 
@@ -1077,14 +1080,16 @@ static int __qbuf_userptr(struct vb2_buffer *vb, const void *pb)
        struct vb2_queue *q = vb->vb2_queue;
        void *mem_priv;
        unsigned int plane;
-       int ret;
+       int ret = 0;
        enum dma_data_direction dma_dir =
                q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
        bool reacquired = vb->planes[0].mem_priv == NULL;
 
        memset(planes, 0, sizeof(planes[0]) * vb->num_planes);
        /* Copy relevant information provided by the userspace */
-       ret = call_bufop(vb->vb2_queue, fill_vb2_buffer, vb, pb, planes);
+       if (pb)
+               ret = call_bufop(vb->vb2_queue, fill_vb2_buffer,
+                                vb, pb, planes);
        if (ret)
                return ret;
 
@@ -1192,14 +1197,16 @@ static int __qbuf_dmabuf(struct vb2_buffer *vb, const void *pb)
        struct vb2_queue *q = vb->vb2_queue;
        void *mem_priv;
        unsigned int plane;
-       int ret;
+       int ret = 0;
        enum dma_data_direction dma_dir =
                q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
        bool reacquired = vb->planes[0].mem_priv == NULL;
 
        memset(planes, 0, sizeof(planes[0]) * vb->num_planes);
        /* Copy relevant information provided by the userspace */
-       ret = call_bufop(vb->vb2_queue, fill_vb2_buffer, vb, pb, planes);
+       if (pb)
+               ret = call_bufop(vb->vb2_queue, fill_vb2_buffer,
+                                vb, pb, planes);
        if (ret)
                return ret;
 
@@ -1520,7 +1527,8 @@ int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb)
        q->waiting_for_buffers = false;
        vb->state = VB2_BUF_STATE_QUEUED;
 
-       call_void_bufop(q, copy_timestamp, vb, pb);
+       if (pb)
+               call_void_bufop(q, copy_timestamp, vb, pb);
 
        trace_vb2_qbuf(q, vb);
 
@@ -1532,7 +1540,8 @@ int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb)
                __enqueue_in_driver(vb);
 
        /* Fill buffer information for the userspace */
-       call_void_bufop(q, fill_user_buffer, vb, pb);
+       if (pb)
+               call_void_bufop(q, fill_user_buffer, vb, pb);
 
        /*
         * If streamon has been called, and we haven't yet called
@@ -1731,7 +1740,8 @@ static void __vb2_dqbuf(struct vb2_buffer *vb)
  * The return values from this function are intended to be directly returned
  * from vidioc_dqbuf handler in driver.
  */
-int vb2_core_dqbuf(struct vb2_queue *q, void *pb, bool nonblocking)
+int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb,
+                  bool nonblocking)
 {
        struct vb2_buffer *vb = NULL;
        int ret;
@@ -1754,8 +1764,12 @@ int vb2_core_dqbuf(struct vb2_queue *q, void *pb, bool nonblocking)
 
        call_void_vb_qop(vb, buf_finish, vb);
 
+       if (pindex)
+               *pindex = vb->index;
+
        /* Fill buffer information for the userspace */
-       call_void_bufop(q, fill_user_buffer, vb, pb);
+       if (pb)
+               call_void_bufop(q, fill_user_buffer, vb, pb);
 
        /* Remove from videobuf queue */
        list_del(&vb->queued_entry);
@@ -1828,7 +1842,7 @@ static void __vb2_queue_cancel(struct vb2_queue *q)
         * that's done in dqbuf, but that's not going to happen when we
         * cancel the whole queue. Note: this code belongs here, not in
         * __vb2_dqbuf() since in vb2_internal_dqbuf() there is a critical
-        * call to __fill_v4l2_buffer() after buf_finish(). That order can't
+        * call to __fill_user_buffer() after buf_finish(). That order can't
         * be changed, so we can't move the buf_finish() to __vb2_dqbuf().
         */
        for (i = 0; i < q->num_buffers; ++i) {
@@ -2357,7 +2371,6 @@ struct vb2_fileio_data {
        unsigned int count;
        unsigned int type;
        unsigned int memory;
-       struct vb2_buffer *b;
        struct vb2_fileio_buf bufs[VB2_MAX_FRAME];
        unsigned int cur_index;
        unsigned int initial_index;
@@ -2410,12 +2423,6 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read)
        if (fileio == NULL)
                return -ENOMEM;
 
-       fileio->b = kzalloc(q->buf_struct_size, GFP_KERNEL);
-       if (fileio->b == NULL) {
-               kfree(fileio);
-               return -ENOMEM;
-       }
-
        fileio->read_once = q->fileio_read_once;
        fileio->write_immediately = q->fileio_write_immediately;
 
@@ -2460,13 +2467,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read)
                 * Queue all buffers.
                 */
                for (i = 0; i < q->num_buffers; i++) {
-                       struct vb2_buffer *b = fileio->b;
-
-                       memset(b, 0, q->buf_struct_size);
-                       b->type = q->type;
-                       b->memory = q->memory;
-                       b->index = i;
-                       ret = vb2_core_qbuf(q, i, b);
+                       ret = vb2_core_qbuf(q, i, NULL);
                        if (ret)
                                goto err_reqbufs;
                        fileio->bufs[i].queued = 1;
@@ -2511,7 +2512,6 @@ static int __vb2_cleanup_fileio(struct vb2_queue *q)
                q->fileio = NULL;
                fileio->count = 0;
                vb2_core_reqbufs(q, fileio->memory, &fileio->count);
-               kfree(fileio->b);
                kfree(fileio);
                dprintk(3, "file io emulator closed\n");
        }
@@ -2539,7 +2539,8 @@ static size_t __vb2_perform_fileio(struct vb2_queue *q, char __user *data, size_
         * else is able to provide this information with the write() operation.
         */
        bool copy_timestamp = !read && q->copy_timestamp;
-       int ret, index;
+       unsigned index;
+       int ret;
 
        dprintk(3, "mode %s, offset %ld, count %zd, %sblocking\n",
                read ? "read" : "write", (long)*ppos, count,
@@ -2564,22 +2565,20 @@ static size_t __vb2_perform_fileio(struct vb2_queue *q, char __user *data, size_
         */
        index = fileio->cur_index;
        if (index >= q->num_buffers) {
-               struct vb2_buffer *b = fileio->b;
+               struct vb2_buffer *b;
 
                /*
                 * Call vb2_dqbuf to get buffer back.
                 */
-               memset(b, 0, q->buf_struct_size);
-               b->type = q->type;
-               b->memory = q->memory;
-               ret = vb2_core_dqbuf(q, b, nonblock);
+               ret = vb2_core_dqbuf(q, &index, NULL, nonblock);
                dprintk(5, "vb2_dqbuf result: %d\n", ret);
                if (ret)
                        return ret;
                fileio->dq_count += 1;
 
-               fileio->cur_index = index = b->index;
+               fileio->cur_index = index;
                buf = &fileio->bufs[index];
+               b = q->bufs[index];
 
                /*
                 * Get number of bytes filled by the driver
@@ -2630,7 +2629,7 @@ static size_t __vb2_perform_fileio(struct vb2_queue *q, char __user *data, size_
         * Queue next buffer if required.
         */
        if (buf->pos == buf->size || (!read && fileio->write_immediately)) {
-               struct vb2_buffer *b = fileio->b;
+               struct vb2_buffer *b = q->bufs[index];
 
                /*
                 * Check if this is the last buffer to read.
@@ -2643,15 +2642,11 @@ static size_t __vb2_perform_fileio(struct vb2_queue *q, char __user *data, size_
                /*
                 * Call vb2_qbuf and give buffer to the driver.
                 */
-               memset(b, 0, q->buf_struct_size);
-               b->type = q->type;
-               b->memory = q->memory;
-               b->index = index;
                b->planes[0].bytesused = buf->pos;
 
                if (copy_timestamp)
                        b->timestamp = ktime_get_ns();
-               ret = vb2_core_qbuf(q, index, b);
+               ret = vb2_core_qbuf(q, index, NULL);
                dprintk(5, "vb2_dbuf result: %d\n", ret);
                if (ret)
                        return ret;
@@ -2713,10 +2708,9 @@ static int vb2_thread(void *data)
 {
        struct vb2_queue *q = data;
        struct vb2_threadio_data *threadio = q->threadio;
-       struct vb2_fileio_data *fileio = q->fileio;
        bool copy_timestamp = false;
-       int prequeue = 0;
-       int index = 0;
+       unsigned prequeue = 0;
+       unsigned index = 0;
        int ret = 0;
 
        if (q->is_output) {
@@ -2728,37 +2722,34 @@ static int vb2_thread(void *data)
 
        for (;;) {
                struct vb2_buffer *vb;
-               struct vb2_buffer *b = fileio->b;
 
                /*
                 * Call vb2_dqbuf to get buffer back.
                 */
-               memset(b, 0, q->buf_struct_size);
-               b->type = q->type;
-               b->memory = q->memory;
                if (prequeue) {
-                       b->index = index++;
+                       vb = q->bufs[index++];
                        prequeue--;
                } else {
                        call_void_qop(q, wait_finish, q);
                        if (!threadio->stop)
-                               ret = vb2_core_dqbuf(q, b, 0);
+                               ret = vb2_core_dqbuf(q, &index, NULL, 0);
                        call_void_qop(q, wait_prepare, q);
                        dprintk(5, "file io: vb2_dqbuf result: %d\n", ret);
+                       if (!ret)
+                               vb = q->bufs[index];
                }
                if (ret || threadio->stop)
                        break;
                try_to_freeze();
 
-               vb = q->bufs[b->index];
-               if (b->state == VB2_BUF_STATE_DONE)
+               if (vb->state != VB2_BUF_STATE_ERROR)
                        if (threadio->fnc(vb, threadio->priv))
                                break;
                call_void_qop(q, wait_finish, q);
                if (copy_timestamp)
-                       b->timestamp = ktime_get_ns();;
+                       vb->timestamp = ktime_get_ns();;
                if (!threadio->stop)
-                       ret = vb2_core_qbuf(q, b->index, b);
+                       ret = vb2_core_qbuf(q, vb->index, NULL);
                call_void_qop(q, wait_prepare, q);
                if (ret || threadio->stop)
                        break;
index c9a2860..91f5521 100644 (file)
@@ -625,7 +625,7 @@ static int vb2_internal_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b,
                return -EINVAL;
        }
 
-       ret = vb2_core_dqbuf(q, b, nonblocking);
+       ret = vb2_core_dqbuf(q, NULL, b, nonblocking);
 
        return ret;
 }
index e6e4bac..12099b0 100644 (file)
@@ -2048,6 +2048,7 @@ int db8500_prcmu_config_hotmon(u8 low, u8 high)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(db8500_prcmu_config_hotmon);
 
 static int config_hot_period(u16 val)
 {
@@ -2074,11 +2075,13 @@ int db8500_prcmu_start_temp_sense(u16 cycles32k)
 
        return config_hot_period(cycles32k);
 }
+EXPORT_SYMBOL_GPL(db8500_prcmu_start_temp_sense);
 
 int db8500_prcmu_stop_temp_sense(void)
 {
        return config_hot_period(0xFFFF);
 }
+EXPORT_SYMBOL_GPL(db8500_prcmu_stop_temp_sense);
 
 static int prcmu_a9wdog(u8 cmd, u8 d0, u8 d1, u8 d2, u8 d3)
 {
index 4c1903f..0c6c17a 100644 (file)
@@ -415,7 +415,7 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
                delta = mftb() - psl_tb;
                if (delta < 0)
                        delta = -delta;
-       } while (cputime_to_usecs(delta) > 16);
+       } while (tb_to_ns(delta) > 16000);
 
        return 0;
 }
index 11fdadc..2a6eaf1 100644 (file)
@@ -103,6 +103,7 @@ enum ctype {
        CT_EXEC_USERSPACE,
        CT_ACCESS_USERSPACE,
        CT_WRITE_RO,
+       CT_WRITE_RO_AFTER_INIT,
        CT_WRITE_KERN,
 };
 
@@ -140,6 +141,7 @@ static char* cp_type[] = {
        "EXEC_USERSPACE",
        "ACCESS_USERSPACE",
        "WRITE_RO",
+       "WRITE_RO_AFTER_INIT",
        "WRITE_KERN",
 };
 
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
 static u8 data_area[EXEC_SIZE];
 
 static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
 
 module_param(recur_count, int, 0644);
 MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
                break;
        }
        case CT_WRITE_RO: {
-               unsigned long *ptr;
+               /* Explicitly cast away "const" for the test. */
+               unsigned long *ptr = (unsigned long *)&rodata;
 
-               ptr = (unsigned long *)&rodata;
+               pr_info("attempting bad rodata write at %p\n", ptr);
+               *ptr ^= 0xabcd1234;
 
-               pr_info("attempting bad write at %p\n", ptr);
+               break;
+       }
+       case CT_WRITE_RO_AFTER_INIT: {
+               unsigned long *ptr = &ro_after_init;
+
+               /*
+                * Verify we were written to during init. Since an Oops
+                * is considered a "success", a failure is to just skip the
+                * real test.
+                */
+               if ((*ptr & 0xAA) != 0xAA) {
+                       pr_info("%p was NOT written during init!?\n", ptr);
+                       break;
+               }
+
+               pr_info("attempting bad ro_after_init write at %p\n", ptr);
                *ptr ^= 0xabcd1234;
 
                break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
        int n_debugfs_entries = 1; /* Assume only the direct entry */
        int i;
 
+       /* Make sure we can write to __ro_after_init values during __init */
+       ro_after_init |= 0xAA;
+
        /* Register debugfs interface */
        lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
        if (!lkdtm_debugfs_root) {
index 677d036..80f9afc 100644 (file)
@@ -458,7 +458,11 @@ static int mei_ioctl_client_notify_request(struct file *file, u32 request)
 {
        struct mei_cl *cl = file->private_data;
 
-       return mei_cl_notify_request(cl, file, request);
+       if (request != MEI_HBM_NOTIFICATION_START &&
+           request != MEI_HBM_NOTIFICATION_STOP)
+               return -EINVAL;
+
+       return mei_cl_notify_request(cl, file, (u8)request);
 }
 
 /**
index 5914263..fe207e5 100644 (file)
 #include "queue.h"
 
 MODULE_ALIAS("mmc:block");
-
-#ifdef KERNEL
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
 #define MODULE_PARAM_PREFIX "mmcblk."
-#endif
 
 #define INAND_CMD38_ARG_EXT_CSD  113
 #define INAND_CMD38_ARG_ERASE    0x00
@@ -655,8 +652,10 @@ static int mmc_blk_ioctl_multi_cmd(struct block_device *bdev,
        }
 
        md = mmc_blk_get(bdev->bd_disk);
-       if (!md)
+       if (!md) {
+               err = -EINVAL;
                goto cmd_err;
+       }
 
        card = md->queue.card;
        if (IS_ERR(card)) {
index 1c1b45e..3446097 100644 (file)
@@ -925,6 +925,10 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
 
                        dma_addr = dma_map_page(dma_dev, sg_page(sg), 0,
                                                PAGE_SIZE, dir);
+                       if (dma_mapping_error(dma_dev, dma_addr)) {
+                               data->error = -EFAULT;
+                               break;
+                       }
                        if (direction == DMA_TO_DEVICE)
                                t->tx_dma = dma_addr + sg->offset;
                        else
@@ -1393,10 +1397,12 @@ static int mmc_spi_probe(struct spi_device *spi)
                host->dma_dev = dev;
                host->ones_dma = dma_map_single(dev, ones,
                                MMC_SPI_BLOCKSIZE, DMA_TO_DEVICE);
+               if (dma_mapping_error(dev, host->ones_dma))
+                       goto fail_ones_dma;
                host->data_dma = dma_map_single(dev, host->data,
                                sizeof(*host->data), DMA_BIDIRECTIONAL);
-
-               /* REVISIT in theory those map operations can fail... */
+               if (dma_mapping_error(dev, host->data_dma))
+                       goto fail_data_dma;
 
                dma_sync_single_for_cpu(host->dma_dev,
                                host->data_dma, sizeof(*host->data),
@@ -1462,6 +1468,11 @@ fail_glue_init:
        if (host->dma_dev)
                dma_unmap_single(host->dma_dev, host->data_dma,
                                sizeof(*host->data), DMA_BIDIRECTIONAL);
+fail_data_dma:
+       if (host->dma_dev)
+               dma_unmap_single(host->dma_dev, host->ones_dma,
+                               MMC_SPI_BLOCKSIZE, DMA_TO_DEVICE);
+fail_ones_dma:
        kfree(host->data);
 
 fail_nobuf1:
index b6639ea..f6e4d97 100644 (file)
@@ -2232,6 +2232,7 @@ err_irq:
                dma_release_channel(host->tx_chan);
        if (host->rx_chan)
                dma_release_channel(host->rx_chan);
+       pm_runtime_dont_use_autosuspend(host->dev);
        pm_runtime_put_sync(host->dev);
        pm_runtime_disable(host->dev);
        if (host->dbclk)
@@ -2253,6 +2254,7 @@ static int omap_hsmmc_remove(struct platform_device *pdev)
        dma_release_channel(host->tx_chan);
        dma_release_channel(host->rx_chan);
 
+       pm_runtime_dont_use_autosuspend(host->dev);
        pm_runtime_put_sync(host->dev);
        pm_runtime_disable(host->dev);
        device_init_wakeup(&pdev->dev, false);
index ce08896..da82477 100644 (file)
@@ -86,7 +86,7 @@ struct pxamci_host {
 static inline void pxamci_init_ocr(struct pxamci_host *host)
 {
 #ifdef CONFIG_REGULATOR
-       host->vcc = regulator_get_optional(mmc_dev(host->mmc), "vmmc");
+       host->vcc = devm_regulator_get_optional(mmc_dev(host->mmc), "vmmc");
 
        if (IS_ERR(host->vcc))
                host->vcc = NULL;
@@ -654,12 +654,8 @@ static int pxamci_probe(struct platform_device *pdev)
 
        r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        irq = platform_get_irq(pdev, 0);
-       if (!r || irq < 0)
-               return -ENXIO;
-
-       r = request_mem_region(r->start, SZ_4K, DRIVER_NAME);
-       if (!r)
-               return -EBUSY;
+       if (irq < 0)
+               return irq;
 
        mmc = mmc_alloc_host(sizeof(struct pxamci_host), &pdev->dev);
        if (!mmc) {
@@ -695,7 +691,7 @@ static int pxamci_probe(struct platform_device *pdev)
        host->pdata = pdev->dev.platform_data;
        host->clkrt = CLKRT_OFF;
 
-       host->clk = clk_get(&pdev->dev, NULL);
+       host->clk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(host->clk)) {
                ret = PTR_ERR(host->clk);
                host->clk = NULL;
@@ -727,9 +723,9 @@ static int pxamci_probe(struct platform_device *pdev)
        host->irq = irq;
        host->imask = MMC_I_MASK_ALL;
 
-       host->base = ioremap(r->start, SZ_4K);
-       if (!host->base) {
-               ret = -ENOMEM;
+       host->base = devm_ioremap_resource(&pdev->dev, r);
+       if (IS_ERR(host->base)) {
+               ret = PTR_ERR(host->base);
                goto out;
        }
 
@@ -742,7 +738,8 @@ static int pxamci_probe(struct platform_device *pdev)
        writel(64, host->base + MMC_RESTO);
        writel(host->imask, host->base + MMC_I_MASK);
 
-       ret = request_irq(host->irq, pxamci_irq, 0, DRIVER_NAME, host);
+       ret = devm_request_irq(&pdev->dev, host->irq, pxamci_irq, 0,
+                              DRIVER_NAME, host);
        if (ret)
                goto out;
 
@@ -804,7 +801,7 @@ static int pxamci_probe(struct platform_device *pdev)
                dev_err(&pdev->dev, "Failed requesting gpio_ro %d\n", gpio_ro);
                goto out;
        } else {
-               mmc->caps |= host->pdata->gpio_card_ro_invert ?
+               mmc->caps2 |= host->pdata->gpio_card_ro_invert ?
                        0 : MMC_CAP2_RO_ACTIVE_HIGH;
        }
 
@@ -833,14 +830,9 @@ out:
                        dma_release_channel(host->dma_chan_rx);
                if (host->dma_chan_tx)
                        dma_release_channel(host->dma_chan_tx);
-               if (host->base)
-                       iounmap(host->base);
-               if (host->clk)
-                       clk_put(host->clk);
        }
        if (mmc)
                mmc_free_host(mmc);
-       release_resource(r);
        return ret;
 }
 
@@ -859,9 +851,6 @@ static int pxamci_remove(struct platform_device *pdev)
                        gpio_ro = host->pdata->gpio_card_ro;
                        gpio_power = host->pdata->gpio_power;
                }
-               if (host->vcc)
-                       regulator_put(host->vcc);
-
                if (host->pdata && host->pdata->exit)
                        host->pdata->exit(&pdev->dev, mmc);
 
@@ -870,16 +859,10 @@ static int pxamci_remove(struct platform_device *pdev)
                       END_CMD_RES|PRG_DONE|DATA_TRAN_DONE,
                       host->base + MMC_I_MASK);
 
-               free_irq(host->irq, host);
                dmaengine_terminate_all(host->dma_chan_rx);
                dmaengine_terminate_all(host->dma_chan_tx);
                dma_release_channel(host->dma_chan_rx);
                dma_release_channel(host->dma_chan_tx);
-               iounmap(host->base);
-
-               clk_put(host->clk);
-
-               release_resource(host->res);
 
                mmc_free_host(mmc);
        }
index f6047fc..a5cda92 100644 (file)
@@ -146,6 +146,33 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
        .ops = &sdhci_acpi_ops_int,
 };
 
+static int bxt_get_cd(struct mmc_host *mmc)
+{
+       int gpio_cd = mmc_gpio_get_cd(mmc);
+       struct sdhci_host *host = mmc_priv(mmc);
+       unsigned long flags;
+       int ret = 0;
+
+       if (!gpio_cd)
+               return 0;
+
+       pm_runtime_get_sync(mmc->parent);
+
+       spin_lock_irqsave(&host->lock, flags);
+
+       if (host->flags & SDHCI_DEVICE_DEAD)
+               goto out;
+
+       ret = !!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT);
+out:
+       spin_unlock_irqrestore(&host->lock, flags);
+
+       pm_runtime_mark_last_busy(mmc->parent);
+       pm_runtime_put_autosuspend(mmc->parent);
+
+       return ret;
+}
+
 static int sdhci_acpi_emmc_probe_slot(struct platform_device *pdev,
                                      const char *hid, const char *uid)
 {
@@ -196,6 +223,9 @@ static int sdhci_acpi_sd_probe_slot(struct platform_device *pdev,
 
        /* Platform specific code during sd probe slot goes here */
 
+       if (hid && !strcmp(hid, "80865ACA"))
+               host->mmc_host_ops.get_cd = bxt_get_cd;
+
        return 0;
 }
 
index 7e7d8f0..9cb86fb 100644 (file)
@@ -217,6 +217,7 @@ static int sdhci_at91_probe(struct platform_device *pdev)
 pm_runtime_disable:
        pm_runtime_disable(&pdev->dev);
        pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_put_noidle(&pdev->dev);
 clocks_disable_unprepare:
        clk_disable_unprepare(priv->gck);
        clk_disable_unprepare(priv->mainck);
index cc851b0..df3b8ec 100644 (file)
@@ -330,6 +330,33 @@ static void spt_read_drive_strength(struct sdhci_host *host)
        sdhci_pci_spt_drive_strength = 0x10 | ((val >> 12) & 0xf);
 }
 
+static int bxt_get_cd(struct mmc_host *mmc)
+{
+       int gpio_cd = mmc_gpio_get_cd(mmc);
+       struct sdhci_host *host = mmc_priv(mmc);
+       unsigned long flags;
+       int ret = 0;
+
+       if (!gpio_cd)
+               return 0;
+
+       pm_runtime_get_sync(mmc->parent);
+
+       spin_lock_irqsave(&host->lock, flags);
+
+       if (host->flags & SDHCI_DEVICE_DEAD)
+               goto out;
+
+       ret = !!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT);
+out:
+       spin_unlock_irqrestore(&host->lock, flags);
+
+       pm_runtime_mark_last_busy(mmc->parent);
+       pm_runtime_put_autosuspend(mmc->parent);
+
+       return ret;
+}
+
 static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
@@ -362,6 +389,10 @@ static int byt_sd_probe_slot(struct sdhci_pci_slot *slot)
        slot->cd_con_id = NULL;
        slot->cd_idx = 0;
        slot->cd_override_level = true;
+       if (slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_BXT_SD ||
+           slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_APL_SD)
+               slot->host->mmc_host_ops.get_cd = bxt_get_cd;
+
        return 0;
 }
 
index d622435..add9fdf 100644 (file)
@@ -1360,7 +1360,7 @@ static void sdhci_request(struct mmc_host *mmc, struct mmc_request *mrq)
        sdhci_runtime_pm_get(host);
 
        /* Firstly check card presence */
-       present = sdhci_do_get_cd(host);
+       present = mmc->ops->get_cd(mmc);
 
        spin_lock_irqsave(&host->lock, flags);
 
@@ -2849,6 +2849,8 @@ struct sdhci_host *sdhci_alloc_host(struct device *dev,
 
        host = mmc_priv(mmc);
        host->mmc = mmc;
+       host->mmc_host_ops = sdhci_ops;
+       mmc->ops = &host->mmc_host_ops;
 
        return host;
 }
@@ -3037,7 +3039,6 @@ int sdhci_add_host(struct sdhci_host *host)
        /*
         * Set host parameters.
         */
-       mmc->ops = &sdhci_ops;
        max_clk = host->max_clk;
 
        if (host->ops->get_min_clock)
index 7654ae5..0115e99 100644 (file)
@@ -430,6 +430,7 @@ struct sdhci_host {
 
        /* Internal data */
        struct mmc_host *mmc;   /* MMC structure */
+       struct mmc_host_ops mmc_host_ops;       /* MMC host ops */
        u64 dma_mask;           /* custom DMA mask */
 
 #if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
index 1ca8a13..6234eab 100644 (file)
@@ -445,7 +445,7 @@ static void sh_mmcif_request_dma(struct sh_mmcif_host *host)
                                                        pdata->slave_id_rx);
        } else {
                host->chan_tx = dma_request_slave_channel(dev, "tx");
-               host->chan_tx = dma_request_slave_channel(dev, "rx");
+               host->chan_rx = dma_request_slave_channel(dev, "rx");
        }
        dev_dbg(dev, "%s: got channel TX %p RX %p\n", __func__, host->chan_tx,
                host->chan_rx);
index 7931615..88b6c81 100644 (file)
@@ -187,7 +187,7 @@ static int double_bit_error_detect(void *error_data, void *error_ecc,
        __nand_calculate_ecc(error_data, size, calc_ecc);
        ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size);
 
-       return (ret == -1) ? 0 : -EINVAL;
+       return (ret == -EBADMSG) ? 0 : -EINVAL;
 }
 
 static const struct nand_ecc_test nand_ecc_test[] = {
index 2a1b6e0..0134ba3 100644 (file)
@@ -193,7 +193,7 @@ int ubi_start_leb_change(struct ubi_device *ubi, struct ubi_volume *vol,
        vol->changing_leb = 1;
        vol->ch_lnum = req->lnum;
 
-       vol->upd_buf = vmalloc(req->bytes);
+       vol->upd_buf = vmalloc(ALIGN((int)req->bytes, ubi->min_io_size));
        if (!vol->upd_buf)
                return -ENOMEM;
 
index 56b5605..b7f1a99 100644 (file)
@@ -214,6 +214,8 @@ static void bond_uninit(struct net_device *bond_dev);
 static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
                                                struct rtnl_link_stats64 *stats);
 static void bond_slave_arr_handler(struct work_struct *work);
+static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
+                                 int mod);
 
 /*---------------------------- General routines -----------------------------*/
 
@@ -2127,6 +2129,7 @@ static void bond_miimon_commit(struct bonding *bond)
                        continue;
 
                case BOND_LINK_UP:
+                       bond_update_speed_duplex(slave);
                        bond_set_slave_link_state(slave, BOND_LINK_UP,
                                                  BOND_SLAVE_NOTIFY_NOW);
                        slave->last_link_up = jiffies;
@@ -2459,7 +2462,7 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
                 struct slave *slave)
 {
        struct arphdr *arp = (struct arphdr *)skb->data;
-       struct slave *curr_active_slave;
+       struct slave *curr_active_slave, *curr_arp_slave;
        unsigned char *arp_ptr;
        __be32 sip, tip;
        int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
@@ -2506,26 +2509,41 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
                     &sip, &tip);
 
        curr_active_slave = rcu_dereference(bond->curr_active_slave);
+       curr_arp_slave = rcu_dereference(bond->current_arp_slave);
 
-       /* Backup slaves won't see the ARP reply, but do come through
-        * here for each ARP probe (so we swap the sip/tip to validate
-        * the probe).  In a "redundant switch, common router" type of
-        * configuration, the ARP probe will (hopefully) travel from
-        * the active, through one switch, the router, then the other
-        * switch before reaching the backup.
+       /* We 'trust' the received ARP enough to validate it if:
+        *
+        * (a) the slave receiving the ARP is active (which includes the
+        * current ARP slave, if any), or
+        *
+        * (b) the receiving slave isn't active, but there is a currently
+        * active slave and it received valid arp reply(s) after it became
+        * the currently active slave, or
+        *
+        * (c) there is an ARP slave that sent an ARP during the prior ARP
+        * interval, and we receive an ARP reply on any slave.  We accept
+        * these because switch FDB update delays may deliver the ARP
+        * reply to a slave other than the sender of the ARP request.
         *
-        * We 'trust' the arp requests if there is an active slave and
-        * it received valid arp reply(s) after it became active. This
-        * is done to avoid endless looping when we can't reach the
+        * Note: for (b), backup slaves are receiving the broadcast ARP
+        * request, not a reply.  This request passes from the sending
+        * slave through the L2 switch(es) to the receiving slave.  Since
+        * this is checking the request, sip/tip are swapped for
+        * validation.
+        *
+        * This is done to avoid endless looping when we can't reach the
         * arp_ip_target and fool ourselves with our own arp requests.
         */
-
        if (bond_is_active_slave(slave))
                bond_validate_arp(bond, slave, sip, tip);
        else if (curr_active_slave &&
                 time_after(slave_last_rx(bond, curr_active_slave),
                            curr_active_slave->last_link_up))
                bond_validate_arp(bond, slave, tip, sip);
+       else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
+                bond_time_in_interval(bond,
+                                      dev_trans_start(curr_arp_slave->dev), 1))
+               bond_validate_arp(bond, slave, sip, tip);
 
 out_unlock:
        if (arp != (struct arphdr *)skb->data)
index 575790e..74a7dfe 100644 (file)
@@ -843,7 +843,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
                if (clear_intf)
                        mcp251x_write_bits(spi, CANINTF, clear_intf, 0x00);
 
-               if (eflag)
+               if (eflag & (EFLG_RX0OVR | EFLG_RX1OVR))
                        mcp251x_write_bits(spi, EFLG, eflag, 0x00);
 
                /* Update can state */
index fc5b756..eb7192f 100644 (file)
@@ -117,6 +117,9 @@ MODULE_LICENSE("GPL v2");
  */
 #define EMS_USB_ARM7_CLOCK 8000000
 
+#define CPC_TX_QUEUE_TRIGGER_LOW       25
+#define CPC_TX_QUEUE_TRIGGER_HIGH      35
+
 /*
  * CAN-Message representation in a CPC_MSG. Message object type is
  * CPC_MSG_TYPE_CAN_FRAME or CPC_MSG_TYPE_RTR_FRAME or
@@ -278,6 +281,11 @@ static void ems_usb_read_interrupt_callback(struct urb *urb)
        switch (urb->status) {
        case 0:
                dev->free_slots = dev->intr_in_buffer[1];
+               if(dev->free_slots > CPC_TX_QUEUE_TRIGGER_HIGH){
+                       if (netif_queue_stopped(netdev)){
+                               netif_wake_queue(netdev);
+                       }
+               }
                break;
 
        case -ECONNRESET: /* unlink */
@@ -526,8 +534,6 @@ static void ems_usb_write_bulk_callback(struct urb *urb)
        /* Release context */
        context->echo_index = MAX_TX_URBS;
 
-       if (netif_queue_stopped(netdev))
-               netif_wake_queue(netdev);
 }
 
 /*
@@ -587,7 +593,7 @@ static int ems_usb_start(struct ems_usb *dev)
        int err, i;
 
        dev->intr_in_buffer[0] = 0;
-       dev->free_slots = 15; /* initial size */
+       dev->free_slots = 50; /* initial size */
 
        for (i = 0; i < MAX_RX_URBS; i++) {
                struct urb *urb = NULL;
@@ -835,7 +841,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 
                /* Slow down tx path */
                if (atomic_read(&dev->active_tx_urbs) >= MAX_TX_URBS ||
-                   dev->free_slots < 5) {
+                   dev->free_slots < CPC_TX_QUEUE_TRIGGER_LOW) {
                        netif_stop_queue(netdev);
                }
        }
index 5eee62b..cbc99d5 100644 (file)
@@ -826,9 +826,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, struct usb_interface
 static void gs_destroy_candev(struct gs_can *dev)
 {
        unregister_candev(dev->netdev);
-       free_candev(dev->netdev);
        usb_kill_anchored_urbs(&dev->tx_submitted);
-       kfree(dev);
+       free_candev(dev->netdev);
 }
 
 static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
@@ -913,12 +912,15 @@ static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *
        for (i = 0; i < icount; i++) {
                dev->canch[i] = gs_make_candev(i, intf);
                if (IS_ERR_OR_NULL(dev->canch[i])) {
+                       /* save error code to return later */
+                       rc = PTR_ERR(dev->canch[i]);
+
                        /* on failure destroy previously created candevs */
                        icount = i;
-                       for (i = 0; i < icount; i++) {
+                       for (i = 0; i < icount; i++)
                                gs_destroy_candev(dev->canch[i]);
-                               dev->canch[i] = NULL;
-                       }
+
+                       usb_kill_anchored_urbs(&dev->rx_submitted);
                        kfree(dev);
                        return rc;
                }
@@ -939,16 +941,12 @@ static void gs_usb_disconnect(struct usb_interface *intf)
                return;
        }
 
-       for (i = 0; i < GS_MAX_INTF; i++) {
-               struct gs_can *can = dev->canch[i];
-
-               if (!can)
-                       continue;
-
-               gs_destroy_candev(can);
-       }
+       for (i = 0; i < GS_MAX_INTF; i++)
+               if (dev->canch[i])
+                       gs_destroy_candev(dev->canch[i]);
 
        usb_kill_anchored_urbs(&dev->rx_submitted);
+       kfree(dev);
 }
 
 static const struct usb_device_id gs_usb_table[] = {
index cc6c545..a47f52f 100644 (file)
@@ -25,6 +25,7 @@
 static const struct mv88e6xxx_switch_id mv88e6352_table[] = {
        { PORT_SWITCH_ID_6172, "Marvell 88E6172" },
        { PORT_SWITCH_ID_6176, "Marvell 88E6176" },
+       { PORT_SWITCH_ID_6240, "Marvell 88E6240" },
        { PORT_SWITCH_ID_6320, "Marvell 88E6320" },
        { PORT_SWITCH_ID_6320_A1, "Marvell 88E6320 (A1)" },
        { PORT_SWITCH_ID_6320_A2, "Marvell 88e6320 (A2)" },
index 9fe33fc..512c8c0 100644 (file)
@@ -1532,7 +1532,7 @@ int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
 
        /* no PVID with ranges, otherwise it's a bug */
        if (pvid)
-               err = _mv88e6xxx_port_pvid_set(ds, port, vid);
+               err = _mv88e6xxx_port_pvid_set(ds, port, vlan->vid_end);
 unlock:
        mutex_unlock(&ps->smi_mutex);
 
@@ -1555,7 +1555,7 @@ static int _mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, u16 vid)
 
        if (vlan.vid != vid || !vlan.valid ||
            vlan.data[port] == GLOBAL_VTU_DATA_MEMBER_TAG_NON_MEMBER)
-               return -ENOENT;
+               return -EOPNOTSUPP;
 
        vlan.data[port] = GLOBAL_VTU_DATA_MEMBER_TAG_NON_MEMBER;
 
@@ -1582,6 +1582,7 @@ int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port,
                            const struct switchdev_obj_port_vlan *vlan)
 {
        struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+       const u16 defpvid = 4000 + ds->index * DSA_MAX_PORTS + port;
        u16 pvid, vid;
        int err = 0;
 
@@ -1597,7 +1598,8 @@ int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port,
                        goto unlock;
 
                if (vid == pvid) {
-                       err = _mv88e6xxx_port_pvid_set(ds, port, 0);
+                       /* restore reserved VLAN ID */
+                       err = _mv88e6xxx_port_pvid_set(ds, port, defpvid);
                        if (err)
                                goto unlock;
                }
@@ -1889,26 +1891,20 @@ unlock:
 
 int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members)
 {
-       struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-       const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
-       int err;
-
-       /* The port joined a bridge, so leave its reserved VLAN */
-       mutex_lock(&ps->smi_mutex);
-       err = _mv88e6xxx_port_vlan_del(ds, port, pvid);
-       if (!err)
-               err = _mv88e6xxx_port_pvid_set(ds, port, 0);
-       mutex_unlock(&ps->smi_mutex);
-       return err;
+       return 0;
 }
 
 int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members)
+{
+       return 0;
+}
+
+static int mv88e6xxx_setup_port_default_vlan(struct dsa_switch *ds, int port)
 {
        struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
        const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
        int err;
 
-       /* The port left the bridge, so join its reserved VLAN */
        mutex_lock(&ps->smi_mutex);
        err = _mv88e6xxx_port_vlan_add(ds, port, pvid, true);
        if (!err)
@@ -2163,7 +2159,8 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
         * database, and allow every port to egress frames on all other ports.
         */
        reg = BIT(ps->num_ports) - 1; /* all ports */
-       ret = _mv88e6xxx_port_vlan_map_set(ds, port, reg & ~port);
+       reg &= ~BIT(port); /* except itself */
+       ret = _mv88e6xxx_port_vlan_map_set(ds, port, reg);
        if (ret)
                goto abort;
 
@@ -2191,8 +2188,7 @@ int mv88e6xxx_setup_ports(struct dsa_switch *ds)
                if (dsa_is_cpu_port(ds, i) || dsa_is_dsa_port(ds, i))
                        continue;
 
-               /* setup the unbridged state */
-               ret = mv88e6xxx_port_bridge_leave(ds, i, 0);
+               ret = mv88e6xxx_setup_port_default_vlan(ds, i);
                if (ret < 0)
                        return ret;
        }
index 79e1a02..17b2126 100644 (file)
@@ -2461,7 +2461,7 @@ boomerang_interrupt(int irq, void *dev_id)
                                        int i;
                                        pci_unmap_single(VORTEX_PCI(vp),
                                                        le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
-                                                       le32_to_cpu(vp->tx_ring[entry].frag[0].length),
+                                                       le32_to_cpu(vp->tx_ring[entry].frag[0].length)&0xFFF,
                                                        PCI_DMA_TODEVICE);
 
                                        for (i=1; i<=skb_shinfo(skb)->nr_frags; i++)
index 2777289..2f79d29 100644 (file)
@@ -1501,6 +1501,7 @@ static const struct pcmcia_device_id pcnet_ids[] = {
        PCMCIA_DEVICE_MANF_CARD(0x026f, 0x030a),
        PCMCIA_DEVICE_MANF_CARD(0x0274, 0x1103),
        PCMCIA_DEVICE_MANF_CARD(0x0274, 0x1121),
+       PCMCIA_DEVICE_MANF_CARD(0xc001, 0x0009),
        PCMCIA_DEVICE_PROD_ID12("2408LAN", "Ethernet", 0x352fff7f, 0x00b2e941),
        PCMCIA_DEVICE_PROD_ID1234("Socket", "CF 10/100 Ethernet Card", "Revision B", "05/11/06", 0xb38bcc2e, 0x4de88352, 0xeaca6c8d, 0x7e57c22e),
        PCMCIA_DEVICE_PROD_ID123("Cardwell", "PCMCIA", "ETHERNET", 0x9533672e, 0x281f1c5d, 0x3ff7175b),
index 3f3bcbe..0907ab6 100644 (file)
@@ -2380,7 +2380,7 @@ static int et131x_tx_dma_memory_alloc(struct et131x_adapter *adapter)
                                                    sizeof(u32),
                                                    &tx_ring->tx_status_pa,
                                                    GFP_KERNEL);
-       if (!tx_ring->tx_status_pa) {
+       if (!tx_ring->tx_status) {
                dev_err(&adapter->pdev->dev,
                        "Cannot alloc memory for Tx status block\n");
                return -ENOMEM;
index 1747285..f749e4d 100644 (file)
@@ -193,7 +193,6 @@ static void altera_tse_mdio_destroy(struct net_device *dev)
                            priv->mdio->id);
 
        mdiobus_unregister(priv->mdio);
-       kfree(priv->mdio->irq);
        mdiobus_free(priv->mdio);
        priv->mdio = NULL;
 }
index 87e727b..fcdf5dd 100644 (file)
@@ -50,8 +50,8 @@ static const char version[] =
 static void write_rreg(u_long base, u_int reg, u_int val)
 {
        asm volatile(
-       "str%?h %1, [%2]        @ NET_RAP\n\t"
-       "str%?h %0, [%2, #-4]   @ NET_RDP"
+       "strh   %1, [%2]        @ NET_RAP\n\t"
+       "strh   %0, [%2, #-4]   @ NET_RDP"
        :
        : "r" (val), "r" (reg), "r" (ISAIO_BASE + 0x0464));
 }
@@ -60,8 +60,8 @@ static inline unsigned short read_rreg(u_long base_addr, u_int reg)
 {
        unsigned short v;
        asm volatile(
-       "str%?h %1, [%2]        @ NET_RAP\n\t"
-       "ldr%?h %0, [%2, #-4]   @ NET_RDP"
+       "strh   %1, [%2]        @ NET_RAP\n\t"
+       "ldrh   %0, [%2, #-4]   @ NET_RDP"
        : "=r" (v)
        : "r" (reg), "r" (ISAIO_BASE + 0x0464));
        return v;
@@ -70,8 +70,8 @@ static inline unsigned short read_rreg(u_long base_addr, u_int reg)
 static inline void write_ireg(u_long base, u_int reg, u_int val)
 {
        asm volatile(
-       "str%?h %1, [%2]        @ NET_RAP\n\t"
-       "str%?h %0, [%2, #8]    @ NET_IDP"
+       "strh   %1, [%2]        @ NET_RAP\n\t"
+       "strh   %0, [%2, #8]    @ NET_IDP"
        :
        : "r" (val), "r" (reg), "r" (ISAIO_BASE + 0x0464));
 }
@@ -80,8 +80,8 @@ static inline unsigned short read_ireg(u_long base_addr, u_int reg)
 {
        u_short v;
        asm volatile(
-       "str%?h %1, [%2]        @ NAT_RAP\n\t"
-       "ldr%?h %0, [%2, #8]    @ NET_IDP\n\t"
+       "strh   %1, [%2]        @ NAT_RAP\n\t"
+       "ldrh   %0, [%2, #8]    @ NET_IDP\n\t"
        : "=r" (v)
        : "r" (reg), "r" (ISAIO_BASE + 0x0464));
        return v;
@@ -96,7 +96,7 @@ am_writebuffer(struct net_device *dev, u_int offset, unsigned char *buf, unsigne
        offset = ISAMEM_BASE + (offset << 1);
        length = (length + 1) & ~1;
        if ((int)buf & 2) {
-               asm volatile("str%?h    %2, [%0], #4"
+               asm volatile("strh      %2, [%0], #4"
                 : "=&r" (offset) : "0" (offset), "r" (buf[0] | (buf[1] << 8)));
                buf += 2;
                length -= 2;
@@ -104,20 +104,20 @@ am_writebuffer(struct net_device *dev, u_int offset, unsigned char *buf, unsigne
        while (length > 8) {
                register unsigned int tmp asm("r2"), tmp2 asm("r3");
                asm volatile(
-                       "ldm%?ia        %0!, {%1, %2}"
+                       "ldmia  %0!, {%1, %2}"
                        : "+r" (buf), "=&r" (tmp), "=&r" (tmp2));
                length -= 8;
                asm volatile(
-                       "str%?h %1, [%0], #4\n\t"
-                       "mov%?  %1, %1, lsr #16\n\t"
-                       "str%?h %1, [%0], #4\n\t"
-                       "str%?h %2, [%0], #4\n\t"
-                       "mov%?  %2, %2, lsr #16\n\t"
-                       "str%?h %2, [%0], #4"
+                       "strh   %1, [%0], #4\n\t"
+                       "mov    %1, %1, lsr #16\n\t"
+                       "strh   %1, [%0], #4\n\t"
+                       "strh   %2, [%0], #4\n\t"
+                       "mov    %2, %2, lsr #16\n\t"
+                       "strh   %2, [%0], #4"
                : "+r" (offset), "=&r" (tmp), "=&r" (tmp2));
        }
        while (length > 0) {
-               asm volatile("str%?h    %2, [%0], #4"
+               asm volatile("strh      %2, [%0], #4"
                 : "=&r" (offset) : "0" (offset), "r" (buf[0] | (buf[1] << 8)));
                buf += 2;
                length -= 2;
@@ -132,23 +132,23 @@ am_readbuffer(struct net_device *dev, u_int offset, unsigned char *buf, unsigned
        if ((int)buf & 2) {
                unsigned int tmp;
                asm volatile(
-                       "ldr%?h %2, [%0], #4\n\t"
-                       "str%?b %2, [%1], #1\n\t"
-                       "mov%?  %2, %2, lsr #8\n\t"
-                       "str%?b %2, [%1], #1"
+                       "ldrh   %2, [%0], #4\n\t"
+                       "strb   %2, [%1], #1\n\t"
+                       "mov    %2, %2, lsr #8\n\t"
+                       "strb   %2, [%1], #1"
                : "=&r" (offset), "=&r" (buf), "=r" (tmp): "0" (offset), "1" (buf));
                length -= 2;
        }
        while (length > 8) {
                register unsigned int tmp asm("r2"), tmp2 asm("r3"), tmp3;
                asm volatile(
-                       "ldr%?h %2, [%0], #4\n\t"
-                       "ldr%?h %4, [%0], #4\n\t"
-                       "ldr%?h %3, [%0], #4\n\t"
-                       "orr%?  %2, %2, %4, lsl #16\n\t"
-                       "ldr%?h %4, [%0], #4\n\t"
-                       "orr%?  %3, %3, %4, lsl #16\n\t"
-                       "stm%?ia        %1!, {%2, %3}"
+                       "ldrh   %2, [%0], #4\n\t"
+                       "ldrh   %4, [%0], #4\n\t"
+                       "ldrh   %3, [%0], #4\n\t"
+                       "orr    %2, %2, %4, lsl #16\n\t"
+                       "ldrh   %4, [%0], #4\n\t"
+                       "orr    %3, %3, %4, lsl #16\n\t"
+                       "stmia  %1!, {%2, %3}"
                : "=&r" (offset), "=&r" (buf), "=r" (tmp), "=r" (tmp2), "=r" (tmp3)
                : "0" (offset), "1" (buf));
                length -= 8;
@@ -156,10 +156,10 @@ am_readbuffer(struct net_device *dev, u_int offset, unsigned char *buf, unsigned
        while (length > 0) {
                unsigned int tmp;
                asm volatile(
-                       "ldr%?h %2, [%0], #4\n\t"
-                       "str%?b %2, [%1], #1\n\t"
-                       "mov%?  %2, %2, lsr #8\n\t"
-                       "str%?b %2, [%1], #1"
+                       "ldrh   %2, [%0], #4\n\t"
+                       "strb   %2, [%1], #1\n\t"
+                       "mov    %2, %2, lsr #8\n\t"
+                       "strb   %2, [%1], #1"
                : "=&r" (offset), "=&r" (buf), "=r" (tmp) : "0" (offset), "1" (buf));
                length -= 2;
        }
index 256f590..3a7ebfd 100644 (file)
@@ -547,8 +547,8 @@ static int __init lance_probe1(struct net_device *dev, int ioaddr, int irq, int
        /* Make certain the data structures used by the LANCE are aligned and DMAble. */
 
        lp = kzalloc(sizeof(*lp), GFP_DMA | GFP_KERNEL);
-       if(lp==NULL)
-               return -ENODEV;
+       if (!lp)
+               return -ENOMEM;
        if (lance_debug > 6) printk(" (#0x%05lx)", (unsigned long)lp);
        dev->ml_priv = lp;
        lp->name = chipname;
index a4799c1..5eb9b20 100644 (file)
@@ -628,6 +628,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
        int ret;
 
        ring = pdata->rx_ring;
+       irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
        ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
                               IRQF_SHARED, ring->irq_name, ring);
        if (ret)
@@ -635,6 +636,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
 
        if (pdata->cq_cnt) {
                ring = pdata->tx_ring->cp_ring;
+               irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
                ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
                                       IRQF_SHARED, ring->irq_name, ring);
                if (ret) {
@@ -649,15 +651,19 @@ static int xgene_enet_register_irq(struct net_device *ndev)
 static void xgene_enet_free_irq(struct net_device *ndev)
 {
        struct xgene_enet_pdata *pdata;
+       struct xgene_enet_desc_ring *ring;
        struct device *dev;
 
        pdata = netdev_priv(ndev);
        dev = ndev_to_dev(ndev);
-       devm_free_irq(dev, pdata->rx_ring->irq, pdata->rx_ring);
+       ring = pdata->rx_ring;
+       irq_clear_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
+       devm_free_irq(dev, ring->irq, ring);
 
        if (pdata->cq_cnt) {
-               devm_free_irq(dev, pdata->tx_ring->cp_ring->irq,
-                             pdata->tx_ring->cp_ring);
+               ring = pdata->tx_ring->cp_ring;
+               irq_clear_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
+               devm_free_irq(dev, ring->irq, ring);
        }
 }
 
index 70d5b62..248dfc4 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/acpi.h>
 #include <linux/clk.h>
 #include <linux/efi.h>
+#include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/of_platform.h>
 #include <linux/of_net.h>
index abe1eab..6446af1 100644 (file)
@@ -163,7 +163,7 @@ static void arc_emac_tx_clean(struct net_device *ndev)
                struct sk_buff *skb = tx_buff->skb;
                unsigned int info = le32_to_cpu(txbd->info);
 
-               if ((info & FOR_EMAC) || !txbd->data)
+               if ((info & FOR_EMAC) || !txbd->data || !skb)
                        break;
 
                if (unlikely(info & (DROP | DEFR | LTCL | UFLO))) {
@@ -191,6 +191,7 @@ static void arc_emac_tx_clean(struct net_device *ndev)
 
                txbd->data = 0;
                txbd->info = 0;
+               tx_buff->skb = NULL;
 
                *txbd_dirty = (*txbd_dirty + 1) % TX_BD_NUM;
        }
@@ -446,6 +447,9 @@ static int arc_emac_open(struct net_device *ndev)
                *last_rx_bd = (*last_rx_bd + 1) % RX_BD_NUM;
        }
 
+       priv->txbd_curr = 0;
+       priv->txbd_dirty = 0;
+
        /* Clean Tx BD's */
        memset(priv->txbd, 0, TX_RING_SZ);
 
@@ -513,6 +517,64 @@ static void arc_emac_set_rx_mode(struct net_device *ndev)
        }
 }
 
+/**
+ * arc_free_tx_queue - free skb from tx queue
+ * @ndev:      Pointer to the network device.
+ *
+ * This function must be called while EMAC disable
+ */
+static void arc_free_tx_queue(struct net_device *ndev)
+{
+       struct arc_emac_priv *priv = netdev_priv(ndev);
+       unsigned int i;
+
+       for (i = 0; i < TX_BD_NUM; i++) {
+               struct arc_emac_bd *txbd = &priv->txbd[i];
+               struct buffer_state *tx_buff = &priv->tx_buff[i];
+
+               if (tx_buff->skb) {
+                       dma_unmap_single(&ndev->dev, dma_unmap_addr(tx_buff, addr),
+                                        dma_unmap_len(tx_buff, len), DMA_TO_DEVICE);
+
+                       /* return the sk_buff to system */
+                       dev_kfree_skb_irq(tx_buff->skb);
+               }
+
+               txbd->info = 0;
+               txbd->data = 0;
+               tx_buff->skb = NULL;
+       }
+}
+
+/**
+ * arc_free_rx_queue - free skb from rx queue
+ * @ndev:      Pointer to the network device.
+ *
+ * This function must be called while EMAC disable
+ */
+static void arc_free_rx_queue(struct net_device *ndev)
+{
+       struct arc_emac_priv *priv = netdev_priv(ndev);
+       unsigned int i;
+
+       for (i = 0; i < RX_BD_NUM; i++) {
+               struct arc_emac_bd *rxbd = &priv->rxbd[i];
+               struct buffer_state *rx_buff = &priv->rx_buff[i];
+
+               if (rx_buff->skb) {
+                       dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr),
+                                       dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE);
+
+                       /* return the sk_buff to system */
+                       dev_kfree_skb_irq(rx_buff->skb);
+               }
+
+               rxbd->info = 0;
+               rxbd->data = 0;
+               rx_buff->skb = NULL;
+       }
+}
+
 /**
  * arc_emac_stop - Close the network device.
  * @ndev:      Pointer to the network device.
@@ -534,6 +596,10 @@ static int arc_emac_stop(struct net_device *ndev)
        /* Disable EMAC */
        arc_reg_clr(priv, R_CTRL, EN_MASK);
 
+       /* Return the sk_buff to system */
+       arc_free_tx_queue(ndev);
+       arc_free_rx_queue(ndev);
+
        return 0;
 }
 
@@ -610,7 +676,6 @@ static int arc_emac_tx(struct sk_buff *skb, struct net_device *ndev)
        dma_unmap_addr_set(&priv->tx_buff[*txbd_curr], addr, addr);
        dma_unmap_len_set(&priv->tx_buff[*txbd_curr], len, len);
 
-       priv->tx_buff[*txbd_curr].skb = skb;
        priv->txbd[*txbd_curr].data = cpu_to_le32(addr);
 
        /* Make sure pointer to data buffer is set */
@@ -620,6 +685,11 @@ static int arc_emac_tx(struct sk_buff *skb, struct net_device *ndev)
 
        *info = cpu_to_le32(FOR_EMAC | FIRST_OR_LAST_MASK | len);
 
+       /* Make sure info word is set */
+       wmb();
+
+       priv->tx_buff[*txbd_curr].skb = skb;
+
        /* Increment index to point to the next BD */
        *txbd_curr = (*txbd_curr + 1) % TX_BD_NUM;
 
index ecc4a33..08a23e6 100644 (file)
@@ -302,7 +302,7 @@ static int nb8800_poll(struct napi_struct *napi, int budget)
        nb8800_tx_done(dev);
 
 again:
-       while (work < budget) {
+       do {
                struct nb8800_rx_buf *rxb;
                unsigned int len;
 
@@ -330,7 +330,7 @@ again:
                rxd->report = 0;
                last = next;
                work++;
-       }
+       } while (work < budget);
 
        if (work) {
                priv->rx_descs[last].desc.config |= DESC_EOC;
@@ -1460,7 +1460,19 @@ static int nb8800_probe(struct platform_device *pdev)
                goto err_disable_clk;
        }
 
-       priv->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
+       if (of_phy_is_fixed_link(pdev->dev.of_node)) {
+               ret = of_phy_register_fixed_link(pdev->dev.of_node);
+               if (ret < 0) {
+                       dev_err(&pdev->dev, "bad fixed-link spec\n");
+                       goto err_free_bus;
+               }
+               priv->phy_node = of_node_get(pdev->dev.of_node);
+       }
+
+       if (!priv->phy_node)
+               priv->phy_node = of_parse_phandle(pdev->dev.of_node,
+                                                 "phy-handle", 0);
+
        if (!priv->phy_node) {
                dev_err(&pdev->dev, "no PHY specified\n");
                ret = -ENODEV;
index 8550df1..19f7cd0 100644 (file)
@@ -151,8 +151,11 @@ config BNX2X_VXLAN
 
 config BGMAC
        tristate "BCMA bus GBit core support"
-       depends on BCMA_HOST_SOC && HAS_DMA && (BCM47XX || ARCH_BCM_5301X)
+       depends on BCMA && BCMA_HOST_SOC
+       depends on HAS_DMA
+       depends on BCM47XX || ARCH_BCM_5301X || COMPILE_TEST
        select PHYLIB
+       select FIXED_PHY
        ---help---
          This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus.
          They can be found on BCM47xx SoCs and provide gigabit ethernet.
index 27aa080..91874d2 100644 (file)
@@ -4896,9 +4896,9 @@ struct c2s_pri_trans_table_entry {
  * cfc delete event data
  */
 struct cfc_del_event_data {
-       u32 cid;
-       u32 reserved0;
-       u32 reserved1;
+       __le32 cid;
+       __le32 reserved0;
+       __le32 reserved1;
 };
 
 
@@ -5114,15 +5114,9 @@ struct vf_pf_channel_zone_trigger {
  * zone that triggers the in-bound interrupt
  */
 struct trigger_vf_zone {
-#if defined(__BIG_ENDIAN)
-       u16 reserved1;
-       u8 reserved0;
-       struct vf_pf_channel_zone_trigger vf_pf_channel;
-#elif defined(__LITTLE_ENDIAN)
        struct vf_pf_channel_zone_trigger vf_pf_channel;
        u8 reserved0;
        u16 reserved1;
-#endif
        u32 reserved2;
 };
 
@@ -5207,9 +5201,9 @@ struct e2_integ_data {
  * set mac event data
  */
 struct eth_event_data {
-       u32 echo;
-       u32 reserved0;
-       u32 reserved1;
+       __le32 echo;
+       __le32 reserved0;
+       __le32 reserved1;
 };
 
 
@@ -5219,9 +5213,9 @@ struct eth_event_data {
 struct vf_pf_event_data {
        u8 vf_id;
        u8 reserved0;
-       u16 reserved1;
-       u32 msg_addr_lo;
-       u32 msg_addr_hi;
+       __le16 reserved1;
+       __le32 msg_addr_lo;
+       __le32 msg_addr_hi;
 };
 
 /*
@@ -5230,9 +5224,9 @@ struct vf_pf_event_data {
 struct vf_flr_event_data {
        u8 vf_id;
        u8 reserved0;
-       u16 reserved1;
-       u32 reserved2;
-       u32 reserved3;
+       __le16 reserved1;
+       __le32 reserved2;
+       __le32 reserved3;
 };
 
 /*
@@ -5241,9 +5235,9 @@ struct vf_flr_event_data {
 struct malicious_vf_event_data {
        u8 vf_id;
        u8 err_id;
-       u16 reserved1;
-       u32 reserved2;
-       u32 reserved3;
+       __le16 reserved1;
+       __le32 reserved2;
+       __le32 reserved3;
 };
 
 /*
index d946bba..1fb8010 100644 (file)
@@ -6185,26 +6185,80 @@ static int bnx2x_format_ver(u32 num, u8 *str, u16 *len)
                shift -= 4;
                digit = ((num & mask) >> shift);
                if (digit == 0 && remove_leading_zeros) {
-                       mask = mask >> 4;
-                       continue;
-               } else if (digit < 0xa)
-                       *str_ptr = digit + '0';
-               else
-                       *str_ptr = digit - 0xa + 'a';
-               remove_leading_zeros = 0;
-               str_ptr++;
-               (*len)--;
+                       *str_ptr = '0';
+               } else {
+                       if (digit < 0xa)
+                               *str_ptr = digit + '0';
+                       else
+                               *str_ptr = digit - 0xa + 'a';
+
+                       remove_leading_zeros = 0;
+                       str_ptr++;
+                       (*len)--;
+               }
                mask = mask >> 4;
                if (shift == 4*4) {
+                       if (remove_leading_zeros) {
+                               str_ptr++;
+                               (*len)--;
+                       }
                        *str_ptr = '.';
                        str_ptr++;
                        (*len)--;
                        remove_leading_zeros = 1;
                }
        }
+       if (remove_leading_zeros)
+               (*len)--;
        return 0;
 }
 
+static int bnx2x_3_seq_format_ver(u32 num, u8 *str, u16 *len)
+{
+       u8 *str_ptr = str;
+       u32 mask = 0x00f00000;
+       u8 shift = 8*3;
+       u8 digit;
+       u8 remove_leading_zeros = 1;
+
+       if (*len < 10) {
+               /* Need more than 10chars for this format */
+               *str_ptr = '\0';
+               (*len)--;
+               return -EINVAL;
+       }
+
+       while (shift > 0) {
+               shift -= 4;
+               digit = ((num & mask) >> shift);
+               if (digit == 0 && remove_leading_zeros) {
+                       *str_ptr = '0';
+               } else {
+                       if (digit < 0xa)
+                               *str_ptr = digit + '0';
+                       else
+                               *str_ptr = digit - 0xa + 'a';
+
+                       remove_leading_zeros = 0;
+                       str_ptr++;
+                       (*len)--;
+               }
+               mask = mask >> 4;
+               if ((shift == 4*4) || (shift == 4*2)) {
+                       if (remove_leading_zeros) {
+                               str_ptr++;
+                               (*len)--;
+                       }
+                       *str_ptr = '.';
+                       str_ptr++;
+                       (*len)--;
+                       remove_leading_zeros = 1;
+               }
+       }
+       if (remove_leading_zeros)
+               (*len)--;
+       return 0;
+}
 
 static int bnx2x_null_format_ver(u32 spirom_ver, u8 *str, u16 *len)
 {
@@ -9677,8 +9731,9 @@ static void bnx2x_save_848xx_spirom_version(struct bnx2x_phy *phy,
 
        if (bnx2x_is_8483x_8485x(phy)) {
                bnx2x_cl45_read(bp, phy, MDIO_CTL_DEVAD, 0x400f, &fw_ver1);
-               bnx2x_save_spirom_version(bp, port, fw_ver1 & 0xfff,
-                               phy->ver_addr);
+               if (phy->type != PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858)
+                       fw_ver1 &= 0xfff;
+               bnx2x_save_spirom_version(bp, port, fw_ver1, phy->ver_addr);
        } else {
                /* For 32-bit registers in 848xx, access via MDIO2ARM i/f. */
                /* (1) set reg 0xc200_0014(SPI_BRIDGE_CTRL_2) to 0x03000000 */
@@ -9732,16 +9787,32 @@ static void bnx2x_save_848xx_spirom_version(struct bnx2x_phy *phy,
 static void bnx2x_848xx_set_led(struct bnx2x *bp,
                                struct bnx2x_phy *phy)
 {
-       u16 val, offset, i;
+       u16 val, led3_blink_rate, offset, i;
        static struct bnx2x_reg_set reg_set[] = {
                {MDIO_PMA_DEVAD, MDIO_PMA_REG_8481_LED1_MASK, 0x0080},
                {MDIO_PMA_DEVAD, MDIO_PMA_REG_8481_LED2_MASK, 0x0018},
                {MDIO_PMA_DEVAD, MDIO_PMA_REG_8481_LED3_MASK, 0x0006},
-               {MDIO_PMA_DEVAD, MDIO_PMA_REG_8481_LED3_BLINK, 0x0000},
                {MDIO_PMA_DEVAD, MDIO_PMA_REG_84823_CTL_SLOW_CLK_CNT_HIGH,
                        MDIO_PMA_REG_84823_BLINK_RATE_VAL_15P9HZ},
                {MDIO_AN_DEVAD, 0xFFFB, 0xFFFD}
        };
+
+       if (phy->type == PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858) {
+               /* Set LED5 source */
+               bnx2x_cl45_write(bp, phy,
+                                MDIO_PMA_DEVAD,
+                                MDIO_PMA_REG_8481_LED5_MASK,
+                                0x90);
+               led3_blink_rate = 0x000f;
+       } else {
+               led3_blink_rate = 0x0000;
+       }
+       /* Set LED3 BLINK */
+       bnx2x_cl45_write(bp, phy,
+                        MDIO_PMA_DEVAD,
+                        MDIO_PMA_REG_8481_LED3_BLINK,
+                        led3_blink_rate);
+
        /* PHYC_CTL_LED_CTL */
        bnx2x_cl45_read(bp, phy,
                        MDIO_PMA_DEVAD,
@@ -9749,6 +9820,9 @@ static void bnx2x_848xx_set_led(struct bnx2x *bp,
        val &= 0xFE00;
        val |= 0x0092;
 
+       if (phy->type == PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858)
+               val |= 2 << 12; /* LED5 ON based on source */
+
        bnx2x_cl45_write(bp, phy,
                         MDIO_PMA_DEVAD,
                         MDIO_PMA_REG_8481_LINK_SIGNAL, val);
@@ -9762,10 +9836,17 @@ static void bnx2x_848xx_set_led(struct bnx2x *bp,
        else
                offset = MDIO_PMA_REG_84823_CTL_LED_CTL_1;
 
-       /* stretch_en for LED3*/
+       if (phy->type == PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858)
+               val = MDIO_PMA_REG_84858_ALLOW_GPHY_ACT |
+                     MDIO_PMA_REG_84823_LED3_STRETCH_EN;
+       else
+               val = MDIO_PMA_REG_84823_LED3_STRETCH_EN;
+
+       /* stretch_en for LEDs */
        bnx2x_cl45_read_or_write(bp, phy,
-                                MDIO_PMA_DEVAD, offset,
-                                MDIO_PMA_REG_84823_LED3_STRETCH_EN);
+                                MDIO_PMA_DEVAD,
+                                offset,
+                                val);
 }
 
 static void bnx2x_848xx_specific_func(struct bnx2x_phy *phy,
@@ -9775,7 +9856,7 @@ static void bnx2x_848xx_specific_func(struct bnx2x_phy *phy,
        struct bnx2x *bp = params->bp;
        switch (action) {
        case PHY_INIT:
-               if (!bnx2x_is_8483x_8485x(phy)) {
+               if (bnx2x_is_8483x_8485x(phy)) {
                        /* Save spirom version */
                        bnx2x_save_848xx_spirom_version(phy, bp, params->port);
                }
@@ -10036,15 +10117,20 @@ static int bnx2x_84858_cmd_hdlr(struct bnx2x_phy *phy,
 
 static int bnx2x_84833_cmd_hdlr(struct bnx2x_phy *phy,
                                struct link_params *params, u16 fw_cmd,
-                               u16 cmd_args[], int argc)
+                               u16 cmd_args[], int argc, int process)
 {
        int idx;
        u16 val;
        struct bnx2x *bp = params->bp;
-       /* Write CMD_OPEN_OVERRIDE to STATUS reg */
-       bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
-                       MDIO_848xx_CMD_HDLR_STATUS,
-                       PHY84833_STATUS_CMD_OPEN_OVERRIDE);
+       int rc = 0;
+
+       if (process == PHY84833_MB_PROCESS2) {
+               /* Write CMD_OPEN_OVERRIDE to STATUS reg */
+               bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
+                                MDIO_848xx_CMD_HDLR_STATUS,
+                                PHY84833_STATUS_CMD_OPEN_OVERRIDE);
+       }
+
        for (idx = 0; idx < PHY848xx_CMDHDLR_WAIT; idx++) {
                bnx2x_cl45_read(bp, phy, MDIO_CTL_DEVAD,
                                MDIO_848xx_CMD_HDLR_STATUS, &val);
@@ -10054,15 +10140,27 @@ static int bnx2x_84833_cmd_hdlr(struct bnx2x_phy *phy,
        }
        if (idx >= PHY848xx_CMDHDLR_WAIT) {
                DP(NETIF_MSG_LINK, "FW cmd: FW not ready.\n");
+               /* if the status is CMD_COMPLETE_PASS or CMD_COMPLETE_ERROR
+                * clear the status to CMD_CLEAR_COMPLETE
+                */
+               if (val == PHY84833_STATUS_CMD_COMPLETE_PASS ||
+                   val == PHY84833_STATUS_CMD_COMPLETE_ERROR) {
+                       bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
+                                        MDIO_848xx_CMD_HDLR_STATUS,
+                                        PHY84833_STATUS_CMD_CLEAR_COMPLETE);
+               }
                return -EINVAL;
        }
-
-       /* Prepare argument(s) and issue command */
-       for (idx = 0; idx < argc; idx++) {
-               bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
-                               MDIO_848xx_CMD_HDLR_DATA1 + idx,
-                               cmd_args[idx]);
+       if (process == PHY84833_MB_PROCESS1 ||
+           process == PHY84833_MB_PROCESS2) {
+               /* Prepare argument(s) */
+               for (idx = 0; idx < argc; idx++) {
+                       bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
+                                        MDIO_848xx_CMD_HDLR_DATA1 + idx,
+                                        cmd_args[idx]);
+               }
        }
+
        bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
                        MDIO_848xx_CMD_HDLR_COMMAND, fw_cmd);
        for (idx = 0; idx < PHY848xx_CMDHDLR_WAIT; idx++) {
@@ -10076,24 +10174,30 @@ static int bnx2x_84833_cmd_hdlr(struct bnx2x_phy *phy,
        if ((idx >= PHY848xx_CMDHDLR_WAIT) ||
            (val == PHY84833_STATUS_CMD_COMPLETE_ERROR)) {
                DP(NETIF_MSG_LINK, "FW cmd failed.\n");
-               return -EINVAL;
+               rc = -EINVAL;
        }
-       /* Gather returning data */
-       for (idx = 0; idx < argc; idx++) {
-               bnx2x_cl45_read(bp, phy, MDIO_CTL_DEVAD,
-                               MDIO_848xx_CMD_HDLR_DATA1 + idx,
-                               &cmd_args[idx]);
+       if (process == PHY84833_MB_PROCESS3 && rc == 0) {
+               /* Gather returning data */
+               for (idx = 0; idx < argc; idx++) {
+                       bnx2x_cl45_read(bp, phy, MDIO_CTL_DEVAD,
+                                       MDIO_848xx_CMD_HDLR_DATA1 + idx,
+                                       &cmd_args[idx]);
+               }
        }
-       bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
-                       MDIO_848xx_CMD_HDLR_STATUS,
-                       PHY84833_STATUS_CMD_CLEAR_COMPLETE);
-       return 0;
+       if (val == PHY84833_STATUS_CMD_COMPLETE_ERROR ||
+           val == PHY84833_STATUS_CMD_COMPLETE_PASS) {
+               bnx2x_cl45_write(bp, phy, MDIO_CTL_DEVAD,
+                                MDIO_848xx_CMD_HDLR_STATUS,
+                                PHY84833_STATUS_CMD_CLEAR_COMPLETE);
+       }
+       return rc;
 }
 
 static int bnx2x_848xx_cmd_hdlr(struct bnx2x_phy *phy,
                                struct link_params *params,
                                u16 fw_cmd,
-                               u16 cmd_args[], int argc)
+                                          u16 cmd_args[], int argc,
+                                          int process)
 {
        struct bnx2x *bp = params->bp;
 
@@ -10106,7 +10210,7 @@ static int bnx2x_848xx_cmd_hdlr(struct bnx2x_phy *phy,
                                            argc);
        } else {
                return bnx2x_84833_cmd_hdlr(phy, params, fw_cmd, cmd_args,
-                                           argc);
+                                           argc, process);
        }
 }
 
@@ -10133,7 +10237,7 @@ static int bnx2x_848xx_pair_swap_cfg(struct bnx2x_phy *phy,
 
        status = bnx2x_848xx_cmd_hdlr(phy, params,
                                      PHY848xx_CMD_SET_PAIR_SWAP, data,
-                                     PHY848xx_CMDHDLR_MAX_ARGS);
+                                     2, PHY84833_MB_PROCESS2);
        if (status == 0)
                DP(NETIF_MSG_LINK, "Pairswap OK, val=0x%x\n", data[1]);
 
@@ -10222,8 +10326,8 @@ static int bnx2x_8483x_disable_eee(struct bnx2x_phy *phy,
        DP(NETIF_MSG_LINK, "Don't Advertise 10GBase-T EEE\n");
 
        /* Prevent Phy from working in EEE and advertising it */
-       rc = bnx2x_848xx_cmd_hdlr(phy, params,
-                                 PHY848xx_CMD_SET_EEE_MODE, &cmd_args, 1);
+       rc = bnx2x_848xx_cmd_hdlr(phy, params, PHY848xx_CMD_SET_EEE_MODE,
+                                 &cmd_args, 1, PHY84833_MB_PROCESS1);
        if (rc) {
                DP(NETIF_MSG_LINK, "EEE disable failed.\n");
                return rc;
@@ -10240,8 +10344,8 @@ static int bnx2x_8483x_enable_eee(struct bnx2x_phy *phy,
        struct bnx2x *bp = params->bp;
        u16 cmd_args = 1;
 
-       rc = bnx2x_848xx_cmd_hdlr(phy, params,
-                                 PHY848xx_CMD_SET_EEE_MODE, &cmd_args, 1);
+       rc = bnx2x_848xx_cmd_hdlr(phy, params, PHY848xx_CMD_SET_EEE_MODE,
+                                 &cmd_args, 1, PHY84833_MB_PROCESS1);
        if (rc) {
                DP(NETIF_MSG_LINK, "EEE enable failed.\n");
                return rc;
@@ -10362,7 +10466,7 @@ static int bnx2x_848x3_config_init(struct bnx2x_phy *phy,
                cmd_args[3] = PHY84833_CONSTANT_LATENCY;
                rc = bnx2x_848xx_cmd_hdlr(phy, params,
                                          PHY848xx_CMD_SET_EEE_MODE, cmd_args,
-                                         PHY848xx_CMDHDLR_MAX_ARGS);
+                                         4, PHY84833_MB_PROCESS1);
                if (rc)
                        DP(NETIF_MSG_LINK, "Cfg AutogrEEEn failed.\n");
        }
@@ -10416,6 +10520,32 @@ static int bnx2x_848x3_config_init(struct bnx2x_phy *phy,
                vars->eee_status &= ~SHMEM_EEE_SUPPORTED_MASK;
        }
 
+       if (phy->type == PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84833) {
+               /* Additional settings for jumbo packets in 1000BASE-T mode */
+               /* Allow rx extended length */
+               bnx2x_cl45_read(bp, phy, MDIO_AN_DEVAD,
+                               MDIO_AN_REG_8481_AUX_CTRL, &val);
+               val |= 0x4000;
+               bnx2x_cl45_write(bp, phy, MDIO_AN_DEVAD,
+                                MDIO_AN_REG_8481_AUX_CTRL, val);
+               /* TX FIFO Elasticity LSB */
+               bnx2x_cl45_read(bp, phy, MDIO_AN_DEVAD,
+                               MDIO_AN_REG_8481_1G_100T_EXT_CTRL, &val);
+               val |= 0x1;
+               bnx2x_cl45_write(bp, phy, MDIO_AN_DEVAD,
+                                MDIO_AN_REG_8481_1G_100T_EXT_CTRL, val);
+               /* TX FIFO Elasticity MSB */
+               /* Enable expansion register 0x46 (Pattern Generator status) */
+               bnx2x_cl45_write(bp, phy, MDIO_AN_DEVAD,
+                                MDIO_AN_REG_8481_EXPANSION_REG_ACCESS, 0xf46);
+
+               bnx2x_cl45_read(bp, phy, MDIO_AN_DEVAD,
+                               MDIO_AN_REG_8481_EXPANSION_REG_RD_RW, &val);
+               val |= 0x4000;
+               bnx2x_cl45_write(bp, phy, MDIO_AN_DEVAD,
+                                MDIO_AN_REG_8481_EXPANSION_REG_RD_RW, val);
+       }
+
        if (bnx2x_is_8483x_8485x(phy)) {
                /* Bring PHY out of super isolate mode as the final step. */
                bnx2x_cl45_read_and_write(bp, phy,
@@ -10555,6 +10685,17 @@ static u8 bnx2x_848xx_read_status(struct bnx2x_phy *phy,
        return link_up;
 }
 
+static int bnx2x_8485x_format_ver(u32 raw_ver, u8 *str, u16 *len)
+{
+       int status = 0;
+       u32 num;
+
+       num = ((raw_ver & 0xF80) >> 7) << 16 | ((raw_ver & 0x7F) << 8) |
+             ((raw_ver & 0xF000) >> 12);
+       status = bnx2x_3_seq_format_ver(num, str, len);
+       return status;
+}
+
 static int bnx2x_848xx_format_ver(u32 raw_ver, u8 *str, u16 *len)
 {
        int status = 0;
@@ -10651,10 +10792,25 @@ static void bnx2x_848xx_set_link_led(struct bnx2x_phy *phy,
                                        0x0);
 
                } else {
+                       /* LED 1 OFF */
                        bnx2x_cl45_write(bp, phy,
                                         MDIO_PMA_DEVAD,
                                         MDIO_PMA_REG_8481_LED1_MASK,
                                         0x0);
+
+                       if (phy->type ==
+                               PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858) {
+                               /* LED 2 OFF */
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED2_MASK,
+                                                0x0);
+                               /* LED 3 OFF */
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED3_MASK,
+                                                0x0);
+                       }
                }
                break;
        case LED_MODE_FRONT_PANEL_OFF:
@@ -10713,6 +10869,19 @@ static void bnx2x_848xx_set_link_led(struct bnx2x_phy *phy,
                                                 MDIO_PMA_REG_8481_SIGNAL_MASK,
                                                 0x0);
                        }
+                       if (phy->type ==
+                               PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858) {
+                               /* LED 2 OFF */
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED2_MASK,
+                                                0x0);
+                               /* LED 3 OFF */
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED3_MASK,
+                                                0x0);
+                       }
                }
                break;
        case LED_MODE_ON:
@@ -10776,6 +10945,25 @@ static void bnx2x_848xx_set_link_led(struct bnx2x_phy *phy,
                                                params->port*4,
                                                NIG_MASK_MI_INT);
                                }
+                       }
+                       if (phy->type ==
+                           PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858) {
+                               /* Tell LED3 to constant on */
+                               bnx2x_cl45_read(bp, phy,
+                                               MDIO_PMA_DEVAD,
+                                               MDIO_PMA_REG_8481_LINK_SIGNAL,
+                                               &val);
+                               val &= ~(7<<6);
+                               val |= (2<<6);  /* A83B[8:6]= 2 */
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LINK_SIGNAL,
+                                                val);
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED3_MASK,
+                                                0x20);
+                       } else {
                                bnx2x_cl45_write(bp, phy,
                                                 MDIO_PMA_DEVAD,
                                                 MDIO_PMA_REG_8481_SIGNAL_MASK,
@@ -10853,6 +11041,17 @@ static void bnx2x_848xx_set_link_led(struct bnx2x_phy *phy,
                                         MDIO_PMA_DEVAD,
                                         MDIO_PMA_REG_8481_LINK_SIGNAL,
                                         val);
+                       if (phy->type ==
+                           PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84858) {
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED2_MASK,
+                                                0x18);
+                               bnx2x_cl45_write(bp, phy,
+                                                MDIO_PMA_DEVAD,
+                                                MDIO_PMA_REG_8481_LED3_MASK,
+                                                0x06);
+                       }
                        if (phy->type ==
                            PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84834) {
                                /* Restore LED4 source to external link,
@@ -11982,7 +12181,7 @@ static const struct bnx2x_phy phy_84858 = {
        .read_status    = (read_status_t)bnx2x_848xx_read_status,
        .link_reset     = (link_reset_t)bnx2x_848x3_link_reset,
        .config_loopback = (config_loopback_t)NULL,
-       .format_fw_ver  = (format_fw_ver_t)bnx2x_848xx_format_ver,
+       .format_fw_ver  = (format_fw_ver_t)bnx2x_8485x_format_ver,
        .hw_reset       = (hw_reset_t)bnx2x_84833_hw_reset_phy,
        .set_link_led   = (set_link_led_t)bnx2x_848xx_set_link_led,
        .phy_specific_func = (phy_specific_func_t)bnx2x_848xx_specific_func
@@ -13807,8 +14006,10 @@ void bnx2x_period_func(struct link_params *params, struct link_vars *vars)
        if (CHIP_IS_E3(bp)) {
                struct bnx2x_phy *phy = &params->phy[INT_PHY];
                bnx2x_set_aer_mmd(params, phy);
-               if ((phy->supported & SUPPORTED_20000baseKR2_Full) &&
-                   (phy->speed_cap_mask & PORT_HW_CFG_SPEED_CAPABILITY_D0_20G))
+               if (((phy->req_line_speed == SPEED_AUTO_NEG) &&
+                    (phy->speed_cap_mask &
+                     PORT_HW_CFG_SPEED_CAPABILITY_D0_20G)) ||
+                   (phy->req_line_speed == SPEED_20000))
                        bnx2x_check_kr2_wa(params, vars, phy);
                bnx2x_check_over_curr(params, vars);
                if (vars->rx_tx_asic_rst)
index 6c4e3a6..2bf9c87 100644 (file)
@@ -5280,14 +5280,14 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
 {
        unsigned long ramrod_flags = 0;
        int rc = 0;
-       u32 cid = elem->message.data.eth_event.echo & BNX2X_SWCID_MASK;
+       u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
+       u32 cid = echo & BNX2X_SWCID_MASK;
        struct bnx2x_vlan_mac_obj *vlan_mac_obj;
 
        /* Always push next commands out, don't wait here */
        __set_bit(RAMROD_CONT, &ramrod_flags);
 
-       switch (le32_to_cpu((__force __le32)elem->message.data.eth_event.echo)
-                           >> BNX2X_SWCID_SHIFT) {
+       switch (echo >> BNX2X_SWCID_SHIFT) {
        case BNX2X_FILTER_MAC_PENDING:
                DP(BNX2X_MSG_SP, "Got SETUP_MAC completions\n");
                if (CNIC_LOADED(bp) && (cid == BNX2X_ISCSI_ETH_CID(bp)))
@@ -5308,8 +5308,7 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
                bnx2x_handle_mcast_eqe(bp);
                return;
        default:
-               BNX2X_ERR("Unsupported classification command: %d\n",
-                         elem->message.data.eth_event.echo);
+               BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
                return;
        }
 
@@ -5478,9 +5477,6 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                        goto next_spqe;
                }
 
-               /* elem CID originates from FW; actually LE */
-               cid = SW_CID((__force __le32)
-                            elem->message.data.cfc_del_event.cid);
                opcode = elem->message.opcode;
 
                /* handle eq element */
@@ -5503,6 +5499,10 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                         * we may want to verify here that the bp state is
                         * HALTING
                         */
+
+                       /* elem CID originates from FW; actually LE */
+                       cid = SW_CID(elem->message.data.cfc_del_event.cid);
+
                        DP(BNX2X_MSG_SP,
                           "got delete ramrod for MULTI[%d]\n", cid);
 
@@ -5596,10 +5596,8 @@ static void bnx2x_eq_int(struct bnx2x *bp)
                      BNX2X_STATE_OPENING_WAIT4_PORT):
                case (EVENT_RING_OPCODE_RSS_UPDATE_RULES |
                      BNX2X_STATE_CLOSING_WAIT4_HALT):
-                       cid = elem->message.data.eth_event.echo &
-                               BNX2X_SWCID_MASK;
                        DP(BNX2X_MSG_SP, "got RSS_UPDATE ramrod. CID %d\n",
-                          cid);
+                          SW_CID(elem->message.data.eth_event.echo));
                        rss_raw->clear_pending(rss_raw);
                        break;
 
@@ -5684,7 +5682,7 @@ static void bnx2x_sp_task(struct work_struct *work)
                if (status & BNX2X_DEF_SB_IDX) {
                        struct bnx2x_fastpath *fp = bnx2x_fcoe_fp(bp);
 
-               if (FCOE_INIT(bp) &&
+                       if (FCOE_INIT(bp) &&
                            (bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
                                /* Prevent local bottom-halves from running as
                                 * we are going to change the local NAPI list.
index 4dead49..a43dea2 100644 (file)
@@ -7296,6 +7296,8 @@ Theotherbitsarereservedandshouldbezero*/
 #define MDIO_PMA_REG_84823_CTL_LED_CTL_1                       0xa8e3
 #define MDIO_PMA_REG_84833_CTL_LED_CTL_1                       0xa8ec
 #define MDIO_PMA_REG_84823_LED3_STRETCH_EN                     0x0080
+/* BCM84858 only */
+#define MDIO_PMA_REG_84858_ALLOW_GPHY_ACT                      0x8000
 
 /* BCM84833 only */
 #define MDIO_84833_TOP_CFG_FW_REV                      0x400f
@@ -7337,6 +7339,10 @@ Theotherbitsarereservedandshouldbezero*/
 #define PHY84833_STATUS_CMD_NOT_OPEN_FOR_CMDS          0x0040
 #define PHY84833_STATUS_CMD_CLEAR_COMPLETE             0x0080
 #define PHY84833_STATUS_CMD_OPEN_OVERRIDE              0xa5a5
+/* Mailbox Process */
+#define PHY84833_MB_PROCESS1                           1
+#define PHY84833_MB_PROCESS2                           2
+#define PHY84833_MB_PROCESS3                           3
 
 /* Mailbox status set used by 84858 only */
 #define PHY84858_STATUS_CMD_RECEIVED                   0x0001
index 9d02734..632daff 100644 (file)
@@ -1672,11 +1672,12 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
 {
        unsigned long ramrod_flags = 0;
        int rc = 0;
+       u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
 
        /* Always push next commands out, don't wait here */
        set_bit(RAMROD_CONT, &ramrod_flags);
 
-       switch (elem->message.data.eth_event.echo >> BNX2X_SWCID_SHIFT) {
+       switch (echo >> BNX2X_SWCID_SHIFT) {
        case BNX2X_FILTER_MAC_PENDING:
                rc = vfq->mac_obj.complete(bp, &vfq->mac_obj, elem,
                                           &ramrod_flags);
@@ -1686,8 +1687,7 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
                                            &ramrod_flags);
                break;
        default:
-               BNX2X_ERR("Unsupported classification command: %d\n",
-                         elem->message.data.eth_event.echo);
+               BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
                return;
        }
        if (rc < 0)
@@ -1747,16 +1747,14 @@ int bnx2x_iov_eq_sp_event(struct bnx2x *bp, union event_ring_elem *elem)
 
        switch (opcode) {
        case EVENT_RING_OPCODE_CFC_DEL:
-               cid = SW_CID((__force __le32)
-                            elem->message.data.cfc_del_event.cid);
+               cid = SW_CID(elem->message.data.cfc_del_event.cid);
                DP(BNX2X_MSG_IOV, "checking cfc-del comp cid=%d\n", cid);
                break;
        case EVENT_RING_OPCODE_CLASSIFICATION_RULES:
        case EVENT_RING_OPCODE_MULTICAST_RULES:
        case EVENT_RING_OPCODE_FILTERS_RULES:
        case EVENT_RING_OPCODE_RSS_UPDATE_RULES:
-               cid = (elem->message.data.eth_event.echo &
-                      BNX2X_SWCID_MASK);
+               cid = SW_CID(elem->message.data.eth_event.echo);
                DP(BNX2X_MSG_IOV, "checking filtering comp cid=%d\n", cid);
                break;
        case EVENT_RING_OPCODE_VF_FLR:
index 1374e53..bfae300 100644 (file)
@@ -2187,8 +2187,10 @@ void bnx2x_vf_mbx_schedule(struct bnx2x *bp,
 
        /* Update VFDB with current message and schedule its handling */
        mutex_lock(&BP_VFDB(bp)->event_mutex);
-       BP_VF_MBX(bp, vf_idx)->vf_addr_hi = vfpf_event->msg_addr_hi;
-       BP_VF_MBX(bp, vf_idx)->vf_addr_lo = vfpf_event->msg_addr_lo;
+       BP_VF_MBX(bp, vf_idx)->vf_addr_hi =
+               le32_to_cpu(vfpf_event->msg_addr_hi);
+       BP_VF_MBX(bp, vf_idx)->vf_addr_lo =
+               le32_to_cpu(vfpf_event->msg_addr_lo);
        BP_VFDB(bp)->event_occur |= (1ULL << vf_idx);
        mutex_unlock(&BP_VFDB(bp)->event_mutex);
 
index df835f5..82f1913 100644 (file)
@@ -69,7 +69,7 @@ MODULE_VERSION(DRV_MODULE_VERSION);
 #define BNXT_RX_DMA_OFFSET NET_SKB_PAD
 #define BNXT_RX_COPY_THRESH 256
 
-#define BNXT_TX_PUSH_THRESH 92
+#define BNXT_TX_PUSH_THRESH 164
 
 enum board_idx {
        BCM57301,
@@ -223,11 +223,12 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
        }
 
        if (free_size == bp->tx_ring_size && length <= bp->tx_push_thresh) {
-               struct tx_push_bd *push = txr->tx_push;
-               struct tx_bd *tx_push = &push->txbd1;
-               struct tx_bd_ext *tx_push1 = &push->txbd2;
-               void *pdata = tx_push1 + 1;
-               int j;
+               struct tx_push_buffer *tx_push_buf = txr->tx_push;
+               struct tx_push_bd *tx_push = &tx_push_buf->push_bd;
+               struct tx_bd_ext *tx_push1 = &tx_push->txbd2;
+               void *pdata = tx_push_buf->data;
+               u64 *end;
+               int j, push_len;
 
                /* Set COAL_NOW to be ready quickly for the next push */
                tx_push->tx_bd_len_flags_type =
@@ -247,6 +248,10 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
                tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
                tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
 
+               end = pdata + length;
+               end = PTR_ALIGN(end, 8) - 1;
+               *end = 0;
+
                skb_copy_from_linear_data(skb, pdata, len);
                pdata += len;
                for (j = 0; j < last_frag; j++) {
@@ -261,22 +266,29 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
                        pdata += skb_frag_size(frag);
                }
 
-               memcpy(txbd, tx_push, sizeof(*txbd));
+               txbd->tx_bd_len_flags_type = tx_push->tx_bd_len_flags_type;
+               txbd->tx_bd_haddr = txr->data_mapping;
                prod = NEXT_TX(prod);
                txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
                memcpy(txbd, tx_push1, sizeof(*txbd));
                prod = NEXT_TX(prod);
-               push->doorbell =
+               tx_push->doorbell =
                        cpu_to_le32(DB_KEY_TX_PUSH | DB_LONG_TX_PUSH | prod);
                txr->tx_prod = prod;
 
                netdev_tx_sent_queue(txq, skb->len);
 
-               __iowrite64_copy(txr->tx_doorbell, push,
-                                (length + sizeof(*push) + 8) / 8);
+               push_len = (length + sizeof(*tx_push) + 7) / 8;
+               if (push_len > 16) {
+                       __iowrite64_copy(txr->tx_doorbell, tx_push_buf, 16);
+                       __iowrite64_copy(txr->tx_doorbell + 4, tx_push_buf + 1,
+                                        push_len - 16);
+               } else {
+                       __iowrite64_copy(txr->tx_doorbell, tx_push_buf,
+                                        push_len);
+               }
 
                tx_buf->is_push = 1;
-
                goto tx_done;
        }
 
@@ -1490,10 +1502,11 @@ static void bnxt_free_tx_skbs(struct bnxt *bp)
 
                        last = tx_buf->nr_frags;
                        j += 2;
-                       for (k = 0; k < last; k++, j = NEXT_TX(j)) {
+                       for (k = 0; k < last; k++, j++) {
+                               int ring_idx = j & bp->tx_ring_mask;
                                skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
 
-                               tx_buf = &txr->tx_buf_ring[j];
+                               tx_buf = &txr->tx_buf_ring[ring_idx];
                                dma_unmap_page(
                                        &pdev->dev,
                                        dma_unmap_addr(tx_buf, mapping),
@@ -1752,7 +1765,7 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
                push_size  = L1_CACHE_ALIGN(sizeof(struct tx_push_bd) +
                                        bp->tx_push_thresh);
 
-               if (push_size > 128) {
+               if (push_size > 256) {
                        push_size = 0;
                        bp->tx_push_thresh = 0;
                }
@@ -1771,7 +1784,6 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
                        return rc;
 
                if (bp->tx_push_size) {
-                       struct tx_bd *txbd;
                        dma_addr_t mapping;
 
                        /* One pre-allocated DMA buffer to backup
@@ -1785,13 +1797,11 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
                        if (!txr->tx_push)
                                return -ENOMEM;
 
-                       txbd = &txr->tx_push->txbd1;
-
                        mapping = txr->tx_push_mapping +
                                sizeof(struct tx_push_bd);
-                       txbd->tx_bd_haddr = cpu_to_le64(mapping);
+                       txr->data_mapping = cpu_to_le64(mapping);
 
-                       memset(txbd + 1, 0, sizeof(struct tx_bd_ext));
+                       memset(txr->tx_push, 0, sizeof(struct tx_push_bd));
                }
                ring->queue_id = bp->q_info[j].queue_id;
                if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1))
@@ -3406,7 +3416,7 @@ static int hwrm_ring_free_send_msg(struct bnxt *bp,
        struct hwrm_ring_free_output *resp = bp->hwrm_cmd_resp_addr;
        u16 error_code;
 
-       bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_FREE, -1, -1);
+       bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_FREE, cmpl_ring_id, -1);
        req.ring_type = ring_type;
        req.ring_id = cpu_to_le16(ring->fw_ring_id);
 
@@ -4545,20 +4555,18 @@ static int bnxt_update_phy_setting(struct bnxt *bp)
        if (!(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) &&
            link_info->force_pause_setting != link_info->req_flow_ctrl)
                update_pause = true;
-       if (link_info->req_duplex != link_info->duplex_setting)
-               update_link = true;
        if (!(link_info->autoneg & BNXT_AUTONEG_SPEED)) {
                if (BNXT_AUTO_MODE(link_info->auto_mode))
                        update_link = true;
                if (link_info->req_link_speed != link_info->force_link_speed)
                        update_link = true;
+               if (link_info->req_duplex != link_info->duplex_setting)
+                       update_link = true;
        } else {
                if (link_info->auto_mode == BNXT_LINK_AUTO_NONE)
                        update_link = true;
                if (link_info->advertising != link_info->auto_link_speeds)
                        update_link = true;
-               if (link_info->req_link_speed != link_info->auto_link_speed)
-                       update_link = true;
        }
 
        if (update_link)
@@ -4635,7 +4643,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
        if (link_re_init) {
                rc = bnxt_update_phy_setting(bp);
                if (rc)
-                       goto open_err;
+                       netdev_warn(bp->dev, "failed to update phy settings\n");
        }
 
        if (irq_re_init) {
@@ -4653,6 +4661,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
        /* Enable TX queues */
        bnxt_tx_enable(bp);
        mod_timer(&bp->timer, jiffies + bp->current_interval);
+       bnxt_update_link(bp, true);
 
        return 0;
 
@@ -4819,8 +4828,6 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 
                stats->multicast += le64_to_cpu(hw_stats->rx_mcast_pkts);
 
-               stats->rx_dropped += le64_to_cpu(hw_stats->rx_drop_pkts);
-
                stats->tx_dropped += le64_to_cpu(hw_stats->tx_drop_pkts);
        }
 
@@ -5671,22 +5678,16 @@ static int bnxt_probe_phy(struct bnxt *bp)
        }
 
        /*initialize the ethool setting copy with NVM settings */
-       if (BNXT_AUTO_MODE(link_info->auto_mode))
-               link_info->autoneg |= BNXT_AUTONEG_SPEED;
-
-       if (link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH) {
-               if (link_info->auto_pause_setting == BNXT_LINK_PAUSE_BOTH)
-                       link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+       if (BNXT_AUTO_MODE(link_info->auto_mode)) {
+               link_info->autoneg = BNXT_AUTONEG_SPEED |
+                                    BNXT_AUTONEG_FLOW_CTRL;
+               link_info->advertising = link_info->auto_link_speeds;
                link_info->req_flow_ctrl = link_info->auto_pause_setting;
-       } else if (link_info->force_pause_setting & BNXT_LINK_PAUSE_BOTH) {
+       } else {
+               link_info->req_link_speed = link_info->force_link_speed;
+               link_info->req_duplex = link_info->duplex_setting;
                link_info->req_flow_ctrl = link_info->force_pause_setting;
        }
-       link_info->req_duplex = link_info->duplex_setting;
-       if (link_info->autoneg & BNXT_AUTONEG_SPEED)
-               link_info->req_link_speed = link_info->auto_link_speed;
-       else
-               link_info->req_link_speed = link_info->force_link_speed;
-       link_info->advertising = link_info->auto_link_speeds;
        snprintf(phy_ver, PHY_VER_STR_LEN, " ph %d.%d.%d",
                 link_info->phy_ver[0],
                 link_info->phy_ver[1],
index 8af3ca8..2be51b3 100644 (file)
@@ -411,8 +411,8 @@ struct rx_tpa_end_cmp_ext {
 
 #define BNXT_NUM_TESTS(bp)     0
 
-#define BNXT_DEFAULT_RX_RING_SIZE      1023
-#define BNXT_DEFAULT_TX_RING_SIZE      512
+#define BNXT_DEFAULT_RX_RING_SIZE      511
+#define BNXT_DEFAULT_TX_RING_SIZE      511
 
 #define MAX_TPA                64
 
@@ -523,10 +523,16 @@ struct bnxt_ring_struct {
 
 struct tx_push_bd {
        __le32                  doorbell;
-       struct tx_bd            txbd1;
+       __le32                  tx_bd_len_flags_type;
+       u32                     tx_bd_opaque;
        struct tx_bd_ext        txbd2;
 };
 
+struct tx_push_buffer {
+       struct tx_push_bd       push_bd;
+       u32                     data[25];
+};
+
 struct bnxt_tx_ring_info {
        struct bnxt_napi        *bnapi;
        u16                     tx_prod;
@@ -538,8 +544,9 @@ struct bnxt_tx_ring_info {
 
        dma_addr_t              tx_desc_mapping[MAX_TX_PAGES];
 
-       struct tx_push_bd       *tx_push;
+       struct tx_push_buffer   *tx_push;
        dma_addr_t              tx_push_mapping;
+       __le64                  data_mapping;
 
 #define BNXT_DEV_STATE_CLOSING 0x1
        u32                     dev_state;
index 922b898..3238817 100644 (file)
@@ -486,15 +486,8 @@ static u32 bnxt_fw_to_ethtool_support_spds(struct bnxt_link_info *link_info)
                speed_mask |= SUPPORTED_2500baseX_Full;
        if (fw_speeds & BNXT_LINK_SPEED_MSK_10GB)
                speed_mask |= SUPPORTED_10000baseT_Full;
-       /* TODO: support 25GB, 50GB with different cable type */
-       if (fw_speeds & BNXT_LINK_SPEED_MSK_20GB)
-               speed_mask |= SUPPORTED_20000baseMLD2_Full |
-                       SUPPORTED_20000baseKR2_Full;
        if (fw_speeds & BNXT_LINK_SPEED_MSK_40GB)
-               speed_mask |= SUPPORTED_40000baseKR4_Full |
-                       SUPPORTED_40000baseCR4_Full |
-                       SUPPORTED_40000baseSR4_Full |
-                       SUPPORTED_40000baseLR4_Full;
+               speed_mask |= SUPPORTED_40000baseCR4_Full;
 
        return speed_mask;
 }
@@ -514,15 +507,8 @@ static u32 bnxt_fw_to_ethtool_advertised_spds(struct bnxt_link_info *link_info)
                speed_mask |= ADVERTISED_2500baseX_Full;
        if (fw_speeds & BNXT_LINK_SPEED_MSK_10GB)
                speed_mask |= ADVERTISED_10000baseT_Full;
-       /* TODO: how to advertise 20, 25, 40, 50GB with different cable type ?*/
-       if (fw_speeds & BNXT_LINK_SPEED_MSK_20GB)
-               speed_mask |= ADVERTISED_20000baseMLD2_Full |
-                             ADVERTISED_20000baseKR2_Full;
        if (fw_speeds & BNXT_LINK_SPEED_MSK_40GB)
-               speed_mask |= ADVERTISED_40000baseKR4_Full |
-                             ADVERTISED_40000baseCR4_Full |
-                             ADVERTISED_40000baseSR4_Full |
-                             ADVERTISED_40000baseLR4_Full;
+               speed_mask |= ADVERTISED_40000baseCR4_Full;
        return speed_mask;
 }
 
@@ -557,11 +543,12 @@ static int bnxt_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
        u16 ethtool_speed;
 
        cmd->supported = bnxt_fw_to_ethtool_support_spds(link_info);
+       cmd->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
 
        if (link_info->auto_link_speeds)
                cmd->supported |= SUPPORTED_Autoneg;
 
-       if (BNXT_AUTO_MODE(link_info->auto_mode)) {
+       if (link_info->autoneg) {
                cmd->advertising =
                        bnxt_fw_to_ethtool_advertised_spds(link_info);
                cmd->advertising |= ADVERTISED_Autoneg;
@@ -570,28 +557,16 @@ static int bnxt_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
                cmd->autoneg = AUTONEG_DISABLE;
                cmd->advertising = 0;
        }
-       if (link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH) {
+       if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) {
                if ((link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH) ==
                    BNXT_LINK_PAUSE_BOTH) {
                        cmd->advertising |= ADVERTISED_Pause;
-                       cmd->supported |= SUPPORTED_Pause;
                } else {
                        cmd->advertising |= ADVERTISED_Asym_Pause;
-                       cmd->supported |= SUPPORTED_Asym_Pause;
                        if (link_info->auto_pause_setting &
                            BNXT_LINK_PAUSE_RX)
                                cmd->advertising |= ADVERTISED_Pause;
                }
-       } else if (link_info->force_pause_setting & BNXT_LINK_PAUSE_BOTH) {
-               if ((link_info->force_pause_setting & BNXT_LINK_PAUSE_BOTH) ==
-                   BNXT_LINK_PAUSE_BOTH) {
-                       cmd->supported |= SUPPORTED_Pause;
-               } else {
-                       cmd->supported |= SUPPORTED_Asym_Pause;
-                       if (link_info->force_pause_setting &
-                           BNXT_LINK_PAUSE_RX)
-                               cmd->supported |= SUPPORTED_Pause;
-               }
        }
 
        cmd->port = PORT_NONE;
@@ -670,6 +645,9 @@ static u16 bnxt_get_fw_auto_link_speeds(u32 advertising)
        if (advertising & ADVERTISED_10000baseT_Full)
                fw_speed_mask |= BNXT_LINK_SPEED_MSK_10GB;
 
+       if (advertising & ADVERTISED_40000baseCR4_Full)
+               fw_speed_mask |= BNXT_LINK_SPEED_MSK_40GB;
+
        return fw_speed_mask;
 }
 
@@ -729,7 +707,7 @@ static int bnxt_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
                speed = ethtool_cmd_speed(cmd);
                link_info->req_link_speed = bnxt_get_fw_speed(dev, speed);
                link_info->req_duplex = BNXT_LINK_DUPLEX_FULL;
-               link_info->autoneg &= ~BNXT_AUTONEG_SPEED;
+               link_info->autoneg = 0;
                link_info->advertising = 0;
        }
 
@@ -748,8 +726,7 @@ static void bnxt_get_pauseparam(struct net_device *dev,
 
        if (BNXT_VF(bp))
                return;
-       epause->autoneg = !!(link_info->auto_pause_setting &
-                            BNXT_LINK_PAUSE_BOTH);
+       epause->autoneg = !!(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL);
        epause->rx_pause = ((link_info->pause & BNXT_LINK_PAUSE_RX) != 0);
        epause->tx_pause = ((link_info->pause & BNXT_LINK_PAUSE_TX) != 0);
 }
@@ -765,6 +742,9 @@ static int bnxt_set_pauseparam(struct net_device *dev,
                return rc;
 
        if (epause->autoneg) {
+               if (!(link_info->autoneg & BNXT_AUTONEG_SPEED))
+                       return -EINVAL;
+
                link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
                link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_BOTH;
        } else {
index b15a60d..d7e01a7 100644 (file)
@@ -2445,8 +2445,7 @@ static void bcmgenet_irq_task(struct work_struct *work)
        }
 
        /* Link UP/DOWN event */
-       if ((priv->hw_params->flags & GENET_HAS_MDIO_INTR) &&
-           (priv->irq0_stat & UMAC_IRQ_LINK_EVENT)) {
+       if (priv->irq0_stat & UMAC_IRQ_LINK_EVENT) {
                phy_mac_interrupt(priv->phydev,
                                  !!(priv->irq0_stat & UMAC_IRQ_LINK_UP));
                priv->irq0_stat &= ~UMAC_IRQ_LINK_EVENT;
index 0d77596..457c3bc 100644 (file)
@@ -401,7 +401,7 @@ int bcmgenet_mii_probe(struct net_device *dev)
         * Ethernet MAC ISRs
         */
        if (priv->internal_phy)
-               priv->mii_bus->irq[phydev->mdio.addr] = PHY_IGNORE_INTERRUPT;
+               priv->phydev->irq = PHY_IGNORE_INTERRUPT;
 
        return 0;
 }
index 9293675..3010080 100644 (file)
@@ -7831,6 +7831,14 @@ static int tigon3_dma_hwbug_workaround(struct tg3_napi *tnapi,
        return ret;
 }
 
+static bool tg3_tso_bug_gso_check(struct tg3_napi *tnapi, struct sk_buff *skb)
+{
+       /* Check if we will never have enough descriptors,
+        * as gso_segs can be more than current ring size
+        */
+       return skb_shinfo(skb)->gso_segs < tnapi->tx_pending / 3;
+}
+
 static netdev_tx_t tg3_start_xmit(struct sk_buff *, struct net_device *);
 
 /* Use GSO to workaround all TSO packets that meet HW bug conditions
@@ -7934,14 +7942,19 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
                 * vlan encapsulated.
                 */
                if (skb->protocol == htons(ETH_P_8021Q) ||
-                   skb->protocol == htons(ETH_P_8021AD))
-                       return tg3_tso_bug(tp, tnapi, txq, skb);
+                   skb->protocol == htons(ETH_P_8021AD)) {
+                       if (tg3_tso_bug_gso_check(tnapi, skb))
+                               return tg3_tso_bug(tp, tnapi, txq, skb);
+                       goto drop;
+               }
 
                if (!skb_is_gso_v6(skb)) {
                        if (unlikely((ETH_HLEN + hdr_len) > 80) &&
-                           tg3_flag(tp, TSO_BUG))
-                               return tg3_tso_bug(tp, tnapi, txq, skb);
-
+                           tg3_flag(tp, TSO_BUG)) {
+                               if (tg3_tso_bug_gso_check(tnapi, skb))
+                                       return tg3_tso_bug(tp, tnapi, txq, skb);
+                               goto drop;
+                       }
                        ip_csum = iph->check;
                        ip_tot_len = iph->tot_len;
                        iph->check = 0;
@@ -8073,7 +8086,7 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
        if (would_hit_hwbug) {
                tg3_tx_skb_unmap(tnapi, tnapi->tx_prod, i);
 
-               if (mss) {
+               if (mss && tg3_tso_bug_gso_check(tnapi, skb)) {
                        /* If it's a TSO packet, do GSO instead of
                         * allocating and copying to a large linear SKB
                         */
@@ -12016,7 +12029,7 @@ static int tg3_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
        int ret;
        u32 offset, len, b_offset, odd_len;
        u8 *buf;
-       __be32 start, end;
+       __be32 start = 0, end;
 
        if (tg3_flag(tp, NO_NVRAM) ||
            eeprom->magic != TG3_EEPROM_MAGIC)
index 04b0d16..95bc470 100644 (file)
@@ -987,7 +987,7 @@ bna_rxf_ucast_cfg_apply(struct bna_rxf *rxf)
        if (!list_empty(&rxf->ucast_pending_add_q)) {
                mac = list_first_entry(&rxf->ucast_pending_add_q,
                                       struct bna_mac, qe);
-               list_add_tail(&mac->qe, &rxf->ucast_active_q);
+               list_move_tail(&mac->qe, &rxf->ucast_active_q);
                bna_bfi_ucast_req(rxf, mac, BFI_ENET_H2I_MAC_UCAST_ADD_REQ);
                return 1;
        }
index 9d9984a..50c9410 100644 (file)
@@ -2823,7 +2823,7 @@ static int macb_probe(struct platform_device *pdev)
        struct device_node *np = pdev->dev.of_node;
        struct device_node *phy_node;
        const struct macb_config *macb_config = NULL;
-       struct clk *pclk, *hclk, *tx_clk;
+       struct clk *pclk, *hclk = NULL, *tx_clk = NULL;
        unsigned int queue_mask, num_queues;
        struct macb_platform_data *pdata;
        bool native_io;
index b895044..34d269c 100644 (file)
@@ -1526,7 +1526,6 @@ static int liquidio_ptp_gettime(struct ptp_clock_info *ptp,
                                struct timespec64 *ts)
 {
        u64 ns;
-       u32 remainder;
        unsigned long flags;
        struct lio *lio = container_of(ptp, struct lio, ptp_info);
        struct octeon_device *oct = (struct octeon_device *)lio->oct_dev;
@@ -1536,8 +1535,7 @@ static int liquidio_ptp_gettime(struct ptp_clock_info *ptp,
        ns += lio->ptp_adjust;
        spin_unlock_irqrestore(&lio->ptp_lock, flags);
 
-       ts->tv_sec = div_u64_rem(ns, 1000000000ULL, &remainder);
-       ts->tv_nsec = remainder;
+       *ts = ns_to_timespec64(ns);
 
        return 0;
 }
@@ -1685,7 +1683,7 @@ static int octeon_setup_droq(struct octeon_device *oct, int q_no, int num_descs,
        dev_dbg(&oct->pci_dev->dev, "Creating Droq: %d\n", q_no);
        /* droq creation and local register settings. */
        ret_val = octeon_create_droq(oct, q_no, num_descs, desc_size, app_ctx);
-       if (ret_val == -1)
+       if (ret_val < 0)
                return ret_val;
 
        if (ret_val == 1) {
@@ -2526,7 +2524,7 @@ static void handle_timestamp(struct octeon_device *oct,
 
        octeon_swap_8B_data(&resp->timestamp, 1);
 
-       if (unlikely((skb_shinfo(skb)->tx_flags | SKBTX_IN_PROGRESS) != 0)) {
+       if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) != 0)) {
                struct skb_shared_hwtstamps ts;
                u64 ns = resp->timestamp;
 
index 4dba86e..174072b 100644 (file)
@@ -983,5 +983,5 @@ int octeon_create_droq(struct octeon_device *oct,
 
 create_droq_fail:
        octeon_delete_droq(oct, q_no);
-       return -1;
+       return -ENOMEM;
 }
index 6888288..34e9ace 100644 (file)
 #define NIC_PF_INTR_ID_MBOX0           8
 #define NIC_PF_INTR_ID_MBOX1           9
 
+/* Minimum FIFO level before all packets for the CQ are dropped
+ *
+ * This value ensures that once a packet has been "accepted"
+ * for reception it will not get dropped due to non-availability
+ * of CQ descriptor. An errata in HW mandates this value to be
+ * atleast 0x100.
+ */
+#define NICPF_CQM_MIN_DROP_LEVEL       0x100
+
 /* Global timer for CQ timer thresh interrupts
  * Calculated for SCLK of 700Mhz
  * value written should be a 1/16th of what is expected
index 4dded90..95f17f8 100644 (file)
@@ -304,6 +304,7 @@ static void nic_set_lmac_vf_mapping(struct nicpf *nic)
 static void nic_init_hw(struct nicpf *nic)
 {
        int i;
+       u64 cqm_cfg;
 
        /* Enable NIC HW block */
        nic_reg_write(nic, NIC_PF_CFG, 0x3);
@@ -340,6 +341,11 @@ static void nic_init_hw(struct nicpf *nic)
        /* Enable VLAN ethertype matching and stripping */
        nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
                      (2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETH_P_8021Q);
+
+       /* Check if HW expected value is higher (could be in future chips) */
+       cqm_cfg = nic_reg_read(nic, NIC_PF_CQM_CFG);
+       if (cqm_cfg < NICPF_CQM_MIN_DROP_LEVEL)
+               nic_reg_write(nic, NIC_PF_CQM_CFG, NICPF_CQM_MIN_DROP_LEVEL);
 }
 
 /* Channel parse index configuration */
index dd536be..afb10e3 100644 (file)
@@ -21,7 +21,7 @@
 #define   NIC_PF_TCP_TIMER                     (0x0060)
 #define   NIC_PF_BP_CFG                                (0x0080)
 #define   NIC_PF_RRM_CFG                       (0x0088)
-#define   NIC_PF_CQM_CF                                (0x00A0)
+#define   NIC_PF_CQM_CFG                       (0x00A0)
 #define   NIC_PF_CNM_CF                                (0x00A8)
 #define   NIC_PF_CNM_STATUS                    (0x00B0)
 #define   NIC_PF_CQ_AVG_CFG                    (0x00C0)
index c24cb2a..a009bc3 100644 (file)
@@ -574,8 +574,7 @@ static inline void nicvf_set_rxhash(struct net_device *netdev,
 
 static void nicvf_rcv_pkt_handler(struct net_device *netdev,
                                  struct napi_struct *napi,
-                                 struct cmp_queue *cq,
-                                 struct cqe_rx_t *cqe_rx, int cqe_type)
+                                 struct cqe_rx_t *cqe_rx)
 {
        struct sk_buff *skb;
        struct nicvf *nic = netdev_priv(netdev);
@@ -591,7 +590,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
        }
 
        /* Check for errors */
-       err = nicvf_check_cqe_rx_errs(nic, cq, cqe_rx);
+       err = nicvf_check_cqe_rx_errs(nic, cqe_rx);
        if (err && !cqe_rx->rb_cnt)
                return;
 
@@ -682,8 +681,7 @@ loop:
                           cq_idx, cq_desc->cqe_type);
                switch (cq_desc->cqe_type) {
                case CQE_TYPE_RX:
-                       nicvf_rcv_pkt_handler(netdev, napi, cq,
-                                             cq_desc, CQE_TYPE_RX);
+                       nicvf_rcv_pkt_handler(netdev, napi, cq_desc);
                        work_done++;
                break;
                case CQE_TYPE_SEND:
@@ -1125,7 +1123,6 @@ int nicvf_stop(struct net_device *netdev)
 
        /* Clear multiqset info */
        nic->pnicvf = nic;
-       nic->sqs_count = 0;
 
        return 0;
 }
@@ -1354,6 +1351,9 @@ void nicvf_update_stats(struct nicvf *nic)
        drv_stats->tx_frames_ok = stats->tx_ucast_frames_ok +
                                  stats->tx_bcast_frames_ok +
                                  stats->tx_mcast_frames_ok;
+       drv_stats->rx_frames_ok = stats->rx_ucast_frames +
+                                 stats->rx_bcast_frames +
+                                 stats->rx_mcast_frames;
        drv_stats->rx_drops = stats->rx_drop_red +
                              stats->rx_drop_overrun;
        drv_stats->tx_drops = stats->tx_drops;
@@ -1538,6 +1538,9 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        nicvf_send_vf_struct(nic);
 
+       if (!pass1_silicon(nic->pdev))
+               nic->hw_tso = true;
+
        /* Check if this VF is in QS only mode */
        if (nic->sqs_mode)
                return 0;
@@ -1557,9 +1560,6 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        netdev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 
-       if (!pass1_silicon(nic->pdev))
-               nic->hw_tso = true;
-
        netdev->netdev_ops = &nicvf_netdev_ops;
        netdev->watchdog_timeo = NICVF_TX_TIMEOUT;
 
index d0d1b54..767347b 100644 (file)
@@ -1329,16 +1329,12 @@ void nicvf_update_sq_stats(struct nicvf *nic, int sq_idx)
 }
 
 /* Check for errors in the receive cmp.queue entry */
-int nicvf_check_cqe_rx_errs(struct nicvf *nic,
-                           struct cmp_queue *cq, struct cqe_rx_t *cqe_rx)
+int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx)
 {
        struct nicvf_hw_stats *stats = &nic->hw_stats;
-       struct nicvf_drv_stats *drv_stats = &nic->drv_stats;
 
-       if (!cqe_rx->err_level && !cqe_rx->err_opcode) {
-               drv_stats->rx_frames_ok++;
+       if (!cqe_rx->err_level && !cqe_rx->err_opcode)
                return 0;
-       }
 
        if (netif_msg_rx_err(nic))
                netdev_err(nic->netdev,
index c5030a7..6673e11 100644 (file)
@@ -338,8 +338,7 @@ u64  nicvf_queue_reg_read(struct nicvf *nic,
 /* Stats */
 void nicvf_update_rq_stats(struct nicvf *nic, int rq_idx);
 void nicvf_update_sq_stats(struct nicvf *nic, int sq_idx);
-int nicvf_check_cqe_rx_errs(struct nicvf *nic,
-                           struct cmp_queue *cq, struct cqe_rx_t *cqe_rx);
+int nicvf_check_cqe_rx_errs(struct nicvf *nic, struct cqe_rx_t *cqe_rx);
 int nicvf_check_cqe_tx_errs(struct nicvf *nic,
                            struct cmp_queue *cq, struct cqe_send_t *cqe_tx);
 #endif /* NICVF_QUEUES_H */
index ee04caa..a89721f 100644 (file)
@@ -681,6 +681,24 @@ int t3_seeprom_wp(struct adapter *adapter, int enable)
        return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0);
 }
 
+static int vpdstrtouint(char *s, int len, unsigned int base, unsigned int *val)
+{
+       char tok[len + 1];
+
+       memcpy(tok, s, len);
+       tok[len] = 0;
+       return kstrtouint(strim(tok), base, val);
+}
+
+static int vpdstrtou16(char *s, int len, unsigned int base, u16 *val)
+{
+       char tok[len + 1];
+
+       memcpy(tok, s, len);
+       tok[len] = 0;
+       return kstrtou16(strim(tok), base, val);
+}
+
 /**
  *     get_vpd_params - read VPD parameters from VPD EEPROM
  *     @adapter: adapter to read
@@ -709,19 +727,19 @@ static int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
                        return ret;
        }
 
-       ret = kstrtouint(vpd.cclk_data, 10, &p->cclk);
+       ret = vpdstrtouint(vpd.cclk_data, vpd.cclk_len, 10, &p->cclk);
        if (ret)
                return ret;
-       ret = kstrtouint(vpd.mclk_data, 10, &p->mclk);
+       ret = vpdstrtouint(vpd.mclk_data, vpd.mclk_len, 10, &p->mclk);
        if (ret)
                return ret;
-       ret = kstrtouint(vpd.uclk_data, 10, &p->uclk);
+       ret = vpdstrtouint(vpd.uclk_data, vpd.uclk_len, 10, &p->uclk);
        if (ret)
                return ret;
-       ret = kstrtouint(vpd.mdc_data, 10, &p->mdc);
+       ret = vpdstrtouint(vpd.mdc_data, vpd.mdc_len, 10, &p->mdc);
        if (ret)
                return ret;
-       ret = kstrtouint(vpd.mt_data, 10, &p->mem_timing);
+       ret = vpdstrtouint(vpd.mt_data, vpd.mt_len, 10, &p->mem_timing);
        if (ret)
                return ret;
        memcpy(p->sn, vpd.sn_data, SERNUM_LEN);
@@ -733,10 +751,12 @@ static int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
        } else {
                p->port_type[0] = hex_to_bin(vpd.port0_data[0]);
                p->port_type[1] = hex_to_bin(vpd.port1_data[0]);
-               ret = kstrtou16(vpd.xaui0cfg_data, 16, &p->xauicfg[0]);
+               ret = vpdstrtou16(vpd.xaui0cfg_data, vpd.xaui0cfg_len, 16,
+                                 &p->xauicfg[0]);
                if (ret)
                        return ret;
-               ret = kstrtou16(vpd.xaui1cfg_data, 16, &p->xauicfg[1]);
+               ret = vpdstrtou16(vpd.xaui1cfg_data, vpd.xaui1cfg_len, 16,
+                                 &p->xauicfg[1]);
                if (ret)
                        return ret;
        }
index a8dda63..06bc2d2 100644 (file)
@@ -165,6 +165,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
        CH_PCI_ID_TABLE_FENTRY(0x5098), /* Custom 2x40G QSFP */
        CH_PCI_ID_TABLE_FENTRY(0x5099), /* Custom 2x40G QSFP */
        CH_PCI_ID_TABLE_FENTRY(0x509a), /* Custom T520-CR */
+       CH_PCI_ID_TABLE_FENTRY(0x509b), /* Custom T540-CR LOM */
 
        /* T6 adapters:
         */
index 1671fa3..7ba6d53 100644 (file)
@@ -33,7 +33,7 @@
 
 #define DRV_NAME               "enic"
 #define DRV_DESCRIPTION                "Cisco VIC Ethernet NIC Driver"
-#define DRV_VERSION            "2.3.0.12"
+#define DRV_VERSION            "2.3.0.20"
 #define DRV_COPYRIGHT          "Copyright 2008-2013 Cisco Systems, Inc"
 
 #define ENIC_BARS_MAX          6
index 1ffd105..1fdf5fe 100644 (file)
@@ -298,7 +298,8 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
                          int wait)
 {
        struct devcmd2_controller *dc2c = vdev->devcmd2;
-       struct devcmd2_result *result = dc2c->result + dc2c->next_result;
+       struct devcmd2_result *result;
+       u8 color;
        unsigned int i;
        int delay, err;
        u32 fetch_index, new_posted;
@@ -336,13 +337,17 @@ static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd,
        if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT)
                return 0;
 
+       result = dc2c->result + dc2c->next_result;
+       color = dc2c->color;
+
+       dc2c->next_result++;
+       if (dc2c->next_result == dc2c->result_size) {
+               dc2c->next_result = 0;
+               dc2c->color = dc2c->color ? 0 : 1;
+       }
+
        for (delay = 0; delay < wait; delay++) {
-               if (result->color == dc2c->color) {
-                       dc2c->next_result++;
-                       if (dc2c->next_result == dc2c->result_size) {
-                               dc2c->next_result = 0;
-                               dc2c->color = dc2c->color ? 0 : 1;
-                       }
+               if (result->color == color) {
                        if (result->error) {
                                err = result->error;
                                if (err != ERR_ECMDUNKNOWN ||
index cf94b72..48d9194 100644 (file)
@@ -128,7 +128,6 @@ struct board_info {
        struct resource *data_res;
        struct resource *addr_req;   /* resources requested */
        struct resource *data_req;
-       struct resource *irq_res;
 
        int              irq_wake;
 
@@ -1300,22 +1299,16 @@ static int
 dm9000_open(struct net_device *dev)
 {
        struct board_info *db = netdev_priv(dev);
-       unsigned long irqflags = db->irq_res->flags & IRQF_TRIGGER_MASK;
 
        if (netif_msg_ifup(db))
                dev_dbg(db->dev, "enabling %s\n", dev->name);
 
-       /* If there is no IRQ type specified, default to something that
-        * may work, and tell the user that this is a problem */
-
-       if (irqflags == IRQF_TRIGGER_NONE)
-               irqflags = irq_get_trigger_type(dev->irq);
-
-       if (irqflags == IRQF_TRIGGER_NONE)
+       /* If there is no IRQ type specified, tell the user that this is a
+        * problem
+        */
+       if (irq_get_trigger_type(dev->irq) == IRQF_TRIGGER_NONE)
                dev_warn(db->dev, "WARNING: no IRQ resource flags set.\n");
 
-       irqflags |= IRQF_SHARED;
-
        /* GPIO0 on pre-activate PHY, Reg 1F is not set by reset */
        iow(db, DM9000_GPR, 0); /* REG_1F bit0 activate phyxcer */
        mdelay(1); /* delay needs by DM9000B */
@@ -1323,7 +1316,8 @@ dm9000_open(struct net_device *dev)
        /* Initialize DM9000 board */
        dm9000_init_dm9000(dev);
 
-       if (request_irq(dev->irq, dm9000_interrupt, irqflags, dev->name, dev))
+       if (request_irq(dev->irq, dm9000_interrupt, IRQF_SHARED,
+                       dev->name, dev))
                return -EAGAIN;
        /* Now that we have an interrupt handler hooked up we can unmask
         * our interrupts
@@ -1500,15 +1494,22 @@ dm9000_probe(struct platform_device *pdev)
 
        db->addr_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        db->data_res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-       db->irq_res  = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 
-       if (db->addr_res == NULL || db->data_res == NULL ||
-           db->irq_res == NULL) {
-               dev_err(db->dev, "insufficient resources\n");
+       if (!db->addr_res || !db->data_res) {
+               dev_err(db->dev, "insufficient resources addr=%p data=%p\n",
+                       db->addr_res, db->data_res);
                ret = -ENOENT;
                goto out;
        }
 
+       ndev->irq = platform_get_irq(pdev, 0);
+       if (ndev->irq < 0) {
+               dev_err(db->dev, "interrupt resource unavailable: %d\n",
+                       ndev->irq);
+               ret = ndev->irq;
+               goto out;
+       }
+
        db->irq_wake = platform_get_irq(pdev, 1);
        if (db->irq_wake >= 0) {
                dev_dbg(db->dev, "wakeup irq %d\n", db->irq_wake);
@@ -1570,7 +1571,6 @@ dm9000_probe(struct platform_device *pdev)
 
        /* fill in parameters for net-dev structure */
        ndev->base_addr = (unsigned long)db->io_addr;
-       ndev->irq       = db->irq_res->start;
 
        /* ensure at least we have a default set of IO routines */
        dm9000_set_io(db, iosize);
index cf83783..f975129 100644 (file)
@@ -531,6 +531,7 @@ struct be_adapter {
 
        struct delayed_work be_err_detection_work;
        u8 err_flags;
+       bool pcicfg_mapped;     /* pcicfg obtained via pci_iomap() */
        u32 flags;
        u32 cmd_privileges;
        /* Ethtool knobs and info */
index 241819b..6d9a8d7 100644 (file)
@@ -622,10 +622,13 @@ enum be_if_flags {
                                         BE_IF_FLAGS_VLAN_PROMISCUOUS |\
                                         BE_IF_FLAGS_MCAST_PROMISCUOUS)
 
-#define BE_IF_EN_FLAGS (BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |\
-                       BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_UNTAGGED)
+#define BE_IF_FILT_FLAGS_BASIC (BE_IF_FLAGS_BROADCAST | \
+                               BE_IF_FLAGS_PASS_L3L4_ERRORS | \
+                               BE_IF_FLAGS_UNTAGGED)
 
-#define BE_IF_ALL_FILT_FLAGS   (BE_IF_EN_FLAGS | BE_IF_FLAGS_ALL_PROMISCUOUS)
+#define BE_IF_ALL_FILT_FLAGS   (BE_IF_FILT_FLAGS_BASIC | \
+                                BE_IF_FLAGS_MULTICAST | \
+                                BE_IF_FLAGS_ALL_PROMISCUOUS)
 
 /* An RX interface is an object with one or more MAC addresses and
  * filtering capabilities. */
index f99de36..d1cf127 100644 (file)
@@ -125,6 +125,11 @@ static const char * const ue_status_hi_desc[] = {
        "Unknown"
 };
 
+#define BE_VF_IF_EN_FLAGS      (BE_IF_FLAGS_UNTAGGED | \
+                                BE_IF_FLAGS_BROADCAST | \
+                                BE_IF_FLAGS_MULTICAST | \
+                                BE_IF_FLAGS_PASS_L3L4_ERRORS)
+
 static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
 {
        struct be_dma_mem *mem = &q->dma_mem;
@@ -3537,7 +3542,7 @@ static int be_enable_if_filters(struct be_adapter *adapter)
 {
        int status;
 
-       status = be_cmd_rx_filter(adapter, BE_IF_EN_FLAGS, ON);
+       status = be_cmd_rx_filter(adapter, BE_IF_FILT_FLAGS_BASIC, ON);
        if (status)
                return status;
 
@@ -3857,8 +3862,7 @@ static int be_vfs_if_create(struct be_adapter *adapter)
        int status;
 
        /* If a FW profile exists, then cap_flags are updated */
-       cap_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
-                   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS;
+       cap_flags = BE_VF_IF_EN_FLAGS;
 
        for_all_vfs(adapter, vf_cfg, vf) {
                if (!BE3_chip(adapter)) {
@@ -3874,10 +3878,8 @@ static int be_vfs_if_create(struct be_adapter *adapter)
                        }
                }
 
-               en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED |
-                                       BE_IF_FLAGS_BROADCAST |
-                                       BE_IF_FLAGS_MULTICAST |
-                                       BE_IF_FLAGS_PASS_L3L4_ERRORS);
+               /* PF should enable IF flags during proxy if_create call */
+               en_flags = cap_flags & BE_VF_IF_EN_FLAGS;
                status = be_cmd_if_create(adapter, cap_flags, en_flags,
                                          &vf_cfg->if_handle, vf + 1);
                if (status)
@@ -4968,6 +4970,8 @@ static void be_unmap_pci_bars(struct be_adapter *adapter)
                pci_iounmap(adapter->pdev, adapter->csr);
        if (adapter->db)
                pci_iounmap(adapter->pdev, adapter->db);
+       if (adapter->pcicfg && adapter->pcicfg_mapped)
+               pci_iounmap(adapter->pdev, adapter->pcicfg);
 }
 
 static int db_bar(struct be_adapter *adapter)
@@ -5019,8 +5023,10 @@ static int be_map_pci_bars(struct be_adapter *adapter)
                        if (!addr)
                                goto pci_map_err;
                        adapter->pcicfg = addr;
+                       adapter->pcicfg_mapped = true;
                } else {
                        adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+                       adapter->pcicfg_mapped = false;
                }
        }
 
index 62fa136..41b0106 100644 (file)
@@ -1265,7 +1265,6 @@ static int ethoc_remove(struct platform_device *pdev)
 
                if (priv->mdio) {
                        mdiobus_unregister(priv->mdio);
-                       kfree(priv->mdio->irq);
                        mdiobus_free(priv->mdio);
                }
                if (priv->clk)
index 48ecbc8..b423ad3 100644 (file)
@@ -18,6 +18,7 @@ if NET_VENDOR_EZCHIP
 config EZCHIP_NPS_MANAGEMENT_ENET
        tristate "EZchip NPS management enet support"
        depends on OF_IRQ && OF_NET
+       depends on HAS_IOMEM
        ---help---
          Simple LAN device for debug or management purposes.
          Device supports interrupts for RX and TX(completion).
index 4097c58..cbe21dc 100644 (file)
@@ -4,6 +4,9 @@
 
 obj-$(CONFIG_FEC) += fec.o
 fec-objs :=fec_main.o fec_ptp.o
+CFLAGS_fec_main.o := -D__CHECK_ENDIAN__
+CFLAGS_fec_ptp.o := -D__CHECK_ENDIAN__
+
 obj-$(CONFIG_FEC_MPC52xx) += fec_mpc52xx.o
 ifeq ($(CONFIG_FEC_MPC52xx_MDIO),y)
        obj-$(CONFIG_FEC_MPC52xx) += fec_mpc52xx_phy.o
index 99d33e2..2106d72 100644 (file)
@@ -19,8 +19,7 @@
 #include <linux/timecounter.h>
 
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
-    defined(CONFIG_M520x) || defined(CONFIG_M532x) || \
-    defined(CONFIG_ARCH_MXC) || defined(CONFIG_SOC_IMX28)
+    defined(CONFIG_M520x) || defined(CONFIG_M532x) || defined(CONFIG_ARM)
 /*
  *     Just figures, Motorola would have to change the offsets for
  *     registers in the same peripheral device on different models
 
 /*
  *     Define the buffer descriptor structure.
+ *
+ *     Evidently, ARM SoCs have the FEC block generated in a
+ *     little endian mode so adjust endianness accordingly.
  */
-#if defined(CONFIG_ARCH_MXC) || defined(CONFIG_SOC_IMX28)
+#if defined(CONFIG_ARM)
+#define fec32_to_cpu le32_to_cpu
+#define fec16_to_cpu le16_to_cpu
+#define cpu_to_fec32 cpu_to_le32
+#define cpu_to_fec16 cpu_to_le16
+#define __fec32 __le32
+#define __fec16 __le16
+
 struct bufdesc {
-       unsigned short cbd_datlen;      /* Data length */
-       unsigned short cbd_sc;  /* Control and status info */
-       unsigned long cbd_bufaddr;      /* Buffer address */
+       __fec16 cbd_datlen;     /* Data length */
+       __fec16 cbd_sc;         /* Control and status info */
+       __fec32 cbd_bufaddr;    /* Buffer address */
 };
 #else
+#define fec32_to_cpu be32_to_cpu
+#define fec16_to_cpu be16_to_cpu
+#define cpu_to_fec32 cpu_to_be32
+#define cpu_to_fec16 cpu_to_be16
+#define __fec32 __be32
+#define __fec16 __be16
+
 struct bufdesc {
-       unsigned short  cbd_sc;                 /* Control and status info */
-       unsigned short  cbd_datlen;             /* Data length */
-       unsigned long   cbd_bufaddr;            /* Buffer address */
+       __fec16 cbd_sc;         /* Control and status info */
+       __fec16 cbd_datlen;     /* Data length */
+       __fec32 cbd_bufaddr;    /* Buffer address */
 };
 #endif
 
 struct bufdesc_ex {
        struct bufdesc desc;
-       unsigned long cbd_esc;
-       unsigned long cbd_prot;
-       unsigned long cbd_bdu;
-       unsigned long ts;
-       unsigned short res0[4];
+       __fec32 cbd_esc;
+       __fec32 cbd_prot;
+       __fec32 cbd_bdu;
+       __fec32 ts;
+       __fec16 res0[4];
 };
 
 /*
index 502da6f..41c81f6 100644 (file)
@@ -332,11 +332,13 @@ static void fec_dump(struct net_device *ndev)
        bdp = txq->tx_bd_base;
 
        do {
-               pr_info("%3u %c%c 0x%04x 0x%08lx %4u %p\n",
+               pr_info("%3u %c%c 0x%04x 0x%08x %4u %p\n",
                        index,
                        bdp == txq->cur_tx ? 'S' : ' ',
                        bdp == txq->dirty_tx ? 'H' : ' ',
-                       bdp->cbd_sc, bdp->cbd_bufaddr, bdp->cbd_datlen,
+                       fec16_to_cpu(bdp->cbd_sc),
+                       fec32_to_cpu(bdp->cbd_bufaddr),
+                       fec16_to_cpu(bdp->cbd_datlen),
                        txq->tx_skbuff[index]);
                bdp = fec_enet_get_nextdesc(bdp, fep, 0);
                index++;
@@ -389,7 +391,7 @@ fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
                bdp = fec_enet_get_nextdesc(bdp, fep, queue);
                ebdp = (struct bufdesc_ex *)bdp;
 
-               status = bdp->cbd_sc;
+               status = fec16_to_cpu(bdp->cbd_sc);
                status &= ~BD_ENET_TX_STATS;
                status |= (BD_ENET_TX_TC | BD_ENET_TX_READY);
                frag_len = skb_shinfo(skb)->frags[frag].size;
@@ -411,7 +413,7 @@ fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
                        if (skb->ip_summed == CHECKSUM_PARTIAL)
                                estatus |= BD_ENET_TX_PINS | BD_ENET_TX_IINS;
                        ebdp->cbd_bdu = 0;
-                       ebdp->cbd_esc = estatus;
+                       ebdp->cbd_esc = cpu_to_fec32(estatus);
                }
 
                bufaddr = page_address(this_frag->page.p) + this_frag->page_offset;
@@ -435,9 +437,9 @@ fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
                        goto dma_mapping_error;
                }
 
-               bdp->cbd_bufaddr = addr;
-               bdp->cbd_datlen = frag_len;
-               bdp->cbd_sc = status;
+               bdp->cbd_bufaddr = cpu_to_fec32(addr);
+               bdp->cbd_datlen = cpu_to_fec16(frag_len);
+               bdp->cbd_sc = cpu_to_fec16(status);
        }
 
        return bdp;
@@ -445,8 +447,8 @@ dma_mapping_error:
        bdp = txq->cur_tx;
        for (i = 0; i < frag; i++) {
                bdp = fec_enet_get_nextdesc(bdp, fep, queue);
-               dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
-                               bdp->cbd_datlen, DMA_TO_DEVICE);
+               dma_unmap_single(&fep->pdev->dev, fec32_to_cpu(bdp->cbd_bufaddr),
+                                fec16_to_cpu(bdp->cbd_datlen), DMA_TO_DEVICE);
        }
        return ERR_PTR(-ENOMEM);
 }
@@ -483,7 +485,7 @@ static int fec_enet_txq_submit_skb(struct fec_enet_priv_tx_q *txq,
        /* Fill in a Tx ring entry */
        bdp = txq->cur_tx;
        last_bdp = bdp;
-       status = bdp->cbd_sc;
+       status = fec16_to_cpu(bdp->cbd_sc);
        status &= ~BD_ENET_TX_STATS;
 
        /* Set buffer length and buffer pointer */
@@ -539,21 +541,21 @@ static int fec_enet_txq_submit_skb(struct fec_enet_priv_tx_q *txq,
                        estatus |= BD_ENET_TX_PINS | BD_ENET_TX_IINS;
 
                ebdp->cbd_bdu = 0;
-               ebdp->cbd_esc = estatus;
+               ebdp->cbd_esc = cpu_to_fec32(estatus);
        }
 
        index = fec_enet_get_bd_index(txq->tx_bd_base, last_bdp, fep);
        /* Save skb pointer */
        txq->tx_skbuff[index] = skb;
 
-       bdp->cbd_datlen = buflen;
-       bdp->cbd_bufaddr = addr;
+       bdp->cbd_datlen = cpu_to_fec16(buflen);
+       bdp->cbd_bufaddr = cpu_to_fec32(addr);
 
        /* Send it on its way.  Tell FEC it's ready, interrupt when done,
         * it's the last BD of the frame, and to put the CRC on the end.
         */
        status |= (BD_ENET_TX_READY | BD_ENET_TX_TC);
-       bdp->cbd_sc = status;
+       bdp->cbd_sc = cpu_to_fec16(status);
 
        /* If this was the last BD in the ring, start at the beginning again. */
        bdp = fec_enet_get_nextdesc(last_bdp, fep, queue);
@@ -585,7 +587,7 @@ fec_enet_txq_put_data_tso(struct fec_enet_priv_tx_q *txq, struct sk_buff *skb,
        unsigned int estatus = 0;
        dma_addr_t addr;
 
-       status = bdp->cbd_sc;
+       status = fec16_to_cpu(bdp->cbd_sc);
        status &= ~BD_ENET_TX_STATS;
 
        status |= (BD_ENET_TX_TC | BD_ENET_TX_READY);
@@ -607,8 +609,8 @@ fec_enet_txq_put_data_tso(struct fec_enet_priv_tx_q *txq, struct sk_buff *skb,
                return NETDEV_TX_BUSY;
        }
 
-       bdp->cbd_datlen = size;
-       bdp->cbd_bufaddr = addr;
+       bdp->cbd_datlen = cpu_to_fec16(size);
+       bdp->cbd_bufaddr = cpu_to_fec32(addr);
 
        if (fep->bufdesc_ex) {
                if (fep->quirks & FEC_QUIRK_HAS_AVB)
@@ -616,7 +618,7 @@ fec_enet_txq_put_data_tso(struct fec_enet_priv_tx_q *txq, struct sk_buff *skb,
                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        estatus |= BD_ENET_TX_PINS | BD_ENET_TX_IINS;
                ebdp->cbd_bdu = 0;
-               ebdp->cbd_esc = estatus;
+               ebdp->cbd_esc = cpu_to_fec32(estatus);
        }
 
        /* Handle the last BD specially */
@@ -625,10 +627,10 @@ fec_enet_txq_put_data_tso(struct fec_enet_priv_tx_q *txq, struct sk_buff *skb,
        if (is_last) {
                status |= BD_ENET_TX_INTR;
                if (fep->bufdesc_ex)
-                       ebdp->cbd_esc |= BD_ENET_TX_INT;
+                       ebdp->cbd_esc |= cpu_to_fec32(BD_ENET_TX_INT);
        }
 
-       bdp->cbd_sc = status;
+       bdp->cbd_sc = cpu_to_fec16(status);
 
        return 0;
 }
@@ -647,7 +649,7 @@ fec_enet_txq_put_hdr_tso(struct fec_enet_priv_tx_q *txq,
        unsigned short status;
        unsigned int estatus = 0;
 
-       status = bdp->cbd_sc;
+       status = fec16_to_cpu(bdp->cbd_sc);
        status &= ~BD_ENET_TX_STATS;
        status |= (BD_ENET_TX_TC | BD_ENET_TX_READY);
 
@@ -671,8 +673,8 @@ fec_enet_txq_put_hdr_tso(struct fec_enet_priv_tx_q *txq,
                }
        }
 
-       bdp->cbd_bufaddr = dmabuf;
-       bdp->cbd_datlen = hdr_len;
+       bdp->cbd_bufaddr = cpu_to_fec32(dmabuf);
+       bdp->cbd_datlen = cpu_to_fec16(hdr_len);
 
        if (fep->bufdesc_ex) {
                if (fep->quirks & FEC_QUIRK_HAS_AVB)
@@ -680,10 +682,10 @@ fec_enet_txq_put_hdr_tso(struct fec_enet_priv_tx_q *txq,
                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        estatus |= BD_ENET_TX_PINS | BD_ENET_TX_IINS;
                ebdp->cbd_bdu = 0;
-               ebdp->cbd_esc = estatus;
+               ebdp->cbd_esc = cpu_to_fec32(estatus);
        }
 
-       bdp->cbd_sc = status;
+       bdp->cbd_sc = cpu_to_fec16(status);
 
        return 0;
 }
@@ -823,15 +825,15 @@ static void fec_enet_bd_init(struct net_device *dev)
 
                        /* Initialize the BD for every fragment in the page. */
                        if (bdp->cbd_bufaddr)
-                               bdp->cbd_sc = BD_ENET_RX_EMPTY;
+                               bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
                        else
-                               bdp->cbd_sc = 0;
+                               bdp->cbd_sc = cpu_to_fec16(0);
                        bdp = fec_enet_get_nextdesc(bdp, fep, q);
                }
 
                /* Set the last buffer to wrap */
                bdp = fec_enet_get_prevdesc(bdp, fep, q);
-               bdp->cbd_sc |= BD_SC_WRAP;
+               bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
 
                rxq->cur_rx = rxq->rx_bd_base;
        }
@@ -844,18 +846,18 @@ static void fec_enet_bd_init(struct net_device *dev)
 
                for (i = 0; i < txq->tx_ring_size; i++) {
                        /* Initialize the BD for every fragment in the page. */
-                       bdp->cbd_sc = 0;
+                       bdp->cbd_sc = cpu_to_fec16(0);
                        if (txq->tx_skbuff[i]) {
                                dev_kfree_skb_any(txq->tx_skbuff[i]);
                                txq->tx_skbuff[i] = NULL;
                        }
-                       bdp->cbd_bufaddr = 0;
+                       bdp->cbd_bufaddr = cpu_to_fec32(0);
                        bdp = fec_enet_get_nextdesc(bdp, fep, q);
                }
 
                /* Set the last buffer to wrap */
                bdp = fec_enet_get_prevdesc(bdp, fep, q);
-               bdp->cbd_sc |= BD_SC_WRAP;
+               bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
                txq->dirty_tx = bdp;
        }
 }
@@ -947,8 +949,10 @@ fec_restart(struct net_device *ndev)
         */
        if (fep->quirks & FEC_QUIRK_ENET_MAC) {
                memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN);
-               writel(cpu_to_be32(temp_mac[0]), fep->hwp + FEC_ADDR_LOW);
-               writel(cpu_to_be32(temp_mac[1]), fep->hwp + FEC_ADDR_HIGH);
+               writel((__force u32)cpu_to_be32(temp_mac[0]),
+                      fep->hwp + FEC_ADDR_LOW);
+               writel((__force u32)cpu_to_be32(temp_mac[1]),
+                      fep->hwp + FEC_ADDR_HIGH);
        }
 
        /* Clear any outstanding interrupt. */
@@ -1222,7 +1226,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
        while (bdp != READ_ONCE(txq->cur_tx)) {
                /* Order the load of cur_tx and cbd_sc */
                rmb();
-               status = READ_ONCE(bdp->cbd_sc);
+               status = fec16_to_cpu(READ_ONCE(bdp->cbd_sc));
                if (status & BD_ENET_TX_READY)
                        break;
 
@@ -1230,10 +1234,12 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
 
                skb = txq->tx_skbuff[index];
                txq->tx_skbuff[index] = NULL;
-               if (!IS_TSO_HEADER(txq, bdp->cbd_bufaddr))
-                       dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
-                                       bdp->cbd_datlen, DMA_TO_DEVICE);
-               bdp->cbd_bufaddr = 0;
+               if (!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
+                       dma_unmap_single(&fep->pdev->dev,
+                                        fec32_to_cpu(bdp->cbd_bufaddr),
+                                        fec16_to_cpu(bdp->cbd_datlen),
+                                        DMA_TO_DEVICE);
+               bdp->cbd_bufaddr = cpu_to_fec32(0);
                if (!skb) {
                        bdp = fec_enet_get_nextdesc(bdp, fep, queue_id);
                        continue;
@@ -1264,7 +1270,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
                        struct skb_shared_hwtstamps shhwtstamps;
                        struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
 
-                       fec_enet_hwtstamp(fep, ebdp->ts, &shhwtstamps);
+                       fec_enet_hwtstamp(fep, fec32_to_cpu(ebdp->ts), &shhwtstamps);
                        skb_tstamp_tx(skb, &shhwtstamps);
                }
 
@@ -1324,10 +1330,8 @@ fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff
        if (off)
                skb_reserve(skb, fep->rx_align + 1 - off);
 
-       bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, skb->data,
-                                         FEC_ENET_RX_FRSIZE - fep->rx_align,
-                                         DMA_FROM_DEVICE);
-       if (dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr)) {
+       bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(&fep->pdev->dev, skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));
+       if (dma_mapping_error(&fep->pdev->dev, fec32_to_cpu(bdp->cbd_bufaddr))) {
                if (net_ratelimit())
                        netdev_err(ndev, "Rx DMA memory map failed\n");
                return -ENOMEM;
@@ -1349,7 +1353,8 @@ static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
        if (!new_skb)
                return false;
 
-       dma_sync_single_for_cpu(&fep->pdev->dev, bdp->cbd_bufaddr,
+       dma_sync_single_for_cpu(&fep->pdev->dev,
+                               fec32_to_cpu(bdp->cbd_bufaddr),
                                FEC_ENET_RX_FRSIZE - fep->rx_align,
                                DMA_FROM_DEVICE);
        if (!swap)
@@ -1396,7 +1401,7 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
         */
        bdp = rxq->cur_rx;
 
-       while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
+       while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) {
 
                if (pkt_received >= budget)
                        break;
@@ -1438,7 +1443,7 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 
                /* Process the incoming frame. */
                ndev->stats.rx_packets++;
-               pkt_len = bdp->cbd_datlen;
+               pkt_len = fec16_to_cpu(bdp->cbd_datlen);
                ndev->stats.rx_bytes += pkt_len;
 
                index = fec_enet_get_bd_index(rxq->rx_bd_base, bdp, fep);
@@ -1456,7 +1461,8 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
                                ndev->stats.rx_dropped++;
                                goto rx_processing_done;
                        }
-                       dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
+                       dma_unmap_single(&fep->pdev->dev,
+                                        fec32_to_cpu(bdp->cbd_bufaddr),
                                         FEC_ENET_RX_FRSIZE - fep->rx_align,
                                         DMA_FROM_DEVICE);
                }
@@ -1475,7 +1481,8 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
                /* If this is a VLAN packet remove the VLAN Tag */
                vlan_packet_rcvd = false;
                if ((ndev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-                       fep->bufdesc_ex && (ebdp->cbd_esc & BD_ENET_RX_VLAN)) {
+                   fep->bufdesc_ex &&
+                   (ebdp->cbd_esc & cpu_to_fec32(BD_ENET_RX_VLAN))) {
                        /* Push and remove the vlan tag */
                        struct vlan_hdr *vlan_header =
                                        (struct vlan_hdr *) (data + ETH_HLEN);
@@ -1491,12 +1498,12 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 
                /* Get receive timestamp from the skb */
                if (fep->hwts_rx_en && fep->bufdesc_ex)
-                       fec_enet_hwtstamp(fep, ebdp->ts,
+                       fec_enet_hwtstamp(fep, fec32_to_cpu(ebdp->ts),
                                          skb_hwtstamps(skb));
 
                if (fep->bufdesc_ex &&
                    (fep->csum_flags & FLAG_RX_CSUM_ENABLED)) {
-                       if (!(ebdp->cbd_esc & FLAG_RX_CSUM_ERROR)) {
+                       if (!(ebdp->cbd_esc & cpu_to_fec32(FLAG_RX_CSUM_ERROR))) {
                                /* don't check it */
                                skb->ip_summed = CHECKSUM_UNNECESSARY;
                        } else {
@@ -1513,7 +1520,8 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
                napi_gro_receive(&fep->napi, skb);
 
                if (is_copybreak) {
-                       dma_sync_single_for_device(&fep->pdev->dev, bdp->cbd_bufaddr,
+                       dma_sync_single_for_device(&fep->pdev->dev,
+                                                  fec32_to_cpu(bdp->cbd_bufaddr),
                                                   FEC_ENET_RX_FRSIZE - fep->rx_align,
                                                   DMA_FROM_DEVICE);
                } else {
@@ -1527,12 +1535,12 @@ rx_processing_done:
 
                /* Mark the buffer empty */
                status |= BD_ENET_RX_EMPTY;
-               bdp->cbd_sc = status;
+               bdp->cbd_sc = cpu_to_fec16(status);
 
                if (fep->bufdesc_ex) {
                        struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
 
-                       ebdp->cbd_esc = BD_ENET_RX_INT;
+                       ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
                        ebdp->cbd_prot = 0;
                        ebdp->cbd_bdu = 0;
                }
@@ -2145,8 +2153,7 @@ static int fec_enet_get_regs_len(struct net_device *ndev)
 
 /* List of registers that can be safety be read to dump them with ethtool */
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
-       defined(CONFIG_M520x) || defined(CONFIG_M532x) ||               \
-       defined(CONFIG_ARCH_MXC) || defined(CONFIG_SOC_IMX28)
+       defined(CONFIG_M520x) || defined(CONFIG_M532x) || defined(CONFIG_ARM)
 static u32 fec_enet_register_offset[] = {
        FEC_IEVENT, FEC_IMASK, FEC_R_DES_ACTIVE_0, FEC_X_DES_ACTIVE_0,
        FEC_ECNTRL, FEC_MII_DATA, FEC_MII_SPEED, FEC_MIB_CTRLSTAT, FEC_R_CNTRL,
@@ -2662,7 +2669,7 @@ static void fec_enet_free_buffers(struct net_device *ndev)
                        rxq->rx_skbuff[i] = NULL;
                        if (skb) {
                                dma_unmap_single(&fep->pdev->dev,
-                                                bdp->cbd_bufaddr,
+                                                fec32_to_cpu(bdp->cbd_bufaddr),
                                                 FEC_ENET_RX_FRSIZE - fep->rx_align,
                                                 DMA_FROM_DEVICE);
                                dev_kfree_skb(skb);
@@ -2777,11 +2784,11 @@ fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
                }
 
                rxq->rx_skbuff[i] = skb;
-               bdp->cbd_sc = BD_ENET_RX_EMPTY;
+               bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
 
                if (fep->bufdesc_ex) {
                        struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
-                       ebdp->cbd_esc = BD_ENET_RX_INT;
+                       ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
                }
 
                bdp = fec_enet_get_nextdesc(bdp, fep, queue);
@@ -2789,7 +2796,7 @@ fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
 
        /* Set the last buffer to wrap. */
        bdp = fec_enet_get_prevdesc(bdp, fep, queue);
-       bdp->cbd_sc |= BD_SC_WRAP;
+       bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
        return 0;
 
  err_alloc:
@@ -2812,12 +2819,12 @@ fec_enet_alloc_txq_buffers(struct net_device *ndev, unsigned int queue)
                if (!txq->tx_bounce[i])
                        goto err_alloc;
 
-               bdp->cbd_sc = 0;
-               bdp->cbd_bufaddr = 0;
+               bdp->cbd_sc = cpu_to_fec16(0);
+               bdp->cbd_bufaddr = cpu_to_fec32(0);
 
                if (fep->bufdesc_ex) {
                        struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
-                       ebdp->cbd_esc = BD_ENET_TX_INT;
+                       ebdp->cbd_esc = cpu_to_fec32(BD_ENET_TX_INT);
                }
 
                bdp = fec_enet_get_nextdesc(bdp, fep, queue);
@@ -2825,7 +2832,7 @@ fec_enet_alloc_txq_buffers(struct net_device *ndev, unsigned int queue)
 
        /* Set the last buffer to wrap. */
        bdp = fec_enet_get_prevdesc(bdp, fep, queue);
-       bdp->cbd_sc |= BD_SC_WRAP;
+       bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
 
        return 0;
 
index 623aa1c..79a210a 100644 (file)
@@ -2791,6 +2791,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
                goto fman_free;
        }
 
+       fman->dev = &of_dev->dev;
+
        return fman;
 
 fman_node_put:
@@ -2845,8 +2847,6 @@ static int fman_probe(struct platform_device *of_dev)
 
        dev_set_drvdata(dev, fman);
 
-       fman->dev = dev;
-
        dev_dbg(dev, "FMan%d probed\n", fman->dts_params.id);
 
        return 0;
index 52e0091..1ba359f 100644 (file)
@@ -552,7 +552,7 @@ static void tx_restart(struct net_device *dev)
        cbd_t __iomem *prev_bd;
        cbd_t __iomem *last_tx_bd;
 
-       last_tx_bd = fep->tx_bd_base + ((fpi->tx_ring - 1) * sizeof(cbd_t));
+       last_tx_bd = fep->tx_bd_base + (fpi->tx_ring - 1);
 
        /* get the current bd held in TBPTR  and scan back from this point */
        recheck_bd = curr_tbptr = (cbd_t __iomem *)
index 2aa7b40..b9ecf19 100644 (file)
@@ -1111,8 +1111,10 @@ static void __gfar_detect_errata_85xx(struct gfar_private *priv)
 
        if ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) == 0x20))
                priv->errata |= GFAR_ERRATA_12;
+       /* P2020/P1010 Rev 1; MPC8548 Rev 2 */
        if (((SVR_SOC_VER(svr) == SVR_P2020) && (SVR_REV(svr) < 0x20)) ||
-           ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)))
+           ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)) ||
+           ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) < 0x31)))
                priv->errata |= GFAR_ERRATA_76; /* aka eTSEC 20 */
 }
 #endif
index a7139f5..678f501 100644 (file)
@@ -469,8 +469,8 @@ static int fmvj18x_config(struct pcmcia_device *link)
                    goto failed;
            }
            /* Read MACID from CIS */
-           for (i = 5; i < 11; i++)
-                   dev->dev_addr[i] = buf[i];
+           for (i = 0; i < 6; i++)
+                   dev->dev_addr[i] = buf[i + 5];
            kfree(buf);
        } else {
            if (pcmcia_get_mac_from_cis(link, dev))
index 74beb18..4ccc032 100644 (file)
@@ -25,6 +25,7 @@ config HIX5HD2_GMAC
 
 config HIP04_ETH
        tristate "HISILICON P04 Ethernet support"
+       depends on HAS_IOMEM    # For MFD_SYSCON
        select MARVELL_PHY
        select MFD_SYSCON
        select HNS_MDIO
index b364529..3bfe36f 100644 (file)
@@ -95,21 +95,17 @@ static struct hnae_buf_ops hnae_bops = {
 static int __ae_match(struct device *dev, const void *data)
 {
        struct hnae_ae_dev *hdev = cls_to_ae_dev(dev);
-       const char *ae_id = data;
 
-       if (!strncmp(ae_id, hdev->name, AE_NAME_SIZE))
-               return 1;
-
-       return 0;
+       return hdev->dev->of_node == data;
 }
 
-static struct hnae_ae_dev *find_ae(const char *ae_id)
+static struct hnae_ae_dev *find_ae(const struct device_node *ae_node)
 {
        struct device *dev;
 
-       WARN_ON(!ae_id);
+       WARN_ON(!ae_node);
 
-       dev = class_find_device(hnae_class, NULL, ae_id, __ae_match);
+       dev = class_find_device(hnae_class, NULL, ae_node, __ae_match);
 
        return dev ? cls_to_ae_dev(dev) : NULL;
 }
@@ -316,7 +312,8 @@ EXPORT_SYMBOL(hnae_reinit_handle);
  * return handle ptr or ERR_PTR
  */
 struct hnae_handle *hnae_get_handle(struct device *owner_dev,
-                                   const char *ae_id, u32 port_id,
+                                   const struct device_node *ae_node,
+                                   u32 port_id,
                                    struct hnae_buf_ops *bops)
 {
        struct hnae_ae_dev *dev;
@@ -324,7 +321,7 @@ struct hnae_handle *hnae_get_handle(struct device *owner_dev,
        int i, j;
        int ret;
 
-       dev = find_ae(ae_id);
+       dev = find_ae(ae_node);
        if (!dev)
                return ERR_PTR(-ENODEV);
 
index 6ca94dc..1cbcb9f 100644 (file)
@@ -524,8 +524,11 @@ struct hnae_handle {
 
 #define ring_to_dev(ring) ((ring)->q->dev->dev)
 
-struct hnae_handle *hnae_get_handle(struct device *owner_dev, const char *ae_id,
-                                   u32 port_id, struct hnae_buf_ops *bops);
+struct hnae_handle *hnae_get_handle(struct device *owner_dev,
+                                   const struct device_node *ae_node,
+                                   u32 port_id,
+                                   struct hnae_buf_ops *bops);
+
 void hnae_put_handle(struct hnae_handle *handle);
 int hnae_ae_register(struct hnae_ae_dev *dev, struct module *owner);
 void hnae_ae_unregister(struct hnae_ae_dev *dev);
index 522b264..d4f92ed 100644 (file)
@@ -675,8 +675,12 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
 {
        int ret;
        struct hnae_vf_cb *vf_cb = hns_ae_get_vf_cb(handle);
+       struct hns_mac_cb *mac_cb = hns_get_mac_cb(handle);
 
        switch (loop) {
+       case MAC_INTERNALLOOP_PHY:
+               ret = 0;
+               break;
        case MAC_INTERNALLOOP_SERDES:
                ret = hns_mac_config_sds_loopback(vf_cb->mac_cb, en);
                break;
@@ -686,6 +690,10 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
        default:
                ret = -EINVAL;
        }
+
+       if (!ret)
+               hns_dsaf_set_inner_lb(mac_cb->dsaf_dev, mac_cb->mac_id, en);
+
        return ret;
 }
 
@@ -847,6 +855,7 @@ static struct hnae_ae_ops hns_dsaf_ops = {
 int hns_dsaf_ae_init(struct dsaf_device *dsaf_dev)
 {
        struct hnae_ae_dev *ae_dev = &dsaf_dev->ae_dev;
+       static atomic_t id = ATOMIC_INIT(-1);
 
        switch (dsaf_dev->dsaf_ver) {
        case AE_VERSION_1:
@@ -858,6 +867,9 @@ int hns_dsaf_ae_init(struct dsaf_device *dsaf_dev)
        default:
                break;
        }
+
+       snprintf(ae_dev->name, AE_NAME_SIZE, "%s%d", DSAF_DEVICE_NAME,
+                (int)atomic_inc_return(&id));
        ae_dev->ops = &hns_dsaf_ops;
        ae_dev->dev = dsaf_dev->dev;
 
index 1c33bd0..38fc5be 100644 (file)
@@ -35,7 +35,7 @@ int hns_dsaf_get_cfg(struct dsaf_device *dsaf_dev)
        int ret, i;
        u32 desc_num;
        u32 buf_size;
-       const char *name, *mode_str;
+       const char *mode_str;
        struct device_node *np = dsaf_dev->dev->of_node;
 
        if (of_device_is_compatible(np, "hisilicon,hns-dsaf-v1"))
@@ -43,14 +43,6 @@ int hns_dsaf_get_cfg(struct dsaf_device *dsaf_dev)
        else
                dsaf_dev->dsaf_ver = AE_VERSION_2;
 
-       ret = of_property_read_string(np, "dsa_name", &name);
-       if (ret) {
-               dev_err(dsaf_dev->dev, "get dsaf name fail, ret=%d!\n", ret);
-               return ret;
-       }
-       strncpy(dsaf_dev->ae_dev.name, name, AE_NAME_SIZE);
-       dsaf_dev->ae_dev.name[AE_NAME_SIZE - 1] = '\0';
-
        ret = of_property_read_string(np, "mode", &mode_str);
        if (ret) {
                dev_err(dsaf_dev->dev, "get dsaf mode fail, ret=%d!\n", ret);
@@ -238,6 +230,30 @@ static void hns_dsaf_mix_def_qid_cfg(struct dsaf_device *dsaf_dev)
        }
 }
 
+static void hns_dsaf_inner_qid_cfg(struct dsaf_device *dsaf_dev)
+{
+       u16 max_q_per_vf, max_vfn;
+       u32 q_id, q_num_per_port;
+       u32 mac_id;
+
+       if (AE_IS_VER1(dsaf_dev->dsaf_ver))
+               return;
+
+       hns_rcb_get_queue_mode(dsaf_dev->dsaf_mode,
+                              HNS_DSAF_COMM_SERVICE_NW_IDX,
+                              &max_vfn, &max_q_per_vf);
+       q_num_per_port = max_vfn * max_q_per_vf;
+
+       for (mac_id = 0, q_id = 0; mac_id < DSAF_SERVICE_NW_NUM; mac_id++) {
+               dsaf_set_dev_field(dsaf_dev,
+                                  DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+                                  DSAFV2_SERDES_LBK_QID_M,
+                                  DSAFV2_SERDES_LBK_QID_S,
+                                  q_id);
+               q_id += q_num_per_port;
+       }
+}
+
 /**
  * hns_dsaf_sw_port_type_cfg - cfg sw type
  * @dsaf_id: dsa fabric id
@@ -699,6 +715,16 @@ void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en)
        dsaf_set_dev_bit(dsaf_dev, DSAF_CFG_0_REG, DSAF_CFG_MIX_MODE_S, !!en);
 }
 
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en)
+{
+       if (AE_IS_VER1(dsaf_dev->dsaf_ver) ||
+           dsaf_dev->mac_cb[mac_id].mac_type == HNAE_PORT_DEBUG)
+               return;
+
+       dsaf_set_dev_bit(dsaf_dev, DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+                        DSAFV2_SERDES_LBK_EN_B, !!en);
+}
+
 /**
  * hns_dsaf_tbl_stat_en - tbl
  * @dsaf_id: dsa fabric id
@@ -1030,6 +1056,9 @@ static void hns_dsaf_comm_init(struct dsaf_device *dsaf_dev)
        /* set promisc def queue id */
        hns_dsaf_mix_def_qid_cfg(dsaf_dev);
 
+       /* set inner loopback queue id */
+       hns_dsaf_inner_qid_cfg(dsaf_dev);
+
        /* in non switch mode, set all port to access mode */
        hns_dsaf_sw_port_type_cfg(dsaf_dev, DSAF_SW_PORT_TYPE_NON_VLAN);
 
index 31c312f..5fea226 100644 (file)
@@ -18,6 +18,7 @@ struct hns_mac_cb;
 
 #define DSAF_DRV_NAME "hns_dsaf"
 #define DSAF_MOD_VERSION "v1.0"
+#define DSAF_DEVICE_NAME "dsaf"
 
 #define HNS_DSAF_DEBUG_NW_REG_OFFSET 0x100000
 
@@ -416,5 +417,6 @@ void hns_dsaf_get_strings(int stringset, u8 *data, int port);
 void hns_dsaf_get_regs(struct dsaf_device *ddev, u32 port, void *data);
 int hns_dsaf_get_regs_count(void);
 void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en);
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en);
 
 #endif /* __HNS_DSAF_MAIN_H__ */
index f0c4f9b..60d695d 100644 (file)
 #define DSAF_XGE_INT_STS_0_REG         0x1C0
 #define DSAF_PPE_INT_STS_0_REG         0x1E0
 #define DSAF_ROCEE_INT_STS_0_REG       0x200
+#define DSAFV2_SERDES_LBK_0_REG         0x220
 #define DSAF_PPE_QID_CFG_0_REG         0x300
 #define DSAF_SW_PORT_TYPE_0_REG                0x320
 #define DSAF_STP_PORT_TYPE_0_REG       0x340
 #define PPEV2_CFG_RSS_TBL_4N3_S        24
 #define PPEV2_CFG_RSS_TBL_4N3_M        (((1UL << 5) - 1) << PPEV2_CFG_RSS_TBL_4N3_S)
 
+#define DSAFV2_SERDES_LBK_EN_B  8
+#define DSAFV2_SERDES_LBK_QID_S 0
+#define DSAFV2_SERDES_LBK_QID_M        (((1UL << 8) - 1) << DSAFV2_SERDES_LBK_QID_S)
+
 #define PPE_CNT_CLR_CE_B       0
 #define PPE_CNT_CLR_SNAP_EN_B  1
 
index 0e30846..3f77ff7 100644 (file)
@@ -1802,7 +1802,7 @@ static int hns_nic_try_get_ae(struct net_device *ndev)
        int ret;
 
        h = hnae_get_handle(&priv->netdev->dev,
-                           priv->ae_name, priv->port_id, NULL);
+                           priv->ae_node, priv->port_id, NULL);
        if (IS_ERR_OR_NULL(h)) {
                ret = PTR_ERR(h);
                dev_dbg(priv->dev, "has not handle, register notifier!\n");
@@ -1880,13 +1880,16 @@ static int hns_nic_dev_probe(struct platform_device *pdev)
        else
                priv->enet_ver = AE_VERSION_2;
 
-       ret = of_property_read_string(node, "ae-name", &priv->ae_name);
-       if (ret)
-               goto out_read_string_fail;
+       priv->ae_node = (void *)of_parse_phandle(node, "ae-handle", 0);
+       if (IS_ERR_OR_NULL(priv->ae_node)) {
+               ret = PTR_ERR(priv->ae_node);
+               dev_err(dev, "not find ae-handle\n");
+               goto out_read_prop_fail;
+       }
 
        ret = of_property_read_u32(node, "port-id", &priv->port_id);
        if (ret)
-               goto out_read_string_fail;
+               goto out_read_prop_fail;
 
        hns_init_mac_addr(ndev);
 
@@ -1945,7 +1948,7 @@ static int hns_nic_dev_probe(struct platform_device *pdev)
 
 out_notify_fail:
        (void)cancel_work_sync(&priv->service_task);
-out_read_string_fail:
+out_read_prop_fail:
        free_netdev(ndev);
        return ret;
 }
index 4b75270..c68ab3d 100644 (file)
@@ -51,7 +51,7 @@ struct hns_nic_ops {
 };
 
 struct hns_nic_priv {
-       const char *ae_name;
+       const struct device_node *ae_node;
        u32 enet_ver;
        u32 port_id;
        int phy_mode;
index 3df2284..3c4a3bc 100644 (file)
@@ -295,8 +295,10 @@ static int __lb_setup(struct net_device *ndev,
 
        switch (loop) {
        case MAC_INTERNALLOOP_PHY:
-               if ((phy_dev) && (!phy_dev->is_c45))
+               if ((phy_dev) && (!phy_dev->is_c45)) {
                        ret = hns_nic_config_phy_loopback(phy_dev, 0x1);
+                       ret |= h->dev->ops->set_loopback(h, loop, 0x1);
+               }
                break;
        case MAC_INTERNALLOOP_MAC:
                if ((h->dev->ops->set_loopback) &&
@@ -376,6 +378,7 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
                               struct sk_buff *skb)
 {
        struct net_device *ndev;
+       struct hns_nic_priv *priv;
        struct hnae_ring *ring;
        struct netdev_queue *dev_queue;
        struct sk_buff *new_skb;
@@ -385,8 +388,17 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
        char buff[33]; /* 32B data and the last character '\0' */
 
        if (!ring_data) { /* Just for doing create frame*/
+               ndev = skb->dev;
+               priv = netdev_priv(ndev);
+
                frame_size = skb->len;
                memset(skb->data, 0xFF, frame_size);
+               if ((!AE_IS_VER1(priv->enet_ver)) &&
+                   (priv->ae_handle->port_type == HNAE_PORT_SERVICE)) {
+                       memcpy(skb->data, ndev->dev_addr, 6);
+                       skb->data[5] += 0x1f;
+               }
+
                frame_size &= ~1ul;
                memset(&skb->data[frame_size / 2], 0xAA, frame_size / 2 - 1);
                memset(&skb->data[frame_size / 2 + 10], 0xBE,
@@ -486,6 +498,7 @@ static int __lb_run_test(struct net_device *ndev,
 
        /* place data into test skb */
        (void)skb_put(skb, size);
+       skb->dev = ndev;
        __lb_other_process(NULL, skb);
        skb->queue_mapping = NIC_LB_TEST_RING_ID;
 
index 1d5c3e1..3daf2d4 100644 (file)
@@ -194,7 +194,6 @@ static const char *hp100_isa_tbl[] = {
 };
 #endif
 
-#ifdef CONFIG_EISA
 static struct eisa_device_id hp100_eisa_tbl[] = {
        { "HWPF180" }, /* HP J2577 rev A */
        { "HWP1920" }, /* HP 27248B */
@@ -205,9 +204,7 @@ static struct eisa_device_id hp100_eisa_tbl[] = {
        { "" }         /* Mandatory final entry ! */
 };
 MODULE_DEVICE_TABLE(eisa, hp100_eisa_tbl);
-#endif
 
-#ifdef CONFIG_PCI
 static const struct pci_device_id hp100_pci_tbl[] = {
        {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_J2585A, PCI_ANY_ID, PCI_ANY_ID,},
        {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_J2585B, PCI_ANY_ID, PCI_ANY_ID,},
@@ -219,7 +216,6 @@ static const struct pci_device_id hp100_pci_tbl[] = {
        {}                      /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(pci, hp100_pci_tbl);
-#endif
 
 static int hp100_rx_ratio = HP100_DEFAULT_RX_RATIO;
 static int hp100_priority_tx = HP100_DEFAULT_PRIORITY_TX;
@@ -2842,7 +2838,6 @@ static void cleanup_dev(struct net_device *d)
        free_netdev(d);
 }
 
-#ifdef CONFIG_EISA
 static int hp100_eisa_probe(struct device *gendev)
 {
        struct net_device *dev = alloc_etherdev(sizeof(struct hp100_private));
@@ -2884,9 +2879,7 @@ static struct eisa_driver hp100_eisa_driver = {
                .remove  = hp100_eisa_remove,
         }
 };
-#endif
 
-#ifdef CONFIG_PCI
 static int hp100_pci_probe(struct pci_dev *pdev,
                           const struct pci_device_id *ent)
 {
@@ -2955,7 +2948,6 @@ static struct pci_driver hp100_pci_driver = {
        .probe          = hp100_pci_probe,
        .remove         = hp100_pci_remove,
 };
-#endif
 
 /*
  *  module section
@@ -3032,23 +3024,17 @@ static int __init hp100_module_init(void)
        err = hp100_isa_init();
        if (err && err != -ENODEV)
                goto out;
-#ifdef CONFIG_EISA
        err = eisa_driver_register(&hp100_eisa_driver);
        if (err && err != -ENODEV)
                goto out2;
-#endif
-#ifdef CONFIG_PCI
        err = pci_register_driver(&hp100_pci_driver);
        if (err && err != -ENODEV)
                goto out3;
-#endif
  out:
        return err;
  out3:
-#ifdef CONFIG_EISA
        eisa_driver_unregister (&hp100_eisa_driver);
  out2:
-#endif
        hp100_isa_cleanup();
        goto out;
 }
@@ -3057,12 +3043,8 @@ static int __init hp100_module_init(void)
 static void __exit hp100_module_exit(void)
 {
        hp100_isa_cleanup();
-#ifdef CONFIG_EISA
        eisa_driver_unregister (&hp100_eisa_driver);
-#endif
-#ifdef CONFIG_PCI
        pci_unregister_driver (&hp100_pci_driver);
-#endif
 }
 
 module_init(hp100_module_init)
index 335417b..ebe6071 100644 (file)
@@ -1166,7 +1166,10 @@ map_failed:
        if (!firmware_has_feature(FW_FEATURE_CMO))
                netdev_err(netdev, "tx: unable to map xmit buffer\n");
        adapter->tx_map_failed++;
-       skb_linearize(skb);
+       if (skb_linearize(skb)) {
+               netdev->stats.tx_dropped++;
+               goto out;
+       }
        force_bounce = 1;
        goto retry_bounce;
 }
index 7d65708..6e9e16e 100644 (file)
@@ -1348,44 +1348,44 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry)
        crq.request_capability.cmd = REQUEST_CAPABILITY;
 
        crq.request_capability.capability = cpu_to_be16(REQ_TX_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_tx_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_tx_queues);
        ibmvnic_send_crq(adapter, &crq);
 
        crq.request_capability.capability = cpu_to_be16(REQ_RX_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_rx_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_rx_queues);
        ibmvnic_send_crq(adapter, &crq);
 
        crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_QUEUES);
-       crq.request_capability.number = cpu_to_be32(adapter->req_rx_add_queues);
+       crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_queues);
        ibmvnic_send_crq(adapter, &crq);
 
        crq.request_capability.capability =
            cpu_to_be16(REQ_TX_ENTRIES_PER_SUBCRQ);
        crq.request_capability.number =
-           cpu_to_be32(adapter->req_tx_entries_per_subcrq);
+           cpu_to_be64(adapter->req_tx_entries_per_subcrq);
        ibmvnic_send_crq(adapter, &crq);
 
        crq.request_capability.capability =
            cpu_to_be16(REQ_RX_ADD_ENTRIES_PER_SUBCRQ);
        crq.request_capability.number =
-           cpu_to_be32(adapter->req_rx_add_entries_per_subcrq);
+           cpu_to_be64(adapter->req_rx_add_entries_per_subcrq);
        ibmvnic_send_crq(adapter, &crq);
 
        crq.request_capability.capability = cpu_to_be16(REQ_MTU);
-       crq.request_capability.number = cpu_to_be32(adapter->req_mtu);
+       crq.request_capability.number = cpu_to_be64(adapter->req_mtu);
        ibmvnic_send_crq(adapter, &crq);
 
        if (adapter->netdev->flags & IFF_PROMISC) {
                if (adapter->promisc_supported) {
                        crq.request_capability.capability =
                            cpu_to_be16(PROMISC_REQUESTED);
-                       crq.request_capability.number = cpu_to_be32(1);
+                       crq.request_capability.number = cpu_to_be64(1);
                        ibmvnic_send_crq(adapter, &crq);
                }
        } else {
                crq.request_capability.capability =
                    cpu_to_be16(PROMISC_REQUESTED);
-               crq.request_capability.number = cpu_to_be32(0);
+               crq.request_capability.number = cpu_to_be64(0);
                ibmvnic_send_crq(adapter, &crq);
        }
 
@@ -2312,93 +2312,93 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
        switch (be16_to_cpu(crq->query_capability.capability)) {
        case MIN_TX_QUEUES:
                adapter->min_tx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_tx_queues = %lld\n",
                           adapter->min_tx_queues);
                break;
        case MIN_RX_QUEUES:
                adapter->min_rx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_rx_queues = %lld\n",
                           adapter->min_rx_queues);
                break;
        case MIN_RX_ADD_QUEUES:
                adapter->min_rx_add_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_rx_add_queues = %lld\n",
                           adapter->min_rx_add_queues);
                break;
        case MAX_TX_QUEUES:
                adapter->max_tx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_tx_queues = %lld\n",
                           adapter->max_tx_queues);
                break;
        case MAX_RX_QUEUES:
                adapter->max_rx_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_rx_queues = %lld\n",
                           adapter->max_rx_queues);
                break;
        case MAX_RX_ADD_QUEUES:
                adapter->max_rx_add_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_rx_add_queues = %lld\n",
                           adapter->max_rx_add_queues);
                break;
        case MIN_TX_ENTRIES_PER_SUBCRQ:
                adapter->min_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_tx_entries_per_subcrq = %lld\n",
                           adapter->min_tx_entries_per_subcrq);
                break;
        case MIN_RX_ADD_ENTRIES_PER_SUBCRQ:
                adapter->min_rx_add_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_rx_add_entrs_per_subcrq = %lld\n",
                           adapter->min_rx_add_entries_per_subcrq);
                break;
        case MAX_TX_ENTRIES_PER_SUBCRQ:
                adapter->max_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_tx_entries_per_subcrq = %lld\n",
                           adapter->max_tx_entries_per_subcrq);
                break;
        case MAX_RX_ADD_ENTRIES_PER_SUBCRQ:
                adapter->max_rx_add_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_rx_add_entrs_per_subcrq = %lld\n",
                           adapter->max_rx_add_entries_per_subcrq);
                break;
        case TCP_IP_OFFLOAD:
                adapter->tcp_ip_offload =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "tcp_ip_offload = %lld\n",
                           adapter->tcp_ip_offload);
                break;
        case PROMISC_SUPPORTED:
                adapter->promisc_supported =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "promisc_supported = %lld\n",
                           adapter->promisc_supported);
                break;
        case MIN_MTU:
-               adapter->min_mtu = be32_to_cpu(crq->query_capability.number);
+               adapter->min_mtu = be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu);
                break;
        case MAX_MTU:
-               adapter->max_mtu = be32_to_cpu(crq->query_capability.number);
+               adapter->max_mtu = be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu);
                break;
        case MAX_MULTICAST_FILTERS:
                adapter->max_multicast_filters =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_multicast_filters = %lld\n",
                           adapter->max_multicast_filters);
                break;
        case VLAN_HEADER_INSERTION:
                adapter->vlan_header_insertion =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                if (adapter->vlan_header_insertion)
                        netdev->features |= NETIF_F_HW_VLAN_STAG_TX;
                netdev_dbg(netdev, "vlan_header_insertion = %lld\n",
@@ -2406,43 +2406,43 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
                break;
        case MAX_TX_SG_ENTRIES:
                adapter->max_tx_sg_entries =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "max_tx_sg_entries = %lld\n",
                           adapter->max_tx_sg_entries);
                break;
        case RX_SG_SUPPORTED:
                adapter->rx_sg_supported =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "rx_sg_supported = %lld\n",
                           adapter->rx_sg_supported);
                break;
        case OPT_TX_COMP_SUB_QUEUES:
                adapter->opt_tx_comp_sub_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "opt_tx_comp_sub_queues = %lld\n",
                           adapter->opt_tx_comp_sub_queues);
                break;
        case OPT_RX_COMP_QUEUES:
                adapter->opt_rx_comp_queues =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "opt_rx_comp_queues = %lld\n",
                           adapter->opt_rx_comp_queues);
                break;
        case OPT_RX_BUFADD_Q_PER_RX_COMP_Q:
                adapter->opt_rx_bufadd_q_per_rx_comp_q =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "opt_rx_bufadd_q_per_rx_comp_q = %lld\n",
                           adapter->opt_rx_bufadd_q_per_rx_comp_q);
                break;
        case OPT_TX_ENTRIES_PER_SUBCRQ:
                adapter->opt_tx_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "opt_tx_entries_per_subcrq = %lld\n",
                           adapter->opt_tx_entries_per_subcrq);
                break;
        case OPT_RXBA_ENTRIES_PER_SUBCRQ:
                adapter->opt_rxba_entries_per_subcrq =
-                   be32_to_cpu(crq->query_capability.number);
+                   be64_to_cpu(crq->query_capability.number);
                netdev_dbg(netdev, "opt_rxba_entries_per_subcrq = %lld\n",
                           adapter->opt_rxba_entries_per_subcrq);
                break;
index 1242925..1a9993c 100644 (file)
@@ -319,10 +319,8 @@ struct ibmvnic_capability {
        u8 first;
        u8 cmd;
        __be16 capability; /* one of ibmvnic_capabilities */
+       __be64 number;
        struct ibmvnic_rc rc;
-       __be32 number; /*FIX: should be __be64, but I'm getting the least
-                       * significant word first
-                       */
 } __packed __aligned(8);
 
 struct ibmvnic_login {
index fa593dd..3772f3a 100644 (file)
@@ -83,6 +83,15 @@ config E1000E
          To compile this driver as a module, choose M here. The module
          will be called e1000e.
 
+config E1000E_HWTS
+       bool "Support HW cross-timestamp on PCH devices"
+       default y
+       depends on E1000E && X86
+       ---help---
+        Say Y to enable hardware supported cross-timestamping on PCH
+        devices. The cross-timestamp is available through the PTP clock
+        driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
+
 config IGB
        tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
        depends on PCI
index f7c7804..0641c00 100644 (file)
 #define E1000_RXCW_C          0x20000000        /* Receive config */
 #define E1000_RXCW_SYNCH      0x40000000        /* Receive config synch */
 
+/* HH Time Sync */
+#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK  0x0000F000 /* max delay */
+#define E1000_TSYNCTXCTL_SYNC_COMP             0x40000000 /* sync complete */
+#define E1000_TSYNCTXCTL_START_SYNC            0x80000000 /* initiate sync */
+
 #define E1000_TSYNCTXCTL_VALID         0x00000001 /* Tx timestamp valid */
 #define E1000_TSYNCTXCTL_ENABLED       0x00000010 /* enable Tx timestamping */
 
index 25a0ad5..e2ff3ef 100644 (file)
 
 #include "e1000.h"
 
+#ifdef CONFIG_E1000E_HWTS
+#include <linux/clocksource.h>
+#include <linux/ktime.h>
+#include <asm/tsc.h>
+#endif
+
 /**
  * e1000e_phc_adjfreq - adjust the frequency of the hardware clock
  * @ptp: ptp clock structure
@@ -98,6 +104,78 @@ static int e1000e_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
        return 0;
 }
 
+#ifdef CONFIG_E1000E_HWTS
+#define MAX_HW_WAIT_COUNT (3)
+
+/**
+ * e1000e_phc_get_syncdevicetime - Callback given to timekeeping code reads system/device registers
+ * @device: current device time
+ * @system: system counter value read synchronously with device time
+ * @ctx: context provided by timekeeping code
+ *
+ * Read device and system (ART) clock simultaneously and return the corrected
+ * clock values in ns.
+ **/
+static int e1000e_phc_get_syncdevicetime(ktime_t *device,
+                                        struct system_counterval_t *system,
+                                        void *ctx)
+{
+       struct e1000_adapter *adapter = (struct e1000_adapter *)ctx;
+       struct e1000_hw *hw = &adapter->hw;
+       unsigned long flags;
+       int i;
+       u32 tsync_ctrl;
+       cycle_t dev_cycles;
+       cycle_t sys_cycles;
+
+       tsync_ctrl = er32(TSYNCTXCTL);
+       tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC |
+               E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK;
+       ew32(TSYNCTXCTL, tsync_ctrl);
+       for (i = 0; i < MAX_HW_WAIT_COUNT; ++i) {
+               udelay(1);
+               tsync_ctrl = er32(TSYNCTXCTL);
+               if (tsync_ctrl & E1000_TSYNCTXCTL_SYNC_COMP)
+                       break;
+       }
+
+       if (i == MAX_HW_WAIT_COUNT)
+               return -ETIMEDOUT;
+
+       dev_cycles = er32(SYSSTMPH);
+       dev_cycles <<= 32;
+       dev_cycles |= er32(SYSSTMPL);
+       spin_lock_irqsave(&adapter->systim_lock, flags);
+       *device = ns_to_ktime(timecounter_cyc2time(&adapter->tc, dev_cycles));
+       spin_unlock_irqrestore(&adapter->systim_lock, flags);
+
+       sys_cycles = er32(PLTSTMPH);
+       sys_cycles <<= 32;
+       sys_cycles |= er32(PLTSTMPL);
+       *system = convert_art_to_tsc(sys_cycles);
+
+       return 0;
+}
+
+/**
+ * e1000e_phc_getsynctime - Reads the current system/device cross timestamp
+ * @ptp: ptp clock structure
+ * @cts: structure containing timestamp
+ *
+ * Read device and system (ART) clock simultaneously and return the scaled
+ * clock values in ns.
+ **/
+static int e1000e_phc_getcrosststamp(struct ptp_clock_info *ptp,
+                                    struct system_device_crosststamp *xtstamp)
+{
+       struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
+                                                    ptp_clock_info);
+
+       return get_device_system_crosststamp(e1000e_phc_get_syncdevicetime,
+                                               adapter, NULL, xtstamp);
+}
+#endif/*CONFIG_E1000E_HWTS*/
+
 /**
  * e1000e_phc_gettime - Reads the current time from the hardware clock
  * @ptp: ptp clock structure
@@ -236,6 +314,13 @@ void e1000e_ptp_init(struct e1000_adapter *adapter)
                break;
        }
 
+#ifdef CONFIG_E1000E_HWTS
+       /* CPU must have ART and GBe must be from Sunrise Point or greater */
+       if (hw->mac.type >= e1000_pch_spt && boot_cpu_has(X86_FEATURE_ART))
+               adapter->ptp_clock_info.getcrosststamp =
+                       e1000e_phc_getcrosststamp;
+#endif/*CONFIG_E1000E_HWTS*/
+
        INIT_DELAYED_WORK(&adapter->systim_overflow_work,
                          e1000e_systim_overflow_work);
 
index 1d5e0b7..0cb4d36 100644 (file)
 #define E1000_SYSTIML  0x0B600 /* System time register Low - RO */
 #define E1000_SYSTIMH  0x0B604 /* System time register High - RO */
 #define E1000_TIMINCA  0x0B608 /* Increment attributes register - RW */
+#define E1000_SYSSTMPL  0x0B648 /* HH Timesync system stamp low register */
+#define E1000_SYSSTMPH  0x0B64C /* HH Timesync system stamp hi register */
+#define E1000_PLTSTMPL  0x0B640 /* HH Timesync platform stamp low register */
+#define E1000_PLTSTMPH  0x0B644 /* HH Timesync platform stamp hi register */
 #define E1000_RXMTRL   0x0B634 /* Time sync Rx EtherType and Msg Type - RW */
 #define E1000_RXUDP    0x0B638 /* Time Sync Rx UDP Port - RW */
 
index bb4612c..8f3b53e 100644 (file)
@@ -7117,9 +7117,7 @@ static void i40e_service_task(struct work_struct *work)
        i40e_watchdog_subtask(pf);
        i40e_fdir_reinit_subtask(pf);
        i40e_sync_filters_subtask(pf);
-#if IS_ENABLED(CONFIG_VXLAN) || IS_ENABLED(CONFIG_GENEVE)
        i40e_sync_udp_filters_subtask(pf);
-#endif
        i40e_clean_adminq_subtask(pf);
 
        i40e_service_event_complete(pf);
@@ -8515,6 +8513,8 @@ static u8 i40e_get_udp_port_idx(struct i40e_pf *pf, __be16 port)
 }
 
 #endif
+
+#if IS_ENABLED(CONFIG_VXLAN)
 /**
  * i40e_add_vxlan_port - Get notifications about VXLAN ports that come up
  * @netdev: This physical port's netdev
@@ -8524,7 +8524,6 @@ static u8 i40e_get_udp_port_idx(struct i40e_pf *pf, __be16 port)
 static void i40e_add_vxlan_port(struct net_device *netdev,
                                sa_family_t sa_family, __be16 port)
 {
-#if IS_ENABLED(CONFIG_VXLAN)
        struct i40e_netdev_priv *np = netdev_priv(netdev);
        struct i40e_vsi *vsi = np->vsi;
        struct i40e_pf *pf = vsi->back;
@@ -8557,7 +8556,6 @@ static void i40e_add_vxlan_port(struct net_device *netdev,
        pf->udp_ports[next_idx].type = I40E_AQC_TUNNEL_TYPE_VXLAN;
        pf->pending_udp_bitmap |= BIT_ULL(next_idx);
        pf->flags |= I40E_FLAG_UDP_FILTER_SYNC;
-#endif
 }
 
 /**
@@ -8569,7 +8567,6 @@ static void i40e_add_vxlan_port(struct net_device *netdev,
 static void i40e_del_vxlan_port(struct net_device *netdev,
                                sa_family_t sa_family, __be16 port)
 {
-#if IS_ENABLED(CONFIG_VXLAN)
        struct i40e_netdev_priv *np = netdev_priv(netdev);
        struct i40e_vsi *vsi = np->vsi;
        struct i40e_pf *pf = vsi->back;
@@ -8592,9 +8589,10 @@ static void i40e_del_vxlan_port(struct net_device *netdev,
                netdev_warn(netdev, "vxlan port %d was not found, not deleting\n",
                            ntohs(port));
        }
-#endif
 }
+#endif
 
+#if IS_ENABLED(CONFIG_GENEVE)
 /**
  * i40e_add_geneve_port - Get notifications about GENEVE ports that come up
  * @netdev: This physical port's netdev
@@ -8604,7 +8602,6 @@ static void i40e_del_vxlan_port(struct net_device *netdev,
 static void i40e_add_geneve_port(struct net_device *netdev,
                                 sa_family_t sa_family, __be16 port)
 {
-#if IS_ENABLED(CONFIG_GENEVE)
        struct i40e_netdev_priv *np = netdev_priv(netdev);
        struct i40e_vsi *vsi = np->vsi;
        struct i40e_pf *pf = vsi->back;
@@ -8639,7 +8636,6 @@ static void i40e_add_geneve_port(struct net_device *netdev,
        pf->flags |= I40E_FLAG_UDP_FILTER_SYNC;
 
        dev_info(&pf->pdev->dev, "adding geneve port %d\n", ntohs(port));
-#endif
 }
 
 /**
@@ -8651,7 +8647,6 @@ static void i40e_add_geneve_port(struct net_device *netdev,
 static void i40e_del_geneve_port(struct net_device *netdev,
                                 sa_family_t sa_family, __be16 port)
 {
-#if IS_ENABLED(CONFIG_GENEVE)
        struct i40e_netdev_priv *np = netdev_priv(netdev);
        struct i40e_vsi *vsi = np->vsi;
        struct i40e_pf *pf = vsi->back;
@@ -8677,8 +8672,8 @@ static void i40e_del_geneve_port(struct net_device *netdev,
                netdev_warn(netdev, "geneve port %d was not found, not deleting\n",
                            ntohs(port));
        }
-#endif
 }
+#endif
 
 static int i40e_get_phys_port_id(struct net_device *netdev,
                                 struct netdev_phys_item_id *ppid)
index 720516b..47bd8b3 100644 (file)
@@ -2313,8 +2313,8 @@ static void i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
        struct iphdr *this_ip_hdr;
        u32 network_hdr_len;
        u8 l4_hdr = 0;
-       struct udphdr *oudph;
-       struct iphdr *oiph;
+       struct udphdr *oudph = NULL;
+       struct iphdr *oiph = NULL;
        u32 l4_tunnel = 0;
 
        if (skb->encapsulation) {
index b1de7af..3ddf657 100644 (file)
@@ -270,11 +270,17 @@ jme_reset_mac_processor(struct jme_adapter *jme)
 }
 
 static inline void
-jme_clear_pm(struct jme_adapter *jme)
+jme_clear_pm_enable_wol(struct jme_adapter *jme)
 {
        jwrite32(jme, JME_PMCS, PMCS_STMASK | jme->reg_pmcs);
 }
 
+static inline void
+jme_clear_pm_disable_wol(struct jme_adapter *jme)
+{
+       jwrite32(jme, JME_PMCS, PMCS_STMASK);
+}
+
 static int
 jme_reload_eeprom(struct jme_adapter *jme)
 {
@@ -1853,7 +1859,7 @@ jme_open(struct net_device *netdev)
        struct jme_adapter *jme = netdev_priv(netdev);
        int rc;
 
-       jme_clear_pm(jme);
+       jme_clear_pm_disable_wol(jme);
        JME_NAPI_ENABLE(jme);
 
        tasklet_init(&jme->linkch_task, jme_link_change_tasklet,
@@ -1925,11 +1931,11 @@ jme_wait_link(struct jme_adapter *jme)
 static void
 jme_powersave_phy(struct jme_adapter *jme)
 {
-       if (jme->reg_pmcs) {
+       if (jme->reg_pmcs && device_may_wakeup(&jme->pdev->dev)) {
                jme_set_100m_half(jme);
                if (jme->reg_pmcs & (PMCS_LFEN | PMCS_LREN))
                        jme_wait_link(jme);
-               jme_clear_pm(jme);
+               jme_clear_pm_enable_wol(jme);
        } else {
                jme_phy_off(jme);
        }
@@ -2646,9 +2652,6 @@ jme_set_wol(struct net_device *netdev,
        if (wol->wolopts & WAKE_MAGIC)
                jme->reg_pmcs |= PMCS_MFEN;
 
-       jwrite32(jme, JME_PMCS, jme->reg_pmcs);
-       device_set_wakeup_enable(&jme->pdev->dev, !!(jme->reg_pmcs));
-
        return 0;
 }
 
@@ -3172,8 +3175,8 @@ jme_init_one(struct pci_dev *pdev,
        jme->mii_if.mdio_read = jme_mdio_read;
        jme->mii_if.mdio_write = jme_mdio_write;
 
-       jme_clear_pm(jme);
-       device_set_wakeup_enable(&pdev->dev, true);
+       jme_clear_pm_disable_wol(jme);
+       device_init_wakeup(&pdev->dev, true);
 
        jme_set_phyfifo_5level(jme);
        jme->pcirev = pdev->revision;
@@ -3304,7 +3307,7 @@ jme_resume(struct device *dev)
        if (!netif_running(netdev))
                return 0;
 
-       jme_clear_pm(jme);
+       jme_clear_pm_disable_wol(jme);
        jme_phy_on(jme);
        if (test_bit(JME_FLAG_SSET, &jme->flags))
                jme_set_settings(netdev, &jme->old_ecmd);
@@ -3312,13 +3315,14 @@ jme_resume(struct device *dev)
                jme_reset_phy_processor(jme);
        jme_phy_calibration(jme);
        jme_phy_setEA(jme);
-       jme_start_irq(jme);
        netif_device_attach(netdev);
 
        atomic_inc(&jme->link_changing);
 
        jme_reset_link(jme);
 
+       jme_start_irq(jme);
+
        return 0;
 }
 
index a0c0383..5583118 100644 (file)
@@ -762,10 +762,10 @@ txq_put_data_tso(struct net_device *dev, struct tx_queue *txq,
 
        if (length <= 8 && (uintptr_t)data & 0x7) {
                /* Copy unaligned small data fragment to TSO header data area */
-               memcpy(txq->tso_hdrs + txq->tx_curr_desc * TSO_HEADER_SIZE,
+               memcpy(txq->tso_hdrs + tx_index * TSO_HEADER_SIZE,
                       data, length);
                desc->buf_ptr = txq->tso_hdrs_dma
-                       + txq->tx_curr_desc * TSO_HEADER_SIZE;
+                       + tx_index * TSO_HEADER_SIZE;
        } else {
                /* Alignment is okay, map buffer and hand off to hardware */
                txq->tx_desc_mapping[tx_index] = DESC_DMA_MAP_SINGLE;
index fabc8df..b0ae69f 100644 (file)
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/kernel.h>
-#include <linux/netdevice.h>
+#include <linux/clk.h>
+#include <linux/cpu.h>
 #include <linux/etherdevice.h>
-#include <linux/platform_device.h>
-#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
 #include <linux/inetdevice.h>
-#include <linux/mbus.h>
-#include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/if_vlan.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
 #include <linux/io.h>
-#include <net/tso.h>
+#include <linux/kernel.h>
+#include <linux/mbus.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
-#include <linux/of_address.h>
 #include <linux/phy.h>
-#include <linux/clk.h>
-#include <linux/cpu.h>
+#include <linux/platform_device.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tso.h>
 
 /* Registers */
 #define MVNETA_RXQ_CONFIG_REG(q)                (0x1400 + ((q) << 2))
@@ -370,9 +370,16 @@ struct mvneta_port {
        struct net_device *dev;
        struct notifier_block cpu_notifier;
        int rxq_def;
+       /* Protect the access to the percpu interrupt registers,
+        * ensuring that the configuration remains coherent.
+        */
+       spinlock_t lock;
+       bool is_stopped;
 
        /* Core clock */
        struct clk *clk;
+       /* AXI clock */
+       struct clk *clk_bus;
        u8 mcast_count[256];
        u16 tx_ring_size;
        u16 rx_ring_size;
@@ -1036,6 +1043,43 @@ static void mvneta_set_autoneg(struct mvneta_port *pp, int enable)
        }
 }
 
+static void mvneta_percpu_unmask_interrupt(void *arg)
+{
+       struct mvneta_port *pp = arg;
+
+       /* All the queue are unmasked, but actually only the ones
+        * mapped to this CPU will be unmasked
+        */
+       mvreg_write(pp, MVNETA_INTR_NEW_MASK,
+                   MVNETA_RX_INTR_MASK_ALL |
+                   MVNETA_TX_INTR_MASK_ALL |
+                   MVNETA_MISCINTR_INTR_MASK);
+}
+
+static void mvneta_percpu_mask_interrupt(void *arg)
+{
+       struct mvneta_port *pp = arg;
+
+       /* All the queue are masked, but actually only the ones
+        * mapped to this CPU will be masked
+        */
+       mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
+       mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
+       mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
+}
+
+static void mvneta_percpu_clear_intr_cause(void *arg)
+{
+       struct mvneta_port *pp = arg;
+
+       /* All the queue are cleared, but actually only the ones
+        * mapped to this CPU will be cleared
+        */
+       mvreg_write(pp, MVNETA_INTR_NEW_CAUSE, 0);
+       mvreg_write(pp, MVNETA_INTR_MISC_CAUSE, 0);
+       mvreg_write(pp, MVNETA_INTR_OLD_CAUSE, 0);
+}
+
 /* This method sets defaults to the NETA port:
  *     Clears interrupt Cause and Mask registers.
  *     Clears all MAC tables.
@@ -1053,14 +1097,10 @@ static void mvneta_defaults_set(struct mvneta_port *pp)
        int max_cpu = num_present_cpus();
 
        /* Clear all Cause registers */
-       mvreg_write(pp, MVNETA_INTR_NEW_CAUSE, 0);
-       mvreg_write(pp, MVNETA_INTR_OLD_CAUSE, 0);
-       mvreg_write(pp, MVNETA_INTR_MISC_CAUSE, 0);
+       on_each_cpu(mvneta_percpu_clear_intr_cause, pp, true);
 
        /* Mask all interrupts */
-       mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
+       on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
        mvreg_write(pp, MVNETA_INTR_ENABLE, 0);
 
        /* Enable MBUS Retry bit16 */
@@ -2526,34 +2566,9 @@ static int mvneta_setup_txqs(struct mvneta_port *pp)
        return 0;
 }
 
-static void mvneta_percpu_unmask_interrupt(void *arg)
-{
-       struct mvneta_port *pp = arg;
-
-       /* All the queue are unmasked, but actually only the ones
-        * maped to this CPU will be unmasked
-        */
-       mvreg_write(pp, MVNETA_INTR_NEW_MASK,
-                   MVNETA_RX_INTR_MASK_ALL |
-                   MVNETA_TX_INTR_MASK_ALL |
-                   MVNETA_MISCINTR_INTR_MASK);
-}
-
-static void mvneta_percpu_mask_interrupt(void *arg)
-{
-       struct mvneta_port *pp = arg;
-
-       /* All the queue are masked, but actually only the ones
-        * maped to this CPU will be masked
-        */
-       mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
-}
-
 static void mvneta_start_dev(struct mvneta_port *pp)
 {
-       unsigned int cpu;
+       int cpu;
 
        mvneta_max_rx_size_set(pp, pp->pkt_size);
        mvneta_txq_max_tx_size_set(pp, pp->pkt_size);
@@ -2562,16 +2577,15 @@ static void mvneta_start_dev(struct mvneta_port *pp)
        mvneta_port_enable(pp);
 
        /* Enable polling on the port */
-       for_each_present_cpu(cpu) {
+       for_each_online_cpu(cpu) {
                struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu);
 
                napi_enable(&port->napi);
        }
 
        /* Unmask interrupts. It has to be done from each CPU */
-       for_each_online_cpu(cpu)
-               smp_call_function_single(cpu, mvneta_percpu_unmask_interrupt,
-                                        pp, true);
+       on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
+
        mvreg_write(pp, MVNETA_INTR_MISC_MASK,
                    MVNETA_CAUSE_PHY_STATUS_CHANGE |
                    MVNETA_CAUSE_LINK_CHANGE |
@@ -2587,7 +2601,7 @@ static void mvneta_stop_dev(struct mvneta_port *pp)
 
        phy_stop(pp->phy_dev);
 
-       for_each_present_cpu(cpu) {
+       for_each_online_cpu(cpu) {
                struct mvneta_pcpu_port *port = per_cpu_ptr(pp->ports, cpu);
 
                napi_disable(&port->napi);
@@ -2602,13 +2616,10 @@ static void mvneta_stop_dev(struct mvneta_port *pp)
        mvneta_port_disable(pp);
 
        /* Clear all ethernet port interrupts */
-       mvreg_write(pp, MVNETA_INTR_MISC_CAUSE, 0);
-       mvreg_write(pp, MVNETA_INTR_OLD_CAUSE, 0);
+       on_each_cpu(mvneta_percpu_clear_intr_cause, pp, true);
 
        /* Mask all ethernet port interrupts */
-       mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
-       mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
+       on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
 
        mvneta_tx_reset(pp);
        mvneta_rx_reset(pp);
@@ -2845,11 +2856,20 @@ static void mvneta_percpu_disable(void *arg)
        disable_percpu_irq(pp->dev->irq);
 }
 
+/* Electing a CPU must be done in an atomic way: it should be done
+ * after or before the removal/insertion of a CPU and this function is
+ * not reentrant.
+ */
 static void mvneta_percpu_elect(struct mvneta_port *pp)
 {
-       int online_cpu_idx, max_cpu, cpu, i = 0;
+       int elected_cpu = 0, max_cpu, cpu, i = 0;
+
+       /* Use the cpu associated to the rxq when it is online, in all
+        * the other cases, use the cpu 0 which can't be offline.
+        */
+       if (cpu_online(pp->rxq_def))
+               elected_cpu = pp->rxq_def;
 
-       online_cpu_idx = pp->rxq_def % num_online_cpus();
        max_cpu = num_present_cpus();
 
        for_each_online_cpu(cpu) {
@@ -2860,7 +2880,7 @@ static void mvneta_percpu_elect(struct mvneta_port *pp)
                        if ((rxq % max_cpu) == cpu)
                                rxq_map |= MVNETA_CPU_RXQ_ACCESS(rxq);
 
-               if (i == online_cpu_idx)
+               if (cpu == elected_cpu)
                        /* Map the default receive queue queue to the
                         * elected CPU
                         */
@@ -2871,7 +2891,7 @@ static void mvneta_percpu_elect(struct mvneta_port *pp)
                 * the CPU bound to the default RX queue
                 */
                if (txq_number == 1)
-                       txq_map = (i == online_cpu_idx) ?
+                       txq_map = (cpu == elected_cpu) ?
                                MVNETA_CPU_TXQ_ACCESS(1) : 0;
                else
                        txq_map = mvreg_read(pp, MVNETA_CPU_MAP(cpu)) &
@@ -2900,6 +2920,14 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
+               spin_lock(&pp->lock);
+               /* Configuring the driver for a new CPU while the
+                * driver is stopping is racy, so just avoid it.
+                */
+               if (pp->is_stopped) {
+                       spin_unlock(&pp->lock);
+                       break;
+               }
                netif_tx_stop_all_queues(pp->dev);
 
                /* We have to synchronise on tha napi of each CPU
@@ -2915,9 +2943,7 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb,
                }
 
                /* Mask all ethernet port interrupts */
-               mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
-               mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
-               mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
+               on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
                napi_enable(&port->napi);
 
 
@@ -2932,27 +2958,25 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb,
                 */
                mvneta_percpu_elect(pp);
 
-               /* Unmask all ethernet port interrupts, as this
-                * notifier is called for each CPU then the CPU to
-                * Queue mapping is applied
-                */
-               mvreg_write(pp, MVNETA_INTR_NEW_MASK,
-                       MVNETA_RX_INTR_MASK(rxq_number) |
-                       MVNETA_TX_INTR_MASK(txq_number) |
-                       MVNETA_MISCINTR_INTR_MASK);
+               /* Unmask all ethernet port interrupts */
+               on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
                mvreg_write(pp, MVNETA_INTR_MISC_MASK,
                        MVNETA_CAUSE_PHY_STATUS_CHANGE |
                        MVNETA_CAUSE_LINK_CHANGE |
                        MVNETA_CAUSE_PSC_SYNC_CHANGE);
                netif_tx_start_all_queues(pp->dev);
+               spin_unlock(&pp->lock);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                netif_tx_stop_all_queues(pp->dev);
+               /* Thanks to this lock we are sure that any pending
+                * cpu election is done
+                */
+               spin_lock(&pp->lock);
                /* Mask all ethernet port interrupts */
-               mvreg_write(pp, MVNETA_INTR_NEW_MASK, 0);
-               mvreg_write(pp, MVNETA_INTR_OLD_MASK, 0);
-               mvreg_write(pp, MVNETA_INTR_MISC_MASK, 0);
+               on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
+               spin_unlock(&pp->lock);
 
                napi_synchronize(&port->napi);
                napi_disable(&port->napi);
@@ -2966,12 +2990,11 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb,
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                /* Check if a new CPU must be elected now this on is down */
+               spin_lock(&pp->lock);
                mvneta_percpu_elect(pp);
+               spin_unlock(&pp->lock);
                /* Unmask all ethernet port interrupts */
-               mvreg_write(pp, MVNETA_INTR_NEW_MASK,
-                       MVNETA_RX_INTR_MASK(rxq_number) |
-                       MVNETA_TX_INTR_MASK(txq_number) |
-                       MVNETA_MISCINTR_INTR_MASK);
+               on_each_cpu(mvneta_percpu_unmask_interrupt, pp, true);
                mvreg_write(pp, MVNETA_INTR_MISC_MASK,
                        MVNETA_CAUSE_PHY_STATUS_CHANGE |
                        MVNETA_CAUSE_LINK_CHANGE |
@@ -2986,7 +3009,7 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb,
 static int mvneta_open(struct net_device *dev)
 {
        struct mvneta_port *pp = netdev_priv(dev);
-       int ret, cpu;
+       int ret;
 
        pp->pkt_size = MVNETA_RX_PKT_SIZE(pp->dev->mtu);
        pp->frag_size = SKB_DATA_ALIGN(MVNETA_RX_BUF_SIZE(pp->pkt_size)) +
@@ -3008,22 +3031,12 @@ static int mvneta_open(struct net_device *dev)
                goto err_cleanup_txqs;
        }
 
-       /* Even though the documentation says that request_percpu_irq
-        * doesn't enable the interrupts automatically, it actually
-        * does so on the local CPU.
-        *
-        * Make sure it's disabled.
-        */
-       mvneta_percpu_disable(pp);
-
        /* Enable per-CPU interrupt on all the CPU to handle our RX
         * queue interrupts
         */
-       for_each_online_cpu(cpu)
-               smp_call_function_single(cpu, mvneta_percpu_enable,
-                                        pp, true);
-
+       on_each_cpu(mvneta_percpu_enable, pp, true);
 
+       pp->is_stopped = false;
        /* Register a CPU notifier to handle the case where our CPU
         * might be taken offline.
         */
@@ -3055,13 +3068,20 @@ err_cleanup_rxqs:
 static int mvneta_stop(struct net_device *dev)
 {
        struct mvneta_port *pp = netdev_priv(dev);
-       int cpu;
 
+       /* Inform that we are stopping so we don't want to setup the
+        * driver for new CPUs in the notifiers
+        */
+       spin_lock(&pp->lock);
+       pp->is_stopped = true;
        mvneta_stop_dev(pp);
        mvneta_mdio_remove(pp);
        unregister_cpu_notifier(&pp->cpu_notifier);
-       for_each_present_cpu(cpu)
-               smp_call_function_single(cpu, mvneta_percpu_disable, pp, true);
+       /* Now that the notifier are unregistered, we can release le
+        * lock
+        */
+       spin_unlock(&pp->lock);
+       on_each_cpu(mvneta_percpu_disable, pp, true);
        free_percpu_irq(dev->irq, pp->ports);
        mvneta_cleanup_rxqs(pp);
        mvneta_cleanup_txqs(pp);
@@ -3242,26 +3262,25 @@ static void mvneta_ethtool_update_stats(struct mvneta_port *pp)
        const struct mvneta_statistic *s;
        void __iomem *base = pp->base;
        u32 high, low, val;
+       u64 val64;
        int i;
 
        for (i = 0, s = mvneta_statistics;
             s < mvneta_statistics + ARRAY_SIZE(mvneta_statistics);
             s++, i++) {
-               val = 0;
-
                switch (s->type) {
                case T_REG_32:
                        val = readl_relaxed(base + s->offset);
+                       pp->ethtool_stats[i] += val;
                        break;
                case T_REG_64:
                        /* Docs say to read low 32-bit then high */
                        low = readl_relaxed(base + s->offset);
                        high = readl_relaxed(base + s->offset + 4);
-                       val = (u64)high << 32 | low;
+                       val64 = (u64)high << 32 | low;
+                       pp->ethtool_stats[i] += val64;
                        break;
                }
-
-               pp->ethtool_stats[i] += val;
        }
 }
 
@@ -3311,9 +3330,7 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 
        netif_tx_stop_all_queues(pp->dev);
 
-       for_each_online_cpu(cpu)
-               smp_call_function_single(cpu, mvneta_percpu_mask_interrupt,
-                                        pp, true);
+       on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
 
        /* We have to synchronise on the napi of each CPU */
        for_each_online_cpu(cpu) {
@@ -3334,7 +3351,9 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
        mvreg_write(pp, MVNETA_PORT_CONFIG, val);
 
        /* Update the elected CPU matching the new rxq_def */
+       spin_lock(&pp->lock);
        mvneta_percpu_elect(pp);
+       spin_unlock(&pp->lock);
 
        /* We have to synchronise on the napi of each CPU */
        for_each_online_cpu(cpu) {
@@ -3605,7 +3624,9 @@ static int mvneta_probe(struct platform_device *pdev)
 
        pp->indir[0] = rxq_def;
 
-       pp->clk = devm_clk_get(&pdev->dev, NULL);
+       pp->clk = devm_clk_get(&pdev->dev, "core");
+       if (IS_ERR(pp->clk))
+               pp->clk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(pp->clk)) {
                err = PTR_ERR(pp->clk);
                goto err_put_phy_node;
@@ -3613,6 +3634,10 @@ static int mvneta_probe(struct platform_device *pdev)
 
        clk_prepare_enable(pp->clk);
 
+       pp->clk_bus = devm_clk_get(&pdev->dev, "bus");
+       if (!IS_ERR(pp->clk_bus))
+               clk_prepare_enable(pp->clk_bus);
+
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        pp->base = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(pp->base)) {
@@ -3724,6 +3749,7 @@ err_free_stats:
 err_free_ports:
        free_percpu(pp->ports);
 err_clk:
+       clk_disable_unprepare(pp->clk_bus);
        clk_disable_unprepare(pp->clk);
 err_put_phy_node:
        of_node_put(phy_node);
@@ -3741,6 +3767,7 @@ static int mvneta_remove(struct platform_device *pdev)
        struct mvneta_port *pp = netdev_priv(dev);
 
        unregister_netdev(dev);
+       clk_disable_unprepare(pp->clk_bus);
        clk_disable_unprepare(pp->clk);
        free_percpu(pp->ports);
        free_percpu(pp->stats);
index a4beccf..c797971 100644 (file)
@@ -3061,7 +3061,7 @@ static int mvpp2_prs_mac_da_accept(struct mvpp2 *priv, int port,
 
                pe = kzalloc(sizeof(*pe), GFP_KERNEL);
                if (!pe)
-                       return -1;
+                       return -ENOMEM;
                mvpp2_prs_tcam_lu_set(pe, MVPP2_PRS_LU_MAC);
                pe->index = tid;
 
@@ -3077,7 +3077,7 @@ static int mvpp2_prs_mac_da_accept(struct mvpp2 *priv, int port,
        if (pmap == 0) {
                if (add) {
                        kfree(pe);
-                       return -1;
+                       return -EINVAL;
                }
                mvpp2_prs_hw_inv(priv, pe->index);
                priv->prs_shadow[pe->index].valid = false;
index 715de8a..c7e9399 100644 (file)
@@ -182,10 +182,17 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
                err = mlx4_reset_slave(dev);
        else
                err = mlx4_reset_master(dev);
-       BUG_ON(err != 0);
 
+       if (!err) {
+               mlx4_err(dev, "device was reset successfully\n");
+       } else {
+               /* EEH could have disabled the PCI channel during reset. That's
+                * recoverable and the PCI error flow will handle it.
+                */
+               if (!pci_channel_offline(dev->persist->pdev))
+                       BUG_ON(1);
+       }
        dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
-       mlx4_err(dev, "device was reset successfully\n");
        mutex_unlock(&persist->device_state_mutex);
 
        /* At that step HW was already reset, now notify clients */
index d48d579..e94ca1c 100644 (file)
@@ -2429,7 +2429,7 @@ err_thread:
        flush_workqueue(priv->mfunc.master.comm_wq);
        destroy_workqueue(priv->mfunc.master.comm_wq);
 err_slaves:
-       while (--i) {
+       while (i--) {
                for (port = 1; port <= MLX4_MAX_PORTS; port++)
                        kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
        }
index 3348e64..a849da9 100644 (file)
@@ -318,7 +318,9 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
        if (timestamp_en)
                cq_context->flags  |= cpu_to_be32(1 << 19);
 
-       cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+       cq_context->logsize_usrpage =
+               cpu_to_be32((ilog2(nent) << 24) |
+                           mlx4_to_hw_uar_index(dev, uar->index));
        cq_context->comp_eqn        = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
        cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
index 038f9ce..1494997 100644 (file)
@@ -236,6 +236,24 @@ static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
        .enable         = mlx4_en_phc_enable,
 };
 
+#define MLX4_EN_WRAP_AROUND_SEC        10ULL
+
+/* This function calculates the max shift that enables the user range
+ * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
+ */
+static u32 freq_to_shift(u16 freq)
+{
+       u32 freq_khz = freq * 1000;
+       u64 max_val_cycles = freq_khz * 1000 * MLX4_EN_WRAP_AROUND_SEC;
+       u64 max_val_cycles_rounded = is_power_of_2(max_val_cycles + 1) ?
+               max_val_cycles : roundup_pow_of_two(max_val_cycles) - 1;
+       /* calculate max possible multiplier in order to fit in 64bit */
+       u64 max_mul = div_u64(0xffffffffffffffffULL, max_val_cycles_rounded);
+
+       /* This comes from the reverse of clocksource_khz2mult */
+       return ilog2(div_u64(max_mul * freq_khz, 1000000));
+}
+
 void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 {
        struct mlx4_dev *dev = mdev->dev;
@@ -254,12 +272,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
        memset(&mdev->cycles, 0, sizeof(mdev->cycles));
        mdev->cycles.read = mlx4_en_read_clock;
        mdev->cycles.mask = CLOCKSOURCE_MASK(48);
-       /* Using shift to make calculation more accurate. Since current HW
-        * clock frequency is 427 MHz, and cycles are given using a 48 bits
-        * register, the biggest shift when calculating using u64, is 14
-        * (max_cycles * multiplier < 2^64)
-        */
-       mdev->cycles.shift = 14;
+       mdev->cycles.shift = freq_to_shift(dev->caps.hca_core_clock);
        mdev->cycles.mult =
                clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift);
        mdev->nominal_c_mult = mdev->cycles.mult;
index 0c7e3f6..21e2c09 100644 (file)
@@ -2245,7 +2245,7 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
        struct mlx4_en_dev *mdev = en_priv->mdev;
        u64 mac_u64 = mlx4_mac_to_u64(mac);
 
-       if (!is_valid_ether_addr(mac))
+       if (is_multicast_ether_addr(mac))
                return -EINVAL;
 
        return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
@@ -2344,8 +2344,6 @@ out:
        /* set offloads */
        priv->dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
                                      NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL;
-       priv->dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
-       priv->dev->features    |= NETIF_F_GSO_UDP_TUNNEL;
 }
 
 static void mlx4_en_del_vxlan_offloads(struct work_struct *work)
@@ -2356,8 +2354,6 @@ static void mlx4_en_del_vxlan_offloads(struct work_struct *work)
        /* unset offloads */
        priv->dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
                                      NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL);
-       priv->dev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL;
-       priv->dev->features    &= ~NETIF_F_GSO_UDP_TUNNEL;
 
        ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
                                  VXLAN_STEER_BY_OUTER_MAC, 0);
@@ -2980,6 +2976,11 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
                priv->rss_hash_fn = ETH_RSS_HASH_TOP;
        }
 
+       if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+               dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+               dev->features    |= NETIF_F_GSO_UDP_TUNNEL;
+       }
+
        mdev->pndev[port] = dev;
        mdev->upper[port] = NULL;
 
index ee99e67..3904b5f 100644 (file)
@@ -238,11 +238,11 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
        stats->collisions = 0;
        stats->rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP);
        stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
-       stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+       stats->rx_over_errors = 0;
        stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
        stats->rx_frame_errors = 0;
        stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
-       stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+       stats->rx_missed_errors = 0;
        stats->tx_aborted_errors = 0;
        stats->tx_carrier_errors = 0;
        stats->tx_fifo_errors = 0;
index 12aab5a..02e925d 100644 (file)
@@ -58,7 +58,8 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
        } else {
                context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
        }
-       context->usr_page = cpu_to_be32(mdev->priv_uar.index);
+       context->usr_page = cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+                                       mdev->priv_uar.index));
        context->local_qpn = cpu_to_be32(qpn);
        context->pri_path.ackto = 1 & 0x07;
        context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
index 4421bf5..e0946ab 100644 (file)
@@ -213,7 +213,9 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
        mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
                                ring->cqn, user_prio, &ring->context);
        if (ring->bf_alloced)
-               ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
+               ring->context.usr_page =
+                       cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+                                                        ring->bf.uar->index));
 
        err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
                               &ring->qp, &ring->qp_state);
index 4696053..f613977 100644 (file)
@@ -940,9 +940,10 @@ static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
        if (!priv->eq_table.uar_map[index]) {
                priv->eq_table.uar_map[index] =
-                       ioremap(pci_resource_start(dev->persist->pdev, 2) +
-                               ((eq->eqn / 4) << PAGE_SHIFT),
-                               PAGE_SIZE);
+                       ioremap(
+                               pci_resource_start(dev->persist->pdev, 2) +
+                               ((eq->eqn / 4) << (dev->uar_page_shift)),
+                               (1 << (dev->uar_page_shift)));
                if (!priv->eq_table.uar_map[index]) {
                        mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
                                 eq->eqn);
index f1b6d21..f8674ae 100644 (file)
@@ -168,6 +168,20 @@ struct mlx4_port_config {
 
 static atomic_t pf_loading = ATOMIC_INIT(0);
 
+static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev,
+                                             struct mlx4_dev_cap *dev_cap)
+{
+       /* The reserved_uars is calculated by system page size unit.
+        * Therefore, adjustment is added when the uar page size is less
+        * than the system page size
+        */
+       dev->caps.reserved_uars =
+               max_t(int,
+                     mlx4_get_num_reserved_uar(dev),
+                     dev_cap->reserved_uars /
+                       (1 << (PAGE_SHIFT - dev->uar_page_shift)));
+}
+
 int mlx4_check_port_params(struct mlx4_dev *dev,
                           enum mlx4_port_type *port_type)
 {
@@ -386,8 +400,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
        dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
        dev->caps.reserved_mrws      = dev_cap->reserved_mrws;
 
-       /* The first 128 UARs are used for EQ doorbells */
-       dev->caps.reserved_uars      = max_t(int, 128, dev_cap->reserved_uars);
        dev->caps.reserved_pds       = dev_cap->reserved_pds;
        dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
                                        dev_cap->reserved_xrcds : 0;
@@ -405,6 +417,15 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
        dev->caps.max_gso_sz         = dev_cap->max_gso_sz;
        dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+       /* Save uar page shift */
+       if (!mlx4_is_slave(dev)) {
+               /* Virtual PCI function needs to determine UAR page size from
+                * firmware. Only master PCI function can set the uar page size
+                */
+               dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
+               mlx4_set_num_reserved_uars(dev, dev_cap);
+       }
+
        if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
                struct mlx4_init_hca_param hca_param;
 
@@ -815,16 +836,25 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
                return -ENODEV;
        }
 
-       /* slave gets uar page size from QUERY_HCA fw command */
-       dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12);
+       /* Set uar_page_shift for VF */
+       dev->uar_page_shift = hca_param.uar_page_sz + 12;
 
-       /* TODO: relax this assumption */
-       if (dev->caps.uar_page_size != PAGE_SIZE) {
-               mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %ld\n",
-                        dev->caps.uar_page_size, PAGE_SIZE);
-               return -ENODEV;
+       /* Make sure the master uar page size is valid */
+       if (dev->uar_page_shift > PAGE_SHIFT) {
+               mlx4_err(dev,
+                        "Invalid configuration: uar page size is larger than system page size\n");
+               return  -ENODEV;
        }
 
+       /* Set reserved_uars based on the uar_page_shift */
+       mlx4_set_num_reserved_uars(dev, &dev_cap);
+
+       /* Although uar page size in FW differs from system page size,
+        * upper software layers (mlx4_ib, mlx4_en and part of mlx4_core)
+        * still works with assumption that uar page size == system page size
+        */
+       dev->caps.uar_page_size = PAGE_SIZE;
+
        memset(&func_cap, 0, sizeof(func_cap));
        err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap);
        if (err) {
@@ -1226,6 +1256,7 @@ err_set_port:
 static int mlx4_mf_bond(struct mlx4_dev *dev)
 {
        int err = 0;
+       int nvfs;
        struct mlx4_slaves_pport slaves_port1;
        struct mlx4_slaves_pport slaves_port2;
        DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
@@ -1242,11 +1273,18 @@ static int mlx4_mf_bond(struct mlx4_dev *dev)
                return -EINVAL;
        }
 
+       /* number of virtual functions is number of total functions minus one
+        * physical function for each port.
+        */
+       nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+               bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
        /* limit on maximum allowed VFs */
-       if ((bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
-           bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1)) >
-           MAX_MF_BOND_ALLOWED_SLAVES)
+       if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+               mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+                         nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
                return -EINVAL;
+       }
 
        if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
                mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
@@ -2179,8 +2217,12 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
                dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-               init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
-               init_hca.uar_page_sz = PAGE_SHIFT - 12;
+               /* Always set UAR page size 4KB, set log_uar_sz accordingly */
+               init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
+                                     PAGE_SHIFT -
+                                     DEFAULT_UAR_PAGE_SHIFT;
+               init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
+
                init_hca.mw_enabled = 0;
                if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
                    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
index 609c59d..b3cc3ab 100644 (file)
@@ -269,9 +269,15 @@ EXPORT_SYMBOL_GPL(mlx4_bf_free);
 
 int mlx4_init_uar_table(struct mlx4_dev *dev)
 {
-       if (dev->caps.num_uars <= 128) {
-               mlx4_err(dev, "Only %d UAR pages (need more than 128)\n",
-                        dev->caps.num_uars);
+       int num_reserved_uar = mlx4_get_num_reserved_uar(dev);
+
+       mlx4_dbg(dev, "uar_page_shift = %d", dev->uar_page_shift);
+       mlx4_dbg(dev, "Effective reserved_uars=%d", dev->caps.reserved_uars);
+
+       if (dev->caps.num_uars <= num_reserved_uar) {
+               mlx4_err(
+                       dev, "Only %d UAR pages (need more than %d)\n",
+                       dev->caps.num_uars, num_reserved_uar);
                mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
                return -ENODEV;
        }
index 787b7bb..211c650 100644 (file)
@@ -193,10 +193,10 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
        if (need_mf_bond) {
                if (port == 1) {
                        mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                } else {
                        mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                }
        } else {
                mutex_lock(&table->mutex);
@@ -389,10 +389,10 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
        if (dup) {
                if (port == 1) {
                        mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                } else {
                        mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                }
        } else {
                mutex_lock(&table->mutex);
@@ -479,10 +479,10 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
        if (dup) {
                if (port == 1) {
                        mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                } else {
                        mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                }
        } else {
                mutex_lock(&table->mutex);
@@ -588,10 +588,10 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
        if (need_mf_bond) {
                if (port == 1) {
                        mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                } else {
                        mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                }
        } else {
                mutex_lock(&table->mutex);
@@ -764,10 +764,10 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
        if (dup) {
                if (port == 1) {
                        mutex_lock(&table->mutex);
-                       mutex_lock(&dup_table->mutex);
+                       mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
                } else {
                        mutex_lock(&dup_table->mutex);
-                       mutex_lock(&table->mutex);
+                       mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
                }
        } else {
                mutex_lock(&table->mutex);
index b46dbe2..25ce1b0 100644 (file)
@@ -915,11 +915,13 @@ static int handle_existing_counter(struct mlx4_dev *dev, u8 slave, int port,
 
        spin_lock_irq(mlx4_tlock(dev));
        r = find_res(dev, counter_index, RES_COUNTER);
-       if (!r || r->owner != slave)
+       if (!r || r->owner != slave) {
                ret = -EINVAL;
-       counter = container_of(r, struct res_counter, com);
-       if (!counter->port)
-               counter->port = port;
+       } else {
+               counter = container_of(r, struct res_counter, com);
+               if (!counter->port)
+                       counter->port = port;
+       }
 
        spin_unlock_irq(mlx4_tlock(dev));
        return ret;
index aac071a..5b17532 100644 (file)
@@ -223,6 +223,7 @@ struct mlx5e_pport_stats {
 
 static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
        "packets",
+       "bytes",
        "csum_none",
        "csum_sw",
        "lro_packets",
@@ -232,16 +233,18 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 
 struct mlx5e_rq_stats {
        u64 packets;
+       u64 bytes;
        u64 csum_none;
        u64 csum_sw;
        u64 lro_packets;
        u64 lro_bytes;
        u64 wqe_err;
-#define NUM_RQ_STATS 6
+#define NUM_RQ_STATS 7
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
        "packets",
+       "bytes",
        "tso_packets",
        "tso_bytes",
        "csum_offload_none",
@@ -253,6 +256,7 @@ static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
 
 struct mlx5e_sq_stats {
        u64 packets;
+       u64 bytes;
        u64 tso_packets;
        u64 tso_bytes;
        u64 csum_offload_none;
@@ -260,7 +264,7 @@ struct mlx5e_sq_stats {
        u64 wake;
        u64 dropped;
        u64 nop;
-#define NUM_SQ_STATS 8
+#define NUM_SQ_STATS 9
 };
 
 struct mlx5e_stats {
@@ -304,14 +308,9 @@ enum {
        MLX5E_RQ_STATE_POST_WQES_ENABLE,
 };
 
-enum cq_flags {
-       MLX5E_CQ_HAS_CQES = 1,
-};
-
 struct mlx5e_cq {
        /* data path - accessed per cqe */
        struct mlx5_cqwq           wq;
-       unsigned long              flags;
 
        /* data path - accessed per napi poll */
        struct napi_struct        *napi;
@@ -452,6 +451,8 @@ enum mlx5e_traffic_types {
        MLX5E_NUM_TT,
 };
 
+#define IS_HASHING_TT(tt) (tt != MLX5E_TT_ANY)
+
 enum mlx5e_rqt_ix {
        MLX5E_INDIRECTION_RQT,
        MLX5E_SINGLE_RQ_RQT,
@@ -618,9 +619,12 @@ void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
 
 int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix);
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
 int mlx5e_open_locked(struct net_device *netdev);
 int mlx5e_close_locked(struct net_device *netdev);
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+                                  int num_channels);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
                                      struct mlx5e_tx_wqe *wqe, int bf_sz)
index be65435..2018eeb 100644 (file)
@@ -62,10 +62,11 @@ static void mlx5e_timestamp_overflow(struct work_struct *work)
        struct delayed_work *dwork = to_delayed_work(work);
        struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
                                                   overflow_work);
+       unsigned long flags;
 
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
        timecounter_read(&tstamp->clock);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
        schedule_delayed_work(&tstamp->overflow_work, tstamp->overflow_period);
 }
 
@@ -136,10 +137,11 @@ static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
        struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                   ptp_info);
        u64 ns = timespec64_to_ns(ts);
+       unsigned long flags;
 
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
        timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
 
        return 0;
 }
@@ -150,10 +152,11 @@ static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
        struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                   ptp_info);
        u64 ns;
+       unsigned long flags;
 
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
        ns = timecounter_read(&tstamp->clock);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
 
        *ts = ns_to_timespec64(ns);
 
@@ -164,10 +167,11 @@ static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
        struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                   ptp_info);
+       unsigned long flags;
 
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
        timecounter_adjtime(&tstamp->clock, delta);
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
 
        return 0;
 }
@@ -176,6 +180,7 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
        u64 adj;
        u32 diff;
+       unsigned long flags;
        int neg_adj = 0;
        struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
                                                  ptp_info);
@@ -189,11 +194,11 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
        adj *= delta;
        diff = div_u64(adj, 1000000000ULL);
 
-       write_lock(&tstamp->lock);
+       write_lock_irqsave(&tstamp->lock, flags);
        timecounter_read(&tstamp->clock);
        tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
                                        tstamp->nominal_c_mult + diff;
-       write_unlock(&tstamp->lock);
+       write_unlock_irqrestore(&tstamp->lock, flags);
 
        return 0;
 }
index 65624ac..5abeb00 100644 (file)
@@ -385,6 +385,8 @@ static int mlx5e_set_channels(struct net_device *dev,
                mlx5e_close_locked(dev);
 
        priv->params.num_channels = count;
+       mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+                                     MLX5E_INDIR_RQT_SIZE, count);
 
        if (was_opened)
                err = mlx5e_open_locked(dev);
@@ -703,18 +705,36 @@ static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
        return 0;
 }
 
+static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
+{
+       struct mlx5_core_dev *mdev = priv->mdev;
+       void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+       int i;
+
+       MLX5_SET(modify_tir_in, in, bitmask.hash, 1);
+       mlx5e_build_tir_ctx_hash(tirc, priv);
+
+       for (i = 0; i < MLX5E_NUM_TT; i++)
+               if (IS_HASHING_TT(i))
+                       mlx5_core_modify_tir(mdev, priv->tirn[i], in, inlen);
+}
+
 static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
                          const u8 *key, const u8 hfunc)
 {
        struct mlx5e_priv *priv = netdev_priv(dev);
-       bool close_open;
-       int err = 0;
+       int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+       void *in;
 
        if ((hfunc != ETH_RSS_HASH_NO_CHANGE) &&
            (hfunc != ETH_RSS_HASH_XOR) &&
            (hfunc != ETH_RSS_HASH_TOP))
                return -EINVAL;
 
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
        mutex_lock(&priv->state_lock);
 
        if (indir) {
@@ -723,11 +743,6 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
                mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT);
        }
 
-       close_open = (key || (hfunc != ETH_RSS_HASH_NO_CHANGE)) &&
-                    test_bit(MLX5E_STATE_OPENED, &priv->state);
-       if (close_open)
-               mlx5e_close_locked(dev);
-
        if (key)
                memcpy(priv->params.toeplitz_hash_key, key,
                       sizeof(priv->params.toeplitz_hash_key));
@@ -735,12 +750,13 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
        if (hfunc != ETH_RSS_HASH_NO_CHANGE)
                priv->params.rss_hfunc = hfunc;
 
-       if (close_open)
-               err = mlx5e_open_locked(priv->netdev);
+       mlx5e_modify_tirs_hash(priv, in, inlen);
 
        mutex_unlock(&priv->state_lock);
 
-       return err;
+       kvfree(in);
+
+       return 0;
 }
 
 static int mlx5e_get_rxnfc(struct net_device *netdev,
index 6a3e430..402994b 100644 (file)
@@ -141,6 +141,10 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
                return;
 
        /* Collect firts the SW counters and then HW for consistency */
+       s->rx_packets           = 0;
+       s->rx_bytes             = 0;
+       s->tx_packets           = 0;
+       s->tx_bytes             = 0;
        s->tso_packets          = 0;
        s->tso_bytes            = 0;
        s->tx_queue_stopped     = 0;
@@ -155,6 +159,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
        for (i = 0; i < priv->params.num_channels; i++) {
                rq_stats = &priv->channel[i]->rq.stats;
 
+               s->rx_packets   += rq_stats->packets;
+               s->rx_bytes     += rq_stats->bytes;
                s->lro_packets  += rq_stats->lro_packets;
                s->lro_bytes    += rq_stats->lro_bytes;
                s->rx_csum_none += rq_stats->csum_none;
@@ -164,6 +170,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
                for (j = 0; j < priv->params.num_tc; j++) {
                        sq_stats = &priv->channel[i]->sq[j].stats;
 
+                       s->tx_packets           += sq_stats->packets;
+                       s->tx_bytes             += sq_stats->bytes;
                        s->tso_packets          += sq_stats->tso_packets;
                        s->tso_bytes            += sq_stats->tso_bytes;
                        s->tx_queue_stopped     += sq_stats->stopped;
@@ -225,23 +233,6 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
        s->tx_broadcast_bytes   =
                MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
-       s->rx_packets =
-               s->rx_unicast_packets +
-               s->rx_multicast_packets +
-               s->rx_broadcast_packets;
-       s->rx_bytes =
-               s->rx_unicast_bytes +
-               s->rx_multicast_bytes +
-               s->rx_broadcast_bytes;
-       s->tx_packets =
-               s->tx_unicast_packets +
-               s->tx_multicast_packets +
-               s->tx_broadcast_packets;
-       s->tx_bytes =
-               s->tx_unicast_bytes +
-               s->tx_multicast_bytes +
-               s->tx_broadcast_bytes;
-
        /* Update calculated offload counters */
        s->tx_csum_offload = s->tx_packets - tx_offload_none;
        s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
@@ -1199,7 +1190,6 @@ static void mlx5e_fill_indir_rqt_rqns(struct mlx5e_priv *priv, void *rqtc)
                        ix = mlx5e_bits_invert(i, MLX5E_LOG_INDIR_RQT_SIZE);
 
                ix = priv->params.indirection_rqt[ix];
-               ix = ix % priv->params.num_channels;
                MLX5_SET(rqtc, rqtc, rq_num[i],
                         test_bit(MLX5E_STATE_OPENED, &priv->state) ?
                         priv->channel[ix]->rq.rqn :
@@ -1317,7 +1307,22 @@ static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
                              lro_timer_supported_periods[2]));
 }
 
-static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv)
+{
+       MLX5_SET(tirc, tirc, rx_hash_fn,
+                mlx5e_rx_hash_fn(priv->params.rss_hfunc));
+       if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
+               void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+                                            rx_hash_toeplitz_key);
+               size_t len = MLX5_FLD_SZ_BYTES(tirc,
+                                              rx_hash_toeplitz_key);
+
+               MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+               memcpy(rss_key, priv->params.toeplitz_hash_key, len);
+       }
+}
+
+static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
 {
        struct mlx5_core_dev *mdev = priv->mdev;
 
@@ -1325,6 +1330,7 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
        void *tirc;
        int inlen;
        int err;
+       int tt;
 
        inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
        in = mlx5_vzalloc(inlen);
@@ -1336,7 +1342,11 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
 
        mlx5e_build_tir_ctx_lro(tirc, priv);
 
-       err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+       for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+               err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+               if (err)
+                       break;
+       }
 
        kvfree(in);
 
@@ -1672,17 +1682,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
        default:
                MLX5_SET(tirc, tirc, indirect_table,
                         priv->rqtn[MLX5E_INDIRECTION_RQT]);
-               MLX5_SET(tirc, tirc, rx_hash_fn,
-                        mlx5e_rx_hash_fn(priv->params.rss_hfunc));
-               if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
-                       void *rss_key = MLX5_ADDR_OF(tirc, tirc,
-                                                    rx_hash_toeplitz_key);
-                       size_t len = MLX5_FLD_SZ_BYTES(tirc,
-                                                      rx_hash_toeplitz_key);
-
-                       MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
-                       memcpy(rss_key, priv->params.toeplitz_hash_key, len);
-               }
+               mlx5e_build_tir_ctx_hash(tirc, priv);
                break;
        }
 
@@ -1885,8 +1885,10 @@ static int mlx5e_set_features(struct net_device *netdev,
                        mlx5e_close_locked(priv->netdev);
 
                priv->params.lro_en = !!(features & NETIF_F_LRO);
-               mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP);
-               mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP);
+               err = mlx5e_modify_tirs_lro(priv);
+               if (err)
+                       mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
+                                      err);
 
                if (was_opened)
                        err = mlx5e_open_locked(priv->netdev);
@@ -2024,18 +2026,37 @@ static int mlx5e_get_vf_stats(struct net_device *dev,
                                            vf_stats);
 }
 
-static struct net_device_ops mlx5e_netdev_ops = {
+static const struct net_device_ops mlx5e_netdev_ops_basic = {
+       .ndo_open                = mlx5e_open,
+       .ndo_stop                = mlx5e_close,
+       .ndo_start_xmit          = mlx5e_xmit,
+       .ndo_get_stats64         = mlx5e_get_stats,
+       .ndo_set_rx_mode         = mlx5e_set_rx_mode,
+       .ndo_set_mac_address     = mlx5e_set_mac,
+       .ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
+       .ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
+       .ndo_set_features        = mlx5e_set_features,
+       .ndo_change_mtu          = mlx5e_change_mtu,
+       .ndo_do_ioctl            = mlx5e_ioctl,
+};
+
+static const struct net_device_ops mlx5e_netdev_ops_sriov = {
        .ndo_open                = mlx5e_open,
        .ndo_stop                = mlx5e_close,
        .ndo_start_xmit          = mlx5e_xmit,
        .ndo_get_stats64         = mlx5e_get_stats,
        .ndo_set_rx_mode         = mlx5e_set_rx_mode,
        .ndo_set_mac_address     = mlx5e_set_mac,
-       .ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
-       .ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
+       .ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
+       .ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
        .ndo_set_features        = mlx5e_set_features,
-       .ndo_change_mtu          = mlx5e_change_mtu,
-       .ndo_do_ioctl            = mlx5e_ioctl,
+       .ndo_change_mtu          = mlx5e_change_mtu,
+       .ndo_do_ioctl            = mlx5e_ioctl,
+       .ndo_set_vf_mac          = mlx5e_set_vf_mac,
+       .ndo_set_vf_vlan         = mlx5e_set_vf_vlan,
+       .ndo_get_vf_config       = mlx5e_get_vf_config,
+       .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
+       .ndo_get_vf_stats        = mlx5e_get_vf_stats,
 };
 
 static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
@@ -2070,12 +2091,20 @@ u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
               2 /*sizeof(mlx5e_tx_wqe.inline_hdr_start)*/;
 }
 
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+                                  int num_channels)
+{
+       int i;
+
+       for (i = 0; i < len; i++)
+               indirection_rqt[i] = i % num_channels;
+}
+
 static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
                                    struct net_device *netdev,
                                    int num_channels)
 {
        struct mlx5e_priv *priv = netdev_priv(netdev);
-       int i;
 
        priv->params.log_sq_size           =
                MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
@@ -2099,8 +2128,8 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
        netdev_rss_key_fill(priv->params.toeplitz_hash_key,
                            sizeof(priv->params.toeplitz_hash_key));
 
-       for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++)
-               priv->params.indirection_rqt[i] = i % num_channels;
+       mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+                                     MLX5E_INDIR_RQT_SIZE, num_channels);
 
        priv->params.lro_wqe_sz            =
                MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
@@ -2137,18 +2166,11 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 
        SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
 
-       if (priv->params.num_tc > 1)
-               mlx5e_netdev_ops.ndo_select_queue = mlx5e_select_queue;
-
-       if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
-               mlx5e_netdev_ops.ndo_set_vf_mac = mlx5e_set_vf_mac;
-               mlx5e_netdev_ops.ndo_set_vf_vlan = mlx5e_set_vf_vlan;
-               mlx5e_netdev_ops.ndo_get_vf_config = mlx5e_get_vf_config;
-               mlx5e_netdev_ops.ndo_set_vf_link_state = mlx5e_set_vf_link_state;
-               mlx5e_netdev_ops.ndo_get_vf_stats = mlx5e_get_vf_stats;
-       }
+       if (MLX5_CAP_GEN(mdev, vport_group_manager))
+               netdev->netdev_ops = &mlx5e_netdev_ops_sriov;
+       else
+               netdev->netdev_ops = &mlx5e_netdev_ops_basic;
 
-       netdev->netdev_ops        = &mlx5e_netdev_ops;
        netdev->watchdog_timeo    = 15 * HZ;
 
        netdev->ethtool_ops       = &mlx5e_ethtool_ops;
index dd959d9..59658b9 100644 (file)
@@ -230,10 +230,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
        struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
        int work_done;
 
-       /* avoid accessing cq (dma coherent memory) if not needed */
-       if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-               return 0;
-
        for (work_done = 0; work_done < budget; work_done++) {
                struct mlx5e_rx_wqe *wqe;
                struct mlx5_cqe64 *cqe;
@@ -267,6 +263,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 
                mlx5e_build_rx_skb(cqe, rq, skb);
                rq->stats.packets++;
+               rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
                napi_gro_receive(cq->napi, skb);
 
 wq_ll_pop:
@@ -279,8 +276,5 @@ wq_ll_pop:
        /* ensure cq space is freed before enabling more cqes */
        wmb();
 
-       if (work_done == budget)
-               set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-
        return work_done;
 }
index 2c3fba0..bb4eeeb 100644 (file)
@@ -179,6 +179,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
        unsigned int skb_len = skb->len;
        u8  opcode = MLX5_OPCODE_SEND;
        dma_addr_t dma_addr = 0;
+       unsigned int num_bytes;
        bool bf = false;
        u16 headlen;
        u16 ds_cnt;
@@ -204,8 +205,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
                opcode       = MLX5_OPCODE_LSO;
                ihs          = skb_transport_offset(skb) + tcp_hdrlen(skb);
                payload_len  = skb->len - ihs;
-               wi->num_bytes = skb->len +
-                               (skb_shinfo(skb)->gso_segs - 1) * ihs;
+               num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
                sq->stats.tso_packets++;
                sq->stats.tso_bytes += payload_len;
        } else {
@@ -213,9 +213,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
                     !skb->xmit_more &&
                     !skb_shinfo(skb)->nr_frags;
                ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
-               wi->num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+               num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
        }
 
+       wi->num_bytes = num_bytes;
+
        if (skb_vlan_tag_present(skb)) {
                mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, &skb_data,
                                  &skb_len);
@@ -307,6 +309,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
        sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
 
        sq->stats.packets++;
+       sq->stats.bytes += num_bytes;
        return NETDEV_TX_OK;
 
 dma_unmap_wqe_err:
@@ -335,10 +338,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
        u16 sqcc;
        int i;
 
-       /* avoid accessing cq (dma coherent memory) if not needed */
-       if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-               return false;
-
        sq = container_of(cq, struct mlx5e_sq, cq);
 
        npkts = 0;
@@ -422,10 +421,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
                                netif_tx_wake_queue(sq->txq);
                                sq->stats.wake++;
        }
-       if (i == MLX5E_TX_CQ_POLL_BUDGET) {
-               set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-               return true;
-       }
 
-       return false;
+       return (i == MLX5E_TX_CQ_POLL_BUDGET);
 }
index 4ac8d71..66d51a7 100644 (file)
@@ -88,7 +88,6 @@ void mlx5e_completion_event(struct mlx5_core_cq *mcq)
 {
        struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
 
-       set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
        set_bit(MLX5E_CHANNEL_NAPI_SCHED, &cq->channel->flags);
        barrier();
        napi_schedule(cq->napi);
index c071077..7992c55 100644 (file)
@@ -215,7 +215,7 @@ mlxsw_pci_queue_elem_info_producer_get(struct mlxsw_pci_queue *q)
 {
        int index = q->producer_counter & (q->count - 1);
 
-       if ((q->producer_counter - q->consumer_counter) == q->count)
+       if ((u16) (q->producer_counter - q->consumer_counter) == q->count)
                return NULL;
        return mlxsw_pci_queue_elem_info_get(q, index);
 }
index 726f543..ae65b99 100644 (file)
@@ -49,7 +49,7 @@
 #define MLXSW_PORT_MID                 0xd000
 
 #define MLXSW_PORT_MAX_PHY_PORTS       0x40
-#define MLXSW_PORT_MAX_PORTS           MLXSW_PORT_MAX_PHY_PORTS
+#define MLXSW_PORT_MAX_PORTS           (MLXSW_PORT_MAX_PHY_PORTS + 1)
 
 #define MLXSW_PORT_DEVID_BITS_OFFSET   10
 #define MLXSW_PORT_PHY_BITS_OFFSET     4
index 0c52372..ffe4c03 100644 (file)
@@ -873,6 +873,62 @@ static inline void mlxsw_reg_spvm_pack(char *payload, u8 local_port,
        }
 }
 
+/* SPAFT - Switch Port Acceptable Frame Types
+ * ------------------------------------------
+ * The Switch Port Acceptable Frame Types register configures the frame
+ * admittance of the port.
+ */
+#define MLXSW_REG_SPAFT_ID 0x2010
+#define MLXSW_REG_SPAFT_LEN 0x08
+
+static const struct mlxsw_reg_info mlxsw_reg_spaft = {
+       .id = MLXSW_REG_SPAFT_ID,
+       .len = MLXSW_REG_SPAFT_LEN,
+};
+
+/* reg_spaft_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is not supported (all tag types are allowed).
+ */
+MLXSW_ITEM32(reg, spaft, local_port, 0x00, 16, 8);
+
+/* reg_spaft_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, sub_port, 0x00, 8, 8);
+
+/* reg_spaft_allow_untagged
+ * When set, untagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_untagged, 0x04, 31, 1);
+
+/* reg_spaft_allow_prio_tagged
+ * When set, priority tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_prio_tagged, 0x04, 30, 1);
+
+/* reg_spaft_allow_tagged
+ * When set, tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_tagged, 0x04, 29, 1);
+
+static inline void mlxsw_reg_spaft_pack(char *payload, u8 local_port,
+                                       bool allow_untagged)
+{
+       MLXSW_REG_ZERO(spaft, payload);
+       mlxsw_reg_spaft_local_port_set(payload, local_port);
+       mlxsw_reg_spaft_allow_untagged_set(payload, allow_untagged);
+       mlxsw_reg_spaft_allow_prio_tagged_set(payload, true);
+       mlxsw_reg_spaft_allow_tagged_set(payload, true);
+}
+
 /* SFGC - Switch Flooding Group Configuration
  * ------------------------------------------
  * The following register controls the association of flooding tables and MIDs
@@ -1044,6 +1100,92 @@ static inline void mlxsw_reg_sftr_pack(char *payload,
        mlxsw_reg_sftr_port_mask_set(payload, port, 1);
 }
 
+/* SFDF - Switch Filtering DB Flush
+ * --------------------------------
+ * The switch filtering DB flush register is used to flush the FDB.
+ * Note that FDB notifications are flushed as well.
+ */
+#define MLXSW_REG_SFDF_ID 0x2013
+#define MLXSW_REG_SFDF_LEN 0x14
+
+static const struct mlxsw_reg_info mlxsw_reg_sfdf = {
+       .id = MLXSW_REG_SFDF_ID,
+       .len = MLXSW_REG_SFDF_LEN,
+};
+
+/* reg_sfdf_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfdf, swid, 0x00, 24, 8);
+
+enum mlxsw_reg_sfdf_flush_type {
+       MLXSW_REG_SFDF_FLUSH_PER_SWID,
+       MLXSW_REG_SFDF_FLUSH_PER_FID,
+       MLXSW_REG_SFDF_FLUSH_PER_PORT,
+       MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID,
+       MLXSW_REG_SFDF_FLUSH_PER_LAG,
+       MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID,
+};
+
+/* reg_sfdf_flush_type
+ * Flush type.
+ * 0 - All SWID dynamic entries are flushed.
+ * 1 - All FID dynamic entries are flushed.
+ * 2 - All dynamic entries pointing to port are flushed.
+ * 3 - All FID dynamic entries pointing to port are flushed.
+ * 4 - All dynamic entries pointing to LAG are flushed.
+ * 5 - All FID dynamic entries pointing to LAG are flushed.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_type, 0x04, 28, 4);
+
+/* reg_sfdf_flush_static
+ * Static.
+ * 0 - Flush only dynamic entries.
+ * 1 - Flush both dynamic and static entries.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_static, 0x04, 24, 1);
+
+static inline void mlxsw_reg_sfdf_pack(char *payload,
+                                      enum mlxsw_reg_sfdf_flush_type type)
+{
+       MLXSW_REG_ZERO(sfdf, payload);
+       mlxsw_reg_sfdf_flush_type_set(payload, type);
+       mlxsw_reg_sfdf_flush_static_set(payload, true);
+}
+
+/* reg_sfdf_fid
+ * FID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, fid, 0x0C, 0, 16);
+
+/* reg_sfdf_system_port
+ * Port to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, system_port, 0x0C, 0, 16);
+
+/* reg_sfdf_port_fid_system_port
+ * Port to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, port_fid_system_port, 0x08, 0, 16);
+
+/* reg_sfdf_lag_id
+ * LAG ID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_id, 0x0C, 0, 10);
+
+/* reg_sfdf_lag_fid_lag_id
+ * LAG ID to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_fid_lag_id, 0x08, 0, 10);
+
 /* SLDR - Switch LAG Descriptor Register
  * -----------------------------------------
  * The switch LAG descriptor register is populated by LAG descriptors.
@@ -1701,20 +1843,20 @@ MLXSW_ITEM32(reg, pmlp, width, 0x00, 0, 8);
  * Module number.
  * Access: RW
  */
-MLXSW_ITEM32_INDEXED(reg, pmlp, module, 0x04, 0, 8, 0x04, 0, false);
+MLXSW_ITEM32_INDEXED(reg, pmlp, module, 0x04, 0, 8, 0x04, 0x00, false);
 
 /* reg_pmlp_tx_lane
  * Tx Lane. When rxtx field is cleared, this field is used for Rx as well.
  * Access: RW
  */
-MLXSW_ITEM32_INDEXED(reg, pmlp, tx_lane, 0x04, 16, 2, 0x04, 16, false);
+MLXSW_ITEM32_INDEXED(reg, pmlp, tx_lane, 0x04, 16, 2, 0x04, 0x00, false);
 
 /* reg_pmlp_rx_lane
  * Rx Lane. When rxtx field is cleared, this field is ignored and Rx lane is
  * equal to Tx lane.
  * Access: RW
  */
-MLXSW_ITEM32_INDEXED(reg, pmlp, rx_lane, 0x04, 24, 2, 0x04, 24, false);
+MLXSW_ITEM32_INDEXED(reg, pmlp, rx_lane, 0x04, 24, 2, 0x04, 0x00, false);
 
 static inline void mlxsw_reg_pmlp_pack(char *payload, u8 local_port)
 {
@@ -3117,10 +3259,14 @@ static inline const char *mlxsw_reg_id_str(u16 reg_id)
                return "SPVID";
        case MLXSW_REG_SPVM_ID:
                return "SPVM";
+       case MLXSW_REG_SPAFT_ID:
+               return "SPAFT";
        case MLXSW_REG_SFGC_ID:
                return "SFGC";
        case MLXSW_REG_SFTR_ID:
                return "SFTR";
+       case MLXSW_REG_SFDF_ID:
+               return "SFDF";
        case MLXSW_REG_SLDR_ID:
                return "SLDR";
        case MLXSW_REG_SLCR_ID:
index ce6845d..a94daa8 100644 (file)
@@ -1979,6 +1979,115 @@ static struct mlxsw_driver mlxsw_sp_driver = {
        .profile                = &mlxsw_sp_config_profile,
 };
 
+static int
+mlxsw_sp_port_fdb_flush_by_port(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       char sfdf_pl[MLXSW_REG_SFDF_LEN];
+
+       mlxsw_reg_sfdf_pack(sfdf_pl, MLXSW_REG_SFDF_FLUSH_PER_PORT);
+       mlxsw_reg_sfdf_system_port_set(sfdf_pl, mlxsw_sp_port->local_port);
+
+       return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static int
+mlxsw_sp_port_fdb_flush_by_port_fid(const struct mlxsw_sp_port *mlxsw_sp_port,
+                                   u16 fid)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       char sfdf_pl[MLXSW_REG_SFDF_LEN];
+
+       mlxsw_reg_sfdf_pack(sfdf_pl, MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID);
+       mlxsw_reg_sfdf_fid_set(sfdf_pl, fid);
+       mlxsw_reg_sfdf_port_fid_system_port_set(sfdf_pl,
+                                               mlxsw_sp_port->local_port);
+
+       return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static int
+mlxsw_sp_port_fdb_flush_by_lag_id(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       char sfdf_pl[MLXSW_REG_SFDF_LEN];
+
+       mlxsw_reg_sfdf_pack(sfdf_pl, MLXSW_REG_SFDF_FLUSH_PER_LAG);
+       mlxsw_reg_sfdf_lag_id_set(sfdf_pl, mlxsw_sp_port->lag_id);
+
+       return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static int
+mlxsw_sp_port_fdb_flush_by_lag_id_fid(const struct mlxsw_sp_port *mlxsw_sp_port,
+                                     u16 fid)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       char sfdf_pl[MLXSW_REG_SFDF_LEN];
+
+       mlxsw_reg_sfdf_pack(sfdf_pl, MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID);
+       mlxsw_reg_sfdf_fid_set(sfdf_pl, fid);
+       mlxsw_reg_sfdf_lag_fid_lag_id_set(sfdf_pl, mlxsw_sp_port->lag_id);
+
+       return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static int
+__mlxsw_sp_port_fdb_flush(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       int err, last_err = 0;
+       u16 vid;
+
+       for (vid = 1; vid < VLAN_N_VID - 1; vid++) {
+               err = mlxsw_sp_port_fdb_flush_by_port_fid(mlxsw_sp_port, vid);
+               if (err)
+                       last_err = err;
+       }
+
+       return last_err;
+}
+
+static int
+__mlxsw_sp_port_fdb_flush_lagged(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       int err, last_err = 0;
+       u16 vid;
+
+       for (vid = 1; vid < VLAN_N_VID - 1; vid++) {
+               err = mlxsw_sp_port_fdb_flush_by_lag_id_fid(mlxsw_sp_port, vid);
+               if (err)
+                       last_err = err;
+       }
+
+       return last_err;
+}
+
+static int mlxsw_sp_port_fdb_flush(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       if (!list_empty(&mlxsw_sp_port->vports_list))
+               if (mlxsw_sp_port->lagged)
+                       return __mlxsw_sp_port_fdb_flush_lagged(mlxsw_sp_port);
+               else
+                       return __mlxsw_sp_port_fdb_flush(mlxsw_sp_port);
+       else
+               if (mlxsw_sp_port->lagged)
+                       return mlxsw_sp_port_fdb_flush_by_lag_id(mlxsw_sp_port);
+               else
+                       return mlxsw_sp_port_fdb_flush_by_port(mlxsw_sp_port);
+}
+
+static int mlxsw_sp_vport_fdb_flush(struct mlxsw_sp_port *mlxsw_sp_vport)
+{
+       u16 vfid = mlxsw_sp_vport_vfid_get(mlxsw_sp_vport);
+       u16 fid = mlxsw_sp_vfid_to_fid(vfid);
+
+       if (mlxsw_sp_vport->lagged)
+               return mlxsw_sp_port_fdb_flush_by_lag_id_fid(mlxsw_sp_vport,
+                                                            fid);
+       else
+               return mlxsw_sp_port_fdb_flush_by_port_fid(mlxsw_sp_vport, fid);
+}
+
 static bool mlxsw_sp_port_dev_check(const struct net_device *dev)
 {
        return dev->netdev_ops == &mlxsw_sp_port_netdev_ops;
@@ -2006,10 +2115,16 @@ static int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port)
        return 0;
 }
 
-static int mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port)
+static int mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+                                     bool flush_fdb)
 {
        struct net_device *dev = mlxsw_sp_port->dev;
 
+       if (flush_fdb && mlxsw_sp_port_fdb_flush(mlxsw_sp_port))
+               netdev_err(mlxsw_sp_port->dev, "Failed to flush FDB\n");
+
+       mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+
        mlxsw_sp_port->learning = 0;
        mlxsw_sp_port->learning_sync = 0;
        mlxsw_sp_port->uc_flood = 0;
@@ -2200,10 +2315,15 @@ err_col_port_enable:
        return err;
 }
 
+static int mlxsw_sp_vport_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_vport,
+                                      struct net_device *br_dev,
+                                      bool flush_fdb);
+
 static int mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
                                   struct net_device *lag_dev)
 {
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       struct mlxsw_sp_port *mlxsw_sp_vport;
        struct mlxsw_sp_upper *lag;
        u16 lag_id = mlxsw_sp_port->lag_id;
        int err;
@@ -2220,7 +2340,30 @@ static int mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
        if (err)
                return err;
 
+       /* In case we leave a LAG device that has bridges built on top,
+        * then their teardown sequence is never issued and we need to
+        * invoke the necessary cleanup routines ourselves.
+        */
+       list_for_each_entry(mlxsw_sp_vport, &mlxsw_sp_port->vports_list,
+                           vport.list) {
+               struct net_device *br_dev;
+
+               if (!mlxsw_sp_vport->bridged)
+                       continue;
+
+               br_dev = mlxsw_sp_vport_br_get(mlxsw_sp_vport);
+               mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport, br_dev, false);
+       }
+
+       if (mlxsw_sp_port->bridged) {
+               mlxsw_sp_port_active_vlans_del(mlxsw_sp_port);
+               mlxsw_sp_port_bridge_leave(mlxsw_sp_port, false);
+               mlxsw_sp_master_bridge_dec(mlxsw_sp, NULL);
+       }
+
        if (lag->ref_count == 1) {
+               if (mlxsw_sp_port_fdb_flush_by_lag_id(mlxsw_sp_port))
+                       netdev_err(mlxsw_sp_port->dev, "Failed to flush FDB\n");
                err = mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
                if (err)
                        return err;
@@ -2272,9 +2415,6 @@ static int mlxsw_sp_port_lag_changed(struct mlxsw_sp_port *mlxsw_sp_port,
        return mlxsw_sp_port_lag_tx_en_set(mlxsw_sp_port, info->tx_enabled);
 }
 
-static int mlxsw_sp_vport_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_vport,
-                                      struct net_device *br_dev);
-
 static int mlxsw_sp_port_vlan_link(struct mlxsw_sp_port *mlxsw_sp_port,
                                   struct net_device *vlan_dev)
 {
@@ -2312,7 +2452,7 @@ static int mlxsw_sp_port_vlan_unlink(struct mlxsw_sp_port *mlxsw_sp_port,
                struct net_device *br_dev;
 
                br_dev = mlxsw_sp_vport_br_get(mlxsw_sp_vport);
-               mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport, br_dev);
+               mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport, br_dev, true);
        }
 
        mlxsw_sp_vport->dev = mlxsw_sp_port->dev;
@@ -2374,7 +2514,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
                                }
                                mlxsw_sp_master_bridge_inc(mlxsw_sp, upper_dev);
                        } else {
-                               err = mlxsw_sp_port_bridge_leave(mlxsw_sp_port);
+                               err = mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
+                                                                true);
                                mlxsw_sp_master_bridge_dec(mlxsw_sp, upper_dev);
                                if (err) {
                                        netdev_err(dev, "Failed to leave bridge\n");
@@ -2541,7 +2682,8 @@ static void mlxsw_sp_br_vfid_destroy(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_vport_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_vport,
-                                      struct net_device *br_dev)
+                                      struct net_device *br_dev,
+                                      bool flush_fdb)
 {
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_vport->mlxsw_sp;
        u16 vid = mlxsw_sp_vport_vid_get(mlxsw_sp_vport);
@@ -2604,6 +2746,16 @@ static int mlxsw_sp_vport_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_vport,
                goto err_vport_flood_set;
        }
 
+       err = mlxsw_sp_port_stp_state_set(mlxsw_sp_vport, vid,
+                                         MLXSW_REG_SPMS_STATE_FORWARDING);
+       if (err) {
+               netdev_err(dev, "Failed to set STP state\n");
+               goto err_port_stp_state_set;
+       }
+
+       if (flush_fdb && mlxsw_sp_vport_fdb_flush(mlxsw_sp_vport))
+               netdev_err(dev, "Failed to flush FDB\n");
+
        /* Switch between the vFIDs and destroy the old one if needed. */
        new_vfid->nr_vports++;
        mlxsw_sp_vport->vport.vfid = new_vfid;
@@ -2618,6 +2770,7 @@ static int mlxsw_sp_vport_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_vport,
 
        return 0;
 
+err_port_stp_state_set:
 err_vport_flood_set:
 err_port_vid_learning_set:
 err_port_vid_to_fid_validate:
@@ -2777,7 +2930,7 @@ static int mlxsw_sp_netdevice_vport_event(struct net_device *dev,
                        if (!mlxsw_sp_vport)
                                return NOTIFY_DONE;
                        err = mlxsw_sp_vport_bridge_leave(mlxsw_sp_vport,
-                                                         upper_dev);
+                                                         upper_dev, true);
                        if (err) {
                                netdev_err(dev, "Failed to leave bridge\n");
                                return NOTIFY_BAD;
index a23dc61..3b89ed2 100644 (file)
@@ -120,7 +120,6 @@ struct mlxsw_sp {
        } fdb_notify;
 #define MLXSW_SP_DEFAULT_AGEING_TIME 300
        u32 ageing_time;
-       struct mutex fdb_lock;  /* Make sure FDB sessions are atomic. */
        struct mlxsw_sp_upper master_bridge;
        struct mlxsw_sp_upper lags[MLXSW_SP_LAG_MAX];
 };
@@ -254,5 +253,7 @@ int mlxsw_sp_port_kill_vid(struct net_device *dev,
                           __be16 __always_unused proto, u16 vid);
 int mlxsw_sp_vport_flood_set(struct mlxsw_sp_port *mlxsw_sp_vport, u16 vfid,
                             bool set, bool only_uc);
+void mlxsw_sp_port_active_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
 
 #endif
index 45479ef..7b56098 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/if_bridge.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rtnetlink.h>
 #include <net/switchdev.h>
 
 #include "spectrum.h"
@@ -124,14 +125,14 @@ static int mlxsw_sp_port_stp_state_set(struct mlxsw_sp_port *mlxsw_sp_port,
        int err;
 
        switch (state) {
-       case BR_STATE_DISABLED: /* fall-through */
        case BR_STATE_FORWARDING:
                spms_state = MLXSW_REG_SPMS_STATE_FORWARDING;
                break;
-       case BR_STATE_LISTENING: /* fall-through */
        case BR_STATE_LEARNING:
                spms_state = MLXSW_REG_SPMS_STATE_LEARNING;
                break;
+       case BR_STATE_LISTENING: /* fall-through */
+       case BR_STATE_DISABLED: /* fall-through */
        case BR_STATE_BLOCKING:
                spms_state = MLXSW_REG_SPMS_STATE_DISCARDING;
                break;
@@ -369,7 +370,8 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
        return err;
 }
 
-static int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+static int __mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+                                   u16 vid)
 {
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
        char spvid_pl[MLXSW_REG_SPVID_LEN];
@@ -378,6 +380,53 @@ static int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
        return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvid), spvid_pl);
 }
 
+static int mlxsw_sp_port_allow_untagged_set(struct mlxsw_sp_port *mlxsw_sp_port,
+                                           bool allow)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       char spaft_pl[MLXSW_REG_SPAFT_LEN];
+
+       mlxsw_reg_spaft_pack(spaft_pl, mlxsw_sp_port->local_port, allow);
+       return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spaft), spaft_pl);
+}
+
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+       struct net_device *dev = mlxsw_sp_port->dev;
+       int err;
+
+       if (!vid) {
+               err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port, false);
+               if (err) {
+                       netdev_err(dev, "Failed to disallow untagged traffic\n");
+                       return err;
+               }
+       } else {
+               err = __mlxsw_sp_port_pvid_set(mlxsw_sp_port, vid);
+               if (err) {
+                       netdev_err(dev, "Failed to set PVID\n");
+                       return err;
+               }
+
+               /* Only allow if not already allowed. */
+               if (!mlxsw_sp_port->pvid) {
+                       err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port,
+                                                              true);
+                       if (err) {
+                               netdev_err(dev, "Failed to allow untagged traffic\n");
+                               goto err_port_allow_untagged_set;
+                       }
+               }
+       }
+
+       mlxsw_sp_port->pvid = vid;
+       return 0;
+
+err_port_allow_untagged_set:
+       __mlxsw_sp_port_pvid_set(mlxsw_sp_port, mlxsw_sp_port->pvid);
+       return err;
+}
+
 static int mlxsw_sp_fid_create(struct mlxsw_sp *mlxsw_sp, u16 fid)
 {
        char sfmr_pl[MLXSW_REG_SFMR_LEN];
@@ -539,7 +588,12 @@ static int __mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port,
                        netdev_err(dev, "Unable to add PVID %d\n", vid_begin);
                        goto err_port_pvid_set;
                }
-               mlxsw_sp_port->pvid = vid_begin;
+       } else if (!flag_pvid && old_pvid >= vid_begin && old_pvid <= vid_end) {
+               err = mlxsw_sp_port_pvid_set(mlxsw_sp_port, 0);
+               if (err) {
+                       netdev_err(dev, "Unable to del PVID\n");
+                       goto err_port_pvid_set;
+               }
        }
 
        /* Changing activity bits only if HW operation succeded */
@@ -891,20 +945,18 @@ static int __mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
                return err;
        }
 
+       if (init)
+               goto out;
+
        pvid = mlxsw_sp_port->pvid;
-       if (pvid >= vid_begin && pvid <= vid_end && pvid != 1) {
-               /* Default VLAN is always 1 */
-               err = mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+       if (pvid >= vid_begin && pvid <= vid_end) {
+               err = mlxsw_sp_port_pvid_set(mlxsw_sp_port, 0);
                if (err) {
                        netdev_err(dev, "Unable to del PVID %d\n", pvid);
                        return err;
                }
-               mlxsw_sp_port->pvid = 1;
        }
 
-       if (init)
-               goto out;
-
        err = __mlxsw_sp_port_flood_set(mlxsw_sp_port, vid_begin, vid_end,
                                        false, false);
        if (err) {
@@ -936,6 +988,14 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
                                         vlan->vid_begin, vlan->vid_end, false);
 }
 
+void mlxsw_sp_port_active_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+       u16 vid;
+
+       for_each_set_bit(vid, mlxsw_sp_port->active_vlans, VLAN_N_VID)
+               __mlxsw_sp_port_vlans_del(mlxsw_sp_port, vid, vid, false);
+}
+
 static int
 mlxsw_sp_port_fdb_static_del(struct mlxsw_sp_port *mlxsw_sp_port,
                             const struct switchdev_obj_port_fdb *fdb)
@@ -1040,10 +1100,12 @@ static struct mlxsw_sp_port *mlxsw_sp_lag_rep_port(struct mlxsw_sp *mlxsw_sp,
 
 static int mlxsw_sp_port_fdb_dump(struct mlxsw_sp_port *mlxsw_sp_port,
                                  struct switchdev_obj_port_fdb *fdb,
-                                 switchdev_obj_dump_cb_t *cb)
+                                 switchdev_obj_dump_cb_t *cb,
+                                 struct net_device *orig_dev)
 {
        struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-       u16 vport_vid = 0, vport_fid = 0;
+       struct mlxsw_sp_port *tmp;
+       u16 vport_fid = 0;
        char *sfd_pl;
        char mac[ETH_ALEN];
        u16 fid;
@@ -1058,13 +1120,11 @@ static int mlxsw_sp_port_fdb_dump(struct mlxsw_sp_port *mlxsw_sp_port,
        if (!sfd_pl)
                return -ENOMEM;
 
-       mutex_lock(&mlxsw_sp_port->mlxsw_sp->fdb_lock);
        if (mlxsw_sp_port_is_vport(mlxsw_sp_port)) {
                u16 tmp;
 
                tmp = mlxsw_sp_vport_vfid_get(mlxsw_sp_port);
                vport_fid = mlxsw_sp_vfid_to_fid(tmp);
-               vport_vid = mlxsw_sp_vport_vid_get(mlxsw_sp_port);
        }
 
        mlxsw_reg_sfd_pack(sfd_pl, MLXSW_REG_SFD_OP_QUERY_DUMP, 0);
@@ -1088,12 +1148,13 @@ static int mlxsw_sp_port_fdb_dump(struct mlxsw_sp_port *mlxsw_sp_port,
                                mlxsw_reg_sfd_uc_unpack(sfd_pl, i, mac, &fid,
                                                        &local_port);
                                if (local_port == mlxsw_sp_port->local_port) {
-                                       if (vport_fid && vport_fid != fid)
-                                               continue;
-                                       else if (vport_fid)
-                                               fdb->vid = vport_vid;
-                                       else
+                                       if (vport_fid && vport_fid == fid)
+                                               fdb->vid = 0;
+                                       else if (!vport_fid &&
+                                                !mlxsw_sp_fid_is_vfid(fid))
                                                fdb->vid = fid;
+                                       else
+                                               continue;
                                        ether_addr_copy(fdb->addr, mac);
                                        fdb->ndm_state = NUD_REACHABLE;
                                        err = cb(&fdb->obj);
@@ -1104,14 +1165,22 @@ static int mlxsw_sp_port_fdb_dump(struct mlxsw_sp_port *mlxsw_sp_port,
                        case MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG:
                                mlxsw_reg_sfd_uc_lag_unpack(sfd_pl, i,
                                                            mac, &fid, &lag_id);
-                               if (mlxsw_sp_port ==
-                                   mlxsw_sp_lag_rep_port(mlxsw_sp, lag_id)) {
-                                       if (vport_fid && vport_fid != fid)
+                               tmp = mlxsw_sp_lag_rep_port(mlxsw_sp, lag_id);
+                               if (tmp && tmp->local_port ==
+                                   mlxsw_sp_port->local_port) {
+                                       /* LAG records can only point to LAG
+                                        * devices or VLAN devices on top.
+                                        */
+                                       if (!netif_is_lag_master(orig_dev) &&
+                                           !is_vlan_dev(orig_dev))
                                                continue;
-                                       else if (vport_fid)
-                                               fdb->vid = vport_vid;
-                                       else
+                                       if (vport_fid && vport_fid == fid)
+                                               fdb->vid = 0;
+                                       else if (!vport_fid &&
+                                                !mlxsw_sp_fid_is_vfid(fid))
                                                fdb->vid = fid;
+                                       else
+                                               continue;
                                        ether_addr_copy(fdb->addr, mac);
                                        fdb->ndm_state = NUD_REACHABLE;
                                        err = cb(&fdb->obj);
@@ -1124,7 +1193,6 @@ static int mlxsw_sp_port_fdb_dump(struct mlxsw_sp_port *mlxsw_sp_port,
        } while (num_rec == MLXSW_REG_SFD_REC_MAX_COUNT);
 
 out:
-       mutex_unlock(&mlxsw_sp_port->mlxsw_sp->fdb_lock);
        kfree(sfd_pl);
        return stored_err ? stored_err : err;
 }
@@ -1176,7 +1244,8 @@ static int mlxsw_sp_port_obj_dump(struct net_device *dev,
                break;
        case SWITCHDEV_OBJ_ID_PORT_FDB:
                err = mlxsw_sp_port_fdb_dump(mlxsw_sp_port,
-                                            SWITCHDEV_OBJ_PORT_FDB(obj), cb);
+                                            SWITCHDEV_OBJ_PORT_FDB(obj), cb,
+                                            obj->orig_dev);
                break;
        default:
                err = -EOPNOTSUPP;
@@ -1194,14 +1263,14 @@ static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
        .switchdev_port_obj_dump        = mlxsw_sp_port_obj_dump,
 };
 
-static void mlxsw_sp_fdb_call_notifiers(bool learning, bool learning_sync,
-                                       bool adding, char *mac, u16 vid,
+static void mlxsw_sp_fdb_call_notifiers(bool learning_sync, bool adding,
+                                       char *mac, u16 vid,
                                        struct net_device *dev)
 {
        struct switchdev_notifier_fdb_info info;
        unsigned long notifier_type;
 
-       if (learning && learning_sync) {
+       if (learning_sync) {
                info.addr = mac;
                info.vid = vid;
                notifier_type = adding ? SWITCHDEV_FDB_ADD : SWITCHDEV_FDB_DEL;
@@ -1237,7 +1306,7 @@ static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
                        netdev_err(mlxsw_sp_port->dev, "Failed to find a matching vPort following FDB notification\n");
                        goto just_remove;
                }
-               vid = mlxsw_sp_vport_vid_get(mlxsw_sp_vport);
+               vid = 0;
                /* Override the physical port with the vPort. */
                mlxsw_sp_port = mlxsw_sp_vport;
        } else {
@@ -1257,8 +1326,7 @@ do_fdb_op:
 
        if (!do_notification)
                return;
-       mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning,
-                                   mlxsw_sp_port->learning_sync,
+       mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning_sync,
                                    adding, mac, vid, mlxsw_sp_port->dev);
        return;
 
@@ -1273,6 +1341,7 @@ static void mlxsw_sp_fdb_notify_mac_lag_process(struct mlxsw_sp *mlxsw_sp,
                                                bool adding)
 {
        struct mlxsw_sp_port *mlxsw_sp_port;
+       struct net_device *dev;
        char mac[ETH_ALEN];
        u16 lag_vid = 0;
        u16 lag_id;
@@ -1298,11 +1367,13 @@ static void mlxsw_sp_fdb_notify_mac_lag_process(struct mlxsw_sp *mlxsw_sp,
                        goto just_remove;
                }
 
-               vid = mlxsw_sp_vport_vid_get(mlxsw_sp_vport);
-               lag_vid = vid;
+               lag_vid = mlxsw_sp_vport_vid_get(mlxsw_sp_vport);
+               dev = mlxsw_sp_vport->dev;
+               vid = 0;
                /* Override the physical port with the vPort. */
                mlxsw_sp_port = mlxsw_sp_vport;
        } else {
+               dev = mlxsw_sp_lag_get(mlxsw_sp, lag_id)->dev;
                vid = fid;
        }
 
@@ -1319,10 +1390,8 @@ do_fdb_op:
 
        if (!do_notification)
                return;
-       mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning,
-                                   mlxsw_sp_port->learning_sync,
-                                   adding, mac, vid,
-                                   mlxsw_sp_lag_get(mlxsw_sp, lag_id)->dev);
+       mlxsw_sp_fdb_call_notifiers(mlxsw_sp_port->learning_sync, adding, mac,
+                                   vid, dev);
        return;
 
 just_remove:
@@ -1374,7 +1443,7 @@ static void mlxsw_sp_fdb_notify_work(struct work_struct *work)
 
        mlxsw_sp = container_of(work, struct mlxsw_sp, fdb_notify.dw.work);
 
-       mutex_lock(&mlxsw_sp->fdb_lock);
+       rtnl_lock();
        do {
                mlxsw_reg_sfn_pack(sfn_pl);
                err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(sfn), sfn_pl);
@@ -1387,7 +1456,7 @@ static void mlxsw_sp_fdb_notify_work(struct work_struct *work)
                        mlxsw_sp_fdb_notify_rec_process(mlxsw_sp, sfn_pl, i);
 
        } while (num_rec);
-       mutex_unlock(&mlxsw_sp->fdb_lock);
+       rtnl_unlock();
 
        kfree(sfn_pl);
        mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
@@ -1402,7 +1471,6 @@ static int mlxsw_sp_fdb_init(struct mlxsw_sp *mlxsw_sp)
                dev_err(mlxsw_sp->bus_info->dev, "Failed to set default ageing time\n");
                return err;
        }
-       mutex_init(&mlxsw_sp->fdb_lock);
        INIT_DELAYED_WORK(&mlxsw_sp->fdb_notify.dw, mlxsw_sp_fdb_notify_work);
        mlxsw_sp->fdb_notify.interval = MLXSW_SP_DEFAULT_LEARNING_INTERVAL;
        mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
index a10c928..3e67f45 100644 (file)
 
 #include "moxart_ether.h"
 
+static inline void moxart_desc_write(u32 data, u32 *desc)
+{
+       *desc = cpu_to_le32(data);
+}
+
+static inline u32 moxart_desc_read(u32 *desc)
+{
+       return le32_to_cpu(*desc);
+}
+
 static inline void moxart_emac_write(struct net_device *ndev,
                                     unsigned int reg, unsigned long value)
 {
@@ -112,7 +122,7 @@ static void moxart_mac_enable(struct net_device *ndev)
 static void moxart_mac_setup_desc_ring(struct net_device *ndev)
 {
        struct moxart_mac_priv_t *priv = netdev_priv(ndev);
-       void __iomem *desc;
+       void *desc;
        int i;
 
        for (i = 0; i < TX_DESC_NUM; i++) {
@@ -121,7 +131,7 @@ static void moxart_mac_setup_desc_ring(struct net_device *ndev)
 
                priv->tx_buf[i] = priv->tx_buf_base + priv->tx_buf_size * i;
        }
-       writel(TX_DESC1_END, desc + TX_REG_OFFSET_DESC1);
+       moxart_desc_write(TX_DESC1_END, desc + TX_REG_OFFSET_DESC1);
 
        priv->tx_head = 0;
        priv->tx_tail = 0;
@@ -129,8 +139,8 @@ static void moxart_mac_setup_desc_ring(struct net_device *ndev)
        for (i = 0; i < RX_DESC_NUM; i++) {
                desc = priv->rx_desc_base + i * RX_REG_DESC_SIZE;
                memset(desc, 0, RX_REG_DESC_SIZE);
-               writel(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
-               writel(RX_BUF_SIZE & RX_DESC1_BUF_SIZE_MASK,
+               moxart_desc_write(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
+               moxart_desc_write(RX_BUF_SIZE & RX_DESC1_BUF_SIZE_MASK,
                       desc + RX_REG_OFFSET_DESC1);
 
                priv->rx_buf[i] = priv->rx_buf_base + priv->rx_buf_size * i;
@@ -141,12 +151,12 @@ static void moxart_mac_setup_desc_ring(struct net_device *ndev)
                if (dma_mapping_error(&ndev->dev, priv->rx_mapping[i]))
                        netdev_err(ndev, "DMA mapping error\n");
 
-               writel(priv->rx_mapping[i],
+               moxart_desc_write(priv->rx_mapping[i],
                       desc + RX_REG_OFFSET_DESC2 + RX_DESC2_ADDRESS_PHYS);
-               writel(priv->rx_buf[i],
+               moxart_desc_write((uintptr_t)priv->rx_buf[i],
                       desc + RX_REG_OFFSET_DESC2 + RX_DESC2_ADDRESS_VIRT);
        }
-       writel(RX_DESC1_END, desc + RX_REG_OFFSET_DESC1);
+       moxart_desc_write(RX_DESC1_END, desc + RX_REG_OFFSET_DESC1);
 
        priv->rx_head = 0;
 
@@ -201,14 +211,15 @@ static int moxart_rx_poll(struct napi_struct *napi, int budget)
                                                      napi);
        struct net_device *ndev = priv->ndev;
        struct sk_buff *skb;
-       void __iomem *desc;
+       void *desc;
        unsigned int desc0, len;
        int rx_head = priv->rx_head;
        int rx = 0;
 
        while (rx < budget) {
                desc = priv->rx_desc_base + (RX_REG_DESC_SIZE * rx_head);
-               desc0 = readl(desc + RX_REG_OFFSET_DESC0);
+               desc0 = moxart_desc_read(desc + RX_REG_OFFSET_DESC0);
+               rmb(); /* ensure desc0 is up to date */
 
                if (desc0 & RX_DESC0_DMA_OWN)
                        break;
@@ -250,7 +261,8 @@ static int moxart_rx_poll(struct napi_struct *napi, int budget)
                        priv->stats.multicast++;
 
 rx_next:
-               writel(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
+               wmb(); /* prevent setting ownership back too early */
+               moxart_desc_write(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
 
                rx_head = RX_NEXT(rx_head);
                priv->rx_head = rx_head;
@@ -310,7 +322,7 @@ static irqreturn_t moxart_mac_interrupt(int irq, void *dev_id)
 static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
        struct moxart_mac_priv_t *priv = netdev_priv(ndev);
-       void __iomem *desc;
+       void *desc;
        unsigned int len;
        unsigned int tx_head = priv->tx_head;
        u32 txdes1;
@@ -319,11 +331,12 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        desc = priv->tx_desc_base + (TX_REG_DESC_SIZE * tx_head);
 
        spin_lock_irq(&priv->txlock);
-       if (readl(desc + TX_REG_OFFSET_DESC0) & TX_DESC0_DMA_OWN) {
+       if (moxart_desc_read(desc + TX_REG_OFFSET_DESC0) & TX_DESC0_DMA_OWN) {
                net_dbg_ratelimited("no TX space for packet\n");
                priv->stats.tx_dropped++;
                goto out_unlock;
        }
+       rmb(); /* ensure data is only read that had TX_DESC0_DMA_OWN cleared */
 
        len = skb->len > TX_BUF_SIZE ? TX_BUF_SIZE : skb->len;
 
@@ -337,9 +350,9 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        priv->tx_len[tx_head] = len;
        priv->tx_skb[tx_head] = skb;
 
-       writel(priv->tx_mapping[tx_head],
+       moxart_desc_write(priv->tx_mapping[tx_head],
               desc + TX_REG_OFFSET_DESC2 + TX_DESC2_ADDRESS_PHYS);
-       writel(skb->data,
+       moxart_desc_write((uintptr_t)skb->data,
               desc + TX_REG_OFFSET_DESC2 + TX_DESC2_ADDRESS_VIRT);
 
        if (skb->len < ETH_ZLEN) {
@@ -354,8 +367,9 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        txdes1 = TX_DESC1_LTS | TX_DESC1_FTS | (len & TX_DESC1_BUF_SIZE_MASK);
        if (tx_head == TX_DESC_NUM_MASK)
                txdes1 |= TX_DESC1_END;
-       writel(txdes1, desc + TX_REG_OFFSET_DESC1);
-       writel(TX_DESC0_DMA_OWN, desc + TX_REG_OFFSET_DESC0);
+       moxart_desc_write(txdes1, desc + TX_REG_OFFSET_DESC1);
+       wmb(); /* flush descriptor before transferring ownership */
+       moxart_desc_write(TX_DESC0_DMA_OWN, desc + TX_REG_OFFSET_DESC0);
 
        /* start to send packet */
        writel(0xffffffff, priv->base + REG_TX_POLL_DEMAND);
@@ -460,9 +474,9 @@ static int moxart_mac_probe(struct platform_device *pdev)
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        ndev->base_addr = res->start;
        priv->base = devm_ioremap_resource(p_dev, res);
-       ret = IS_ERR(priv->base);
-       if (ret) {
+       if (IS_ERR(priv->base)) {
                dev_err(p_dev, "devm_ioremap_resource failed\n");
+               ret = PTR_ERR(priv->base);
                goto init_fail;
        }
 
index 2be9280..93a9563 100644 (file)
@@ -300,7 +300,7 @@ struct moxart_mac_priv_t {
 
        dma_addr_t rx_base;
        dma_addr_t rx_mapping[RX_DESC_NUM];
-       void __iomem *rx_desc_base;
+       void *rx_desc_base;
        unsigned char *rx_buf_base;
        unsigned char *rx_buf[RX_DESC_NUM];
        unsigned int rx_head;
@@ -308,7 +308,7 @@ struct moxart_mac_priv_t {
 
        dma_addr_t tx_base;
        dma_addr_t tx_mapping[TX_DESC_NUM];
-       void __iomem *tx_desc_base;
+       void *tx_desc_base;
        unsigned char *tx_buf_base;
        unsigned char *tx_buf[RX_DESC_NUM];
        unsigned int tx_head;
index 50d5604..e0993eb 100644 (file)
@@ -2223,8 +2223,6 @@ static irqreturn_t vxge_isr_napi(int irq, void *dev_id)
        return IRQ_NONE;
 }
 
-#ifdef CONFIG_PCI_MSI
-
 static irqreturn_t vxge_tx_msix_handle(int irq, void *dev_id)
 {
        struct vxge_fifo *fifo = (struct vxge_fifo *)dev_id;
@@ -2442,16 +2440,13 @@ static void vxge_rem_msix_isr(struct vxgedev *vdev)
        if (vdev->config.intr_type == MSI_X)
                pci_disable_msix(vdev->pdev);
 }
-#endif
 
 static void vxge_rem_isr(struct vxgedev *vdev)
 {
-#ifdef CONFIG_PCI_MSI
-       if (vdev->config.intr_type == MSI_X) {
+       if (IS_ENABLED(CONFIG_PCI_MSI) &&
+           vdev->config.intr_type == MSI_X) {
                vxge_rem_msix_isr(vdev);
-       } else
-#endif
-       if (vdev->config.intr_type == INTA) {
+       } else if (vdev->config.intr_type == INTA) {
                        synchronize_irq(vdev->pdev->irq);
                        free_irq(vdev->pdev->irq, vdev);
        }
@@ -2460,11 +2455,10 @@ static void vxge_rem_isr(struct vxgedev *vdev)
 static int vxge_add_isr(struct vxgedev *vdev)
 {
        int ret = 0;
-#ifdef CONFIG_PCI_MSI
        int vp_idx = 0, intr_idx = 0, intr_cnt = 0, msix_idx = 0, irq_req = 0;
        int pci_fun = PCI_FUNC(vdev->pdev->devfn);
 
-       if (vdev->config.intr_type == MSI_X)
+       if (IS_ENABLED(CONFIG_PCI_MSI) && vdev->config.intr_type == MSI_X)
                ret = vxge_enable_msix(vdev);
 
        if (ret) {
@@ -2475,7 +2469,7 @@ static int vxge_add_isr(struct vxgedev *vdev)
                vdev->config.intr_type = INTA;
        }
 
-       if (vdev->config.intr_type == MSI_X) {
+       if (IS_ENABLED(CONFIG_PCI_MSI) && vdev->config.intr_type == MSI_X) {
                for (intr_idx = 0;
                     intr_idx < (vdev->no_of_vpath *
                        VXGE_HW_VPATH_MSIX_ACTIVE); intr_idx++) {
@@ -2576,9 +2570,8 @@ static int vxge_add_isr(struct vxgedev *vdev)
                vdev->vxge_entries[intr_cnt].in_use = 1;
                vdev->vxge_entries[intr_cnt].arg = &vdev->vpaths[0];
        }
-INTA_MODE:
-#endif
 
+INTA_MODE:
        if (vdev->config.intr_type == INTA) {
                snprintf(vdev->desc[0], VXGE_INTR_STRLEN,
                        "%s:vxge:INTA", vdev->ndev->name);
@@ -3889,12 +3882,12 @@ static void vxge_device_config_init(struct vxge_hw_device_config *device_config,
        if (max_mac_vpath > VXGE_MAX_MAC_ADDR_COUNT)
                max_mac_vpath = VXGE_MAX_MAC_ADDR_COUNT;
 
-#ifndef CONFIG_PCI_MSI
-       vxge_debug_init(VXGE_ERR,
-               "%s: This Kernel does not support "
-               "MSI-X. Defaulting to INTA", VXGE_DRIVER_NAME);
-       *intr_type = INTA;
-#endif
+       if (!IS_ENABLED(CONFIG_PCI_MSI)) {
+               vxge_debug_init(VXGE_ERR,
+                       "%s: This Kernel does not support "
+                       "MSI-X. Defaulting to INTA", VXGE_DRIVER_NAME);
+               *intr_type = INTA;
+       }
 
        /* Configure whether MSI-X or IRQL. */
        switch (*intr_type) {
index 689a4a5..1ef0393 100644 (file)
@@ -811,7 +811,7 @@ qcaspi_netdev_setup(struct net_device *dev)
        dev->netdev_ops = &qcaspi_netdev_ops;
        qcaspi_set_ethtool_ops(dev);
        dev->watchdog_timeo = QCASPI_TX_TIMEOUT;
-       dev->flags = IFF_MULTICAST;
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->tx_queue_len = 100;
 
        qca = netdev_priv(dev);
index 17d5571..dd2cf37 100644 (file)
@@ -4933,8 +4933,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
                RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
                break;
        case RTL_GIGA_MAC_VER_40:
-               RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF);
-               break;
        case RTL_GIGA_MAC_VER_41:
        case RTL_GIGA_MAC_VER_42:
        case RTL_GIGA_MAC_VER_43:
@@ -4943,8 +4941,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
        case RTL_GIGA_MAC_VER_46:
        case RTL_GIGA_MAC_VER_47:
        case RTL_GIGA_MAC_VER_48:
-               RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST | RX_EARLY_OFF);
-               break;
        case RTL_GIGA_MAC_VER_49:
        case RTL_GIGA_MAC_VER_50:
        case RTL_GIGA_MAC_VER_51:
@@ -6137,28 +6133,28 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp)
                sw_cnt_1ms_ini = 16000000/rg_saw_cnt;
                sw_cnt_1ms_ini &= 0x0fff;
                data = r8168_mac_ocp_read(tp, 0xd412);
-               data &= 0x0fff;
+               data &= ~0x0fff;
                data |= sw_cnt_1ms_ini;
                r8168_mac_ocp_write(tp, 0xd412, data);
        }
 
        data = r8168_mac_ocp_read(tp, 0xe056);
-       data &= 0xf0;
-       data |= 0x07;
+       data &= ~0xf0;
+       data |= 0x70;
        r8168_mac_ocp_write(tp, 0xe056, data);
 
        data = r8168_mac_ocp_read(tp, 0xe052);
-       data &= 0x8008;
-       data |= 0x6000;
+       data &= ~0x6000;
+       data |= 0x8008;
        r8168_mac_ocp_write(tp, 0xe052, data);
 
        data = r8168_mac_ocp_read(tp, 0xe0d6);
-       data &= 0x01ff;
+       data &= ~0x01ff;
        data |= 0x017f;
        r8168_mac_ocp_write(tp, 0xe0d6, data);
 
        data = r8168_mac_ocp_read(tp, 0xd420);
-       data &= 0x0fff;
+       data &= ~0x0fff;
        data |= 0x047f;
        r8168_mac_ocp_write(tp, 0xd420, data);
 
@@ -7730,10 +7726,13 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
+       struct pci_dev *pdev = tp->pci_dev;
        struct rtl8169_counters *counters = tp->counters;
        unsigned int start;
 
-       if (netif_running(dev))
+       pm_runtime_get_noresume(&pdev->dev);
+
+       if (netif_running(dev) && pm_runtime_active(&pdev->dev))
                rtl8169_rx_missed(dev, ioaddr);
 
        do {
@@ -7761,7 +7760,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
         * Fetch additonal counter values missing in stats collected by driver
         * from tally counters.
         */
-       rtl8169_update_counters(dev);
+       if (pm_runtime_active(&pdev->dev))
+               rtl8169_update_counters(dev);
 
        /*
         * Subtract values fetched during initalization.
@@ -7774,6 +7774,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
        stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) -
                le16_to_cpu(tp->tc_offset.tx_aborted);
 
+       pm_runtime_put_noidle(&pdev->dev);
+
        return stats;
 }
 
@@ -7853,6 +7855,10 @@ static int rtl8169_runtime_suspend(struct device *device)
 
        rtl8169_net_suspend(dev);
 
+       /* Update counters before going runtime suspend */
+       rtl8169_rx_missed(dev, tp->mmio_addr);
+       rtl8169_update_counters(dev);
+
        return 0;
 }
 
index ac43ed9..86449c3 100644 (file)
@@ -1139,7 +1139,8 @@ static int ravb_set_ringparam(struct net_device *ndev,
        if (netif_running(ndev)) {
                netif_device_detach(ndev);
                /* Stop PTP Clock driver */
-               ravb_ptp_stop(ndev);
+               if (priv->chip_id == RCAR_GEN2)
+                       ravb_ptp_stop(ndev);
                /* Wait for DMA stopping */
                error = ravb_stop_dma(ndev);
                if (error) {
@@ -1170,7 +1171,8 @@ static int ravb_set_ringparam(struct net_device *ndev,
                ravb_emac_init(ndev);
 
                /* Initialise PTP Clock driver */
-               ravb_ptp_init(ndev, priv->pdev);
+               if (priv->chip_id == RCAR_GEN2)
+                       ravb_ptp_init(ndev, priv->pdev);
 
                netif_device_attach(ndev);
        }
@@ -1298,7 +1300,8 @@ static void ravb_tx_timeout_work(struct work_struct *work)
        netif_tx_stop_all_queues(ndev);
 
        /* Stop PTP Clock driver */
-       ravb_ptp_stop(ndev);
+       if (priv->chip_id == RCAR_GEN2)
+               ravb_ptp_stop(ndev);
 
        /* Wait for DMA stopping */
        ravb_stop_dma(ndev);
@@ -1311,7 +1314,8 @@ static void ravb_tx_timeout_work(struct work_struct *work)
        ravb_emac_init(ndev);
 
        /* Initialise PTP Clock driver */
-       ravb_ptp_init(ndev, priv->pdev);
+       if (priv->chip_id == RCAR_GEN2)
+               ravb_ptp_init(ndev, priv->pdev);
 
        netif_tx_start_all_queues(ndev);
 }
@@ -1718,7 +1722,6 @@ static int ravb_set_gti(struct net_device *ndev)
 static int ravb_probe(struct platform_device *pdev)
 {
        struct device_node *np = pdev->dev.of_node;
-       const struct of_device_id *match;
        struct ravb_private *priv;
        enum ravb_chip_id chip_id;
        struct net_device *ndev;
@@ -1750,8 +1753,7 @@ static int ravb_probe(struct platform_device *pdev)
        ndev->base_addr = res->start;
        ndev->dma = -1;
 
-       match = of_match_device(of_match_ptr(ravb_match_table), &pdev->dev);
-       chip_id = (enum ravb_chip_id)match->data;
+       chip_id = (enum ravb_chip_id)of_device_get_match_data(&pdev->dev);
 
        if (chip_id == RCAR_GEN3)
                irq = platform_get_irq_byname(pdev, "ch22");
@@ -1814,10 +1816,6 @@ static int ravb_probe(struct platform_device *pdev)
                           CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB, CCC);
        }
 
-       /* Set CSEL value */
-       ravb_write(ndev, (ravb_read(ndev, CCC) & ~CCC_CSEL) | CCC_CSEL_HPB,
-                  CCC);
-
        /* Set GTI value */
        error = ravb_set_gti(ndev);
        if (error)
index dfa9e59..7384499 100644 (file)
@@ -3061,15 +3061,11 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
        mdp->ether_link_active_low = pd->ether_link_active_low;
 
        /* set cpu data */
-       if (id) {
+       if (id)
                mdp->cd = (struct sh_eth_cpu_data *)id->driver_data;
-       } else  {
-               const struct of_device_id *match;
+       else
+               mdp->cd = (struct sh_eth_cpu_data *)of_device_get_match_data(&pdev->dev);
 
-               match = of_match_device(of_match_ptr(sh_eth_match_table),
-                                       &pdev->dev);
-               mdp->cd = (struct sh_eth_cpu_data *)match->data;
-       }
        mdp->reg_offset = sh_eth_get_register_offset(mdp->cd->register_type);
        if (!mdp->reg_offset) {
                dev_err(&pdev->dev, "Unknown register type (%d)\n",
index a4ab71d..166a7fc 100644 (file)
@@ -3531,12 +3531,14 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
        info.addr = lw->addr;
        info.vid = lw->vid;
 
+       rtnl_lock();
        if (learned && removing)
                call_switchdev_notifiers(SWITCHDEV_FDB_DEL,
                                         lw->rocker_port->dev, &info.info);
        else if (learned && !removing)
                call_switchdev_notifiers(SWITCHDEV_FDB_ADD,
                                         lw->rocker_port->dev, &info.info);
+       rtnl_unlock();
 
        rocker_port_kfree(lw->trans, work);
 }
index 0e2fc1a..db7db8a 100644 (file)
@@ -2342,8 +2342,8 @@ static int smc_drv_probe(struct platform_device *pdev)
        }
 
        ndev->irq = platform_get_irq(pdev, 0);
-       if (ndev->irq <= 0) {
-               ret = -ENODEV;
+       if (ndev->irq < 0) {
+               ret = ndev->irq;
                goto out_release_io;
        }
        /*
index 0faf163..efb54f3 100644 (file)
@@ -199,21 +199,12 @@ int stmmac_mdio_register(struct net_device *ndev)
        struct stmmac_priv *priv = netdev_priv(ndev);
        struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
        int addr, found;
-       struct device_node *mdio_node = NULL;
-       struct device_node *child_node = NULL;
+       struct device_node *mdio_node = priv->plat->mdio_node;
 
        if (!mdio_bus_data)
                return 0;
 
        if (IS_ENABLED(CONFIG_OF)) {
-               for_each_child_of_node(priv->device->of_node, child_node) {
-                       if (of_device_is_compatible(child_node,
-                                                   "snps,dwmac-mdio")) {
-                               mdio_node = child_node;
-                               break;
-                       }
-               }
-
                if (mdio_node) {
                        netdev_dbg(ndev, "FOUND MDIO subnode\n");
                } else {
index 6a52fa1..4514ba7 100644 (file)
@@ -110,6 +110,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
        struct device_node *np = pdev->dev.of_node;
        struct plat_stmmacenet_data *plat;
        struct stmmac_dma_cfg *dma_cfg;
+       struct device_node *child_node = NULL;
 
        plat = devm_kzalloc(&pdev->dev, sizeof(*plat), GFP_KERNEL);
        if (!plat)
@@ -140,13 +141,19 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
                plat->phy_node = of_node_get(np);
        }
 
+       for_each_child_of_node(np, child_node)
+               if (of_device_is_compatible(child_node, "snps,dwmac-mdio")) {
+                       plat->mdio_node = child_node;
+                       break;
+               }
+
        /* "snps,phy-addr" is not a standard property. Mark it as deprecated
         * and warn of its use. Remove this when phy node support is added.
         */
        if (of_property_read_u32(np, "snps,phy-addr", &plat->phy_addr) == 0)
                dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n");
 
-       if ((plat->phy_node && !of_phy_is_fixed_link(np)) || plat->phy_bus_name)
+       if ((plat->phy_node && !of_phy_is_fixed_link(np)) || !plat->mdio_node)
                plat->mdio_bus_data = NULL;
        else
                plat->mdio_bus_data =
index cc106d8..23fa298 100644 (file)
@@ -389,17 +389,27 @@ static int vnet_rx_one(struct vnet_port *port, struct vio_net_desc *desc)
        if (vio_version_after_eq(&port->vio, 1, 8)) {
                struct vio_net_dext *dext = vio_net_ext(desc);
 
+               skb_reset_network_header(skb);
+
                if (dext->flags & VNET_PKT_HCK_IPV4_HDRCKSUM) {
                        if (skb->protocol == ETH_P_IP) {
-                               struct iphdr *iph = (struct iphdr *)skb->data;
+                               struct iphdr *iph = ip_hdr(skb);
 
                                iph->check = 0;
                                ip_send_check(iph);
                        }
                }
                if ((dext->flags & VNET_PKT_HCK_FULLCKSUM) &&
-                   skb->ip_summed == CHECKSUM_NONE)
-                       vnet_fullcsum(skb);
+                   skb->ip_summed == CHECKSUM_NONE) {
+                       if (skb->protocol == htons(ETH_P_IP)) {
+                               struct iphdr *iph = ip_hdr(skb);
+                               int ihl = iph->ihl * 4;
+
+                               skb_reset_transport_header(skb);
+                               skb_set_transport_header(skb, ihl);
+                               vnet_fullcsum(skb);
+                       }
+               }
                if (dext->flags & VNET_PKT_HCK_IPV4_HDRCKSUM_OK) {
                        skb->ip_summed = CHECKSUM_PARTIAL;
                        skb->csum_level = 0;
index 70814b7..af11ed1 100644 (file)
 #define DWC_MMC_RXOCTETCOUNT_GB          0x0784
 #define DWC_MMC_RXPACKETCOUNT_GB         0x0780
 
-static int debug = 3;
+static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "DWC_eth_qos debug level (0=none,...,16=all)");
 
@@ -650,6 +650,11 @@ struct net_local {
        u32 mmc_tx_counters_mask;
 
        struct dwceqos_flowcontrol flowcontrol;
+
+       /* Tracks the intermediate state of phy started but hardware
+        * init not finished yet.
+        */
+       bool phy_defer;
 };
 
 static void dwceqos_read_mmc_counters(struct net_local *lp, u32 rx_mask,
@@ -901,6 +906,9 @@ static void dwceqos_adjust_link(struct net_device *ndev)
        struct phy_device *phydev = lp->phy_dev;
        int status_change = 0;
 
+       if (lp->phy_defer)
+               return;
+
        if (phydev->link) {
                if ((lp->speed != phydev->speed) ||
                    (lp->duplex != phydev->duplex)) {
@@ -1113,7 +1121,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
        /* Allocate DMA descriptors */
        size = DWCEQOS_RX_DCNT * sizeof(struct dwceqos_dma_desc);
        lp->rx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-                       &lp->rx_descs_addr, 0);
+                       &lp->rx_descs_addr, GFP_KERNEL);
        if (!lp->rx_descs)
                goto err_out;
        lp->rx_descs_tail_addr = lp->rx_descs_addr +
@@ -1121,7 +1129,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
 
        size = DWCEQOS_TX_DCNT * sizeof(struct dwceqos_dma_desc);
        lp->tx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-                       &lp->tx_descs_addr, 0);
+                       &lp->tx_descs_addr, GFP_KERNEL);
        if (!lp->tx_descs)
                goto err_out;
        lp->tx_descs_tail_addr = lp->tx_descs_addr +
@@ -1635,6 +1643,12 @@ static void dwceqos_init_hw(struct net_local *lp)
        regval = dwceqos_read(lp, REG_DWCEQOS_MAC_CFG);
        dwceqos_write(lp, REG_DWCEQOS_MAC_CFG,
                      regval | DWCEQOS_MAC_CFG_TE | DWCEQOS_MAC_CFG_RE);
+
+       lp->phy_defer = false;
+       mutex_lock(&lp->phy_dev->lock);
+       phy_read_status(lp->phy_dev);
+       dwceqos_adjust_link(lp->ndev);
+       mutex_unlock(&lp->phy_dev->lock);
 }
 
 static void dwceqos_tx_reclaim(unsigned long data)
@@ -1880,9 +1894,13 @@ static int dwceqos_open(struct net_device *ndev)
        }
        netdev_reset_queue(ndev);
 
-       napi_enable(&lp->napi);
+       /* The dwceqos reset state machine requires all phy clocks to complete,
+        * hence the unusual init order with phy_start first.
+        */
+       lp->phy_defer = true;
        phy_start(lp->phy_dev);
        dwceqos_init_hw(lp);
+       napi_enable(&lp->napi);
 
        netif_start_queue(ndev);
        tasklet_enable(&lp->tx_bdreclaim_tasklet);
@@ -1915,18 +1933,19 @@ static int dwceqos_stop(struct net_device *ndev)
 {
        struct net_local *lp = netdev_priv(ndev);
 
-       phy_stop(lp->phy_dev);
-
        tasklet_disable(&lp->tx_bdreclaim_tasklet);
-       netif_stop_queue(ndev);
        napi_disable(&lp->napi);
 
-       dwceqos_drain_dma(lp);
+       /* Stop all tx before we drain the tx dma. */
+       netif_tx_lock_bh(lp->ndev);
+       netif_stop_queue(ndev);
+       netif_tx_unlock_bh(lp->ndev);
 
-       netif_tx_lock(lp->ndev);
+       dwceqos_drain_dma(lp);
        dwceqos_reset_hw(lp);
+       phy_stop(lp->phy_dev);
+
        dwceqos_descriptor_free(lp);
-       netif_tx_unlock(lp->ndev);
 
        return 0;
 }
@@ -2178,12 +2197,10 @@ static int dwceqos_start_xmit(struct sk_buff *skb, struct net_device *ndev)
                ((trans.initial_descriptor + trans.nr_descriptors) %
                 DWCEQOS_TX_DCNT));
 
-       dwceqos_tx_finalize(skb, lp, &trans);
-
-       netdev_sent_queue(ndev, skb->len);
-
        spin_lock_bh(&lp->tx_lock);
        lp->tx_free -= trans.nr_descriptors;
+       dwceqos_tx_finalize(skb, lp, &trans);
+       netdev_sent_queue(ndev, skb->len);
        spin_unlock_bh(&lp->tx_lock);
 
        ndev->trans_start = jiffies;
index e9cc61e..c3e85ac 100644 (file)
@@ -63,8 +63,12 @@ static void cpsw_gmii_sel_am3352(struct cpsw_phy_sel_priv *priv,
                mode = AM33XX_GMII_SEL_MODE_RGMII;
                break;
 
-       case PHY_INTERFACE_MODE_MII:
        default:
+               dev_warn(priv->dev,
+                        "Unsupported PHY mode: \"%s\". Defaulting to MII.\n",
+                       phy_modes(phy_mode));
+               /* fallthrough */
+       case PHY_INTERFACE_MODE_MII:
                mode = AM33XX_GMII_SEL_MODE_MII;
                break;
        };
@@ -106,8 +110,12 @@ static void cpsw_gmii_sel_dra7xx(struct cpsw_phy_sel_priv *priv,
                mode = AM33XX_GMII_SEL_MODE_RGMII;
                break;
 
-       case PHY_INTERFACE_MODE_MII:
        default:
+               dev_warn(priv->dev,
+                        "Unsupported PHY mode: \"%s\". Defaulting to MII.\n",
+                       phy_modes(phy_mode));
+               /* fallthrough */
+       case PHY_INTERFACE_MODE_MII:
                mode = AM33XX_GMII_SEL_MODE_MII;
                break;
        };
index 657b65b..18bf3a8 100644 (file)
@@ -82,7 +82,7 @@ struct cpdma_desc {
 
 struct cpdma_desc_pool {
        phys_addr_t             phys;
-       u32                     hw_addr;
+       dma_addr_t              hw_addr;
        void __iomem            *iomap;         /* ioremap map */
        void                    *cpumap;        /* dma_alloc map */
        int                     desc_size, mem_size;
@@ -152,7 +152,7 @@ struct cpdma_chan {
  * abstract out these details
  */
 static struct cpdma_desc_pool *
-cpdma_desc_pool_create(struct device *dev, u32 phys, u32 hw_addr,
+cpdma_desc_pool_create(struct device *dev, u32 phys, dma_addr_t hw_addr,
                                int size, int align)
 {
        int bitmap_size;
@@ -176,13 +176,13 @@ cpdma_desc_pool_create(struct device *dev, u32 phys, u32 hw_addr,
 
        if (phys) {
                pool->phys  = phys;
-               pool->iomap = ioremap(phys, size);
+               pool->iomap = ioremap(phys, size); /* should be memremap? */
                pool->hw_addr = hw_addr;
        } else {
-               pool->cpumap = dma_alloc_coherent(dev, size, &pool->phys,
+               pool->cpumap = dma_alloc_coherent(dev, size, &pool->hw_addr,
                                                  GFP_KERNEL);
-               pool->iomap = pool->cpumap;
-               pool->hw_addr = pool->phys;
+               pool->iomap = (void __iomem __force *)pool->cpumap;
+               pool->phys = pool->hw_addr; /* assumes no IOMMU, don't use this value */
        }
 
        if (pool->iomap)
index c61d66d..029841f 100644 (file)
@@ -117,21 +117,17 @@ static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, dma_addr_t *ndesc,
        *ndesc = le32_to_cpu(desc->next_desc);
 }
 
-static void get_pad_info(u32 *pad0, u32 *pad1, u32 *pad2, struct knav_dma_desc *desc)
+static u32 get_sw_data(int index, struct knav_dma_desc *desc)
 {
-       *pad0 = le32_to_cpu(desc->pad[0]);
-       *pad1 = le32_to_cpu(desc->pad[1]);
-       *pad2 = le32_to_cpu(desc->pad[2]);
+       /* No Endian conversion needed as this data is untouched by hw */
+       return desc->sw_data[index];
 }
 
-static void get_pad_ptr(void **padptr, struct knav_dma_desc *desc)
-{
-       u64 pad64;
-
-       pad64 = le32_to_cpu(desc->pad[0]) +
-               ((u64)le32_to_cpu(desc->pad[1]) << 32);
-       *padptr = (void *)(uintptr_t)pad64;
-}
+/* use these macros to get sw data */
+#define GET_SW_DATA0(desc) get_sw_data(0, desc)
+#define GET_SW_DATA1(desc) get_sw_data(1, desc)
+#define GET_SW_DATA2(desc) get_sw_data(2, desc)
+#define GET_SW_DATA3(desc) get_sw_data(3, desc)
 
 static void get_org_pkt_info(dma_addr_t *buff, u32 *buff_len,
                             struct knav_dma_desc *desc)
@@ -163,13 +159,18 @@ static void set_desc_info(u32 desc_info, u32 pkt_info,
        desc->packet_info = cpu_to_le32(pkt_info);
 }
 
-static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, struct knav_dma_desc *desc)
+static void set_sw_data(int index, u32 data, struct knav_dma_desc *desc)
 {
-       desc->pad[0] = cpu_to_le32(pad0);
-       desc->pad[1] = cpu_to_le32(pad1);
-       desc->pad[2] = cpu_to_le32(pad1);
+       /* No Endian conversion needed as this data is untouched by hw */
+       desc->sw_data[index] = data;
 }
 
+/* use these macros to set sw data */
+#define SET_SW_DATA0(data, desc) set_sw_data(0, data, desc)
+#define SET_SW_DATA1(data, desc) set_sw_data(1, data, desc)
+#define SET_SW_DATA2(data, desc) set_sw_data(2, data, desc)
+#define SET_SW_DATA3(data, desc) set_sw_data(3, data, desc)
+
 static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
                             struct knav_dma_desc *desc)
 {
@@ -581,7 +582,6 @@ static void netcp_free_rx_desc_chain(struct netcp_intf *netcp,
        dma_addr_t dma_desc, dma_buf;
        unsigned int buf_len, dma_sz = sizeof(*ndesc);
        void *buf_ptr;
-       u32 pad[2];
        u32 tmp;
 
        get_words(&dma_desc, 1, &desc->next_desc);
@@ -593,14 +593,20 @@ static void netcp_free_rx_desc_chain(struct netcp_intf *netcp,
                        break;
                }
                get_pkt_info(&dma_buf, &tmp, &dma_desc, ndesc);
-               get_pad_ptr(&buf_ptr, ndesc);
+               /* warning!!!! We are retrieving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               buf_ptr = (void *)GET_SW_DATA0(ndesc);
+               buf_len = (int)GET_SW_DATA1(desc);
                dma_unmap_page(netcp->dev, dma_buf, PAGE_SIZE, DMA_FROM_DEVICE);
                __free_page(buf_ptr);
                knav_pool_desc_put(netcp->rx_pool, desc);
        }
-
-       get_pad_info(&pad[0], &pad[1], &buf_len, desc);
-       buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+       /* warning!!!! We are retrieving the virtual ptr in the sw_data
+        * field as a 32bit value. Will not work on 64bit machines
+        */
+       buf_ptr = (void *)GET_SW_DATA0(desc);
+       buf_len = (int)GET_SW_DATA1(desc);
 
        if (buf_ptr)
                netcp_frag_free(buf_len <= PAGE_SIZE, buf_ptr);
@@ -639,7 +645,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
        dma_addr_t dma_desc, dma_buff;
        struct netcp_packet p_info;
        struct sk_buff *skb;
-       u32 pad[2];
        void *org_buf_ptr;
 
        dma_desc = knav_queue_pop(netcp->rx_queue, &dma_sz);
@@ -653,8 +658,11 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
        }
 
        get_pkt_info(&dma_buff, &buf_len, &dma_desc, desc);
-       get_pad_info(&pad[0], &pad[1], &org_buf_len, desc);
-       org_buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+       /* warning!!!! We are retrieving the virtual ptr in the sw_data
+        * field as a 32bit value. Will not work on 64bit machines
+        */
+       org_buf_ptr = (void *)GET_SW_DATA0(desc);
+       org_buf_len = (int)GET_SW_DATA1(desc);
 
        if (unlikely(!org_buf_ptr)) {
                dev_err(netcp->ndev_dev, "NULL bufptr in desc\n");
@@ -679,7 +687,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
        /* Fill in the page fragment list */
        while (dma_desc) {
                struct page *page;
-               void *ptr;
 
                ndesc = knav_pool_desc_unmap(netcp->rx_pool, dma_desc, dma_sz);
                if (unlikely(!ndesc)) {
@@ -688,8 +695,10 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
                }
 
                get_pkt_info(&dma_buff, &buf_len, &dma_desc, ndesc);
-               get_pad_ptr(&ptr, ndesc);
-               page = ptr;
+               /* warning!!!! We are retrieving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               page = (struct page *)GET_SW_DATA0(desc);
 
                if (likely(dma_buff && buf_len && page)) {
                        dma_unmap_page(netcp->dev, dma_buff, PAGE_SIZE,
@@ -777,7 +786,10 @@ static void netcp_free_rx_buf(struct netcp_intf *netcp, int fdq)
                }
 
                get_org_pkt_info(&dma, &buf_len, desc);
-               get_pad_ptr(&buf_ptr, desc);
+               /* warning!!!! We are retrieving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               buf_ptr = (void *)GET_SW_DATA0(desc);
 
                if (unlikely(!dma)) {
                        dev_err(netcp->ndev_dev, "NULL orig_buff in desc\n");
@@ -829,7 +841,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
        struct page *page;
        dma_addr_t dma;
        void *bufptr;
-       u32 pad[3];
+       u32 sw_data[2];
 
        /* Allocate descriptor */
        hwdesc = knav_pool_desc_get(netcp->rx_pool);
@@ -846,7 +858,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
                                SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
                bufptr = netdev_alloc_frag(primary_buf_len);
-               pad[2] = primary_buf_len;
+               sw_data[1] = primary_buf_len;
 
                if (unlikely(!bufptr)) {
                        dev_warn_ratelimited(netcp->ndev_dev,
@@ -858,9 +870,10 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
                if (unlikely(dma_mapping_error(netcp->dev, dma)))
                        goto fail;
 
-               pad[0] = lower_32_bits((uintptr_t)bufptr);
-               pad[1] = upper_32_bits((uintptr_t)bufptr);
-
+               /* warning!!!! We are saving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               sw_data[0] = (u32)bufptr;
        } else {
                /* Allocate a secondary receive queue entry */
                page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD);
@@ -870,9 +883,11 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
                }
                buf_len = PAGE_SIZE;
                dma = dma_map_page(netcp->dev, page, 0, buf_len, DMA_TO_DEVICE);
-               pad[0] = lower_32_bits(dma);
-               pad[1] = upper_32_bits(dma);
-               pad[2] = 0;
+               /* warning!!!! We are saving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               sw_data[0] = (u32)page;
+               sw_data[1] = 0;
        }
 
        desc_info =  KNAV_DMA_DESC_PS_INFO_IN_DESC;
@@ -882,7 +897,8 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
        pkt_info |= (netcp->rx_queue_id & KNAV_DMA_DESC_RETQ_MASK) <<
                    KNAV_DMA_DESC_RETQ_SHIFT;
        set_org_pkt_info(dma, buf_len, hwdesc);
-       set_pad_info(pad[0], pad[1], pad[2], hwdesc);
+       SET_SW_DATA0(sw_data[0], hwdesc);
+       SET_SW_DATA1(sw_data[1], hwdesc);
        set_desc_info(desc_info, pkt_info, hwdesc);
 
        /* Push to FDQs */
@@ -971,7 +987,6 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
                                          unsigned int budget)
 {
        struct knav_dma_desc *desc;
-       void *ptr;
        struct sk_buff *skb;
        unsigned int dma_sz;
        dma_addr_t dma;
@@ -988,8 +1003,10 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
                        continue;
                }
 
-               get_pad_ptr(&ptr, desc);
-               skb = ptr;
+               /* warning!!!! We are retrieving the virtual ptr in the sw_data
+                * field as a 32bit value. Will not work on 64bit machines
+                */
+               skb = (struct sk_buff *)GET_SW_DATA0(desc);
                netcp_free_tx_desc_chain(netcp, desc, dma_sz);
                if (!skb) {
                        dev_err(netcp->ndev_dev, "No skb in Tx desc\n");
@@ -1194,10 +1211,10 @@ static int netcp_tx_submit_skb(struct netcp_intf *netcp,
        }
 
        set_words(&tmp, 1, &desc->packet_info);
-       tmp = lower_32_bits((uintptr_t)&skb);
-       set_words(&tmp, 1, &desc->pad[0]);
-       tmp = upper_32_bits((uintptr_t)&skb);
-       set_words(&tmp, 1, &desc->pad[1]);
+       /* warning!!!! We are saving the virtual ptr in the sw_data
+        * field as a 32bit value. Will not work on 64bit machines
+        */
+       SET_SW_DATA0((u32)skb, desc);
 
        if (tx_pipe->flags & SWITCH_TO_PORT_IN_TAGINFO) {
                tmp = tx_pipe->switch_to_port;
index 7f975a2..b0de8ec 100644 (file)
@@ -533,8 +533,8 @@ static int dfx_register(struct device *bdev)
        const char *print_name = dev_name(bdev);
        struct net_device *dev;
        DFX_board_t       *bp;                  /* board pointer */
-       resource_size_t bar_start[3];           /* pointers to ports */
-       resource_size_t bar_len[3];             /* resource length */
+       resource_size_t bar_start[3] = {0};     /* pointers to ports */
+       resource_size_t bar_len[3] = {0};       /* resource length */
        int alloc_size;                         /* total buffer size used */
        struct resource *region;
        int err = 0;
@@ -3697,8 +3697,8 @@ static void dfx_unregister(struct device *bdev)
        int dfx_bus_pci = dev_is_pci(bdev);
        int dfx_bus_tc = DFX_BUS_TC(bdev);
        int dfx_use_mmio = DFX_MMIO || dfx_bus_tc;
-       resource_size_t bar_start[3];           /* pointers to ports */
-       resource_size_t bar_len[3];             /* resource lengths */
+       resource_size_t bar_start[3] = {0};     /* pointers to ports */
+       resource_size_t bar_len[3] = {0};       /* resource lengths */
        int             alloc_size;             /* total buffer size used */
 
        unregister_netdev(dev);
index 7456569..0bf7edd 100644 (file)
@@ -980,9 +980,9 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
                        opts = ip_tunnel_info_opts(info);
 
                if (key->tun_flags & TUNNEL_CSUM)
-                       flags |= GENEVE_F_UDP_CSUM;
+                       flags &= ~GENEVE_F_UDP_ZERO_CSUM6_TX;
                else
-                       flags &= ~GENEVE_F_UDP_CSUM;
+                       flags |= GENEVE_F_UDP_ZERO_CSUM6_TX;
 
                err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
                                        info->options_len, opts,
@@ -1039,6 +1039,34 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
        return geneve_xmit_skb(skb, dev, info);
 }
 
+static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict)
+{
+       /* The max_mtu calculation does not take account of GENEVE
+        * options, to avoid excluding potentially valid
+        * configurations.
+        */
+       int max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - sizeof(struct iphdr)
+               - dev->hard_header_len;
+
+       if (new_mtu < 68)
+               return -EINVAL;
+
+       if (new_mtu > max_mtu) {
+               if (strict)
+                       return -EINVAL;
+
+               new_mtu = max_mtu;
+       }
+
+       dev->mtu = new_mtu;
+       return 0;
+}
+
+static int geneve_change_mtu(struct net_device *dev, int new_mtu)
+{
+       return __geneve_change_mtu(dev, new_mtu, true);
+}
+
 static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 {
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -1083,7 +1111,7 @@ static const struct net_device_ops geneve_netdev_ops = {
        .ndo_stop               = geneve_stop,
        .ndo_start_xmit         = geneve_xmit,
        .ndo_get_stats64        = ip_tunnel_get_stats64,
-       .ndo_change_mtu         = eth_change_mtu,
+       .ndo_change_mtu         = geneve_change_mtu,
        .ndo_validate_addr      = eth_validate_addr,
        .ndo_set_mac_address    = eth_mac_addr,
        .ndo_fill_metadata_dst  = geneve_fill_metadata_dst,
@@ -1150,6 +1178,7 @@ static void geneve_setup(struct net_device *dev)
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 
        netif_keep_dst(dev);
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
        eth_hw_addr_random(dev);
 }
@@ -1441,12 +1470,23 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
                return dev;
 
        err = geneve_configure(net, dev, &geneve_remote_unspec,
-                              0, 0, 0, htons(dst_port), true, 0);
-       if (err) {
-               free_netdev(dev);
-               return ERR_PTR(err);
-       }
+                              0, 0, 0, htons(dst_port), true,
+                              GENEVE_F_UDP_ZERO_CSUM6_RX);
+       if (err)
+               goto err;
+
+       /* openvswitch users expect packet sizes to be unrestricted,
+        * so set the largest MTU we can.
+        */
+       err = __geneve_change_mtu(dev, IP_MAX_MTU, false);
+       if (err)
+               goto err;
+
        return dev;
+
+ err:
+       free_netdev(dev);
+       return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
 
index f4130af..fcb92c0 100644 (file)
@@ -624,6 +624,7 @@ struct nvsp_message {
 #define RNDIS_PKT_ALIGN_DEFAULT 8
 
 struct multi_send_data {
+       struct sk_buff *skb; /* skb containing the pkt */
        struct hv_netvsc_packet *pkt; /* netvsc pkt pending */
        u32 count; /* counter of batched packets */
 };
index 059fc52..ec313fc 100644 (file)
@@ -841,6 +841,18 @@ static inline int netvsc_send_pkt(
        return ret;
 }
 
+/* Move packet out of multi send data (msd), and clear msd */
+static inline void move_pkt_msd(struct hv_netvsc_packet **msd_send,
+                               struct sk_buff **msd_skb,
+                               struct multi_send_data *msdp)
+{
+       *msd_skb = msdp->skb;
+       *msd_send = msdp->pkt;
+       msdp->skb = NULL;
+       msdp->pkt = NULL;
+       msdp->count = 0;
+}
+
 int netvsc_send(struct hv_device *device,
                struct hv_netvsc_packet *packet,
                struct rndis_message *rndis_msg,
@@ -855,6 +867,7 @@ int netvsc_send(struct hv_device *device,
        unsigned int section_index = NETVSC_INVALID_INDEX;
        struct multi_send_data *msdp;
        struct hv_netvsc_packet *msd_send = NULL, *cur_send = NULL;
+       struct sk_buff *msd_skb = NULL;
        bool try_batch;
        bool xmit_more = (skb != NULL) ? skb->xmit_more : false;
 
@@ -897,10 +910,8 @@ int netvsc_send(struct hv_device *device,
                   net_device->send_section_size) {
                section_index = netvsc_get_next_send_section(net_device);
                if (section_index != NETVSC_INVALID_INDEX) {
-                               msd_send = msdp->pkt;
-                               msdp->pkt = NULL;
-                               msdp->count = 0;
-                               msd_len = 0;
+                       move_pkt_msd(&msd_send, &msd_skb, msdp);
+                       msd_len = 0;
                }
        }
 
@@ -919,31 +930,31 @@ int netvsc_send(struct hv_device *device,
                        packet->total_data_buflen += msd_len;
                }
 
-               if (msdp->pkt)
-                       dev_kfree_skb_any(skb);
+               if (msdp->skb)
+                       dev_kfree_skb_any(msdp->skb);
 
                if (xmit_more && !packet->cp_partial) {
+                       msdp->skb = skb;
                        msdp->pkt = packet;
                        msdp->count++;
                } else {
                        cur_send = packet;
+                       msdp->skb = NULL;
                        msdp->pkt = NULL;
                        msdp->count = 0;
                }
        } else {
-               msd_send = msdp->pkt;
-               msdp->pkt = NULL;
-               msdp->count = 0;
+               move_pkt_msd(&msd_send, &msd_skb, msdp);
                cur_send = packet;
        }
 
        if (msd_send) {
-               m_ret = netvsc_send_pkt(msd_send, net_device, pb, skb);
+               m_ret = netvsc_send_pkt(msd_send, net_device, NULL, msd_skb);
 
                if (m_ret != 0) {
                        netvsc_free_send_slot(net_device,
                                              msd_send->send_buf_index);
-                       dev_kfree_skb_any(skb);
+                       dev_kfree_skb_any(msd_skb);
                }
        }
 
index 1c8db9a..98e34fe 100644 (file)
@@ -196,65 +196,6 @@ static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
        return ppi;
 }
 
-union sub_key {
-       u64 k;
-       struct {
-               u8 pad[3];
-               u8 kb;
-               u32 ka;
-       };
-};
-
-/* Toeplitz hash function
- * data: network byte order
- * return: host byte order
- */
-static u32 comp_hash(u8 *key, int klen, void *data, int dlen)
-{
-       union sub_key subk;
-       int k_next = 4;
-       u8 dt;
-       int i, j;
-       u32 ret = 0;
-
-       subk.k = 0;
-       subk.ka = ntohl(*(u32 *)key);
-
-       for (i = 0; i < dlen; i++) {
-               subk.kb = key[k_next];
-               k_next = (k_next + 1) % klen;
-               dt = ((u8 *)data)[i];
-               for (j = 0; j < 8; j++) {
-                       if (dt & 0x80)
-                               ret ^= subk.ka;
-                       dt <<= 1;
-                       subk.k <<= 1;
-               }
-       }
-
-       return ret;
-}
-
-static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
-{
-       struct flow_keys flow;
-       int data_len;
-
-       if (!skb_flow_dissect_flow_keys(skb, &flow, 0) ||
-           !(flow.basic.n_proto == htons(ETH_P_IP) ||
-             flow.basic.n_proto == htons(ETH_P_IPV6)))
-               return false;
-
-       if (flow.basic.ip_proto == IPPROTO_TCP)
-               data_len = 12;
-       else
-               data_len = 8;
-
-       *hash = comp_hash(netvsc_hash_key, HASH_KEYLEN, &flow, data_len);
-
-       return true;
-}
-
 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
                        void *accel_priv, select_queue_fallback_t fallback)
 {
@@ -267,11 +208,9 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
        if (nvsc_dev == NULL || ndev->real_num_tx_queues <= 1)
                return 0;
 
-       if (netvsc_set_hash(&hash, skb)) {
-               q_idx = nvsc_dev->send_table[hash % VRSS_SEND_TAB_SIZE] %
-                       ndev->real_num_tx_queues;
-               skb_set_hash(skb, hash, PKT_HASH_TYPE_L3);
-       }
+       hash = skb_get_hash(skb);
+       q_idx = nvsc_dev->send_table[hash % VRSS_SEND_TAB_SIZE] %
+               ndev->real_num_tx_queues;
 
        if (!nvsc_dev->chn_table[q_idx])
                q_idx = 0;
@@ -1150,6 +1089,9 @@ static int netvsc_probe(struct hv_device *dev,
        net->ethtool_ops = &ethtool_ops;
        SET_NETDEV_DEV(net, &dev->device);
 
+       /* We always need headroom for rndis header */
+       net->needed_headroom = RNDIS_AND_PPI_SIZE;
+
        /* Notify the netvsc driver of the new device */
        memset(&device_info, 0, sizeof(device_info));
        device_info.ring_size = ring_size;
index 29cbde8..d47cf14 100644 (file)
@@ -82,9 +82,6 @@ struct bfin_sir_self {
 
 #define DRIVER_NAME "bfin_sir"
 
-#define port_membase(port)     (((struct bfin_sir_port *)(port))->membase)
-#define get_lsr_cache(port)    (((struct bfin_sir_port *)(port))->lsr)
-#define put_lsr_cache(port, v) (((struct bfin_sir_port *)(port))->lsr = (v))
 #include <asm/bfin_serial.h>
 
 static const unsigned short per[][4] = {
index 6a57a00..94e6888 100644 (file)
@@ -1323,6 +1323,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 
        list_add_tail_rcu(&vlan->list, &port->vlans);
        netif_stacked_transfer_operstate(lowerdev, dev);
+       linkwatch_fire_event(dev);
 
        return 0;
 
@@ -1522,6 +1523,7 @@ static int macvlan_device_event(struct notifier_block *unused,
        port = macvlan_port_get_rtnl(dev);
 
        switch (event) {
+       case NETDEV_UP:
        case NETDEV_CHANGE:
                list_for_each_entry(vlan, &port->vlans, list)
                        netif_stacked_transfer_operstate(vlan->lowerdev,
index 60994a8..f0a7702 100644 (file)
@@ -186,6 +186,7 @@ config MDIO_GPIO
 config MDIO_OCTEON
        tristate "Support for MDIO buses on Octeon and ThunderX SOCs"
        depends on 64BIT
+       depends on HAS_IOMEM
        help
 
          This module provides a driver for the Octeon and ThunderX MDIO
index bf241a3..db507e3 100644 (file)
@@ -250,10 +250,6 @@ static int bcm7xxx_config_init(struct phy_device *phydev)
        phy_write(phydev, MII_BCM7XXX_AUX_MODE, MII_BCM7XX_64CLK_MDIO);
        phy_read(phydev, MII_BCM7XXX_AUX_MODE);
 
-       /* Workaround only required for 100Mbits/sec capable PHYs */
-       if (phydev->supported & PHY_GBIT_FEATURES)
-               return 0;
-
        /* set shadow mode 2 */
        ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST,
                        MII_BCM7XXX_SHD_MODE_2, MII_BCM7XXX_SHD_MODE_2);
@@ -270,7 +266,7 @@ static int bcm7xxx_config_init(struct phy_device *phydev)
        phy_write(phydev, MII_BCM7XXX_100TX_FALSE_CAR, 0x7555);
 
        /* reset shadow mode 2 */
-       ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, MII_BCM7XXX_SHD_MODE_2, 0);
+       ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0, MII_BCM7XXX_SHD_MODE_2);
        if (ret < 0)
                return ret;
 
@@ -307,11 +303,6 @@ static int bcm7xxx_suspend(struct phy_device *phydev)
        return 0;
 }
 
-static int bcm7xxx_dummy_config_init(struct phy_device *phydev)
-{
-       return 0;
-}
-
 #define BCM7XXX_28NM_GPHY(_oui, _name)                                 \
 {                                                                      \
        .phy_id         = (_oui),                                       \
@@ -337,7 +328,7 @@ static struct phy_driver bcm7xxx_driver[] = {
        .phy_id         = PHY_ID_BCM7425,
        .phy_id_mask    = 0xfffffff0,
        .name           = "Broadcom BCM7425",
-       .features       = PHY_GBIT_FEATURES |
+       .features       = PHY_BASIC_FEATURES |
                          SUPPORTED_Pause | SUPPORTED_Asym_Pause,
        .flags          = PHY_IS_INTERNAL,
        .config_init    = bcm7xxx_config_init,
@@ -349,7 +340,7 @@ static struct phy_driver bcm7xxx_driver[] = {
        .phy_id         = PHY_ID_BCM7429,
        .phy_id_mask    = 0xfffffff0,
        .name           = "Broadcom BCM7429",
-       .features       = PHY_GBIT_FEATURES |
+       .features       = PHY_BASIC_FEATURES |
                          SUPPORTED_Pause | SUPPORTED_Asym_Pause,
        .flags          = PHY_IS_INTERNAL,
        .config_init    = bcm7xxx_config_init,
@@ -361,7 +352,7 @@ static struct phy_driver bcm7xxx_driver[] = {
        .phy_id         = PHY_ID_BCM7435,
        .phy_id_mask    = 0xfffffff0,
        .name           = "Broadcom BCM7435",
-       .features       = PHY_GBIT_FEATURES |
+       .features       = PHY_BASIC_FEATURES |
                          SUPPORTED_Pause | SUPPORTED_Asym_Pause,
        .flags          = PHY_IS_INTERNAL,
        .config_init    = bcm7xxx_config_init,
@@ -369,30 +360,6 @@ static struct phy_driver bcm7xxx_driver[] = {
        .read_status    = genphy_read_status,
        .suspend        = bcm7xxx_suspend,
        .resume         = bcm7xxx_config_init,
-}, {
-       .phy_id         = PHY_BCM_OUI_4,
-       .phy_id_mask    = 0xffff0000,
-       .name           = "Broadcom BCM7XXX 40nm",
-       .features       = PHY_GBIT_FEATURES |
-                         SUPPORTED_Pause | SUPPORTED_Asym_Pause,
-       .flags          = PHY_IS_INTERNAL,
-       .config_init    = bcm7xxx_config_init,
-       .config_aneg    = genphy_config_aneg,
-       .read_status    = genphy_read_status,
-       .suspend        = bcm7xxx_suspend,
-       .resume         = bcm7xxx_config_init,
-}, {
-       .phy_id         = PHY_BCM_OUI_5,
-       .phy_id_mask    = 0xffffff00,
-       .name           = "Broadcom BCM7XXX 65nm",
-       .features       = PHY_BASIC_FEATURES |
-                         SUPPORTED_Pause | SUPPORTED_Asym_Pause,
-       .flags          = PHY_IS_INTERNAL,
-       .config_init    = bcm7xxx_dummy_config_init,
-       .config_aneg    = genphy_config_aneg,
-       .read_status    = genphy_read_status,
-       .suspend        = bcm7xxx_suspend,
-       .resume         = bcm7xxx_config_init,
 } };
 
 static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
@@ -404,8 +371,6 @@ static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
        { PHY_ID_BCM7439, 0xfffffff0, },
        { PHY_ID_BCM7435, 0xfffffff0, },
        { PHY_ID_BCM7445, 0xfffffff0, },
-       { PHY_BCM_OUI_4, 0xffff0000 },
-       { PHY_BCM_OUI_5, 0xffffff00 },
        { }
 };
 
index 180f699..7a240fc 100644 (file)
@@ -846,6 +846,11 @@ static void decode_rxts(struct dp83640_private *dp83640,
        struct skb_shared_hwtstamps *shhwtstamps = NULL;
        struct sk_buff *skb;
        unsigned long flags;
+       u8 overflow;
+
+       overflow = (phy_rxts->ns_hi >> 14) & 0x3;
+       if (overflow)
+               pr_debug("rx timestamp queue overflow, count %d\n", overflow);
 
        spin_lock_irqsave(&dp83640->rx_lock, flags);
 
@@ -888,6 +893,7 @@ static void decode_txts(struct dp83640_private *dp83640,
        struct skb_shared_hwtstamps shhwtstamps;
        struct sk_buff *skb;
        u64 ns;
+       u8 overflow;
 
        /* We must already have the skb that triggered this. */
 
@@ -897,6 +903,17 @@ static void decode_txts(struct dp83640_private *dp83640,
                pr_debug("have timestamp but tx_queue empty\n");
                return;
        }
+
+       overflow = (phy_txts->ns_hi >> 14) & 0x3;
+       if (overflow) {
+               pr_debug("tx timestamp queue overflow, count %d\n", overflow);
+               while (skb) {
+                       skb_complete_tx_timestamp(skb, NULL);
+                       skb = skb_dequeue(&dp83640->tx_queue);
+               }
+               return;
+       }
+
        ns = phy2txts(phy_txts);
        memset(&shhwtstamps, 0, sizeof(shhwtstamps));
        shhwtstamps.hwtstamp = ns_to_ktime(ns);
index e3eb964..ab1d0fc 100644 (file)
@@ -446,6 +446,12 @@ static int m88e1510_config_aneg(struct phy_device *phydev)
        if (err < 0)
                return err;
 
+       return 0;
+}
+
+static int marvell_config_init(struct phy_device *phydev)
+{
+       /* Set registers from marvell,reg-init DT property */
        return marvell_of_reg_init(phydev);
 }
 
@@ -495,7 +501,7 @@ static int m88e1116r_config_init(struct phy_device *phydev)
 
        mdelay(500);
 
-       return 0;
+       return marvell_config_init(phydev);
 }
 
 static int m88e3016_config_init(struct phy_device *phydev)
@@ -514,7 +520,7 @@ static int m88e3016_config_init(struct phy_device *phydev)
        if (reg < 0)
                return reg;
 
-       return 0;
+       return marvell_config_init(phydev);
 }
 
 static int m88e1111_config_init(struct phy_device *phydev)
@@ -1078,6 +1084,7 @@ static struct phy_driver marvell_drivers[] = {
                .features = PHY_GBIT_FEATURES,
                .probe = marvell_probe,
                .flags = PHY_HAS_INTERRUPT,
+               .config_init = &marvell_config_init,
                .config_aneg = &marvell_config_aneg,
                .read_status = &genphy_read_status,
                .ack_interrupt = &marvell_ack_interrupt,
@@ -1149,6 +1156,7 @@ static struct phy_driver marvell_drivers[] = {
                .features = PHY_GBIT_FEATURES,
                .flags = PHY_HAS_INTERRUPT,
                .probe = marvell_probe,
+               .config_init = &marvell_config_init,
                .config_aneg = &m88e1121_config_aneg,
                .read_status = &marvell_read_status,
                .ack_interrupt = &marvell_ack_interrupt,
@@ -1167,6 +1175,7 @@ static struct phy_driver marvell_drivers[] = {
                .features = PHY_GBIT_FEATURES,
                .flags = PHY_HAS_INTERRUPT,
                .probe = marvell_probe,
+               .config_init = &marvell_config_init,
                .config_aneg = &m88e1318_config_aneg,
                .read_status = &marvell_read_status,
                .ack_interrupt = &marvell_ack_interrupt,
@@ -1259,6 +1268,7 @@ static struct phy_driver marvell_drivers[] = {
                .features = PHY_GBIT_FEATURES,
                .flags = PHY_HAS_INTERRUPT,
                .probe = marvell_probe,
+               .config_init = &marvell_config_init,
                .config_aneg = &m88e1510_config_aneg,
                .read_status = &marvell_read_status,
                .ack_interrupt = &marvell_ack_interrupt,
@@ -1277,6 +1287,7 @@ static struct phy_driver marvell_drivers[] = {
                .features = PHY_GBIT_FEATURES,
                .flags = PHY_HAS_INTERRUPT,
                .probe = marvell_probe,
+               .config_init = &marvell_config_init,
                .config_aneg = &m88e1510_config_aneg,
                .read_status = &marvell_read_status,
                .ack_interrupt = &marvell_ack_interrupt,
index 03833db..dc85f70 100644 (file)
@@ -297,6 +297,17 @@ static int kszphy_config_init(struct phy_device *phydev)
        if (priv->led_mode >= 0)
                kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
 
+       if (phy_interrupt_is_valid(phydev)) {
+               int ctl = phy_read(phydev, MII_BMCR);
+
+               if (ctl < 0)
+                       return ctl;
+
+               ret = phy_write(phydev, MII_BMCR, ctl & ~BMCR_ANENABLE);
+               if (ret < 0)
+                       return ret;
+       }
+
        return 0;
 }
 
@@ -635,6 +646,21 @@ static void kszphy_get_stats(struct phy_device *phydev,
                data[i] = kszphy_get_stat(phydev, i);
 }
 
+static int kszphy_resume(struct phy_device *phydev)
+{
+       int value;
+
+       mutex_lock(&phydev->lock);
+
+       value = phy_read(phydev, MII_BMCR);
+       phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN);
+
+       kszphy_config_intr(phydev);
+       mutex_unlock(&phydev->lock);
+
+       return 0;
+}
+
 static int kszphy_probe(struct phy_device *phydev)
 {
        const struct kszphy_type *type = phydev->drv->driver_data;
@@ -844,7 +870,7 @@ static struct phy_driver ksphy_driver[] = {
        .get_strings    = kszphy_get_strings,
        .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
-       .resume         = genphy_resume,
+       .resume         = kszphy_resume,
 }, {
        .phy_id         = PHY_ID_KSZ8061,
        .name           = "Micrel KSZ8061",
index 8763bb2..5590b9c 100644 (file)
@@ -692,25 +692,29 @@ void phy_change(struct work_struct *work)
        struct phy_device *phydev =
                container_of(work, struct phy_device, phy_queue);
 
-       if (phydev->drv->did_interrupt &&
-           !phydev->drv->did_interrupt(phydev))
-               goto ignore;
+       if (phy_interrupt_is_valid(phydev)) {
+               if (phydev->drv->did_interrupt &&
+                   !phydev->drv->did_interrupt(phydev))
+                       goto ignore;
 
-       if (phy_disable_interrupts(phydev))
-               goto phy_err;
+               if (phy_disable_interrupts(phydev))
+                       goto phy_err;
+       }
 
        mutex_lock(&phydev->lock);
        if ((PHY_RUNNING == phydev->state) || (PHY_NOLINK == phydev->state))
                phydev->state = PHY_CHANGELINK;
        mutex_unlock(&phydev->lock);
 
-       atomic_dec(&phydev->irq_disable);
-       enable_irq(phydev->irq);
+       if (phy_interrupt_is_valid(phydev)) {
+               atomic_dec(&phydev->irq_disable);
+               enable_irq(phydev->irq);
 
-       /* Reenable interrupts */
-       if (PHY_HALTED != phydev->state &&
-           phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED))
-               goto irq_enable_err;
+               /* Reenable interrupts */
+               if (PHY_HALTED != phydev->state &&
+                   phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED))
+                       goto irq_enable_err;
+       }
 
        /* reschedule state queue work to run as soon as possible */
        cancel_delayed_work_sync(&phydev->state_queue);
@@ -905,10 +909,10 @@ void phy_state_machine(struct work_struct *work)
                phydev->adjust_link(phydev->attached_dev);
                break;
        case PHY_RUNNING:
-               /* Only register a CHANGE if we are polling or ignoring
-                * interrupts and link changed since latest checking.
+               /* Only register a CHANGE if we are polling and link changed
+                * since latest checking.
                 */
-               if (!phy_interrupt_is_valid(phydev)) {
+               if (phydev->irq == PHY_POLL) {
                        old_link = phydev->link;
                        err = phy_read_status(phydev);
                        if (err)
@@ -1000,15 +1004,21 @@ void phy_state_machine(struct work_struct *work)
                   phy_state_to_str(old_state),
                   phy_state_to_str(phydev->state));
 
-       queue_delayed_work(system_power_efficient_wq, &phydev->state_queue,
-                          PHY_STATE_TIME * HZ);
+       /* Only re-schedule a PHY state machine change if we are polling the
+        * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
+        * between states from phy_mac_interrupt()
+        */
+       if (phydev->irq == PHY_POLL)
+               queue_delayed_work(system_power_efficient_wq, &phydev->state_queue,
+                                  PHY_STATE_TIME * HZ);
 }
 
 void phy_mac_interrupt(struct phy_device *phydev, int new_link)
 {
-       cancel_work_sync(&phydev->phy_queue);
        phydev->link = new_link;
-       schedule_work(&phydev->phy_queue);
+
+       /* Trigger a state machine change */
+       queue_work(system_power_efficient_wq, &phydev->phy_queue);
 }
 EXPORT_SYMBOL(phy_mac_interrupt);
 
index bad3f00..e551f3a 100644 (file)
@@ -1410,7 +1410,7 @@ int genphy_config_init(struct phy_device *phydev)
 
        features = (SUPPORTED_TP | SUPPORTED_MII
                        | SUPPORTED_AUI | SUPPORTED_FIBRE |
-                       SUPPORTED_BNC);
+                       SUPPORTED_BNC | SUPPORTED_Pause | SUPPORTED_Asym_Pause);
 
        /* Do we support autonegotiation? */
        val = phy_read(phydev, MII_BMSR);
index e485f26..2e21e93 100644 (file)
 #include <linux/netdevice.h>
 #include <linux/smscphy.h>
 
+struct smsc_phy_priv {
+       bool energy_enable;
+};
+
 static int smsc_phy_config_intr(struct phy_device *phydev)
 {
        int rc = phy_write (phydev, MII_LAN83C185_IM,
@@ -43,19 +47,14 @@ static int smsc_phy_ack_interrupt(struct phy_device *phydev)
 
 static int smsc_phy_config_init(struct phy_device *phydev)
 {
-       int __maybe_unused len;
-       struct device *dev __maybe_unused = &phydev->mdio.dev;
-       struct device_node *of_node __maybe_unused = dev->of_node;
+       struct smsc_phy_priv *priv = phydev->priv;
+
        int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
-       int enable_energy = 1;
 
        if (rc < 0)
                return rc;
 
-       if (of_find_property(of_node, "smsc,disable-energy-detect", &len))
-               enable_energy = 0;
-
-       if (enable_energy) {
+       if (priv->energy_enable) {
                /* Enable energy detect mode for this SMSC Transceivers */
                rc = phy_write(phydev, MII_LAN83C185_CTRL_STATUS,
                               rc | MII_LAN83C185_EDPWRDOWN);
@@ -110,10 +109,13 @@ static int lan911x_config_init(struct phy_device *phydev)
  */
 static int lan87xx_read_status(struct phy_device *phydev)
 {
+       struct smsc_phy_priv *priv = phydev->priv;
+
        int err = genphy_read_status(phydev);
-       int i;
 
-       if (!phydev->link) {
+       if (!phydev->link && priv->energy_enable) {
+               int i;
+
                /* Disable EDPD to wake up PHY */
                int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
                if (rc < 0)
@@ -149,6 +151,26 @@ static int lan87xx_read_status(struct phy_device *phydev)
        return err;
 }
 
+static int smsc_phy_probe(struct phy_device *phydev)
+{
+       struct device *dev = &phydev->mdio.dev;
+       struct device_node *of_node = dev->of_node;
+       struct smsc_phy_priv *priv;
+
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->energy_enable = true;
+
+       if (of_property_read_bool(of_node, "smsc,disable-energy-detect"))
+               priv->energy_enable = false;
+
+       phydev->priv = priv;
+
+       return 0;
+}
+
 static struct phy_driver smsc_phy_driver[] = {
 {
        .phy_id         = 0x0007c0a0, /* OUI=0x00800f, Model#=0x0a */
@@ -159,6 +181,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = genphy_read_status,
@@ -180,6 +204,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = genphy_read_status,
@@ -201,6 +227,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = lan87xx_read_status,
@@ -222,6 +250,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = genphy_read_status,
@@ -242,6 +272,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = lan87xx_read_status,
@@ -263,6 +295,8 @@ static struct phy_driver smsc_phy_driver[] = {
                                | SUPPORTED_Asym_Pause),
        .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
 
+       .probe          = smsc_phy_probe,
+
        /* basic functions */
        .config_aneg    = genphy_config_aneg,
        .read_status    = lan87xx_read_status,
index fc8ad00..d61da9e 100644 (file)
@@ -443,9 +443,14 @@ static ssize_t ppp_read(struct file *file, char __user *buf,
                         * network traffic (demand mode).
                         */
                        struct ppp *ppp = PF_TO_PPP(pf);
+
+                       ppp_recv_lock(ppp);
                        if (ppp->n_channels == 0 &&
-                           (ppp->flags & SC_LOOP_TRAFFIC) == 0)
+                           (ppp->flags & SC_LOOP_TRAFFIC) == 0) {
+                               ppp_recv_unlock(ppp);
                                break;
+                       }
+                       ppp_recv_unlock(ppp);
                }
                ret = -EAGAIN;
                if (file->f_flags & O_NONBLOCK)
@@ -532,9 +537,12 @@ static unsigned int ppp_poll(struct file *file, poll_table *wait)
        else if (pf->kind == INTERFACE) {
                /* see comment in ppp_read */
                struct ppp *ppp = PF_TO_PPP(pf);
+
+               ppp_recv_lock(ppp);
                if (ppp->n_channels == 0 &&
                    (ppp->flags & SC_LOOP_TRAFFIC) == 0)
                        mask |= POLLIN | POLLRDNORM;
+               ppp_recv_unlock(ppp);
        }
 
        return mask;
@@ -2808,6 +2816,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit,
 
 out2:
        mutex_unlock(&pn->all_ppp_mutex);
+       rtnl_unlock();
        free_netdev(dev);
 out1:
        *retp = ret;
index f3c6302..4ddae81 100644 (file)
@@ -395,6 +395,8 @@ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb)
 
                if (!__pppoe_xmit(sk_pppox(relay_po), skb))
                        goto abort_put;
+
+               sock_put(sk_pppox(relay_po));
        } else {
                if (sock_queue_rcv_skb(sk, skb))
                        goto abort_kfree;
index 90868ca..ae0905e 100644 (file)
@@ -129,24 +129,27 @@ static int lookup_chan_dst(u16 call_id, __be32 d_addr)
        return i < MAX_CALLID;
 }
 
-static int add_chan(struct pppox_sock *sock)
+static int add_chan(struct pppox_sock *sock,
+                   struct pptp_addr *sa)
 {
        static int call_id;
 
        spin_lock(&chan_lock);
-       if (!sock->proto.pptp.src_addr.call_id) {
+       if (!sa->call_id)       {
                call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1);
                if (call_id == MAX_CALLID) {
                        call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1);
                        if (call_id == MAX_CALLID)
                                goto out_err;
                }
-               sock->proto.pptp.src_addr.call_id = call_id;
-       } else if (test_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap))
+               sa->call_id = call_id;
+       } else if (test_bit(sa->call_id, callid_bitmap)) {
                goto out_err;
+       }
 
-       set_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap);
-       rcu_assign_pointer(callid_sock[sock->proto.pptp.src_addr.call_id], sock);
+       sock->proto.pptp.src_addr = *sa;
+       set_bit(sa->call_id, callid_bitmap);
+       rcu_assign_pointer(callid_sock[sa->call_id], sock);
        spin_unlock(&chan_lock);
 
        return 0;
@@ -416,7 +419,6 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
        struct sock *sk = sock->sk;
        struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr;
        struct pppox_sock *po = pppox_sk(sk);
-       struct pptp_opt *opt = &po->proto.pptp;
        int error = 0;
 
        if (sockaddr_len < sizeof(struct sockaddr_pppox))
@@ -424,10 +426,22 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
 
        lock_sock(sk);
 
-       opt->src_addr = sp->sa_addr.pptp;
-       if (add_chan(po))
+       if (sk->sk_state & PPPOX_DEAD) {
+               error = -EALREADY;
+               goto out;
+       }
+
+       if (sk->sk_state & PPPOX_BOUND) {
                error = -EBUSY;
+               goto out;
+       }
+
+       if (add_chan(po, &sp->sa_addr.pptp))
+               error = -EBUSY;
+       else
+               sk->sk_state |= PPPOX_BOUND;
 
+out:
        release_sock(sk);
        return error;
 }
@@ -498,7 +512,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
        }
 
        opt->dst_addr = sp->sa_addr.pptp;
-       sk->sk_state = PPPOX_CONNECTED;
+       sk->sk_state |= PPPOX_CONNECTED;
 
  end:
        release_sock(sk);
index 7f83504..cdde590 100644 (file)
@@ -395,6 +395,10 @@ config USB_NET_RNDIS_HOST
          The protocol specification is incomplete, and is controlled by
          (and for) Microsoft; it isn't an "Open" ecosystem or market.
 
+config USB_NET_CDC_SUBSET_ENABLE
+       tristate
+       depends on USB_NET_CDC_SUBSET
+
 config USB_NET_CDC_SUBSET
        tristate "Simple USB Network Links (CDC Ethernet subset)"
        depends on USB_USBNET
@@ -413,6 +417,7 @@ config USB_NET_CDC_SUBSET
 config USB_ALI_M5632
        bool "ALi M5632 based 'USB 2.0 Data Link' cables"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        help
          Choose this option if you're using a host-to-host cable
          based on this design, which supports USB 2.0 high speed.
@@ -420,6 +425,7 @@ config USB_ALI_M5632
 config USB_AN2720
        bool "AnchorChips 2720 based cables (Xircom PGUNET, ...)"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        help
          Choose this option if you're using a host-to-host cable
          based on this design.  Note that AnchorChips is now a
@@ -428,6 +434,7 @@ config USB_AN2720
 config USB_BELKIN
        bool "eTEK based host-to-host cables (Advance, Belkin, ...)"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        default y
        help
          Choose this option if you're using a host-to-host cable
@@ -437,6 +444,7 @@ config USB_BELKIN
 config USB_ARMLINUX
        bool "Embedded ARM Linux links (iPaq, ...)"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        default y
        help
          Choose this option to support the "usb-eth" networking driver
@@ -454,6 +462,7 @@ config USB_ARMLINUX
 config USB_EPSON2888
        bool "Epson 2888 based firmware (DEVELOPMENT)"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        help
          Choose this option to support the usb networking links used
          by some sample firmware from Epson.
@@ -461,6 +470,7 @@ config USB_EPSON2888
 config USB_KC2190
        bool "KT Technology KC2190 based cables (InstaNet)"
        depends on USB_NET_CDC_SUBSET
+       select USB_NET_CDC_SUBSET_ENABLE
        help
          Choose this option if you're using a host-to-host cable
          with one of these chips.
index b5f0406..37fb46a 100644 (file)
@@ -23,7 +23,7 @@ obj-$(CONFIG_USB_NET_GL620A)  += gl620a.o
 obj-$(CONFIG_USB_NET_NET1080)  += net1080.o
 obj-$(CONFIG_USB_NET_PLUSB)    += plusb.o
 obj-$(CONFIG_USB_NET_RNDIS_HOST)       += rndis_host.o
-obj-$(CONFIG_USB_NET_CDC_SUBSET)       += cdc_subset.o
+obj-$(CONFIG_USB_NET_CDC_SUBSET_ENABLE)        += cdc_subset.o
 obj-$(CONFIG_USB_NET_ZAURUS)   += zaurus.o
 obj-$(CONFIG_USB_NET_MCS7830)  += mcs7830.o
 obj-$(CONFIG_USB_USBNET)       += usbnet.o
index 224e7d8..cf77f2d 100644 (file)
@@ -134,7 +134,6 @@ static void ax88172a_remove_mdio(struct usbnet *dev)
 
        netdev_info(dev->net, "deregistering mdio bus %s\n", priv->mdio->id);
        mdiobus_unregister(priv->mdio);
-       kfree(priv->mdio->irq);
        mdiobus_free(priv->mdio);
 }
 
index dc0212c..86ba30b 100644 (file)
@@ -837,7 +837,11 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 
        iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber;
 
-       /* reset data interface */
+       /* Reset data interface. Some devices will not reset properly
+        * unless they are configured first.  Toggle the altsetting to
+        * force a reset
+        */
+       usb_set_interface(dev->udev, iface_no, data_altsetting);
        temp = usb_set_interface(dev->udev, iface_no, 0);
        if (temp) {
                dev_dbg(&intf->dev, "set interface failed\n");
@@ -984,8 +988,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting);
 
 static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 {
-       int ret;
-
        /* MBIM backwards compatible function? */
        if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
                return -ENODEV;
@@ -994,16 +996,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
         * Additionally, generic NCM devices are assumed to accept arbitrarily
         * placed NDP.
         */
-       ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
-
-       /*
-        * We should get an event when network connection is "connected" or
-        * "disconnected". Set network connection in "disconnected" state
-        * (carrier is OFF) during attach, so the IP network stack does not
-        * start IPv6 negotiation and more.
-        */
-       usbnet_link_change(dev, 0, 0);
-       return ret;
+       return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 }
 
 static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max)
@@ -1586,7 +1579,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 
 static const struct driver_info cdc_ncm_info = {
        .description = "CDC NCM",
-       .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
+       .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
+                       | FLAG_LINK_INTR,
        .bind = cdc_ncm_bind,
        .unbind = cdc_ncm_unbind,
        .manage_power = usbnet_manage_power,
@@ -1599,7 +1593,7 @@ static const struct driver_info cdc_ncm_info = {
 static const struct driver_info wwan_info = {
        .description = "Mobile Broadband Network Device",
        .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-                       | FLAG_WWAN,
+                       | FLAG_LINK_INTR | FLAG_WWAN,
        .bind = cdc_ncm_bind,
        .unbind = cdc_ncm_unbind,
        .manage_power = usbnet_manage_power,
@@ -1612,7 +1606,7 @@ static const struct driver_info wwan_info = {
 static const struct driver_info wwan_noarp_info = {
        .description = "Mobile Broadband Network Device (NO ARP)",
        .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-                       | FLAG_WWAN | FLAG_NOARP,
+                       | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP,
        .bind = cdc_ncm_bind,
        .unbind = cdc_ncm_unbind,
        .manage_power = usbnet_manage_power,
index 2ed5333..1c299b8 100644 (file)
@@ -36,7 +36,7 @@
 #define DRIVER_AUTHOR  "WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC    "LAN78XX USB 3.0 Gigabit Ethernet Devices"
 #define DRIVER_NAME    "lan78xx"
-#define DRIVER_VERSION "1.0.1"
+#define DRIVER_VERSION "1.0.2"
 
 #define TX_TIMEOUT_JIFFIES             (5 * HZ)
 #define THROTTLE_JIFFIES               (HZ / 8)
@@ -462,32 +462,53 @@ static int lan78xx_read_raw_eeprom(struct lan78xx_net *dev, u32 offset,
                                   u32 length, u8 *data)
 {
        u32 val;
+       u32 saved;
        int i, ret;
+       int retval;
 
-       ret = lan78xx_eeprom_confirm_not_busy(dev);
-       if (ret)
-               return ret;
+       /* depends on chip, some EEPROM pins are muxed with LED function.
+        * disable & restore LED function to access EEPROM.
+        */
+       ret = lan78xx_read_reg(dev, HW_CFG, &val);
+       saved = val;
+       if ((dev->devid & ID_REV_CHIP_ID_MASK_) == 0x78000000) {
+               val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_);
+               ret = lan78xx_write_reg(dev, HW_CFG, val);
+       }
+
+       retval = lan78xx_eeprom_confirm_not_busy(dev);
+       if (retval)
+               return retval;
 
        for (i = 0; i < length; i++) {
                val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_READ_;
                val |= (offset & E2P_CMD_EPC_ADDR_MASK_);
                ret = lan78xx_write_reg(dev, E2P_CMD, val);
-               if (unlikely(ret < 0))
-                       return -EIO;
+               if (unlikely(ret < 0)) {
+                       retval = -EIO;
+                       goto exit;
+               }
 
-               ret = lan78xx_wait_eeprom(dev);
-               if (ret < 0)
-                       return ret;
+               retval = lan78xx_wait_eeprom(dev);
+               if (retval < 0)
+                       goto exit;
 
                ret = lan78xx_read_reg(dev, E2P_DATA, &val);
-               if (unlikely(ret < 0))
-                       return -EIO;
+               if (unlikely(ret < 0)) {
+                       retval = -EIO;
+                       goto exit;
+               }
 
                data[i] = val & 0xFF;
                offset++;
        }
 
-       return 0;
+       retval = 0;
+exit:
+       if ((dev->devid & ID_REV_CHIP_ID_MASK_) == 0x78000000)
+               ret = lan78xx_write_reg(dev, HW_CFG, saved);
+
+       return retval;
 }
 
 static int lan78xx_read_eeprom(struct lan78xx_net *dev, u32 offset,
@@ -509,44 +530,67 @@ static int lan78xx_write_raw_eeprom(struct lan78xx_net *dev, u32 offset,
                                    u32 length, u8 *data)
 {
        u32 val;
+       u32 saved;
        int i, ret;
+       int retval;
 
-       ret = lan78xx_eeprom_confirm_not_busy(dev);
-       if (ret)
-               return ret;
+       /* depends on chip, some EEPROM pins are muxed with LED function.
+        * disable & restore LED function to access EEPROM.
+        */
+       ret = lan78xx_read_reg(dev, HW_CFG, &val);
+       saved = val;
+       if ((dev->devid & ID_REV_CHIP_ID_MASK_) == 0x78000000) {
+               val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_);
+               ret = lan78xx_write_reg(dev, HW_CFG, val);
+       }
+
+       retval = lan78xx_eeprom_confirm_not_busy(dev);
+       if (retval)
+               goto exit;
 
        /* Issue write/erase enable command */
        val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_EWEN_;
        ret = lan78xx_write_reg(dev, E2P_CMD, val);
-       if (unlikely(ret < 0))
-               return -EIO;
+       if (unlikely(ret < 0)) {
+               retval = -EIO;
+               goto exit;
+       }
 
-       ret = lan78xx_wait_eeprom(dev);
-       if (ret < 0)
-               return ret;
+       retval = lan78xx_wait_eeprom(dev);
+       if (retval < 0)
+               goto exit;
 
        for (i = 0; i < length; i++) {
                /* Fill data register */
                val = data[i];
                ret = lan78xx_write_reg(dev, E2P_DATA, val);
-               if (ret < 0)
-                       return ret;
+               if (ret < 0) {
+                       retval = -EIO;
+                       goto exit;
+               }
 
                /* Send "write" command */
                val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_WRITE_;
                val |= (offset & E2P_CMD_EPC_ADDR_MASK_);
                ret = lan78xx_write_reg(dev, E2P_CMD, val);
-               if (ret < 0)
-                       return ret;
+               if (ret < 0) {
+                       retval = -EIO;
+                       goto exit;
+               }
 
-               ret = lan78xx_wait_eeprom(dev);
-               if (ret < 0)
-                       return ret;
+               retval = lan78xx_wait_eeprom(dev);
+               if (retval < 0)
+                       goto exit;
 
                offset++;
        }
 
-       return 0;
+       retval = 0;
+exit:
+       if ((dev->devid & ID_REV_CHIP_ID_MASK_) == 0x78000000)
+               ret = lan78xx_write_reg(dev, HW_CFG, saved);
+
+       return retval;
 }
 
 static int lan78xx_read_raw_otp(struct lan78xx_net *dev, u32 offset,
@@ -904,7 +948,6 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
 
        if (!phydev->link && dev->link_on) {
                dev->link_on = false;
-               netif_carrier_off(dev->net);
 
                /* reset MAC */
                ret = lan78xx_read_reg(dev, MAC_CR, &buf);
@@ -914,6 +957,8 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
                ret = lan78xx_write_reg(dev, MAC_CR, buf);
                if (unlikely(ret < 0))
                        return -EIO;
+
+               phy_mac_interrupt(phydev, 0);
        } else if (phydev->link && !dev->link_on) {
                dev->link_on = true;
 
@@ -953,7 +998,7 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
                          ethtool_cmd_speed(&ecmd), ecmd.duplex, ladv, radv);
 
                ret = lan78xx_update_flowcontrol(dev, ecmd.duplex, ladv, radv);
-               netif_carrier_on(dev->net);
+               phy_mac_interrupt(phydev, 1);
        }
 
        return ret;
@@ -1495,7 +1540,6 @@ done:
 static int lan78xx_mdio_init(struct lan78xx_net *dev)
 {
        int ret;
-       int i;
 
        dev->mdiobus = mdiobus_alloc();
        if (!dev->mdiobus) {
@@ -1511,10 +1555,6 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev)
        snprintf(dev->mdiobus->id, MII_BUS_ID_SIZE, "usb-%03d:%03d",
                 dev->udev->bus->busnum, dev->udev->devnum);
 
-       /* handle our own interrupt */
-       for (i = 0; i < PHY_MAX_ADDR; i++)
-               dev->mdiobus->irq[i] = PHY_IGNORE_INTERRUPT;
-
        switch (dev->devid & ID_REV_CHIP_ID_MASK_) {
        case 0x78000000:
        case 0x78500000:
@@ -1558,6 +1598,16 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
                return -EIO;
        }
 
+       /* Enable PHY interrupts.
+        * We handle our own interrupt
+        */
+       ret = phy_read(phydev, LAN88XX_INT_STS);
+       ret = phy_write(phydev, LAN88XX_INT_MASK,
+                       LAN88XX_INT_MASK_MDINTPIN_EN_ |
+                       LAN88XX_INT_MASK_LINK_CHANGE_);
+
+       phydev->irq = PHY_IGNORE_INTERRUPT;
+
        ret = phy_connect_direct(dev->net, phydev,
                                 lan78xx_link_status_change,
                                 PHY_INTERFACE_MODE_GMII);
@@ -1580,14 +1630,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
                              SUPPORTED_Pause | SUPPORTED_Asym_Pause);
        genphy_config_aneg(phydev);
 
-       /* Workaround to enable PHY interrupt.
-        * phy_start_interrupts() is API for requesting and enabling
-        * PHY interrupt. However, USB-to-Ethernet device can't use
-        * request_irq() called in phy_start_interrupts().
-        * Set PHY to PHY_HALTED and call phy_start()
-        * to make a call to phy_enable_interrupts()
-        */
-       phy_stop(phydev);
        phy_start(phydev);
 
        netif_dbg(dev, ifup, dev->net, "phy initialised successfully");
@@ -2221,7 +2263,9 @@ netdev_tx_t lan78xx_start_xmit(struct sk_buff *skb, struct net_device *net)
        if (skb2) {
                skb_queue_tail(&dev->txq_pend, skb2);
 
-               if (skb_queue_len(&dev->txq_pend) > 10)
+               /* throttle TX patch at slower than SUPER SPEED USB */
+               if ((dev->udev->speed < USB_SPEED_SUPER) &&
+                   (skb_queue_len(&dev->txq_pend) > 10))
                        netif_stop_queue(net);
        } else {
                netif_dbg(dev, tx_err, dev->net,
index 23e9880..a3a4ccf 100644 (file)
@@ -637,6 +637,7 @@ static const struct usb_device_id products[] = {
 
        /* 3. Combined interface devices matching on interface number */
        {QMI_FIXED_INTF(0x0408, 0xea42, 4)},    /* Yota / Megafon M100-1 */
+       {QMI_FIXED_INTF(0x05c6, 0x6001, 3)},    /* 4G LTE usb-modem U901 */
        {QMI_FIXED_INTF(0x05c6, 0x7000, 0)},
        {QMI_FIXED_INTF(0x05c6, 0x7001, 1)},
        {QMI_FIXED_INTF(0x05c6, 0x7002, 1)},
@@ -860,8 +861,10 @@ static const struct usb_device_id products[] = {
        {QMI_FIXED_INTF(0x1199, 0x9056, 8)},    /* Sierra Wireless Modem */
        {QMI_FIXED_INTF(0x1199, 0x9057, 8)},
        {QMI_FIXED_INTF(0x1199, 0x9061, 8)},    /* Sierra Wireless Modem */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 8)},    /* Sierra Wireless MC74xx/EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 10)},   /* Sierra Wireless MC74xx/EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9071, 8)},    /* Sierra Wireless MC74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9071, 10)},   /* Sierra Wireless MC74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9079, 8)},    /* Sierra Wireless EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x9079, 10)},   /* Sierra Wireless EM74xx */
        {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},    /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
        {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},    /* Alcatel L800MA */
        {QMI_FIXED_INTF(0x2357, 0x0201, 4)},    /* TP-LINK HSUPA Modem MA180 */
@@ -884,6 +887,7 @@ static const struct usb_device_id products[] = {
        {QMI_FIXED_INTF(0x413c, 0x81a8, 8)},    /* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */
        {QMI_FIXED_INTF(0x413c, 0x81a9, 8)},    /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
        {QMI_FIXED_INTF(0x413c, 0x81b1, 8)},    /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
+       {QMI_FIXED_INTF(0x413c, 0x81b3, 8)},    /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */
        {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},    /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
        {QMI_FIXED_INTF(0x22de, 0x9061, 3)},    /* WeTelecom WPD-600N */
        {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},    /* SIMCom 7230E */
index 0b0ba7e..1079812 100644 (file)
@@ -1769,6 +1769,13 @@ out3:
        if (info->unbind)
                info->unbind (dev, udev);
 out1:
+       /* subdrivers must undo all they did in bind() if they
+        * fail it, but we may fail later and a deferred kevent
+        * may trigger an error resubmitting itself and, worse,
+        * schedule a timer. So we kill it all just in case.
+        */
+       cancel_work_sync(&dev->kevent);
+       del_timer_sync(&dev->delay);
        free_netdev(net);
 out:
        return status;
index 221a530..72ba8ae 100644 (file)
@@ -377,7 +377,7 @@ union Vmxnet3_GenericDesc {
 #define VMXNET3_TX_RING_MAX_SIZE   4096
 #define VMXNET3_TC_RING_MAX_SIZE   4096
 #define VMXNET3_RX_RING_MAX_SIZE   4096
-#define VMXNET3_RX_RING2_MAX_SIZE  2048
+#define VMXNET3_RX_RING2_MAX_SIZE  4096
 #define VMXNET3_RC_RING_MAX_SIZE   8192
 
 /* a list of reasons for queue stop */
index 0cbf520..fc895d0 100644 (file)
@@ -814,7 +814,7 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
 
 
 /*
- *    parse and copy relevant protocol headers:
+ *    parse relevant protocol headers:
  *      For a tso pkt, relevant headers are L2/3/4 including options
  *      For a pkt requesting csum offloading, they are L2/3 and may include L4
  *      if it's a TCP/UDP pkt
@@ -827,15 +827,14 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
  * Other effects:
  *    1. related *ctx fields are updated.
  *    2. ctx->copy_size is # of bytes copied
- *    3. the portion copied is guaranteed to be in the linear part
+ *    3. the portion to be copied is guaranteed to be in the linear part
  *
  */
 static int
-vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
-                          struct vmxnet3_tx_ctx *ctx,
-                          struct vmxnet3_adapter *adapter)
+vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+                 struct vmxnet3_tx_ctx *ctx,
+                 struct vmxnet3_adapter *adapter)
 {
-       struct Vmxnet3_TxDataDesc *tdd;
        u8 protocol = 0;
 
        if (ctx->mss) { /* TSO */
@@ -892,16 +891,34 @@ vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                return 0;
        }
 
+       return 1;
+err:
+       return -1;
+}
+
+/*
+ *    copy relevant protocol headers to the transmit ring:
+ *      For a tso pkt, relevant headers are L2/3/4 including options
+ *      For a pkt requesting csum offloading, they are L2/3 and may include L4
+ *      if it's a TCP/UDP pkt
+ *
+ *
+ *    Note that this requires that vmxnet3_parse_hdr be called first to set the
+ *      appropriate bits in ctx first
+ */
+static void
+vmxnet3_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+                struct vmxnet3_tx_ctx *ctx,
+                struct vmxnet3_adapter *adapter)
+{
+       struct Vmxnet3_TxDataDesc *tdd;
+
        tdd = tq->data_ring.base + tq->tx_ring.next2fill;
 
        memcpy(tdd->data, skb->data, ctx->copy_size);
        netdev_dbg(adapter->netdev,
                "copy %u bytes to dataRing[%u]\n",
                ctx->copy_size, tq->tx_ring.next2fill);
-       return 1;
-
-err:
-       return -1;
 }
 
 
@@ -998,22 +1015,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                }
        }
 
-       spin_lock_irqsave(&tq->tx_lock, flags);
-
-       if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
-               tq->stats.tx_ring_full++;
-               netdev_dbg(adapter->netdev,
-                       "tx queue stopped on %s, next2comp %u"
-                       " next2fill %u\n", adapter->netdev->name,
-                       tq->tx_ring.next2comp, tq->tx_ring.next2fill);
-
-               vmxnet3_tq_stop(tq, adapter);
-               spin_unlock_irqrestore(&tq->tx_lock, flags);
-               return NETDEV_TX_BUSY;
-       }
-
-
-       ret = vmxnet3_parse_and_copy_hdr(skb, tq, &ctx, adapter);
+       ret = vmxnet3_parse_hdr(skb, tq, &ctx, adapter);
        if (ret >= 0) {
                BUG_ON(ret <= 0 && ctx.copy_size != 0);
                /* hdrs parsed, check against other limits */
@@ -1033,9 +1035,26 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
                }
        } else {
                tq->stats.drop_hdr_inspect_err++;
-               goto unlock_drop_pkt;
+               goto drop_pkt;
        }
 
+       spin_lock_irqsave(&tq->tx_lock, flags);
+
+       if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
+               tq->stats.tx_ring_full++;
+               netdev_dbg(adapter->netdev,
+                       "tx queue stopped on %s, next2comp %u"
+                       " next2fill %u\n", adapter->netdev->name,
+                       tq->tx_ring.next2comp, tq->tx_ring.next2fill);
+
+               vmxnet3_tq_stop(tq, adapter);
+               spin_unlock_irqrestore(&tq->tx_lock, flags);
+               return NETDEV_TX_BUSY;
+       }
+
+
+       vmxnet3_copy_hdr(skb, tq, &ctx, adapter);
+
        /* fill tx descs related to addr & len */
        if (vmxnet3_map_pkt(skb, &ctx, tq, adapter->pdev, adapter))
                goto unlock_drop_pkt;
index bdb8a6c..729c344 100644 (file)
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.4.5.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.4.6.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01040500
+#define VMXNET3_DRIVER_VERSION_NUM      0x01040600
 
 #if defined(CONFIG_PCI_MSI)
        /* RSS only makes sense if MSI-X is supported. */
index 66addb7..bdcf617 100644 (file)
@@ -104,20 +104,23 @@ static struct dst_ops vrf_dst_ops = {
 #if IS_ENABLED(CONFIG_IPV6)
 static bool check_ipv6_frame(const struct sk_buff *skb)
 {
-       const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data;
-       size_t hlen = sizeof(*ipv6h);
+       const struct ipv6hdr *ipv6h;
+       struct ipv6hdr _ipv6h;
        bool rc = true;
 
-       if (skb->len < hlen)
+       ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
+       if (!ipv6h)
                goto out;
 
        if (ipv6h->nexthdr == NEXTHDR_ICMP) {
                const struct icmp6hdr *icmph;
+               struct icmp6hdr _icmph;
 
-               if (skb->len < hlen + sizeof(*icmph))
+               icmph = skb_header_pointer(skb, sizeof(_ipv6h),
+                                          sizeof(_icmph), &_icmph);
+               if (!icmph)
                        goto out;
 
-               icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h));
                switch (icmph->icmp6_type) {
                case NDISC_ROUTER_SOLICITATION:
                case NDISC_ROUTER_ADVERTISEMENT:
index 2d88c79..1c32bd1 100644 (file)
@@ -73,7 +73,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static int vxlan_net_id;
 static struct rtnl_link_ops vxlan_link_ops;
 
-static const u8 all_zeros_mac[ETH_ALEN];
+static const u8 all_zeros_mac[ETH_ALEN + 2];
 
 static int vxlan_sock_add(struct vxlan_dev *vxlan);
 
@@ -931,8 +931,10 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                                                     cb->nlh->nlmsg_seq,
                                                     RTM_NEWNEIGH,
                                                     NLM_F_MULTI, rd);
-                               if (err < 0)
+                               if (err < 0) {
+                                       cb->args[1] = err;
                                        goto out;
+                               }
 skip:
                                ++idx;
                        }
@@ -1306,8 +1308,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                gbp = (struct vxlanhdr_gbp *)vxh;
                md->gbp = ntohs(gbp->policy_id);
 
-               if (tun_dst)
+               if (tun_dst) {
                        tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
+                       tun_dst->u.tun_info.options_len = sizeof(*md);
+               }
 
                if (gbp->dont_learn)
                        md->gbp |= VXLAN_GBP_DONT_LEARN;
@@ -1985,11 +1989,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                                     vxlan->cfg.port_max, true);
 
        if (info) {
-               if (info->key.tun_flags & TUNNEL_CSUM)
-                       flags |= VXLAN_F_UDP_CSUM;
-               else
-                       flags &= ~VXLAN_F_UDP_CSUM;
-
                ttl = info->key.ttl;
                tos = info->key.tos;
 
@@ -2004,8 +2003,15 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                        goto drop;
                sk = vxlan->vn4_sock->sock->sk;
 
-               if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
-                       df = htons(IP_DF);
+               if (info) {
+                       if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+                               df = htons(IP_DF);
+
+                       if (info->key.tun_flags & TUNNEL_CSUM)
+                               flags |= VXLAN_F_UDP_CSUM;
+                       else
+                               flags &= ~VXLAN_F_UDP_CSUM;
+               }
 
                memset(&fl4, 0, sizeof(fl4));
                fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
@@ -2101,6 +2107,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                        return;
                }
 
+               if (info) {
+                       if (info->key.tun_flags & TUNNEL_CSUM)
+                               flags &= ~VXLAN_F_UDP_ZERO_CSUM6_TX;
+                       else
+                               flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
+               }
+
                ttl = ttl ? : ip6_dst_hoplimit(ndst);
                err = vxlan6_xmit_skb(ndst, sk, skb, dev, &saddr, &dst->sin6.sin6_addr,
                                      0, ttl, src_port, dst_port, htonl(vni << 8), md,
@@ -2162,9 +2175,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
        }
 
-       if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
-           info && info->mode & IP_TUNNEL_INFO_TX) {
-               vxlan_xmit_one(skb, dev, NULL, false);
+       if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
+               if (info && info->mode & IP_TUNNEL_INFO_TX)
+                       vxlan_xmit_one(skb, dev, NULL, false);
+               else
+                       kfree_skb(skb);
                return NETDEV_TX_OK;
        }
 
@@ -2358,29 +2373,43 @@ static void vxlan_set_multicast_list(struct net_device *dev)
 {
 }
 
-static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
+static int __vxlan_change_mtu(struct net_device *dev,
+                             struct net_device *lowerdev,
+                             struct vxlan_rdst *dst, int new_mtu, bool strict)
 {
-       struct vxlan_dev *vxlan = netdev_priv(dev);
-       struct vxlan_rdst *dst = &vxlan->default_dst;
-       struct net_device *lowerdev;
-       int max_mtu;
+       int max_mtu = IP_MAX_MTU;
 
-       lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex);
-       if (lowerdev == NULL)
-               return eth_change_mtu(dev, new_mtu);
+       if (lowerdev)
+               max_mtu = lowerdev->mtu;
 
        if (dst->remote_ip.sa.sa_family == AF_INET6)
-               max_mtu = lowerdev->mtu - VXLAN6_HEADROOM;
+               max_mtu -= VXLAN6_HEADROOM;
        else
-               max_mtu = lowerdev->mtu - VXLAN_HEADROOM;
+               max_mtu -= VXLAN_HEADROOM;
 
-       if (new_mtu < 68 || new_mtu > max_mtu)
+       if (new_mtu < 68)
                return -EINVAL;
 
+       if (new_mtu > max_mtu) {
+               if (strict)
+                       return -EINVAL;
+
+               new_mtu = max_mtu;
+       }
+
        dev->mtu = new_mtu;
        return 0;
 }
 
+static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
+{
+       struct vxlan_dev *vxlan = netdev_priv(dev);
+       struct vxlan_rdst *dst = &vxlan->default_dst;
+       struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
+                                                        dst->remote_ifindex);
+       return __vxlan_change_mtu(dev, lowerdev, dst, new_mtu, true);
+}
+
 static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
                                struct ip_tunnel_info *info,
                                __be16 sport, __be16 dport)
@@ -2514,6 +2543,7 @@ static void vxlan_setup(struct net_device *dev)
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
        dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
        netif_keep_dst(dev);
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
 
        INIT_LIST_HEAD(&vxlan->next);
@@ -2756,6 +2786,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
        int err;
        bool use_ipv6 = false;
        __be16 default_port = vxlan->cfg.dst_port;
+       struct net_device *lowerdev = NULL;
 
        vxlan->net = src_net;
 
@@ -2776,9 +2807,7 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
        }
 
        if (conf->remote_ifindex) {
-               struct net_device *lowerdev
-                        = __dev_get_by_index(src_net, conf->remote_ifindex);
-
+               lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
                dst->remote_ifindex = conf->remote_ifindex;
 
                if (!lowerdev) {
@@ -2802,6 +2831,12 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
                needed_headroom = lowerdev->hard_header_len;
        }
 
+       if (conf->mtu) {
+               err = __vxlan_change_mtu(dev, lowerdev, dst, conf->mtu, false);
+               if (err)
+                       return err;
+       }
+
        if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
                needed_headroom += VXLAN6_HEADROOM;
        else
index 7a72407..6292259 100644 (file)
@@ -1626,7 +1626,7 @@ try:
                if (state & Xpr) {
                        void __iomem *scc_addr;
                        unsigned long ring;
-                       int i;
+                       unsigned int i;
 
                        /*
                         * - the busy condition happens (sometimes);
index a7afdee..73fb423 100644 (file)
@@ -150,18 +150,18 @@ int ath9k_hw_nvram_swap_data(struct ath_hw *ah, bool *swap_needed, int size)
                return -EIO;
        }
 
-       if (magic == AR5416_EEPROM_MAGIC) {
-               *swap_needed = false;
-       } else if (swab16(magic) == AR5416_EEPROM_MAGIC) {
+       *swap_needed = false;
+       if (swab16(magic) == AR5416_EEPROM_MAGIC) {
                if (ah->ah_flags & AH_NO_EEP_SWAP) {
                        ath_info(common,
                                 "Ignoring endianness difference in EEPROM magic bytes.\n");
-
-                       *swap_needed = false;
                } else {
                        *swap_needed = true;
                }
-       } else {
+       } else if (magic != AR5416_EEPROM_MAGIC) {
+               if (ath9k_hw_use_flash(ah))
+                       return 0;
+
                ath_err(common,
                        "Invalid EEPROM Magic (0x%04x).\n", magic);
                return -EINVAL;
index 5363739..b98db8a 100644 (file)
@@ -879,11 +879,24 @@ int brcmf_sdiod_abort(struct brcmf_sdio_dev *sdiodev, uint fn)
        return 0;
 }
 
-static void brcmf_sdiod_sgtable_alloc(struct brcmf_sdio_dev *sdiodev)
+void brcmf_sdiod_sgtable_alloc(struct brcmf_sdio_dev *sdiodev)
 {
+       struct sdio_func *func;
+       struct mmc_host *host;
+       uint max_blocks;
        uint nents;
        int err;
 
+       func = sdiodev->func[2];
+       host = func->card->host;
+       sdiodev->sg_support = host->max_segs > 1;
+       max_blocks = min_t(uint, host->max_blk_count, 511u);
+       sdiodev->max_request_size = min_t(uint, host->max_req_size,
+                                         max_blocks * func->cur_blksize);
+       sdiodev->max_segment_count = min_t(uint, host->max_segs,
+                                          SG_MAX_SINGLE_ALLOC);
+       sdiodev->max_segment_size = host->max_seg_size;
+
        if (!sdiodev->sg_support)
                return;
 
@@ -1021,9 +1034,6 @@ static void brcmf_sdiod_host_fixup(struct mmc_host *host)
 
 static int brcmf_sdiod_probe(struct brcmf_sdio_dev *sdiodev)
 {
-       struct sdio_func *func;
-       struct mmc_host *host;
-       uint max_blocks;
        int ret = 0;
 
        sdiodev->num_funcs = 2;
@@ -1054,26 +1064,6 @@ static int brcmf_sdiod_probe(struct brcmf_sdio_dev *sdiodev)
                goto out;
        }
 
-       /*
-        * determine host related variables after brcmf_sdiod_probe()
-        * as func->cur_blksize is properly set and F2 init has been
-        * completed successfully.
-        */
-       func = sdiodev->func[2];
-       host = func->card->host;
-       sdiodev->sg_support = host->max_segs > 1;
-       max_blocks = min_t(uint, host->max_blk_count, 511u);
-       sdiodev->max_request_size = min_t(uint, host->max_req_size,
-                                         max_blocks * func->cur_blksize);
-       sdiodev->max_segment_count = min_t(uint, host->max_segs,
-                                          SG_MAX_SINGLE_ALLOC);
-       sdiodev->max_segment_size = host->max_seg_size;
-
-       /* allocate scatter-gather table. sg support
-        * will be disabled upon allocation failure.
-        */
-       brcmf_sdiod_sgtable_alloc(sdiodev);
-
        ret = brcmf_sdiod_freezer_attach(sdiodev);
        if (ret)
                goto out;
@@ -1084,7 +1074,7 @@ static int brcmf_sdiod_probe(struct brcmf_sdio_dev *sdiodev)
                ret = -ENODEV;
                goto out;
        }
-       brcmf_sdiod_host_fixup(host);
+       brcmf_sdiod_host_fixup(sdiodev->func[2]->card->host);
 out:
        if (ret)
                brcmf_sdiod_remove(sdiodev);
index 4265b50..cfee477 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/netdevice.h>
+#include <linux/module.h>
 #include <brcmu_wifi.h>
 #include <brcmu_utils.h>
 #include "core.h"
index dd66143..a14d9d9 100644 (file)
@@ -4114,6 +4114,11 @@ struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev)
                goto fail;
        }
 
+       /* allocate scatter-gather table. sg support
+        * will be disabled upon allocation failure.
+        */
+       brcmf_sdiod_sgtable_alloc(bus->sdiodev);
+
        /* Query the F2 block size, set roundup accordingly */
        bus->blocksize = bus->sdiodev->func[2]->cur_blksize;
        bus->roundup = min(max_roundup, bus->blocksize);
index 5ec7a6d..23f2231 100644 (file)
@@ -342,6 +342,7 @@ int brcmf_sdiod_ramrw(struct brcmf_sdio_dev *sdiodev, bool write, u32 address,
 
 /* Issue an abort to the specified function */
 int brcmf_sdiod_abort(struct brcmf_sdio_dev *sdiodev, uint fn);
+void brcmf_sdiod_sgtable_alloc(struct brcmf_sdio_dev *sdiodev);
 void brcmf_sdiod_change_state(struct brcmf_sdio_dev *sdiodev,
                              enum brcmf_sdiod_state state);
 #ifdef CONFIG_PM_SLEEP
index 8660677..7438fbe 100644 (file)
@@ -53,7 +53,6 @@ config IWLWIFI_LEDS
 
 config IWLDVM
        tristate "Intel Wireless WiFi DVM Firmware support"
-       depends on m
        help
          This is the driver that supports the DVM firmware. The list
          of the devices that use this firmware is available here:
index e60cf14..fa41a5e 100644 (file)
 #define IWL7260_UCODE_API_MAX  17
 #define IWL7265_UCODE_API_MAX  17
 #define IWL7265D_UCODE_API_MAX 20
+#define IWL3168_UCODE_API_MAX  20
 
 /* Oldest version we won't warn about */
 #define IWL7260_UCODE_API_OK   13
 #define IWL7265_UCODE_API_OK   13
 #define IWL7265D_UCODE_API_OK  13
+#define IWL3168_UCODE_API_OK   20
 
 /* Lowest firmware API version supported */
 #define IWL7260_UCODE_API_MIN  13
 #define IWL7265_UCODE_API_MIN  13
 #define IWL7265D_UCODE_API_MIN 13
+#define IWL3168_UCODE_API_MIN  20
 
 /* NVM versions */
 #define IWL7260_NVM_VERSION            0x0a1d
@@ -92,6 +95,8 @@
 #define IWL3160_TX_POWER_VERSION       0xffff /* meaningless */
 #define IWL3165_NVM_VERSION            0x709
 #define IWL3165_TX_POWER_VERSION       0xffff /* meaningless */
+#define IWL3168_NVM_VERSION            0xd01
+#define IWL3168_TX_POWER_VERSION       0xffff /* meaningless */
 #define IWL7265_NVM_VERSION            0x0a1d
 #define IWL7265_TX_POWER_VERSION       0xffff /* meaningless */
 #define IWL7265D_NVM_VERSION           0x0c11
 #define IWL3160_FW_PRE "iwlwifi-3160-"
 #define IWL3160_MODULE_FIRMWARE(api) IWL3160_FW_PRE __stringify(api) ".ucode"
 
+#define IWL3168_FW_PRE "iwlwifi-3168-"
+#define IWL3168_MODULE_FIRMWARE(api) IWL3168_FW_PRE __stringify(api) ".ucode"
+
 #define IWL7265_FW_PRE "iwlwifi-7265-"
 #define IWL7265_MODULE_FIRMWARE(api) IWL7265_FW_PRE __stringify(api) ".ucode"
 
@@ -180,6 +188,12 @@ static const struct iwl_ht_params iwl7000_ht_params = {
        .ucode_api_ok = IWL7265_UCODE_API_OK,                   \
        .ucode_api_min = IWL7265_UCODE_API_MIN
 
+#define IWL_DEVICE_3008                                                \
+       IWL_DEVICE_7000_COMMON,                                 \
+       .ucode_api_max = IWL3168_UCODE_API_MAX,                 \
+       .ucode_api_ok = IWL3168_UCODE_API_OK,                   \
+       .ucode_api_min = IWL3168_UCODE_API_MIN
+
 #define IWL_DEVICE_7005D                                       \
        IWL_DEVICE_7000_COMMON,                                 \
        .ucode_api_max = IWL7265D_UCODE_API_MAX,                \
@@ -299,11 +313,11 @@ const struct iwl_cfg iwl3165_2ac_cfg = {
 
 const struct iwl_cfg iwl3168_2ac_cfg = {
        .name = "Intel(R) Dual Band Wireless AC 3168",
-       .fw_name_pre = IWL7265D_FW_PRE,
-       IWL_DEVICE_7000,
+       .fw_name_pre = IWL3168_FW_PRE,
+       IWL_DEVICE_3008,
        .ht_params = &iwl7000_ht_params,
-       .nvm_ver = IWL3165_NVM_VERSION,
-       .nvm_calib_ver = IWL3165_TX_POWER_VERSION,
+       .nvm_ver = IWL3168_NVM_VERSION,
+       .nvm_calib_ver = IWL3168_TX_POWER_VERSION,
        .pwr_tx_backoffs = iwl7265_pwr_tx_backoffs,
        .dccm_len = IWL7265_DCCM_LEN,
 };
@@ -376,5 +390,6 @@ const struct iwl_cfg iwl7265d_n_cfg = {
 
 MODULE_FIRMWARE(IWL7260_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
 MODULE_FIRMWARE(IWL3160_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
+MODULE_FIRMWARE(IWL3168_MODULE_FIRMWARE(IWL3168_UCODE_API_OK));
 MODULE_FIRMWARE(IWL7265_MODULE_FIRMWARE(IWL7265_UCODE_API_OK));
 MODULE_FIRMWARE(IWL7265D_MODULE_FIRMWARE(IWL7265D_UCODE_API_OK));
index c84a029..bce9b34 100644 (file)
@@ -7,6 +7,7 @@
  *
  * Copyright(c) 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2014 - 2015 Intel Mobile Communications GmbH
+ * Copyright(c) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
 
 /* Highest firmware API version supported */
 #define IWL8000_UCODE_API_MAX  20
+#define IWL8265_UCODE_API_MAX  20
 
 /* Oldest version we won't warn about */
 #define IWL8000_UCODE_API_OK   13
+#define IWL8265_UCODE_API_OK   20
 
 /* Lowest firmware API version supported */
 #define IWL8000_UCODE_API_MIN  13
+#define IWL8265_UCODE_API_MIN  20
 
 /* NVM versions */
 #define IWL8000_NVM_VERSION            0x0a1d
 #define IWL8000_MODULE_FIRMWARE(api) \
        IWL8000_FW_PRE "-" __stringify(api) ".ucode"
 
+#define IWL8265_FW_PRE "iwlwifi-8265-"
+#define IWL8265_MODULE_FIRMWARE(api) \
+       IWL8265_FW_PRE __stringify(api) ".ucode"
+
 #define NVM_HW_SECTION_NUM_FAMILY_8000         10
 #define DEFAULT_NVM_FILE_FAMILY_8000B          "nvmData-8000B"
 #define DEFAULT_NVM_FILE_FAMILY_8000C          "nvmData-8000C"
@@ -144,10 +152,7 @@ static const struct iwl_tt_params iwl8000_tt_params = {
        .support_tx_backoff = true,
 };
 
-#define IWL_DEVICE_8000                                                        \
-       .ucode_api_max = IWL8000_UCODE_API_MAX,                         \
-       .ucode_api_ok = IWL8000_UCODE_API_OK,                           \
-       .ucode_api_min = IWL8000_UCODE_API_MIN,                         \
+#define IWL_DEVICE_8000_COMMON                                         \
        .device_family = IWL_DEVICE_FAMILY_8000,                        \
        .max_inst_size = IWL60_RTC_INST_SIZE,                           \
        .max_data_size = IWL60_RTC_DATA_SIZE,                           \
@@ -167,10 +172,28 @@ static const struct iwl_tt_params iwl8000_tt_params = {
        .thermal_params = &iwl8000_tt_params,                           \
        .apmg_not_supported = true
 
+#define IWL_DEVICE_8000                                                        \
+       IWL_DEVICE_8000_COMMON,                                         \
+       .ucode_api_max = IWL8000_UCODE_API_MAX,                         \
+       .ucode_api_ok = IWL8000_UCODE_API_OK,                           \
+       .ucode_api_min = IWL8000_UCODE_API_MIN                          \
+
+#define IWL_DEVICE_8260                                                        \
+       IWL_DEVICE_8000_COMMON,                                         \
+       .ucode_api_max = IWL8000_UCODE_API_MAX,                         \
+       .ucode_api_ok = IWL8000_UCODE_API_OK,                           \
+       .ucode_api_min = IWL8000_UCODE_API_MIN                          \
+
+#define IWL_DEVICE_8265                                                        \
+       IWL_DEVICE_8000_COMMON,                                         \
+       .ucode_api_max = IWL8265_UCODE_API_MAX,                         \
+       .ucode_api_ok = IWL8265_UCODE_API_OK,                           \
+       .ucode_api_min = IWL8265_UCODE_API_MIN                          \
+
 const struct iwl_cfg iwl8260_2n_cfg = {
        .name = "Intel(R) Dual Band Wireless N 8260",
        .fw_name_pre = IWL8000_FW_PRE,
-       IWL_DEVICE_8000,
+       IWL_DEVICE_8260,
        .ht_params = &iwl8000_ht_params,
        .nvm_ver = IWL8000_NVM_VERSION,
        .nvm_calib_ver = IWL8000_TX_POWER_VERSION,
@@ -179,7 +202,7 @@ const struct iwl_cfg iwl8260_2n_cfg = {
 const struct iwl_cfg iwl8260_2ac_cfg = {
        .name = "Intel(R) Dual Band Wireless AC 8260",
        .fw_name_pre = IWL8000_FW_PRE,
-       IWL_DEVICE_8000,
+       IWL_DEVICE_8260,
        .ht_params = &iwl8000_ht_params,
        .nvm_ver = IWL8000_NVM_VERSION,
        .nvm_calib_ver = IWL8000_TX_POWER_VERSION,
@@ -188,8 +211,8 @@ const struct iwl_cfg iwl8260_2ac_cfg = {
 
 const struct iwl_cfg iwl8265_2ac_cfg = {
        .name = "Intel(R) Dual Band Wireless AC 8265",
-       .fw_name_pre = IWL8000_FW_PRE,
-       IWL_DEVICE_8000,
+       .fw_name_pre = IWL8265_FW_PRE,
+       IWL_DEVICE_8265,
        .ht_params = &iwl8000_ht_params,
        .nvm_ver = IWL8000_NVM_VERSION,
        .nvm_calib_ver = IWL8000_TX_POWER_VERSION,
@@ -209,7 +232,7 @@ const struct iwl_cfg iwl4165_2ac_cfg = {
 const struct iwl_cfg iwl8260_2ac_sdio_cfg = {
        .name = "Intel(R) Dual Band Wireless-AC 8260",
        .fw_name_pre = IWL8000_FW_PRE,
-       IWL_DEVICE_8000,
+       IWL_DEVICE_8260,
        .ht_params = &iwl8000_ht_params,
        .nvm_ver = IWL8000_NVM_VERSION,
        .nvm_calib_ver = IWL8000_TX_POWER_VERSION,
@@ -236,3 +259,4 @@ const struct iwl_cfg iwl4165_2ac_sdio_cfg = {
 };
 
 MODULE_FIRMWARE(IWL8000_MODULE_FIRMWARE(IWL8000_UCODE_API_OK));
+MODULE_FIRMWARE(IWL8265_MODULE_FIRMWARE(IWL8265_UCODE_API_OK));
index 7acb490..ab4c2a0 100644 (file)
@@ -243,8 +243,10 @@ static int iwl_request_firmware(struct iwl_drv *drv, bool first)
        if (drv->trans->cfg->device_family == IWL_DEVICE_FAMILY_8000) {
                char rev_step = 'A' + CSR_HW_REV_STEP(drv->trans->hw_rev);
 
-               snprintf(drv->firmware_name, sizeof(drv->firmware_name),
-                        "%s%c-%s.ucode", name_pre, rev_step, tag);
+               if (rev_step != 'A')
+                       snprintf(drv->firmware_name,
+                                sizeof(drv->firmware_name), "%s%c-%s.ucode",
+                                name_pre, rev_step, tag);
        }
 
        IWL_DEBUG_INFO(drv, "attempting to load firmware %s'%s'\n",
index 0036d18..ba3f0bb 100644 (file)
@@ -510,6 +510,9 @@ struct iwl_mvm_tx_resp {
  * @scd_ssn: the index of the last contiguously sent packet
  * @txed: number of Txed frames in this batch
  * @txed_2_done: number of Acked frames in this batch
+ * @reduced_txp: power reduced according to TPC. This is the actual value and
+ *     not a copy from the LQ command. Thus, if not the first rate was used
+ *     for Tx-ing then this value will be set to 0 by FW.
  */
 struct iwl_mvm_ba_notif {
        __le32 sta_addr_lo32;
@@ -524,7 +527,8 @@ struct iwl_mvm_ba_notif {
        __le16 scd_ssn;
        u8 txed;
        u8 txed_2_done;
-       __le16 reserved1;
+       u8 reduced_txp;
+       u8 reserved1;
 } __packed;
 
 /*
index 4ed5180..0ccc697 100644 (file)
@@ -107,7 +107,7 @@ static int iwl_send_tx_ant_cfg(struct iwl_mvm *mvm, u8 valid_tx_ant)
                                    sizeof(tx_ant_cmd), &tx_ant_cmd);
 }
 
-static void iwl_free_fw_paging(struct iwl_mvm *mvm)
+void iwl_free_fw_paging(struct iwl_mvm *mvm)
 {
        int i;
 
@@ -127,6 +127,8 @@ static void iwl_free_fw_paging(struct iwl_mvm *mvm)
                             get_order(mvm->fw_paging_db[i].fw_paging_size));
        }
        kfree(mvm->trans->paging_download_buf);
+       mvm->trans->paging_download_buf = NULL;
+
        memset(mvm->fw_paging_db, 0, sizeof(mvm->fw_paging_db));
 }
 
index 5f3ac8c..ff7c6df 100644 (file)
@@ -1225,6 +1225,9 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm,
 void iwl_mvm_rx_umac_scan_iter_complete_notif(struct iwl_mvm *mvm,
                                              struct iwl_rx_cmd_buffer *rxb);
 
+/* Paging */
+void iwl_free_fw_paging(struct iwl_mvm *mvm);
+
 /* MVM debugfs */
 #ifdef CONFIG_IWLWIFI_DEBUGFS
 int iwl_mvm_dbgfs_register(struct iwl_mvm *mvm, struct dentry *dbgfs_dir);
index 89ea70d..e80be9a 100644 (file)
@@ -684,6 +684,8 @@ static void iwl_op_mode_mvm_stop(struct iwl_op_mode *op_mode)
        for (i = 0; i < NVM_MAX_NUM_SECTIONS; i++)
                kfree(mvm->nvm_sections[i].data);
 
+       iwl_free_fw_paging(mvm);
+
        iwl_mvm_tof_clean(mvm);
 
        ieee80211_free_hw(mvm->hw);
index 7bb6fd0..94caa88 100644 (file)
@@ -2,6 +2,7 @@
  *
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
+ * Copyright(c) 2016 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -724,14 +725,28 @@ static int _rs_collect_tx_data(struct iwl_mvm *mvm,
        return 0;
 }
 
-static int rs_collect_tx_data(struct iwl_mvm *mvm,
-                             struct iwl_lq_sta *lq_sta,
-                             struct iwl_scale_tbl_info *tbl,
-                             int scale_index, int attempts, int successes,
-                             u8 reduced_txp)
+static int rs_collect_tpc_data(struct iwl_mvm *mvm,
+                              struct iwl_lq_sta *lq_sta,
+                              struct iwl_scale_tbl_info *tbl,
+                              int scale_index, int attempts, int successes,
+                              u8 reduced_txp)
+{
+       struct iwl_rate_scale_data *window = NULL;
+
+       if (WARN_ON_ONCE(reduced_txp > TPC_MAX_REDUCTION))
+               return -EINVAL;
+
+       window = &tbl->tpc_win[reduced_txp];
+       return  _rs_collect_tx_data(mvm, tbl, scale_index, attempts, successes,
+                                   window);
+}
+
+static int rs_collect_tlc_data(struct iwl_mvm *mvm,
+                              struct iwl_lq_sta *lq_sta,
+                              struct iwl_scale_tbl_info *tbl,
+                              int scale_index, int attempts, int successes)
 {
        struct iwl_rate_scale_data *window = NULL;
-       int ret;
 
        if (scale_index < 0 || scale_index >= IWL_RATE_COUNT)
                return -EINVAL;
@@ -745,16 +760,6 @@ static int rs_collect_tx_data(struct iwl_mvm *mvm,
 
        /* Select window for current tx bit rate */
        window = &(tbl->win[scale_index]);
-
-       ret = _rs_collect_tx_data(mvm, tbl, scale_index, attempts, successes,
-                                 window);
-       if (ret)
-               return ret;
-
-       if (WARN_ON_ONCE(reduced_txp > TPC_MAX_REDUCTION))
-               return -EINVAL;
-
-       window = &tbl->tpc_win[reduced_txp];
        return _rs_collect_tx_data(mvm, tbl, scale_index, attempts, successes,
                                   window);
 }
@@ -1301,17 +1306,30 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
         * first index into rate scale table.
         */
        if (info->flags & IEEE80211_TX_STAT_AMPDU) {
-               /* ampdu_ack_len = 0 marks no BA was received. In this case
-                * treat it as a single frame loss as we don't want the success
-                * ratio to dip too quickly because a BA wasn't received
+               rs_collect_tpc_data(mvm, lq_sta, curr_tbl, lq_rate.index,
+                                   info->status.ampdu_len,
+                                   info->status.ampdu_ack_len,
+                                   reduced_txp);
+
+               /* ampdu_ack_len = 0 marks no BA was received. For TLC, treat
+                * it as a single frame loss as we don't want the success ratio
+                * to dip too quickly because a BA wasn't received.
+                * For TPC, there's no need for this optimisation since we want
+                * to recover very quickly from a bad power reduction and,
+                * therefore we'd like the success ratio to get an immediate hit
+                * when failing to get a BA, so we'd switch back to a lower or
+                * zero power reduction. When FW transmits agg with a rate
+                * different from the initial rate, it will not use reduced txp
+                * and will send BA notification twice (one empty with reduced
+                * txp equal to the value from LQ and one with reduced txp 0).
+                * We need to update counters for each txp level accordingly.
                 */
                if (info->status.ampdu_ack_len == 0)
                        info->status.ampdu_len = 1;
 
-               rs_collect_tx_data(mvm, lq_sta, curr_tbl, lq_rate.index,
-                                  info->status.ampdu_len,
-                                  info->status.ampdu_ack_len,
-                                  reduced_txp);
+               rs_collect_tlc_data(mvm, lq_sta, curr_tbl, lq_rate.index,
+                                   info->status.ampdu_len,
+                                   info->status.ampdu_ack_len);
 
                /* Update success/fail counts if not searching for new mode */
                if (lq_sta->rs_state == RS_STATE_STAY_IN_COLUMN) {
@@ -1344,9 +1362,13 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
                        else
                                continue;
 
-                       rs_collect_tx_data(mvm, lq_sta, tmp_tbl, lq_rate.index,
-                                          1, i < retries ? 0 : legacy_success,
-                                          reduced_txp);
+                       rs_collect_tpc_data(mvm, lq_sta, tmp_tbl,
+                                           lq_rate.index, 1,
+                                           i < retries ? 0 : legacy_success,
+                                           reduced_txp);
+                       rs_collect_tlc_data(mvm, lq_sta, tmp_tbl,
+                                           lq_rate.index, 1,
+                                           i < retries ? 0 : legacy_success);
                }
 
                /* Update success/fail counts if not searching for new mode */
index 9a15642..ea1e177 100644 (file)
@@ -1298,6 +1298,10 @@ int iwl_mvm_sched_scan_start(struct iwl_mvm *mvm,
                return -EBUSY;
        }
 
+       /* we don't support "match all" in the firmware */
+       if (!req->n_match_sets)
+               return -EOPNOTSUPP;
+
        ret = iwl_mvm_check_running_scans(mvm, type);
        if (ret)
                return ret;
index 8bf48a7..a040edc 100644 (file)
@@ -423,6 +423,15 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
                return -1;
        }
 
+       /*
+        * Increase the pending frames counter, so that later when a reply comes
+        * in and the counter is decreased - we don't start getting negative
+        * values.
+        * Note that we don't need to make sure it isn't agg'd, since we're
+        * TXing non-sta
+        */
+       atomic_inc(&mvm->pending_frames[sta_id]);
+
        return 0;
 }
 
@@ -1029,7 +1038,6 @@ static void iwl_mvm_rx_tx_cmd_agg(struct iwl_mvm *mvm,
                struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
                mvmsta->tid_data[tid].rate_n_flags =
                        le32_to_cpu(tx_resp->initial_rate);
-               mvmsta->tid_data[tid].reduced_tpc = tx_resp->reduced_tpc;
                mvmsta->tid_data[tid].tx_time =
                        le16_to_cpu(tx_resp->wireless_media_time);
        }
@@ -1060,7 +1068,7 @@ static void iwl_mvm_tx_info_from_ba_notif(struct ieee80211_tx_info *info,
        /* TODO: not accounted if the whole A-MPDU failed */
        info->status.tx_time = tid_data->tx_time;
        info->status.status_driver_data[0] =
-               (void *)(uintptr_t)tid_data->reduced_tpc;
+               (void *)(uintptr_t)ba_notif->reduced_txp;
        info->status.status_driver_data[1] =
                (void *)(uintptr_t)tid_data->rate_n_flags;
 }
@@ -1133,6 +1141,8 @@ void iwl_mvm_rx_ba_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
                           scd_flow, ba_resp_scd_ssn, ba_notif->txed,
                           ba_notif->txed_2_done);
 
+       IWL_DEBUG_TX_REPLY(mvm, "reduced txp from ba notif %d\n",
+                          ba_notif->reduced_txp);
        tid_data->next_reclaimed = ba_resp_scd_ssn;
 
        iwl_mvm_check_ratid_empty(mvm, sta, tid);
index 6261a68..00335ea 100644 (file)
@@ -378,7 +378,10 @@ static const struct pci_device_id iwl_hw_card_ids[] = {
        {IWL_PCI_DEVICE(0x3165, 0x8110, iwl3165_2ac_cfg)},
 
 /* 3168 Series */
+       {IWL_PCI_DEVICE(0x24FB, 0x2010, iwl3168_2ac_cfg)},
        {IWL_PCI_DEVICE(0x24FB, 0x2110, iwl3168_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x24FB, 0x2050, iwl3168_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x24FB, 0x2150, iwl3168_2ac_cfg)},
        {IWL_PCI_DEVICE(0x24FB, 0x0000, iwl3168_2ac_cfg)},
 
 /* 7265 Series */
@@ -475,6 +478,7 @@ static const struct pci_device_id iwl_hw_card_ids[] = {
        {IWL_PCI_DEVICE(0x24F3, 0x0000, iwl8265_2ac_cfg)},
        {IWL_PCI_DEVICE(0x24FD, 0x0010, iwl8265_2ac_cfg)},
        {IWL_PCI_DEVICE(0x24FD, 0x8010, iwl8265_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x24FD, 0x0810, iwl8265_2ac_cfg)},
 
 /* 9000 Series */
        {IWL_PCI_DEVICE(0x9DF0, 0x2A10, iwl5165_2ac_cfg)},
index cc3888e..73c9559 100644 (file)
@@ -490,6 +490,15 @@ static inline void iwl_enable_interrupts(struct iwl_trans *trans)
        iwl_write32(trans, CSR_INT_MASK, trans_pcie->inta_mask);
 }
 
+static inline void iwl_enable_fw_load_int(struct iwl_trans *trans)
+{
+       struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+
+       IWL_DEBUG_ISR(trans, "Enabling FW load interrupt\n");
+       trans_pcie->inta_mask = CSR_INT_BIT_FH_TX;
+       iwl_write32(trans, CSR_INT_MASK, trans_pcie->inta_mask);
+}
+
 static inline void iwl_enable_rfkill_int(struct iwl_trans *trans)
 {
        struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
index ccafbd8..152cf9a 100644 (file)
@@ -1438,9 +1438,11 @@ irqreturn_t iwl_pcie_irq_handler(int irq, void *dev_id)
                         inta & ~trans_pcie->inta_mask);
        }
 
-       /* Re-enable all interrupts */
-       /* only Re-enable if disabled by irq */
-       if (test_bit(STATUS_INT_ENABLED, &trans->status))
+       /* we are loading the firmware, enable FH_TX interrupt only */
+       if (handled & CSR_INT_BIT_FH_TX)
+               iwl_enable_fw_load_int(trans);
+       /* only Re-enable all interrupt if disabled by irq */
+       else if (test_bit(STATUS_INT_ENABLED, &trans->status))
                iwl_enable_interrupts(trans);
        /* Re-enable RF_KILL if it occurred */
        else if (handled & CSR_INT_BIT_RF_KILL)
index d60a467..5a854c6 100644 (file)
@@ -1021,82 +1021,6 @@ static int iwl_pcie_load_given_ucode_8000(struct iwl_trans *trans,
                                               &first_ucode_section);
 }
 
-static int iwl_trans_pcie_start_fw(struct iwl_trans *trans,
-                                  const struct fw_img *fw, bool run_in_rfkill)
-{
-       struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
-       bool hw_rfkill;
-       int ret;
-
-       mutex_lock(&trans_pcie->mutex);
-
-       /* Someone called stop_device, don't try to start_fw */
-       if (trans_pcie->is_down) {
-               IWL_WARN(trans,
-                        "Can't start_fw since the HW hasn't been started\n");
-               ret = EIO;
-               goto out;
-       }
-
-       /* This may fail if AMT took ownership of the device */
-       if (iwl_pcie_prepare_card_hw(trans)) {
-               IWL_WARN(trans, "Exit HW not ready\n");
-               ret = -EIO;
-               goto out;
-       }
-
-       iwl_enable_rfkill_int(trans);
-
-       /* If platform's RF_KILL switch is NOT set to KILL */
-       hw_rfkill = iwl_is_rfkill_set(trans);
-       if (hw_rfkill)
-               set_bit(STATUS_RFKILL, &trans->status);
-       else
-               clear_bit(STATUS_RFKILL, &trans->status);
-       iwl_trans_pcie_rf_kill(trans, hw_rfkill);
-       if (hw_rfkill && !run_in_rfkill) {
-               ret = -ERFKILL;
-               goto out;
-       }
-
-       iwl_write32(trans, CSR_INT, 0xFFFFFFFF);
-
-       ret = iwl_pcie_nic_init(trans);
-       if (ret) {
-               IWL_ERR(trans, "Unable to init nic\n");
-               goto out;
-       }
-
-       /* make sure rfkill handshake bits are cleared */
-       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
-       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR,
-                   CSR_UCODE_DRV_GP1_BIT_CMD_BLOCKED);
-
-       /* clear (again), then enable host interrupts */
-       iwl_write32(trans, CSR_INT, 0xFFFFFFFF);
-       iwl_enable_interrupts(trans);
-
-       /* really make sure rfkill handshake bits are cleared */
-       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
-       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
-
-       /* Load the given image to the HW */
-       if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000)
-               ret = iwl_pcie_load_given_ucode_8000(trans, fw);
-       else
-               ret = iwl_pcie_load_given_ucode(trans, fw);
-
-out:
-       mutex_unlock(&trans_pcie->mutex);
-       return ret;
-}
-
-static void iwl_trans_pcie_fw_alive(struct iwl_trans *trans, u32 scd_addr)
-{
-       iwl_pcie_reset_ict(trans);
-       iwl_pcie_tx_start(trans, scd_addr);
-}
-
 static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
 {
        struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
@@ -1127,7 +1051,8 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
         * already dead.
         */
        if (test_and_clear_bit(STATUS_DEVICE_ENABLED, &trans->status)) {
-               IWL_DEBUG_INFO(trans, "DEVICE_ENABLED bit was set and is now cleared\n");
+               IWL_DEBUG_INFO(trans,
+                              "DEVICE_ENABLED bit was set and is now cleared\n");
                iwl_pcie_tx_stop(trans);
                iwl_pcie_rx_stop(trans);
 
@@ -1161,7 +1086,6 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
        iwl_disable_interrupts(trans);
        spin_unlock(&trans_pcie->irq_lock);
 
-
        /* clear all status bits */
        clear_bit(STATUS_SYNC_HCMD_ACTIVE, &trans->status);
        clear_bit(STATUS_INT_ENABLED, &trans->status);
@@ -1194,10 +1118,116 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
        if (hw_rfkill != was_hw_rfkill)
                iwl_trans_pcie_rf_kill(trans, hw_rfkill);
 
-       /* re-take ownership to prevent other users from stealing the deivce */
+       /* re-take ownership to prevent other users from stealing the device */
        iwl_pcie_prepare_card_hw(trans);
 }
 
+static int iwl_trans_pcie_start_fw(struct iwl_trans *trans,
+                                  const struct fw_img *fw, bool run_in_rfkill)
+{
+       struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+       bool hw_rfkill;
+       int ret;
+
+       /* This may fail if AMT took ownership of the device */
+       if (iwl_pcie_prepare_card_hw(trans)) {
+               IWL_WARN(trans, "Exit HW not ready\n");
+               ret = -EIO;
+               goto out;
+       }
+
+       iwl_enable_rfkill_int(trans);
+
+       iwl_write32(trans, CSR_INT, 0xFFFFFFFF);
+
+       /*
+        * We enabled the RF-Kill interrupt and the handler may very
+        * well be running. Disable the interrupts to make sure no other
+        * interrupt can be fired.
+        */
+       iwl_disable_interrupts(trans);
+
+       /* Make sure it finished running */
+       synchronize_irq(trans_pcie->pci_dev->irq);
+
+       mutex_lock(&trans_pcie->mutex);
+
+       /* If platform's RF_KILL switch is NOT set to KILL */
+       hw_rfkill = iwl_is_rfkill_set(trans);
+       if (hw_rfkill)
+               set_bit(STATUS_RFKILL, &trans->status);
+       else
+               clear_bit(STATUS_RFKILL, &trans->status);
+       iwl_trans_pcie_rf_kill(trans, hw_rfkill);
+       if (hw_rfkill && !run_in_rfkill) {
+               ret = -ERFKILL;
+               goto out;
+       }
+
+       /* Someone called stop_device, don't try to start_fw */
+       if (trans_pcie->is_down) {
+               IWL_WARN(trans,
+                        "Can't start_fw since the HW hasn't been started\n");
+               ret = -EIO;
+               goto out;
+       }
+
+       /* make sure rfkill handshake bits are cleared */
+       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
+       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR,
+                   CSR_UCODE_DRV_GP1_BIT_CMD_BLOCKED);
+
+       /* clear (again), then enable host interrupts */
+       iwl_write32(trans, CSR_INT, 0xFFFFFFFF);
+
+       ret = iwl_pcie_nic_init(trans);
+       if (ret) {
+               IWL_ERR(trans, "Unable to init nic\n");
+               goto out;
+       }
+
+       /*
+        * Now, we load the firmware and don't want to be interrupted, even
+        * by the RF-Kill interrupt (hence mask all the interrupt besides the
+        * FH_TX interrupt which is needed to load the firmware). If the
+        * RF-Kill switch is toggled, we will find out after having loaded
+        * the firmware and return the proper value to the caller.
+        */
+       iwl_enable_fw_load_int(trans);
+
+       /* really make sure rfkill handshake bits are cleared */
+       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
+       iwl_write32(trans, CSR_UCODE_DRV_GP1_CLR, CSR_UCODE_SW_BIT_RFKILL);
+
+       /* Load the given image to the HW */
+       if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000)
+               ret = iwl_pcie_load_given_ucode_8000(trans, fw);
+       else
+               ret = iwl_pcie_load_given_ucode(trans, fw);
+       iwl_enable_interrupts(trans);
+
+       /* re-check RF-Kill state since we may have missed the interrupt */
+       hw_rfkill = iwl_is_rfkill_set(trans);
+       if (hw_rfkill)
+               set_bit(STATUS_RFKILL, &trans->status);
+       else
+               clear_bit(STATUS_RFKILL, &trans->status);
+
+       iwl_trans_pcie_rf_kill(trans, hw_rfkill);
+       if (hw_rfkill && !run_in_rfkill)
+               ret = -ERFKILL;
+
+out:
+       mutex_unlock(&trans_pcie->mutex);
+       return ret;
+}
+
+static void iwl_trans_pcie_fw_alive(struct iwl_trans *trans, u32 scd_addr)
+{
+       iwl_pcie_reset_ict(trans);
+       iwl_pcie_tx_start(trans, scd_addr);
+}
+
 static void iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
 {
        struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
index c32889a..a28414c 100644 (file)
@@ -991,7 +991,8 @@ static void mac80211_hwsim_tx_frame_nl(struct ieee80211_hw *hw,
                goto nla_put_failure;
        }
 
-       if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, hdr->addr2))
+       if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER,
+                   ETH_ALEN, data->addresses[1].addr))
                goto nla_put_failure;
 
        /* We get the skb->data */
@@ -2736,7 +2737,7 @@ static struct mac80211_hwsim_data *get_hwsim_data_ref_from_addr(const u8 *addr)
 
        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry(data, &hwsim_radios, list) {
-               if (mac80211_hwsim_addr_match(data, addr)) {
+               if (memcmp(data->addresses[1].addr, addr, ETH_ALEN) == 0) {
                        _found = true;
                        break;
                }
index 9a3966c..155f343 100644 (file)
@@ -273,8 +273,10 @@ static void rt2400pci_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_PLCPFAIL));
        rt2x00_set_field32(&reg, RXCSR0_DROP_CONTROL,
                           !(filter_flags & FIF_CONTROL));
-       rt2x00_set_field32(&reg, RXCSR0_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field32(&reg, RXCSR0_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field32(&reg, RXCSR0_DROP_TODS,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags) &&
                           !rt2x00dev->intf_ap_count);
        rt2x00_set_field32(&reg, RXCSR0_DROP_VERSION_ERROR, 1);
        rt2x00mmio_register_write(rt2x00dev, RXCSR0, reg);
index 1a6740b..2553cdd 100644 (file)
@@ -274,8 +274,10 @@ static void rt2500pci_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_PLCPFAIL));
        rt2x00_set_field32(&reg, RXCSR0_DROP_CONTROL,
                           !(filter_flags & FIF_CONTROL));
-       rt2x00_set_field32(&reg, RXCSR0_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field32(&reg, RXCSR0_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field32(&reg, RXCSR0_DROP_TODS,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags) &&
                           !rt2x00dev->intf_ap_count);
        rt2x00_set_field32(&reg, RXCSR0_DROP_VERSION_ERROR, 1);
        rt2x00_set_field32(&reg, RXCSR0_DROP_MCAST,
index d26018f..2d64611 100644 (file)
@@ -437,8 +437,10 @@ static void rt2500usb_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_PLCPFAIL));
        rt2x00_set_field16(&reg, TXRX_CSR2_DROP_CONTROL,
                           !(filter_flags & FIF_CONTROL));
-       rt2x00_set_field16(&reg, TXRX_CSR2_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field16(&reg, TXRX_CSR2_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field16(&reg, TXRX_CSR2_DROP_TODS,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags) &&
                           !rt2x00dev->intf_ap_count);
        rt2x00_set_field16(&reg, TXRX_CSR2_DROP_VERSION_ERROR, 1);
        rt2x00_set_field16(&reg, TXRX_CSR2_DROP_MULTICAST,
index 9733b31..a26afca 100644 (file)
@@ -1490,7 +1490,8 @@ void rt2800_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_FCSFAIL));
        rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_PHY_ERROR,
                           !(filter_flags & FIF_PLCPFAIL));
-       rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_NOT_MY_BSSD, 0);
        rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_VER_ERROR, 1);
        rt2x00_set_field32(&reg, RX_FILTER_CFG_DROP_MULTICAST,
index 3282ddb..2642714 100644 (file)
@@ -669,6 +669,7 @@ enum rt2x00_state_flags {
        CONFIG_POWERSAVING,
        CONFIG_HT_DISABLED,
        CONFIG_QOS_DISABLED,
+       CONFIG_MONITORING,
 
        /*
         * Mark we currently are sequentially reading TX_STA_FIFO register
index 7e8bb11..6a1f508 100644 (file)
@@ -277,6 +277,11 @@ void rt2x00lib_config(struct rt2x00_dev *rt2x00dev,
        else
                clear_bit(CONFIG_POWERSAVING, &rt2x00dev->flags);
 
+       if (conf->flags & IEEE80211_CONF_MONITOR)
+               set_bit(CONFIG_MONITORING, &rt2x00dev->flags);
+       else
+               clear_bit(CONFIG_MONITORING, &rt2x00dev->flags);
+
        rt2x00dev->curr_band = conf->chandef.chan->band;
        rt2x00dev->curr_freq = conf->chandef.chan->center_freq;
        rt2x00dev->tx_power = conf->power_level;
index 3c26ee6..13da95a 100644 (file)
@@ -385,11 +385,6 @@ void rt2x00mac_configure_filter(struct ieee80211_hw *hw,
                        *total_flags |= FIF_PSPOLL;
        }
 
-       /*
-        * Check if there is any work left for us.
-        */
-       if (rt2x00dev->packet_filter == *total_flags)
-               return;
        rt2x00dev->packet_filter = *total_flags;
 
        rt2x00dev->ops->lib->config_filter(rt2x00dev, *total_flags);
index c0e730e..24a3436 100644 (file)
@@ -530,8 +530,10 @@ static void rt61pci_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_PLCPFAIL));
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_CONTROL,
                           !(filter_flags & (FIF_CONTROL | FIF_PSPOLL)));
-       rt2x00_set_field32(&reg, TXRX_CSR0_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field32(&reg, TXRX_CSR0_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_TO_DS,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags) &&
                           !rt2x00dev->intf_ap_count);
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_VERSION_ERROR, 1);
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_MULTICAST,
index 7081e13..7bbc869 100644 (file)
@@ -480,8 +480,10 @@ static void rt73usb_config_filter(struct rt2x00_dev *rt2x00dev,
                           !(filter_flags & FIF_PLCPFAIL));
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_CONTROL,
                           !(filter_flags & (FIF_CONTROL | FIF_PSPOLL)));
-       rt2x00_set_field32(&reg, TXRX_CSR0_DROP_NOT_TO_ME, 1);
+       rt2x00_set_field32(&reg, TXRX_CSR0_DROP_NOT_TO_ME,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags));
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_TO_DS,
+                          !test_bit(CONFIG_MONITORING, &rt2x00dev->flags) &&
                           !rt2x00dev->intf_ap_count);
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_VERSION_ERROR, 1);
        rt2x00_set_field32(&reg, TXRX_CSR0_DROP_MULTICAST,
index 74c14ce..28f7010 100644 (file)
@@ -138,6 +138,11 @@ static void _rtl_rc_rate_set_series(struct rtl_priv *rtlpriv,
                    ((wireless_mode == WIRELESS_MODE_N_5G) ||
                     (wireless_mode == WIRELESS_MODE_N_24G)))
                        rate->flags |= IEEE80211_TX_RC_MCS;
+               if (sta && sta->vht_cap.vht_supported &&
+                   (wireless_mode == WIRELESS_MODE_AC_5G ||
+                    wireless_mode == WIRELESS_MODE_AC_24G ||
+                    wireless_mode == WIRELESS_MODE_AC_ONLY))
+                       rate->flags |= IEEE80211_TX_RC_VHT_MCS;
        }
 }
 
index a62bf0a..5be3411 100644 (file)
@@ -351,7 +351,6 @@ static const struct ieee80211_regdomain *_rtl_regdomain_select(
        case COUNTRY_CODE_SPAIN:
        case COUNTRY_CODE_FRANCE:
        case COUNTRY_CODE_ISRAEL:
-       case COUNTRY_CODE_WORLD_WIDE_13:
                return &rtl_regdom_12_13;
        case COUNTRY_CODE_MKK:
        case COUNTRY_CODE_MKK1:
@@ -360,6 +359,7 @@ static const struct ieee80211_regdomain *_rtl_regdomain_select(
                return &rtl_regdom_14_60_64;
        case COUNTRY_CODE_GLOBAL_DOMAIN:
                return &rtl_regdom_14;
+       case COUNTRY_CODE_WORLD_WIDE_13:
        case COUNTRY_CODE_WORLD_WIDE_13_5G_ALL:
                return &rtl_regdom_12_13_5g_all;
        default:
index 9ac118e..564ca75 100644 (file)
@@ -175,14 +175,14 @@ int wlcore_set_partition(struct wl1271 *wl,
        if (ret < 0)
                goto out;
 
+       /* We don't need the size of the last partition, as it is
+        * automatically calculated based on the total memory size and
+        * the sizes of the previous partitions.
+        */
        ret = wlcore_raw_write32(wl, HW_PART3_START_ADDR, p->mem3.start);
        if (ret < 0)
                goto out;
 
-       ret = wlcore_raw_write32(wl, HW_PART3_SIZE_ADDR, p->mem3.size);
-       if (ret < 0)
-               goto out;
-
 out:
        return ret;
 }
index 6c257b5..10cf374 100644 (file)
@@ -36,8 +36,8 @@
 #define HW_PART1_START_ADDR             (HW_PARTITION_REGISTERS_ADDR + 12)
 #define HW_PART2_SIZE_ADDR              (HW_PARTITION_REGISTERS_ADDR + 16)
 #define HW_PART2_START_ADDR             (HW_PARTITION_REGISTERS_ADDR + 20)
-#define HW_PART3_SIZE_ADDR              (HW_PARTITION_REGISTERS_ADDR + 24)
-#define HW_PART3_START_ADDR             (HW_PARTITION_REGISTERS_ADDR + 28)
+#define HW_PART3_START_ADDR             (HW_PARTITION_REGISTERS_ADDR + 24)
+
 #define HW_ACCESS_REGISTER_SIZE         4
 
 #define HW_ACCESS_PRAM_MAX_RANGE       0x3c000
index d6abf19..96ccd4e 100644 (file)
@@ -364,6 +364,7 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
        RING_IDX cons, prod;
        unsigned short id;
        struct sk_buff *skb;
+       bool more_to_do;
 
        BUG_ON(!netif_carrier_ok(queue->info->netdev));
 
@@ -398,18 +399,8 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
 
                queue->tx.rsp_cons = prod;
 
-               /*
-                * Set a new event, then check for race with update of tx_cons.
-                * Note that it is essential to schedule a callback, no matter
-                * how few buffers are pending. Even if there is space in the
-                * transmit ring, higher layers may be blocked because too much
-                * data is outstanding: in such cases notification from Xen is
-                * likely to be the only kick that we'll get.
-                */
-               queue->tx.sring->rsp_event =
-                       prod + ((queue->tx.sring->req_prod - prod) >> 1) + 1;
-               mb();           /* update shared area */
-       } while ((cons == prod) && (prod != queue->tx.sring->rsp_prod));
+               RING_FINAL_CHECK_FOR_RESPONSES(&queue->tx, more_to_do);
+       } while (more_to_do);
 
        xennet_maybe_wake_tx(queue);
 }
index 7e2c43f..5d28e94 100644 (file)
@@ -382,18 +382,18 @@ static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
        [ND_CMD_ARS_CAP] = {
                .in_num = 2,
                .in_sizes = { 8, 8, },
-               .out_num = 2,
-               .out_sizes = { 4, 4, },
+               .out_num = 4,
+               .out_sizes = { 4, 4, 4, 4, },
        },
        [ND_CMD_ARS_START] = {
-               .in_num = 4,
-               .in_sizes = { 8, 8, 2, 6, },
-               .out_num = 1,
-               .out_sizes = { 4, },
+               .in_num = 5,
+               .in_sizes = { 8, 8, 2, 1, 5, },
+               .out_num = 2,
+               .out_sizes = { 4, 4, },
        },
        [ND_CMD_ARS_STATUS] = {
-               .out_num = 2,
-               .out_sizes = { 4, UINT_MAX, },
+               .out_num = 3,
+               .out_sizes = { 4, 4, UINT_MAX, },
        },
 };
 
@@ -442,8 +442,8 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
                return in_field[1];
        else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
                return out_field[1];
-       else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1)
-               return ND_CMD_ARS_STATUS_MAX;
+       else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2)
+               return out_field[1] - 8;
 
        return UINT_MAX;
 }
index b0045a5..95825b3 100644 (file)
@@ -55,7 +55,7 @@ static int e820_pmem_probe(struct platform_device *pdev)
        for (p = iomem_resource.child; p ; p = p->sibling) {
                struct nd_region_desc ndr_desc;
 
-               if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+               if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
                        continue;
 
                memset(&ndr_desc, 0, sizeof(ndr_desc));
index 8ebfcaa..9edf7eb 100644 (file)
@@ -1277,10 +1277,12 @@ static ssize_t mode_show(struct device *dev,
 
        device_lock(dev);
        claim = ndns->claim;
-       if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
-               mode = "memory";
-       else if (claim && is_nd_btt(claim))
+       if (claim && is_nd_btt(claim))
                mode = "safe";
+       else if (claim && is_nd_pfn(claim))
+               mode = "memory";
+       else if (!claim && pmem_should_map_pages(dev))
+               mode = "memory";
        else
                mode = "raw";
        rc = sprintf(buf, "%s\n", mode);
index 0cc9048..ae81a2f 100644 (file)
@@ -301,10 +301,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 
        switch (le32_to_cpu(pfn_sb->mode)) {
        case PFN_MODE_RAM:
-               break;
        case PFN_MODE_PMEM:
-               /* TODO: allocate from PMEM support */
-               return -ENOTTY;
+               break;
        default:
                return -ENXIO;
        }
index 7edf316..8d0b546 100644 (file)
@@ -41,7 +41,7 @@ struct pmem_device {
        phys_addr_t             phys_addr;
        /* when non-zero this device is hosting a 'pfn' instance */
        phys_addr_t             data_offset;
-       unsigned long           pfn_flags;
+       u64                     pfn_flags;
        void __pmem             *virt_addr;
        size_t                  size;
        struct badblocks        bb;
index 5d62373..b586d84 100644 (file)
@@ -17,5 +17,6 @@ config BLK_DEV_NVME_SCSI
          and block devices nodes, as well a a translation for a small
          number of selected SCSI commands to NVMe commands to the NVMe
          driver.  If you don't know what this means you probably want
-         to say N here, and if you know what it means you probably
-         want to say N as well.
+         to say N here, unless you run a distro that abuses the SCSI
+         emulation to provide stable device names for mount by id, like
+         some OpenSuSE and SLES versions.
index c5bf001..03c4641 100644 (file)
@@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref)
        ns->disk->private_data = NULL;
        spin_unlock(&dev_list_lock);
 
-       nvme_put_ctrl(ns->ctrl);
        put_disk(ns->disk);
+       ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+       nvme_put_ctrl(ns->ctrl);
        kfree(ns);
 }
 
@@ -183,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
                        goto out_unmap;
                }
 
-               if (meta_buffer) {
+               if (meta_buffer && meta_len) {
                        struct bio_integrity_payload *bip;
 
                        meta = kmalloc(meta_len, GFP_KERNEL);
@@ -373,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
        if (copy_from_user(&io, uio, sizeof(io)))
                return -EFAULT;
+       if (io.flags)
+               return -EINVAL;
 
        switch (io.opcode) {
        case nvme_cmd_write:
@@ -424,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
                return -EACCES;
        if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
                return -EFAULT;
+       if (cmd.flags)
+               return -EINVAL;
 
        memset(&c, 0, sizeof(c));
        c.common.opcode = cmd.opcode;
@@ -556,6 +561,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
        u16 old_ms;
        unsigned short bs;
 
+       if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+               set_capacity(disk, 0);
+               return -ENODEV;
+       }
        if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
                dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
                                __func__, ns->ctrl->instance, ns->ns_id);
@@ -831,6 +840,23 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
        return ret;
 }
 
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+               struct request_queue *q)
+{
+       if (ctrl->max_hw_sectors) {
+               u32 max_segments =
+                       (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
+               blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+               blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+       }
+       if (ctrl->stripe_size)
+               blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
+       if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+               blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+       blk_queue_virt_boundary(q, ctrl->page_size - 1);
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -888,6 +914,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                }
        }
 
+       nvme_set_queue_limits(ctrl, ctrl->admin_q);
+
        kfree(id);
        return 0;
 }
@@ -1118,10 +1146,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        if (!ns)
                return;
 
+       ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
+       if (ns->instance < 0)
+               goto out_free_ns;
+
        ns->queue = blk_mq_init_queue(ctrl->tagset);
        if (IS_ERR(ns->queue))
-               goto out_free_ns;
-       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+               goto out_release_instance;
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
        ns->queue->queuedata = ns;
        ns->ctrl = ctrl;
@@ -1135,17 +1166,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        ns->disk = disk;
        ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
 
+
        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-       if (ctrl->max_hw_sectors) {
-               blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
-               blk_queue_max_segments(ns->queue,
-                       (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
-       }
-       if (ctrl->stripe_size)
-               blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
-       if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
-               blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
-       blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+       nvme_set_queue_limits(ctrl, ns->queue);
 
        disk->major = nvme_major;
        disk->first_minor = 0;
@@ -1154,7 +1177,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        disk->queue = ns->queue;
        disk->driverfs_dev = ctrl->device;
        disk->flags = GENHD_FL_EXT_DEVT;
-       sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+       sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
 
        if (nvme_revalidate_disk(ns->disk))
                goto out_free_disk;
@@ -1174,40 +1197,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        kfree(disk);
  out_free_queue:
        blk_cleanup_queue(ns->queue);
+ out_release_instance:
+       ida_simple_remove(&ctrl->ns_ida, ns->instance);
  out_free_ns:
        kfree(ns);
 }
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-       bool kill = nvme_io_incapable(ns->ctrl) &&
-                       !blk_queue_dying(ns->queue);
-
-       lockdep_assert_held(&ns->ctrl->namespaces_mutex);
-
-       if (kill) {
-               blk_set_queue_dying(ns->queue);
+       if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+               return;
 
-               /*
-                * The controller was shutdown first if we got here through
-                * device removal. The shutdown may requeue outstanding
-                * requests. These need to be aborted immediately so
-                * del_gendisk doesn't block indefinitely for their completion.
-                */
-               blk_mq_abort_requeue_list(ns->queue);
-       }
        if (ns->disk->flags & GENHD_FL_UP) {
                if (blk_get_integrity(ns->disk))
                        blk_integrity_unregister(ns->disk);
                sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
                                        &nvme_ns_attr_group);
                del_gendisk(ns->disk);
-       }
-       if (kill || !blk_queue_dying(ns->queue)) {
                blk_mq_abort_requeue_list(ns->queue);
                blk_cleanup_queue(ns->queue);
        }
+       mutex_lock(&ns->ctrl->namespaces_mutex);
        list_del_init(&ns->list);
+       mutex_unlock(&ns->ctrl->namespaces_mutex);
        nvme_put_ns(ns);
 }
 
@@ -1301,10 +1313,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns, *next;
 
-       mutex_lock(&ctrl->namespaces_mutex);
        list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
                nvme_ns_remove(ns);
-       mutex_unlock(&ctrl->namespaces_mutex);
 }
 
 static DEFINE_IDA(nvme_instance_ida);
@@ -1351,6 +1361,7 @@ static void nvme_free_ctrl(struct kref *kref)
 
        put_device(ctrl->device);
        nvme_release_instance(ctrl);
+       ida_destroy(&ctrl->ns_ida);
 
        ctrl->ops->free_ctrl(ctrl);
 }
@@ -1391,6 +1402,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
        }
        get_device(ctrl->device);
        dev_set_drvdata(ctrl->device, ctrl);
+       ida_init(&ctrl->ns_ida);
 
        spin_lock(&dev_list_lock);
        list_add_tail(&ctrl->node, &nvme_ctrl_list);
@@ -1403,6 +1415,38 @@ out:
        return ret;
 }
 
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               if (!kref_get_unless_zero(&ns->kref))
+                       continue;
+
+               /*
+                * Revalidating a dead namespace sets capacity to 0. This will
+                * end buffered writers dirtying pages that can't be synced.
+                */
+               if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+                       revalidate_disk(ns->disk);
+
+               blk_set_queue_dying(ns->queue);
+               blk_mq_abort_requeue_list(ns->queue);
+               blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+               nvme_put_ns(ns);
+       }
+       mutex_unlock(&ctrl->namespaces_mutex);
+}
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns;
index 5cd3725..6bb15e4 100644 (file)
@@ -146,9 +146,10 @@ struct nvme_nvm_command {
        };
 };
 
+#define NVME_NVM_LP_MLC_PAIRS 886
 struct nvme_nvm_lp_mlc {
        __u16                   num_pairs;
-       __u8                    pairs[886];
+       __u8                    pairs[NVME_NVM_LP_MLC_PAIRS];
 };
 
 struct nvme_nvm_lp_tbl {
@@ -282,9 +283,14 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
                        memcpy(dst->lptbl.id, src->lptbl.id, 8);
                        dst->lptbl.mlc.num_pairs =
                                        le16_to_cpu(src->lptbl.mlc.num_pairs);
-                       /* 4 bits per pair */
+
+                       if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
+                               pr_err("nvm: number of MLC pairs not supported\n");
+                               return -EINVAL;
+                       }
+
                        memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
-                                               dst->lptbl.mlc.num_pairs >> 1);
+                                               dst->lptbl.mlc.num_pairs);
                }
        }
 
index 4fb5bb7..fb15ba5 100644 (file)
@@ -72,6 +72,7 @@ struct nvme_ctrl {
        struct mutex namespaces_mutex;
        struct device *device;  /* char device */
        struct list_head node;
+       struct ida ns_ida;
 
        char name[12];
        char serial[20];
@@ -102,6 +103,7 @@ struct nvme_ns {
        struct request_queue *queue;
        struct gendisk *disk;
        struct kref kref;
+       int instance;
 
        u8 eui[8];
        u8 uuid[16];
@@ -112,6 +114,11 @@ struct nvme_ns {
        bool ext;
        u8 pi_type;
        int type;
+       unsigned long flags;
+
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD     1
+
        u64 mode_select_num_blocks;
        u32 mode_select_block_len;
 };
@@ -139,9 +146,9 @@ static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl)
        u32 val = 0;
 
        if (ctrl->ops->io_incapable(ctrl))
-               return false;
+               return true;
        if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
-               return false;
+               return true;
        return val & NVME_CSTS_CFS;
 }
 
@@ -240,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
 
 struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, unsigned int flags);
index 72ef832..680f578 100644 (file)
@@ -86,7 +86,6 @@ struct nvme_queue;
 
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
 /*
@@ -120,6 +119,7 @@ struct nvme_dev {
        unsigned long flags;
 
 #define NVME_CTRL_RESETTING    0
+#define NVME_CTRL_REMOVING     1
 
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
@@ -286,6 +286,17 @@ static int nvme_init_request(void *data, struct request *req,
        return 0;
 }
 
+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+       /*
+        * Do not queue new scan work when a controller is reset during
+        * removal.
+        */
+       if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+               return;
+       queue_work(nvme_workq, &dev->scan_work);
+}
+
 static void nvme_complete_async_event(struct nvme_dev *dev,
                struct nvme_completion *cqe)
 {
@@ -300,7 +311,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
        switch (result & 0xff07) {
        case NVME_AER_NOTICE_NS_CHANGED:
                dev_info(dev->dev, "rescanning\n");
-               queue_work(nvme_workq, &dev->scan_work);
+               nvme_queue_scan(dev);
        default:
                dev_warn(dev->dev, "async event result %08x\n", result);
        }
@@ -678,6 +689,14 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
        blk_mq_start_request(req);
 
        spin_lock_irq(&nvmeq->q_lock);
+       if (unlikely(nvmeq->cq_vector < 0)) {
+               if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+                       ret = BLK_MQ_RQ_QUEUE_BUSY;
+               else
+                       ret = BLK_MQ_RQ_QUEUE_ERROR;
+               spin_unlock_irq(&nvmeq->q_lock);
+               goto out;
+       }
        __nvme_submit_cmd(nvmeq, &cmnd);
        nvme_process_cq(nvmeq);
        spin_unlock_irq(&nvmeq->q_lock);
@@ -999,7 +1018,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
        if (!blk_mq_request_started(req))
                return;
 
-       dev_warn(nvmeq->q_dmadev,
+       dev_dbg_ratelimited(nvmeq->q_dmadev,
                 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
 
        status = NVME_SC_ABORT_REQ;
@@ -1245,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
        if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+               /*
+                * If the controller was reset during removal, it's possible
+                * user requests may be waiting on a stopped queue. Start the
+                * queue to flush these to completion.
+                */
+               blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
                blk_cleanup_queue(dev->ctrl.admin_q);
                blk_mq_free_tag_set(&dev->admin_tagset);
        }
@@ -1685,14 +1710,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
                        return 0;
                dev->ctrl.tagset = &dev->tagset;
        }
-       queue_work(nvme_workq, &dev->scan_work);
+       nvme_queue_scan(dev);
        return 0;
 }
 
-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
 {
        u64 cap;
-       int bars, result = -ENOMEM;
+       int result = -ENOMEM;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
        if (pci_enable_device_mem(pdev))
@@ -1700,24 +1725,14 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
        dev->entry[0].vector = pdev->irq;
        pci_set_master(pdev);
-       bars = pci_select_bars(pdev, IORESOURCE_MEM);
-       if (!bars)
-               goto disable_pci;
-
-       if (pci_request_selected_regions(pdev, bars, "nvme"))
-               goto disable_pci;
 
        if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
            dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
                goto disable;
 
-       dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-       if (!dev->bar)
-               goto disable;
-
        if (readl(dev->bar + NVME_REG_CSTS) == -1) {
                result = -ENODEV;
-               goto unmap;
+               goto disable;
        }
 
        /*
@@ -1727,7 +1742,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
        if (!pdev->irq) {
                result = pci_enable_msix(pdev, dev->entry, 1);
                if (result < 0)
-                       goto unmap;
+                       goto disable;
        }
 
        cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1754,17 +1769,19 @@ static int nvme_dev_map(struct nvme_dev *dev)
        pci_save_state(pdev);
        return 0;
 
- unmap:
-       iounmap(dev->bar);
-       dev->bar = NULL;
  disable:
-       pci_release_regions(pdev);
- disable_pci:
        pci_disable_device(pdev);
        return result;
 }
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+       if (dev->bar)
+               iounmap(dev->bar);
+       pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
 {
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
@@ -1773,12 +1790,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
        else if (pdev->msix_enabled)
                pci_disable_msix(pdev);
 
-       if (dev->bar) {
-               iounmap(dev->bar);
-               dev->bar = NULL;
-               pci_release_regions(pdev);
-       }
-
        if (pci_is_enabled(pdev)) {
                pci_disable_pcie_error_reporting(pdev);
                pci_disable_device(pdev);
@@ -1837,7 +1848,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
        nvme_dev_list_remove(dev);
 
        mutex_lock(&dev->shutdown_lock);
-       if (dev->bar) {
+       if (pci_is_enabled(to_pci_dev(dev->dev))) {
                nvme_stop_queues(&dev->ctrl);
                csts = readl(dev->bar + NVME_REG_CSTS);
        }
@@ -1850,7 +1861,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
                nvme_disable_io_queues(dev);
                nvme_disable_admin_queue(dev, shutdown);
        }
-       nvme_dev_unmap(dev);
+       nvme_pci_disable(dev);
 
        for (i = dev->queue_count - 1; i >= 0; i--)
                nvme_clear_queue(dev->queues[i]);
@@ -1894,10 +1905,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
        kfree(dev);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+       dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
+
+       kref_get(&dev->ctrl.kref);
+       nvme_dev_disable(dev, false);
+       if (!schedule_work(&dev->remove_work))
+               nvme_put_ctrl(&dev->ctrl);
+}
+
 static void nvme_reset_work(struct work_struct *work)
 {
        struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
-       int result;
+       int result = -ENODEV;
 
        if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
                goto out;
@@ -1906,37 +1927,37 @@ static void nvme_reset_work(struct work_struct *work)
         * If we're called to reset a live controller first shut it down before
         * moving on.
         */
-       if (dev->bar)
+       if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                nvme_dev_disable(dev, false);
 
        set_bit(NVME_CTRL_RESETTING, &dev->flags);
 
-       result = nvme_dev_map(dev);
+       result = nvme_pci_enable(dev);
        if (result)
                goto out;
 
        result = nvme_configure_admin_queue(dev);
        if (result)
-               goto unmap;
+               goto out;
 
        nvme_init_queue(dev->queues[0], 0);
        result = nvme_alloc_admin_tags(dev);
        if (result)
-               goto disable;
+               goto out;
 
        result = nvme_init_identify(&dev->ctrl);
        if (result)
-               goto free_tags;
+               goto out;
 
        result = nvme_setup_io_queues(dev);
        if (result)
-               goto free_tags;
+               goto out;
 
        dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
 
        result = nvme_dev_list_add(dev);
        if (result)
-               goto remove;
+               goto out;
 
        /*
         * Keep the controller around but remove all namespaces if we don't have
@@ -1953,19 +1974,8 @@ static void nvme_reset_work(struct work_struct *work)
        clear_bit(NVME_CTRL_RESETTING, &dev->flags);
        return;
 
- remove:
-       nvme_dev_list_remove(dev);
- free_tags:
-       nvme_dev_remove_admin(dev);
-       blk_put_queue(dev->ctrl.admin_q);
-       dev->ctrl.admin_q = NULL;
-       dev->queues[0]->tags = NULL;
- disable:
-       nvme_disable_admin_queue(dev, false);
- unmap:
-       nvme_dev_unmap(dev);
  out:
-       nvme_remove_dead_ctrl(dev);
+       nvme_remove_dead_ctrl(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1973,19 +1983,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
        struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+       nvme_kill_queues(&dev->ctrl);
        if (pci_get_drvdata(pdev))
                pci_stop_and_remove_bus_device_locked(pdev);
        nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
-       dev_warn(dev->dev, "Removing after probe failure\n");
-       kref_get(&dev->ctrl.kref);
-       if (!schedule_work(&dev->remove_work))
-               nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
        if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2037,6 +2040,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
        .free_ctrl              = nvme_pci_free_ctrl,
 };
 
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+       int bars;
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+       bars = pci_select_bars(pdev, IORESOURCE_MEM);
+       if (!bars)
+               return -ENODEV;
+       if (pci_request_selected_regions(pdev, bars, "nvme"))
+               return -ENODEV;
+
+       dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+       if (!dev->bar)
+               goto release;
+
+       return 0;
+  release:
+       pci_release_regions(pdev);
+       return -ENODEV;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
        int node, result = -ENOMEM;
@@ -2061,6 +2085,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        dev->dev = get_device(&pdev->dev);
        pci_set_drvdata(pdev, dev);
 
+       result = nvme_dev_map(dev);
+       if (result)
+               goto free;
+
        INIT_LIST_HEAD(&dev->node);
        INIT_WORK(&dev->scan_work, nvme_dev_scan);
        INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2084,6 +2112,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        nvme_release_prp_pools(dev);
  put_pci:
        put_device(dev->dev);
+       nvme_dev_unmap(dev);
  free:
        kfree(dev->queues);
        kfree(dev->entry);
@@ -2107,24 +2136,27 @@ static void nvme_shutdown(struct pci_dev *pdev)
        nvme_dev_disable(dev, true);
 }
 
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
 static void nvme_remove(struct pci_dev *pdev)
 {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
 
-       spin_lock(&dev_list_lock);
-       list_del_init(&dev->node);
-       spin_unlock(&dev_list_lock);
-
+       set_bit(NVME_CTRL_REMOVING, &dev->flags);
        pci_set_drvdata(pdev, NULL);
-       flush_work(&dev->reset_work);
        flush_work(&dev->scan_work);
        nvme_remove_namespaces(&dev->ctrl);
        nvme_uninit_ctrl(&dev->ctrl);
        nvme_dev_disable(dev, true);
+       flush_work(&dev->reset_work);
        nvme_dev_remove_admin(dev);
        nvme_free_queues(dev, 0);
        nvme_release_cmb(dev);
        nvme_release_prp_pools(dev);
+       nvme_dev_unmap(dev);
        nvme_put_ctrl(&dev->ctrl);
 }
 
index 6fd4e5a..9d11d98 100644 (file)
@@ -70,6 +70,9 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
        if (pos >= nvmem->size)
                return 0;
 
+       if (count < nvmem->word_size)
+               return -EINVAL;
+
        if (pos + count > nvmem->size)
                count = nvmem->size - pos;
 
@@ -95,6 +98,9 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
        if (pos >= nvmem->size)
                return 0;
 
+       if (count < nvmem->word_size)
+               return -EINVAL;
+
        if (pos + count > nvmem->size)
                count = nvmem->size - pos;
 
index afb67e7..3829e5f 100644 (file)
@@ -21,6 +21,7 @@ static struct regmap_config qfprom_regmap_config = {
        .reg_bits = 32,
        .val_bits = 8,
        .reg_stride = 1,
+       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 static struct nvmem_config econfig = {
index 706e3ff..e7bfc17 100644 (file)
@@ -635,6 +635,13 @@ static u32 __of_msi_map_rid(struct device *dev, struct device_node **np,
                msi_base = be32_to_cpup(msi_map + 2);
                rid_len = be32_to_cpup(msi_map + 3);
 
+               if (rid_base & ~map_mask) {
+                       dev_err(parent_dev,
+                               "Invalid msi-map translation - msi-map-mask (0x%x) ignores rid-base (0x%x)\n",
+                               map_mask, rid_base);
+                       return rid_out;
+               }
+
                msi_controller_node = of_find_node_by_phandle(phandle);
 
                matched = (masked_rid >= rid_base &&
@@ -654,7 +661,7 @@ static u32 __of_msi_map_rid(struct device *dev, struct device_node **np,
        if (!matched)
                return rid_out;
 
-       rid_out = masked_rid + msi_base;
+       rid_out = masked_rid - rid_base + msi_base;
        dev_dbg(dev,
                "msi-map at: %s, using mask %08x, rid-base: %08x, msi-base: %08x, length: %08x, rid: %08x -> %08x\n",
                dev_name(parent_dev), map_mask, rid_base, msi_base,
@@ -679,18 +686,6 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in)
        return __of_msi_map_rid(dev, &msi_np, rid_in);
 }
 
-static struct irq_domain *__of_get_msi_domain(struct device_node *np,
-                                             enum irq_domain_bus_token token)
-{
-       struct irq_domain *d;
-
-       d = irq_find_matching_host(np, token);
-       if (!d)
-               d = irq_find_host(np);
-
-       return d;
-}
-
 /**
  * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain
  * @dev: device for which the mapping is to be done.
@@ -706,7 +701,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid)
        struct device_node *np = NULL;
 
        __of_msi_map_rid(dev, &np, rid);
-       return __of_get_msi_domain(np, DOMAIN_BUS_PCI_MSI);
+       return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
 }
 
 /**
@@ -730,7 +725,7 @@ struct irq_domain *of_msi_get_domain(struct device *dev,
        /* Check for a single msi-parent property */
        msi_np = of_parse_phandle(np, "msi-parent", 0);
        if (msi_np && !of_property_read_bool(msi_np, "#msi-cells")) {
-               d = __of_get_msi_domain(msi_np, token);
+               d = irq_find_matching_host(msi_np, token);
                if (!d)
                        of_node_put(msi_np);
                return d;
@@ -744,7 +739,7 @@ struct irq_domain *of_msi_get_domain(struct device *dev,
                while (!of_parse_phandle_with_args(np, "msi-parent",
                                                   "#msi-cells",
                                                   index, &args)) {
-                       d = __of_get_msi_domain(args.np, token);
+                       d = irq_find_matching_host(args.np, token);
                        if (d)
                                return d;
 
index 86829f8..365dc7e 100644 (file)
@@ -143,11 +143,32 @@ int of_mdio_parse_addr(struct device *dev, const struct device_node *np)
 }
 EXPORT_SYMBOL(of_mdio_parse_addr);
 
+/* The following is a list of PHY compatible strings which appear in
+ * some DTBs. The compatible string is never matched against a PHY
+ * driver, so is pointless. We only expect devices which are not PHYs
+ * to have a compatible string, so they can be matched to an MDIO
+ * driver.  Encourage users to upgrade their DT blobs to remove these.
+ */
+static const struct of_device_id whitelist_phys[] = {
+       { .compatible = "brcm,40nm-ephy" },
+       { .compatible = "marvell,88E1111", },
+       { .compatible = "marvell,88e1116", },
+       { .compatible = "marvell,88e1118", },
+       { .compatible = "marvell,88e1145", },
+       { .compatible = "marvell,88e1149r", },
+       { .compatible = "marvell,88e1310", },
+       { .compatible = "marvell,88E1510", },
+       { .compatible = "marvell,88E1514", },
+       { .compatible = "moxa,moxart-rtl8201cp", },
+       {}
+};
+
 /*
  * Return true if the child node is for a phy. It must either:
  * o Compatible string of "ethernet-phy-idX.X"
  * o Compatible string of "ethernet-phy-ieee802.3-c45"
  * o Compatible string of "ethernet-phy-ieee802.3-c22"
+ * o In the white list above (and issue a warning)
  * o No compatibility string
  *
  * A device which is not a phy is expected to have a compatible string
@@ -166,6 +187,13 @@ static bool of_mdiobus_child_is_phy(struct device_node *child)
        if (of_device_is_compatible(child, "ethernet-phy-ieee802.3-c22"))
                return true;
 
+       if (of_match_node(whitelist_phys, child)) {
+               pr_warn(FW_WARN
+                       "%s: Whitelisted compatible string. Please remove\n",
+                       child->full_name);
+               return true;
+       }
+
        if (!of_find_property(child, "compatible", NULL))
                return true;
 
@@ -256,11 +284,19 @@ static int of_phy_match(struct device *dev, void *phy_np)
 struct phy_device *of_phy_find_device(struct device_node *phy_np)
 {
        struct device *d;
+       struct mdio_device *mdiodev;
+
        if (!phy_np)
                return NULL;
 
        d = bus_find_device(&mdio_bus_type, NULL, phy_np, of_phy_match);
-       return d ? to_phy_device(d) : NULL;
+       if (d) {
+               mdiodev = to_mdio_device(d);
+               if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY)
+                       return to_phy_device(d);
+       }
+
+       return NULL;
 }
 EXPORT_SYMBOL(of_phy_find_device);
 
@@ -269,6 +305,7 @@ EXPORT_SYMBOL(of_phy_find_device);
  * @dev: pointer to net_device claiming the phy
  * @phy_np: Pointer to device tree node for the PHY
  * @hndlr: Link state callback for the network device
+ * @flags: flags to pass to the PHY
  * @iface: PHY data interface type
  *
  * If successful, returns a pointer to the phy_device with the embedded
index a656d9e..21905fe 100644 (file)
@@ -91,7 +91,7 @@ static int configure_memory(const unsigned char *buf,
        for (i=0;i<HPEE_MEMORY_MAX_ENT;i++) {
                c = get_8(buf+len);
                
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                        int result;
                        
                        res->name = name;
@@ -183,7 +183,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
        for (i=0;i<HPEE_PORT_MAX_ENT;i++) {
                c = get_8(buf+len);
                
-               if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+               if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
                        res->name = board;
                        res->start = get_16(buf+len+1);
                        res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
index 75a6054..d1cdd9c 100644 (file)
@@ -14,6 +14,7 @@ config PCI_DRA7XX
 config PCI_MVEBU
        bool "Marvell EBU PCIe controller"
        depends on ARCH_MVEBU || ARCH_DOVE
+       depends on ARM
        depends on OF
 
 config PCIE_DW
index ed34c95..6153853 100644 (file)
 
 #define to_keystone_pcie(x)    container_of(x, struct keystone_pcie, pp)
 
-static inline struct pcie_port *sys_to_pcie(struct pci_sys_data *sys)
-{
-       return sys->private_data;
-}
-
 static inline void update_reg_offset_bit_pos(u32 offset, u32 *reg_offset,
                                             u32 *bit_pos)
 {
@@ -108,7 +103,7 @@ static void ks_dw_pcie_msi_irq_ack(struct irq_data *d)
        struct pcie_port *pp;
 
        msi = irq_data_get_msi_desc(d);
-       pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+       pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
        ks_pcie = to_keystone_pcie(pp);
        offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
        update_reg_offset_bit_pos(offset, &reg_offset, &bit_pos);
@@ -146,7 +141,7 @@ static void ks_dw_pcie_msi_irq_mask(struct irq_data *d)
        u32 offset;
 
        msi = irq_data_get_msi_desc(d);
-       pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+       pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
        ks_pcie = to_keystone_pcie(pp);
        offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 
@@ -167,7 +162,7 @@ static void ks_dw_pcie_msi_irq_unmask(struct irq_data *d)
        u32 offset;
 
        msi = irq_data_get_msi_desc(d);
-       pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+       pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
        ks_pcie = to_keystone_pcie(pp);
        offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 
index 3923bed..f39961b 100644 (file)
@@ -77,6 +77,16 @@ static void ls_pcie_fix_class(struct ls_pcie *pcie)
        iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE);
 }
 
+/* Drop MSG TLP except for Vendor MSG */
+static void ls_pcie_drop_msg_tlp(struct ls_pcie *pcie)
+{
+       u32 val;
+
+       val = ioread32(pcie->dbi + PCIE_STRFMR1);
+       val &= 0xDFFFFFFF;
+       iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+}
+
 static int ls1021_pcie_link_up(struct pcie_port *pp)
 {
        u32 state;
@@ -97,7 +107,7 @@ static int ls1021_pcie_link_up(struct pcie_port *pp)
 static void ls1021_pcie_host_init(struct pcie_port *pp)
 {
        struct ls_pcie *pcie = to_ls_pcie(pp);
-       u32 val, index[2];
+       u32 index[2];
 
        pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node,
                                                     "fsl,pcie-scfg");
@@ -116,13 +126,7 @@ static void ls1021_pcie_host_init(struct pcie_port *pp)
 
        dw_pcie_setup_rc(pp);
 
-       /*
-        * LS1021A Workaround for internal TKT228622
-        * to fix the INTx hang issue
-        */
-       val = ioread32(pcie->dbi + PCIE_STRFMR1);
-       val &= 0xffff;
-       iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+       ls_pcie_drop_msg_tlp(pcie);
 }
 
 static int ls_pcie_link_up(struct pcie_port *pp)
@@ -147,6 +151,7 @@ static void ls_pcie_host_init(struct pcie_port *pp)
        iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN);
        ls_pcie_fix_class(pcie);
        ls_pcie_clear_multifunction(pcie);
+       ls_pcie_drop_msg_tlp(pcie);
        iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN);
 }
 
index 5816bce..a576aee 100644 (file)
@@ -64,7 +64,6 @@
 #define OARR_SIZE_CFG                BIT(OARR_SIZE_CFG_SHIFT)
 
 #define MAX_NUM_OB_WINDOWS           2
-#define MAX_NUM_PAXC_PF              4
 
 #define IPROC_PCIE_REG_INVALID 0xffff
 
@@ -170,20 +169,6 @@ static inline void iproc_pcie_ob_write(struct iproc_pcie *pcie,
        writel(val, pcie->base + offset + (window * 8));
 }
 
-static inline bool iproc_pcie_device_is_valid(struct iproc_pcie *pcie,
-                                             unsigned int slot,
-                                             unsigned int fn)
-{
-       if (slot > 0)
-               return false;
-
-       /* PAXC can only support limited number of functions */
-       if (pcie->type == IPROC_PCIE_PAXC && fn >= MAX_NUM_PAXC_PF)
-               return false;
-
-       return true;
-}
-
 /**
  * Note access to the configuration registers are protected at the higher layer
  * by 'pci_lock' in drivers/pci/access.c
@@ -199,11 +184,11 @@ static void __iomem *iproc_pcie_map_cfg_bus(struct pci_bus *bus,
        u32 val;
        u16 offset;
 
-       if (!iproc_pcie_device_is_valid(pcie, slot, fn))
-               return NULL;
-
        /* root complex access */
        if (busno == 0) {
+               if (slot > 0 || fn > 0)
+                       return NULL;
+
                iproc_pcie_write_reg(pcie, IPROC_PCIE_CFG_IND_ADDR,
                                     where & CFG_IND_ADDR_MASK);
                offset = iproc_pcie_reg_offset(pcie, IPROC_PCIE_CFG_IND_DATA);
@@ -213,6 +198,14 @@ static void __iomem *iproc_pcie_map_cfg_bus(struct pci_bus *bus,
                        return (pcie->base + offset);
        }
 
+       /*
+        * PAXC is connected to an internally emulated EP within the SoC.  It
+        * allows only one device.
+        */
+       if (pcie->type == IPROC_PCIE_PAXC)
+               if (slot > 0)
+                       return NULL;
+
        /* EP device access */
        val = (busno << CFG_ADDR_BUS_NUM_SHIFT) |
                (slot << CFG_ADDR_DEV_NUM_SHIFT) |
index 5f2fda1..fa49f91 100644 (file)
@@ -953,8 +953,10 @@ int acpiphp_enable_slot(struct acpiphp_slot *slot)
 {
        pci_lock_rescan_remove();
 
-       if (slot->flags & SLOT_IS_GOING_AWAY)
+       if (slot->flags & SLOT_IS_GOING_AWAY) {
+               pci_unlock_rescan_remove();
                return -ENODEV;
+       }
 
        /* configure all functions */
        if (!(slot->flags & SLOT_ENABLED))
index 602eb42..f89db3a 100644 (file)
@@ -4772,8 +4772,10 @@ int pci_get_new_domain_nr(void)
 void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
 {
        static int use_dt_domains = -1;
-       int domain = of_get_pci_domain_nr(parent->of_node);
+       int domain = -1;
 
+       if (parent)
+               domain = of_get_pci_domain_nr(parent->of_node);
        /*
         * Check DT domain and use_dt_domains values.
         *
index 0bf82a2..48d21e0 100644 (file)
@@ -262,7 +262,6 @@ static struct aer_rpc *aer_alloc_rpc(struct pcie_device *dev)
        rpc->rpd = dev;
        INIT_WORK(&rpc->dpc_handler, aer_isr);
        mutex_init(&rpc->rpc_mutex);
-       init_waitqueue_head(&rpc->wait_release);
 
        /* Use PCIe bus function to store rpc into PCIe device */
        set_service_data(dev, rpc);
@@ -285,8 +284,7 @@ static void aer_remove(struct pcie_device *dev)
                if (rpc->isr)
                        free_irq(dev->irq, dev);
 
-               wait_event(rpc->wait_release, rpc->prod_idx == rpc->cons_idx);
-
+               flush_work(&rpc->dpc_handler);
                aer_disable_rootport(rpc);
                kfree(rpc);
                set_service_data(dev, NULL);
index 84420b7..945c939 100644 (file)
@@ -72,7 +72,6 @@ struct aer_rpc {
                                         * recovery on the same
                                         * root port hierarchy
                                         */
-       wait_queue_head_t wait_release;
 };
 
 struct aer_broadcast_data {
index 7123925..521e39c 100644 (file)
@@ -811,8 +811,6 @@ void aer_isr(struct work_struct *work)
        while (get_e_source(rpc, &e_src))
                aer_isr_one_error(p_device, &e_src);
        mutex_unlock(&rpc->rpc_mutex);
-
-       wake_up(&rpc->wait_release);
 }
 
 /**
index c777b97..5f70fee 100644 (file)
@@ -53,7 +53,7 @@ struct pcifront_device {
 };
 
 struct pcifront_sd {
-       int domain;
+       struct pci_sysdata sd;
        struct pcifront_device *pdev;
 };
 
@@ -67,7 +67,9 @@ static inline void pcifront_init_sd(struct pcifront_sd *sd,
                                    unsigned int domain, unsigned int bus,
                                    struct pcifront_device *pdev)
 {
-       sd->domain = domain;
+       /* Because we do not expose that information via XenBus. */
+       sd->sd.node = first_online_node;
+       sd->sd.domain = domain;
        sd->pdev = pdev;
 }
 
@@ -468,8 +470,8 @@ static int pcifront_scan_root(struct pcifront_device *pdev,
        dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
                 domain, bus);
 
-       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
-       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+       bus_entry = kzalloc(sizeof(*bus_entry), GFP_KERNEL);
+       sd = kzalloc(sizeof(*sd), GFP_KERNEL);
        if (!bus_entry || !sd) {
                err = -ENOMEM;
                goto err_out;
index e7e117d..0124d17 100644 (file)
@@ -224,6 +224,7 @@ config PHY_MT65XX_USB3
 
 config PHY_HI6220_USB
        tristate "hi6220 USB PHY support"
+       depends on (ARCH_HISI && ARM64) || COMPILE_TEST
        select GENERIC_PHY
        select MFD_SYSCON
        help
index 8c7f27d..e7e574d 100644 (file)
@@ -275,20 +275,21 @@ EXPORT_SYMBOL_GPL(phy_exit);
 
 int phy_power_on(struct phy *phy)
 {
-       int ret;
+       int ret = 0;
 
        if (!phy)
-               return 0;
+               goto out;
 
        if (phy->pwr) {
                ret = regulator_enable(phy->pwr);
                if (ret)
-                       return ret;
+                       goto out;
        }
 
        ret = phy_pm_runtime_get_sync(phy);
        if (ret < 0 && ret != -ENOTSUPP)
-               return ret;
+               goto err_pm_sync;
+
        ret = 0; /* Override possible ret == -ENOTSUPP */
 
        mutex_lock(&phy->mutex);
@@ -296,19 +297,20 @@ int phy_power_on(struct phy *phy)
                ret = phy->ops->power_on(phy);
                if (ret < 0) {
                        dev_err(&phy->dev, "phy poweron failed --> %d\n", ret);
-                       goto out;
+                       goto err_pwr_on;
                }
        }
        ++phy->power_count;
        mutex_unlock(&phy->mutex);
        return 0;
 
-out:
+err_pwr_on:
        mutex_unlock(&phy->mutex);
        phy_pm_runtime_put_sync(phy);
+err_pm_sync:
        if (phy->pwr)
                regulator_disable(phy->pwr);
-
+out:
        return ret;
 }
 EXPORT_SYMBOL_GPL(phy_power_on);
index 4a3fc6e..840f3ea 100644 (file)
@@ -715,6 +715,7 @@ static int twl4030_usb_probe(struct platform_device *pdev)
        pm_runtime_use_autosuspend(&pdev->dev);
        pm_runtime_set_autosuspend_delay(&pdev->dev, 2000);
        pm_runtime_enable(&pdev->dev);
+       pm_runtime_get_sync(&pdev->dev);
 
        /* Our job is to use irqs and status from the power module
         * to keep the transceiver disabled when nothing's connected.
@@ -750,6 +751,7 @@ static int twl4030_usb_remove(struct platform_device *pdev)
        struct twl4030_usb *twl = platform_get_drvdata(pdev);
        int val;
 
+       usb_remove_phy(&twl->phy);
        pm_runtime_get_sync(twl->dev);
        cancel_delayed_work(&twl->id_workaround_work);
        device_remove_file(twl->dev, &dev_attr_vbus);
@@ -757,6 +759,13 @@ static int twl4030_usb_remove(struct platform_device *pdev)
        /* set transceiver mode to power on defaults */
        twl4030_usb_set_mode(twl, -1);
 
+       /* idle ulpi before powering off */
+       if (cable_present(twl->linkstat))
+               pm_runtime_put_noidle(twl->dev);
+       pm_runtime_mark_last_busy(twl->dev);
+       pm_runtime_put_sync_suspend(twl->dev);
+       pm_runtime_disable(twl->dev);
+
        /* autogate 60MHz ULPI clock,
         * clear dpll clock request for i2c access,
         * disable 32KHz
@@ -771,11 +780,6 @@ static int twl4030_usb_remove(struct platform_device *pdev)
        /* disable complete OTG block */
        twl4030_usb_clear_bits(twl, POWER_CTRL, POWER_CTRL_OTG_ENAB);
 
-       if (cable_present(twl->linkstat))
-               pm_runtime_put_noidle(twl->dev);
-       pm_runtime_mark_last_busy(twl->dev);
-       pm_runtime_put(twl->dev);
-
        return 0;
 }
 
index 16d48a4..e96e86d 100644 (file)
@@ -347,6 +347,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev,
                ret = mtk_pconf_set_pull_select(pctl, pin, true, false, arg);
                break;
        case PIN_CONFIG_INPUT_ENABLE:
+               mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true);
                ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param);
                break;
        case PIN_CONFIG_OUTPUT:
@@ -354,6 +355,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev,
                ret = mtk_pmx_gpio_set_direction(pctldev, NULL, pin, false);
                break;
        case PIN_CONFIG_INPUT_SCHMITT_ENABLE:
+               mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true);
                ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param);
                break;
        case PIN_CONFIG_DRIVE_STRENGTH:
index e4d4738..3ef798f 100644 (file)
@@ -666,16 +666,19 @@ int mvebu_pinctrl_probe(struct platform_device *pdev)
                struct mvebu_mpp_ctrl_setting *set = &mode->settings[0];
                struct mvebu_pinctrl_group *grp;
                unsigned num_settings;
+               unsigned supp_settings;
 
-               for (num_settings = 0; ; set++) {
+               for (num_settings = 0, supp_settings = 0; ; set++) {
                        if (!set->name)
                                break;
 
+                       num_settings++;
+
                        /* skip unsupported settings for this variant */
                        if (pctl->variant && !(pctl->variant & set->variant))
                                continue;
 
-                       num_settings++;
+                       supp_settings++;
 
                        /* find gpio/gpo/gpi settings */
                        if (strcmp(set->name, "gpio") == 0)
@@ -688,7 +691,7 @@ int mvebu_pinctrl_probe(struct platform_device *pdev)
                }
 
                /* skip modes with no settings for this variant */
-               if (!num_settings)
+               if (!supp_settings)
                        continue;
 
                grp = mvebu_pinctrl_find_group_by_pid(pctl, mode->pid);
index 085e601..1f7469c 100644 (file)
@@ -191,6 +191,7 @@ static void abx500_gpio_set(struct gpio_chip *chip, unsigned offset, int val)
                dev_err(pct->dev, "%s write failed (%d)\n", __func__, ret);
 }
 
+#ifdef CONFIG_DEBUG_FS
 static int abx500_get_pull_updown(struct abx500_pinctrl *pct, int offset,
                                  enum abx500_gpio_pull_updown *pull_updown)
 {
@@ -226,6 +227,7 @@ out:
 
        return ret;
 }
+#endif
 
 static int abx500_set_pull_updown(struct abx500_pinctrl *pct,
                                  int offset, enum abx500_gpio_pull_updown val)
@@ -468,6 +470,7 @@ out:
        return ret;
 }
 
+#ifdef CONFIG_DEBUG_FS
 static int abx500_get_mode(struct pinctrl_dev *pctldev, struct gpio_chip *chip,
                          unsigned gpio)
 {
@@ -553,8 +556,6 @@ out:
        return ret;
 }
 
-#ifdef CONFIG_DEBUG_FS
-
 #include <linux/seq_file.h>
 
 static void abx500_gpio_dbg_show_one(struct seq_file *s,
index d90e205..216f227 100644 (file)
@@ -426,6 +426,7 @@ int pxa2xx_pinctrl_init(struct platform_device *pdev,
 
        return 0;
 }
+EXPORT_SYMBOL(pxa2xx_pinctrl_init);
 
 int pxa2xx_pinctrl_exit(struct platform_device *pdev)
 {
index f67b1e9..5cc97f8 100644 (file)
@@ -514,25 +514,35 @@ static const struct pinconf_ops samsung_pinconf_ops = {
        .pin_config_group_set   = samsung_pinconf_group_set,
 };
 
-/* gpiolib gpio_set callback function */
-static void samsung_gpio_set(struct gpio_chip *gc, unsigned offset, int value)
+/*
+ * The samsung_gpio_set_vlaue() should be called with "bank->slock" held
+ * to avoid race condition.
+ */
+static void samsung_gpio_set_value(struct gpio_chip *gc,
+                                         unsigned offset, int value)
 {
        struct samsung_pin_bank *bank = gpiochip_get_data(gc);
        const struct samsung_pin_bank_type *type = bank->type;
-       unsigned long flags;
        void __iomem *reg;
        u32 data;
 
        reg = bank->drvdata->virt_base + bank->pctl_offset;
 
-       spin_lock_irqsave(&bank->slock, flags);
-
        data = readl(reg + type->reg_offset[PINCFG_TYPE_DAT]);
        data &= ~(1 << offset);
        if (value)
                data |= 1 << offset;
        writel(data, reg + type->reg_offset[PINCFG_TYPE_DAT]);
+}
+
+/* gpiolib gpio_set callback function */
+static void samsung_gpio_set(struct gpio_chip *gc, unsigned offset, int value)
+{
+       struct samsung_pin_bank *bank = gpiochip_get_data(gc);
+       unsigned long flags;
 
+       spin_lock_irqsave(&bank->slock, flags);
+       samsung_gpio_set_value(gc, offset, value);
        spin_unlock_irqrestore(&bank->slock, flags);
 }
 
@@ -553,6 +563,8 @@ static int samsung_gpio_get(struct gpio_chip *gc, unsigned offset)
 }
 
 /*
+ * The samsung_gpio_set_direction() should be called with "bank->slock" held
+ * to avoid race condition.
  * The calls to gpio_direction_output() and gpio_direction_input()
  * leads to this function call.
  */
@@ -564,7 +576,6 @@ static int samsung_gpio_set_direction(struct gpio_chip *gc,
        struct samsung_pinctrl_drv_data *drvdata;
        void __iomem *reg;
        u32 data, mask, shift;
-       unsigned long flags;
 
        bank = gpiochip_get_data(gc);
        type = bank->type;
@@ -581,31 +592,42 @@ static int samsung_gpio_set_direction(struct gpio_chip *gc,
                reg += 4;
        }
 
-       spin_lock_irqsave(&bank->slock, flags);
-
        data = readl(reg);
        data &= ~(mask << shift);
        if (!input)
                data |= FUNC_OUTPUT << shift;
        writel(data, reg);
 
-       spin_unlock_irqrestore(&bank->slock, flags);
-
        return 0;
 }
 
 /* gpiolib gpio_direction_input callback function. */
 static int samsung_gpio_direction_input(struct gpio_chip *gc, unsigned offset)
 {
-       return samsung_gpio_set_direction(gc, offset, true);
+       struct samsung_pin_bank *bank = gpiochip_get_data(gc);
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&bank->slock, flags);
+       ret = samsung_gpio_set_direction(gc, offset, true);
+       spin_unlock_irqrestore(&bank->slock, flags);
+       return ret;
 }
 
 /* gpiolib gpio_direction_output callback function. */
 static int samsung_gpio_direction_output(struct gpio_chip *gc, unsigned offset,
                                                        int value)
 {
-       samsung_gpio_set(gc, offset, value);
-       return samsung_gpio_set_direction(gc, offset, false);
+       struct samsung_pin_bank *bank = gpiochip_get_data(gc);
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&bank->slock, flags);
+       samsung_gpio_set_value(gc, offset, value);
+       ret = samsung_gpio_set_direction(gc, offset, false);
+       spin_unlock_irqrestore(&bank->slock, flags);
+
+       return ret;
 }
 
 /*
index 77d4cf0..11760bb 100644 (file)
@@ -492,6 +492,7 @@ static const struct sunxi_pinctrl_desc sun8i_h3_pinctrl_data = {
        .pins = sun8i_h3_pins,
        .npins = ARRAY_SIZE(sun8i_h3_pins),
        .irq_banks = 2,
+       .irq_read_needs_mux = true
 };
 
 static int sun8i_h3_pinctrl_probe(struct platform_device *pdev)
index 20f0ad9..e20f23e 100644 (file)
@@ -41,8 +41,7 @@ static const struct key_entry intel_hid_keymap[] = {
        { KE_KEY, 4, { KEY_HOME } },
        { KE_KEY, 5, { KEY_END } },
        { KE_KEY, 6, { KEY_PAGEUP } },
-       { KE_KEY, 4, { KEY_PAGEDOWN } },
-       { KE_KEY, 4, { KEY_HOME } },
+       { KE_KEY, 7, { KEY_PAGEDOWN } },
        { KE_KEY, 8, { KEY_RFKILL } },
        { KE_KEY, 9, { KEY_POWER } },
        { KE_KEY, 11, { KEY_SLEEP } },
index 02bc5a6..aa45424 100644 (file)
@@ -49,7 +49,7 @@ struct scu_ipc_data {
 
 static int scu_reg_access(u32 cmd, struct scu_ipc_data  *data)
 {
-       int count = data->count;
+       unsigned int count = data->count;
 
        if (count == 0 || count == 3 || count > 4)
                return -EINVAL;
index f700723..d28e3ab 100644 (file)
@@ -342,6 +342,7 @@ static void quirk_amd_mmconfig_area(struct pnp_dev *dev)
 /* Device IDs of parts that have 32KB MCH space */
 static const unsigned int mch_quirk_devices[] = {
        0x0154, /* Ivy Bridge */
+       0x0a04, /* Haswell-ULT */
        0x0c00, /* Haswell */
        0x1604, /* Broadwell */
 };
index 9429e66..8eafc6f 100644 (file)
@@ -21,6 +21,9 @@
 
 #include <linux/power/bq27xxx_battery.h>
 
+static DEFINE_IDR(battery_id);
+static DEFINE_MUTEX(battery_mutex);
+
 static irqreturn_t bq27xxx_battery_irq_handler_thread(int irq, void *data)
 {
        struct bq27xxx_device_info *di = data;
@@ -70,19 +73,33 @@ static int bq27xxx_battery_i2c_probe(struct i2c_client *client,
 {
        struct bq27xxx_device_info *di;
        int ret;
+       char *name;
+       int num;
+
+       /* Get new ID for the new battery device */
+       mutex_lock(&battery_mutex);
+       num = idr_alloc(&battery_id, client, 0, 0, GFP_KERNEL);
+       mutex_unlock(&battery_mutex);
+       if (num < 0)
+               return num;
+
+       name = devm_kasprintf(&client->dev, GFP_KERNEL, "%s-%d", id->name, num);
+       if (!name)
+               goto err_mem;
 
        di = devm_kzalloc(&client->dev, sizeof(*di), GFP_KERNEL);
        if (!di)
-               return -ENOMEM;
+               goto err_mem;
 
+       di->id = num;
        di->dev = &client->dev;
        di->chip = id->driver_data;
-       di->name = id->name;
+       di->name = name;
        di->bus.read = bq27xxx_battery_i2c_read;
 
        ret = bq27xxx_battery_setup(di);
        if (ret)
-               return ret;
+               goto err_failed;
 
        /* Schedule a polling after about 1 min */
        schedule_delayed_work(&di->work, 60 * HZ);
@@ -103,6 +120,16 @@ static int bq27xxx_battery_i2c_probe(struct i2c_client *client,
        }
 
        return 0;
+
+err_mem:
+       ret = -ENOMEM;
+
+err_failed:
+       mutex_lock(&battery_mutex);
+       idr_remove(&battery_id, num);
+       mutex_unlock(&battery_mutex);
+
+       return ret;
 }
 
 static int bq27xxx_battery_i2c_remove(struct i2c_client *client)
@@ -111,6 +138,10 @@ static int bq27xxx_battery_i2c_remove(struct i2c_client *client)
 
        bq27xxx_battery_teardown(di);
 
+       mutex_lock(&battery_mutex);
+       idr_remove(&battery_id, di->id);
+       mutex_unlock(&battery_mutex);
+
        return 0;
 }
 
index da7bae9..579fd65 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 
 #include "ptp_private.h"
 
@@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
        struct ptp_clock_caps caps;
        struct ptp_clock_request req;
        struct ptp_sys_offset *sysoff = NULL;
+       struct ptp_sys_offset_precise precise_offset;
        struct ptp_pin_desc pd;
        struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
        struct ptp_clock_info *ops = ptp->info;
        struct ptp_clock_time *pct;
        struct timespec64 ts;
+       struct system_device_crosststamp xtstamp;
        int enable, err = 0;
        unsigned int i, pin_index;
 
@@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
                caps.n_per_out = ptp->info->n_per_out;
                caps.pps = ptp->info->pps;
                caps.n_pins = ptp->info->n_pins;
+               caps.cross_timestamping = ptp->info->getcrosststamp != NULL;
                if (copy_to_user((void __user *)arg, &caps, sizeof(caps)))
                        err = -EFAULT;
                break;
@@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
                err = ops->enable(ops, &req, enable);
                break;
 
+       case PTP_SYS_OFFSET_PRECISE:
+               if (!ptp->info->getcrosststamp) {
+                       err = -EOPNOTSUPP;
+                       break;
+               }
+               err = ptp->info->getcrosststamp(ptp->info, &xtstamp);
+               if (err)
+                       break;
+
+               ts = ktime_to_timespec64(xtstamp.device);
+               precise_offset.device.sec = ts.tv_sec;
+               precise_offset.device.nsec = ts.tv_nsec;
+               ts = ktime_to_timespec64(xtstamp.sys_realtime);
+               precise_offset.sys_realtime.sec = ts.tv_sec;
+               precise_offset.sys_realtime.nsec = ts.tv_nsec;
+               ts = ktime_to_timespec64(xtstamp.sys_monoraw);
+               precise_offset.sys_monoraw.sec = ts.tv_sec;
+               precise_offset.sys_monoraw.nsec = ts.tv_nsec;
+               if (copy_to_user((void __user *)arg, &precise_offset,
+                                sizeof(precise_offset)))
+                       err = -EFAULT;
+               break;
+
        case PTP_SYS_OFFSET:
                sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL);
                if (!sysoff) {
index 934c139..ee4f183 100644 (file)
@@ -178,7 +178,6 @@ static int ptp_ixp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 static int ptp_ixp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
        u64 ns;
-       u32 remainder;
        unsigned long flags;
        struct ixp_clock *ixp_clock = container_of(ptp, struct ixp_clock, caps);
        struct ixp46x_ts_regs *regs = ixp_clock->regs;
@@ -189,8 +188,7 @@ static int ptp_ixp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 
        spin_unlock_irqrestore(&register_lock, flags);
 
-       ts->tv_sec = div_u64_rem(ns, 1000000000, &remainder);
-       ts->tv_nsec = remainder;
+       *ts = ns_to_timespec64(ns);
        return 0;
 }
 
@@ -202,8 +200,7 @@ static int ptp_ixp_settime(struct ptp_clock_info *ptp,
        struct ixp_clock *ixp_clock = container_of(ptp, struct ixp_clock, caps);
        struct ixp46x_ts_regs *regs = ixp_clock->regs;
 
-       ns = ts->tv_sec * 1000000000ULL;
-       ns += ts->tv_nsec;
+       ns = timespec64_to_ns(ts);
 
        spin_lock_irqsave(&register_lock, flags);
 
index d7b87c6..e220edc 100644 (file)
@@ -117,7 +117,7 @@ int rio_request_inb_mbox(struct rio_mport *mport,
        if (mport->ops->open_inb_mbox == NULL)
                goto out;
 
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
        if (res) {
                rio_init_mbox_res(res, mbox, mbox);
@@ -185,7 +185,7 @@ int rio_request_outb_mbox(struct rio_mport *mport,
        if (mport->ops->open_outb_mbox == NULL)
                goto out;
 
-       res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
        if (res) {
                rio_init_mbox_res(res, mbox, mbox);
@@ -285,7 +285,7 @@ int rio_request_inb_dbell(struct rio_mport *mport,
 {
        int rc = 0;
 
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
        if (res) {
                rio_init_dbell_res(res, start, end);
@@ -360,7 +360,7 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
 struct resource *rio_request_outb_dbell(struct rio_dev *rdev, u16 start,
                                        u16 end)
 {
-       struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+       struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
        if (res) {
                rio_init_dbell_res(res, start, end);
index 41605da..c78db05 100644 (file)
@@ -3035,6 +3035,7 @@ static void dasd_setup_queue(struct dasd_block *block)
                max = block->base->discipline->max_blocks << block->s2b_shift;
        }
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, block->request_queue);
+       block->request_queue->limits.max_dev_sectors = max;
        blk_queue_logical_block_size(block->request_queue,
                                     block->bp_block);
        blk_queue_max_hw_sectors(block->request_queue, max);
index 184b1db..286782c 100644 (file)
@@ -264,8 +264,10 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device)
                spin_unlock_irqrestore(&lcu->lock, flags);
                cancel_work_sync(&lcu->suc_data.worker);
                spin_lock_irqsave(&lcu->lock, flags);
-               if (device == lcu->suc_data.device)
+               if (device == lcu->suc_data.device) {
+                       dasd_put_device(device);
                        lcu->suc_data.device = NULL;
+               }
        }
        was_pending = 0;
        if (device == lcu->ruac_data.device) {
@@ -273,8 +275,10 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device)
                was_pending = 1;
                cancel_delayed_work_sync(&lcu->ruac_data.dwork);
                spin_lock_irqsave(&lcu->lock, flags);
-               if (device == lcu->ruac_data.device)
+               if (device == lcu->ruac_data.device) {
+                       dasd_put_device(device);
                        lcu->ruac_data.device = NULL;
+               }
        }
        private->lcu = NULL;
        spin_unlock_irqrestore(&lcu->lock, flags);
@@ -549,8 +553,10 @@ static void lcu_update_work(struct work_struct *work)
        if ((rc && (rc != -EOPNOTSUPP)) || (lcu->flags & NEED_UAC_UPDATE)) {
                DBF_DEV_EVENT(DBF_WARNING, device, "could not update"
                            " alias data in lcu (rc = %d), retry later", rc);
-               schedule_delayed_work(&lcu->ruac_data.dwork, 30*HZ);
+               if (!schedule_delayed_work(&lcu->ruac_data.dwork, 30*HZ))
+                       dasd_put_device(device);
        } else {
+               dasd_put_device(device);
                lcu->ruac_data.device = NULL;
                lcu->flags &= ~UPDATE_PENDING;
        }
@@ -593,8 +599,10 @@ static int _schedule_lcu_update(struct alias_lcu *lcu,
         */
        if (!usedev)
                return -EINVAL;
+       dasd_get_device(usedev);
        lcu->ruac_data.device = usedev;
-       schedule_delayed_work(&lcu->ruac_data.dwork, 0);
+       if (!schedule_delayed_work(&lcu->ruac_data.dwork, 0))
+               dasd_put_device(usedev);
        return 0;
 }
 
@@ -723,7 +731,7 @@ static int reset_summary_unit_check(struct alias_lcu *lcu,
        ASCEBC((char *) &cqr->magic, 4);
        ccw = cqr->cpaddr;
        ccw->cmd_code = DASD_ECKD_CCW_RSCK;
-       ccw->flags = ;
+       ccw->flags = CCW_FLAG_SLI;
        ccw->count = 16;
        ccw->cda = (__u32)(addr_t) cqr->data;
        ((char *)cqr->data)[0] = reason;
@@ -930,6 +938,7 @@ static void summary_unit_check_handling_work(struct work_struct *work)
        /* 3. read new alias configuration */
        _schedule_lcu_update(lcu, device);
        lcu->suc_data.device = NULL;
+       dasd_put_device(device);
        spin_unlock_irqrestore(&lcu->lock, flags);
 }
 
@@ -989,6 +998,8 @@ void dasd_alias_handle_summary_unit_check(struct dasd_device *device,
        }
        lcu->suc_data.reason = reason;
        lcu->suc_data.device = device;
+       dasd_get_device(device);
        spin_unlock(&lcu->lock);
-       schedule_work(&lcu->suc_data.worker);
+       if (!schedule_work(&lcu->suc_data.worker))
+               dasd_put_device(device);
 };
index cb61f30..277b5c8 100644 (file)
@@ -67,7 +67,7 @@ static const u8 DASD_DIAG_CMS1[] = { 0xc3, 0xd4, 0xe2, 0xf1 };/* EBCDIC CMS1 */
  * and function code cmd.
  * In case of an exception return 3. Otherwise return result of bitwise OR of
  * resulting condition code and DIAG return code. */
-static inline int dia250(void *iob, int cmd)
+static inline int __dia250(void *iob, int cmd)
 {
        register unsigned long reg2 asm ("2") = (unsigned long) iob;
        typedef union {
@@ -77,7 +77,6 @@ static inline int dia250(void *iob, int cmd)
        int rc;
 
        rc = 3;
-       diag_stat_inc(DIAG_STAT_X250);
        asm volatile(
                "       diag    2,%2,0x250\n"
                "0:     ipm     %0\n"
@@ -91,6 +90,12 @@ static inline int dia250(void *iob, int cmd)
        return rc;
 }
 
+static inline int dia250(void *iob, int cmd)
+{
+       diag_stat_inc(DIAG_STAT_X250);
+       return __dia250(iob, cmd);
+}
+
 /* Initialize block I/O to DIAG device using the specified blocksize and
  * block offset. On success, return zero and set end_block to contain the
  * number of blocks on the device minus the specified offset. Return non-zero
index c692dfe..50597f9 100644 (file)
@@ -139,11 +139,11 @@ static ssize_t chp_measurement_chars_read(struct file *filp,
 
        device = container_of(kobj, struct device, kobj);
        chp = to_channelpath(device);
-       if (!chp->cmg_chars)
+       if (chp->cmg == -1)
                return 0;
 
-       return memory_read_from_buffer(buf, count, &off,
-                               chp->cmg_chars, sizeof(struct cmg_chars));
+       return memory_read_from_buffer(buf, count, &off, &chp->cmg_chars,
+                                      sizeof(chp->cmg_chars));
 }
 
 static struct bin_attribute chp_measurement_chars_attr = {
@@ -416,7 +416,8 @@ static void chp_release(struct device *dev)
  * chp_update_desc - update channel-path description
  * @chp - channel-path
  *
- * Update the channel-path description of the specified channel-path.
+ * Update the channel-path description of the specified channel-path
+ * including channel measurement related information.
  * Return zero on success, non-zero otherwise.
  */
 int chp_update_desc(struct channel_path *chp)
@@ -428,8 +429,10 @@ int chp_update_desc(struct channel_path *chp)
                return rc;
 
        rc = chsc_determine_fmt1_channel_path_desc(chp->chpid, &chp->desc_fmt1);
+       if (rc)
+               return rc;
 
-       return rc;
+       return chsc_get_channel_measurement_chars(chp);
 }
 
 /**
@@ -466,14 +469,6 @@ int chp_new(struct chp_id chpid)
                ret = -ENODEV;
                goto out_free;
        }
-       /* Get channel-measurement characteristics. */
-       if (css_chsc_characteristics.scmc && css_chsc_characteristics.secm) {
-               ret = chsc_get_channel_measurement_chars(chp);
-               if (ret)
-                       goto out_free;
-       } else {
-               chp->cmg = -1;
-       }
        dev_set_name(&chp->dev, "chp%x.%02x", chpid.cssid, chpid.id);
 
        /* make it known to the system */
index 4efd5b8..af02322 100644 (file)
@@ -48,7 +48,7 @@ struct channel_path {
        /* Channel-measurement related stuff: */
        int cmg;
        int shared;
-       void *cmg_chars;
+       struct cmg_chars cmg_chars;
 };
 
 /* Return channel_path struct for given chpid. */
index a831d18..c424c0c 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 
 #include <asm/cio.h>
@@ -224,8 +225,9 @@ out_unreg:
 
 void chsc_chp_offline(struct chp_id chpid)
 {
-       char dbf_txt[15];
+       struct channel_path *chp = chpid_to_chp(chpid);
        struct chp_link link;
+       char dbf_txt[15];
 
        sprintf(dbf_txt, "chpr%x.%02x", chpid.cssid, chpid.id);
        CIO_TRACE_EVENT(2, dbf_txt);
@@ -236,6 +238,11 @@ void chsc_chp_offline(struct chp_id chpid)
        link.chpid = chpid;
        /* Wait until previous actions have settled. */
        css_wait_for_slow_path();
+
+       mutex_lock(&chp->lock);
+       chp_update_desc(chp);
+       mutex_unlock(&chp->lock);
+
        for_each_subchannel_staged(s390_subchannel_remove_chpid, NULL, &link);
 }
 
@@ -690,8 +697,9 @@ static void chsc_process_crw(struct crw *crw0, struct crw *crw1, int overflow)
 
 void chsc_chp_online(struct chp_id chpid)
 {
-       char dbf_txt[15];
+       struct channel_path *chp = chpid_to_chp(chpid);
        struct chp_link link;
+       char dbf_txt[15];
 
        sprintf(dbf_txt, "cadd%x.%02x", chpid.cssid, chpid.id);
        CIO_TRACE_EVENT(2, dbf_txt);
@@ -701,6 +709,11 @@ void chsc_chp_online(struct chp_id chpid)
                link.chpid = chpid;
                /* Wait until previous actions have settled. */
                css_wait_for_slow_path();
+
+               mutex_lock(&chp->lock);
+               chp_update_desc(chp);
+               mutex_unlock(&chp->lock);
+
                for_each_subchannel_staged(__s390_process_res_acc, NULL,
                                           &link);
                css_schedule_reprobe();
@@ -967,22 +980,19 @@ static void
 chsc_initialize_cmg_chars(struct channel_path *chp, u8 cmcv,
                          struct cmg_chars *chars)
 {
-       struct cmg_chars *cmg_chars;
        int i, mask;
 
-       cmg_chars = chp->cmg_chars;
        for (i = 0; i < NR_MEASUREMENT_CHARS; i++) {
                mask = 0x80 >> (i + 3);
                if (cmcv & mask)
-                       cmg_chars->values[i] = chars->values[i];
+                       chp->cmg_chars.values[i] = chars->values[i];
                else
-                       cmg_chars->values[i] = 0;
+                       chp->cmg_chars.values[i] = 0;
        }
 }
 
 int chsc_get_channel_measurement_chars(struct channel_path *chp)
 {
-       struct cmg_chars *cmg_chars;
        int ccode, ret;
 
        struct {
@@ -1006,10 +1016,11 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
                u32 data[NR_MEASUREMENT_CHARS];
        } __attribute__ ((packed)) *scmc_area;
 
-       chp->cmg_chars = NULL;
-       cmg_chars = kmalloc(sizeof(*cmg_chars), GFP_KERNEL);
-       if (!cmg_chars)
-               return -ENOMEM;
+       chp->shared = -1;
+       chp->cmg = -1;
+
+       if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm)
+               return 0;
 
        spin_lock_irq(&chsc_page_lock);
        memset(chsc_page, 0, PAGE_SIZE);
@@ -1031,25 +1042,19 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
                              scmc_area->response.code);
                goto out;
        }
-       if (scmc_area->not_valid) {
-               chp->cmg = -1;
-               chp->shared = -1;
+       if (scmc_area->not_valid)
                goto out;
-       }
+
        chp->cmg = scmc_area->cmg;
        chp->shared = scmc_area->shared;
        if (chp->cmg != 2 && chp->cmg != 3) {
                /* No cmg-dependent data. */
                goto out;
        }
-       chp->cmg_chars = cmg_chars;
        chsc_initialize_cmg_chars(chp, scmc_area->cmcv,
                                  (struct cmg_chars *) &scmc_area->data);
 out:
        spin_unlock_irq(&chsc_page_lock);
-       if (!chp->cmg_chars)
-               kfree(cmg_chars);
-
        return ret;
 }
 
index 7b23f43..de1b6c1 100644 (file)
@@ -112,9 +112,10 @@ static inline int convert_error(struct zcrypt_device *zdev,
                atomic_set(&zcrypt_rescan_req, 1);
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%drc%d",
-                       zdev->ap_dev->qid, zdev->online, ehdr->reply_code);
+                       AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online,
+                       ehdr->reply_code);
                return -EAGAIN;
        case REP82_ERROR_TRANSPORT_FAIL:
        case REP82_ERROR_MACHINE_FAILURE:
@@ -123,16 +124,18 @@ static inline int convert_error(struct zcrypt_device *zdev,
                atomic_set(&zcrypt_rescan_req, 1);
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%drc%d",
-                       zdev->ap_dev->qid, zdev->online, ehdr->reply_code);
+                       AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online,
+                       ehdr->reply_code);
                return -EAGAIN;
        default:
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%drc%d",
-                       zdev->ap_dev->qid, zdev->online, ehdr->reply_code);
+                       AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online,
+                       ehdr->reply_code);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
index 74edf29..eedfaa2 100644 (file)
@@ -336,9 +336,10 @@ static int convert_type80(struct zcrypt_device *zdev,
                /* The result is too short, the CEX2A card may not do that.. */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%drc%d",
-                              zdev->ap_dev->qid, zdev->online, t80h->code);
+                              AP_QID_DEVICE(zdev->ap_dev->qid),
+                              zdev->online, t80h->code);
 
                return -EAGAIN; /* repeat the request on a different device. */
        }
@@ -368,9 +369,9 @@ static int convert_response(struct zcrypt_device *zdev,
        default: /* Unknown response type, this should NEVER EVER happen */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%dfail",
-                              zdev->ap_dev->qid, zdev->online);
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
index 9a2dd47..2195971 100644 (file)
@@ -572,9 +572,9 @@ static int convert_type86_ica(struct zcrypt_device *zdev,
                        return -EINVAL;
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%drc%d",
-                              zdev->ap_dev->qid, zdev->online,
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online,
                               msg->hdr.reply_code);
                return -EAGAIN; /* repeat the request on a different device. */
        }
@@ -715,9 +715,9 @@ static int convert_response_ica(struct zcrypt_device *zdev,
        default: /* Unknown response type, this should NEVER EVER happen */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%dfail",
-                              zdev->ap_dev->qid, zdev->online);
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
@@ -747,9 +747,9 @@ static int convert_response_xcrb(struct zcrypt_device *zdev,
                xcRB->status = 0x0008044DL; /* HDD_InvalidParm */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%dfail",
-                              zdev->ap_dev->qid, zdev->online);
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
@@ -773,9 +773,9 @@ static int convert_response_ep11_xcrb(struct zcrypt_device *zdev,
        default: /* Unknown response type, this should NEVER EVER happen */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%dfail",
-                              zdev->ap_dev->qid, zdev->online);
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
@@ -800,9 +800,9 @@ static int convert_response_rng(struct zcrypt_device *zdev,
        default: /* Unknown response type, this should NEVER EVER happen */
                zdev->online = 0;
                pr_err("Cryptographic device %x failed and was set offline\n",
-                      zdev->ap_dev->qid);
+                      AP_QID_DEVICE(zdev->ap_dev->qid));
                ZCRYPT_DBF_DEV(DBF_ERR, zdev, "dev%04xo%dfail",
-                              zdev->ap_dev->qid, zdev->online);
+                              AP_QID_DEVICE(zdev->ap_dev->qid), zdev->online);
                return -EAGAIN; /* repeat the request on a different device. */
        }
 }
index 3613581..93880ed 100644 (file)
@@ -562,7 +562,7 @@ static int mode_select_handle_sense(struct scsi_device *sdev,
                        /*
                         * Command Lock contention
                         */
-                       err = SCSI_DH_RETRY;
+                       err = SCSI_DH_IMM_RETRY;
                break;
        default:
                break;
@@ -612,6 +612,8 @@ retry:
                err = mode_select_handle_sense(sdev, h->sense);
                if (err == SCSI_DH_RETRY && retry_cnt--)
                        goto retry;
+               if (err == SCSI_DH_IMM_RETRY)
+                       goto retry;
        }
        if (err == SCSI_DH_OK) {
                h->state = RDAC_STATE_ACTIVE;
index 37a0c71..d1dd161 100644 (file)
@@ -1,5 +1,7 @@
 config SCSI_HISI_SAS
        tristate "HiSilicon SAS"
+       depends on HAS_DMA && HAS_IOMEM
+       depends on ARM64 || COMPILE_TEST
        select SCSI_SAS_LIBSAS
        select BLK_DEV_INTEGRITY
        help
index 057fdeb..eea24d7 100644 (file)
@@ -1289,13 +1289,10 @@ static int slot_complete_v1_hw(struct hisi_hba *hisi_hba,
                goto out;
        }
 
-       if (cmplt_hdr_data & CMPLT_HDR_ERR_RCRD_XFRD_MSK) {
-               if (!(cmplt_hdr_data & CMPLT_HDR_CMD_CMPLT_MSK) ||
-                   !(cmplt_hdr_data & CMPLT_HDR_RSPNS_XFRD_MSK))
-                       ts->stat = SAS_DATA_OVERRUN;
-               else
-                       slot_err_v1_hw(hisi_hba, task, slot);
+       if (cmplt_hdr_data & CMPLT_HDR_ERR_RCRD_XFRD_MSK &&
+               !(cmplt_hdr_data & CMPLT_HDR_RSPNS_XFRD_MSK)) {
 
+               slot_err_v1_hw(hisi_hba, task, slot);
                goto out;
        }
 
index 3b3e099..d6a691e 100644 (file)
@@ -4002,6 +4002,7 @@ static ssize_t ipr_store_update_fw(struct device *dev,
        struct ipr_sglist *sglist;
        char fname[100];
        char *src;
+       char *endline;
        int result, dnld_size;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -4009,6 +4010,10 @@ static ssize_t ipr_store_update_fw(struct device *dev,
 
        snprintf(fname, sizeof(fname), "%s", buf);
 
+       endline = strchr(fname, '\n');
+       if (endline)
+               *endline = '\0';
+
        if (request_firmware(&fw_entry, fname, &ioa_cfg->pdev->dev)) {
                dev_err(&ioa_cfg->pdev->dev, "Firmware file %s not found\n", fname);
                return -EIO;
index 52a8765..692a757 100644 (file)
@@ -2204,7 +2204,7 @@ qla2x00_init_rings(scsi_qla_host_t *vha)
        /* Clear outstanding commands array. */
        for (que = 0; que < ha->max_req_queues; que++) {
                req = ha->req_q_map[que];
-               if (!req)
+               if (!req || !test_bit(que, ha->req_qid_map))
                        continue;
                req->out_ptr = (void *)(req->ring + req->length);
                *req->out_ptr = 0;
@@ -2221,7 +2221,7 @@ qla2x00_init_rings(scsi_qla_host_t *vha)
 
        for (que = 0; que < ha->max_rsp_queues; que++) {
                rsp = ha->rsp_q_map[que];
-               if (!rsp)
+               if (!rsp || !test_bit(que, ha->rsp_qid_map))
                        continue;
                rsp->in_ptr = (void *)(rsp->ring + rsp->length);
                *rsp->in_ptr = 0;
@@ -4981,7 +4981,7 @@ qla25xx_init_queues(struct qla_hw_data *ha)
 
        for (i = 1; i < ha->max_rsp_queues; i++) {
                rsp = ha->rsp_q_map[i];
-               if (rsp) {
+               if (rsp && test_bit(i, ha->rsp_qid_map)) {
                        rsp->options &= ~BIT_0;
                        ret = qla25xx_init_rsp_que(base_vha, rsp);
                        if (ret != QLA_SUCCESS)
@@ -4996,8 +4996,8 @@ qla25xx_init_queues(struct qla_hw_data *ha)
        }
        for (i = 1; i < ha->max_req_queues; i++) {
                req = ha->req_q_map[i];
-               if (req) {
-               /* Clear outstanding commands array. */
+               if (req && test_bit(i, ha->req_qid_map)) {
+                       /* Clear outstanding commands array. */
                        req->options &= ~BIT_0;
                        ret = qla25xx_init_req_que(base_vha, req);
                        if (ret != QLA_SUCCESS)
index d4d65eb..4af9547 100644 (file)
@@ -3063,9 +3063,9 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
                    "MSI-X: Failed to enable support "
                    "-- %d/%d\n Retry with %d vectors.\n",
                    ha->msix_count, ret, ret);
+               ha->msix_count = ret;
+               ha->max_rsp_queues = ha->msix_count - 1;
        }
-       ha->msix_count = ret;
-       ha->max_rsp_queues = ha->msix_count - 1;
        ha->msix_entries = kzalloc(sizeof(struct qla_msix_entry) *
                                ha->msix_count, GFP_KERNEL);
        if (!ha->msix_entries) {
index c5dd594..cf7ba52 100644 (file)
@@ -600,7 +600,7 @@ qla25xx_delete_queues(struct scsi_qla_host *vha)
        /* Delete request queues */
        for (cnt = 1; cnt < ha->max_req_queues; cnt++) {
                req = ha->req_q_map[cnt];
-               if (req) {
+               if (req && test_bit(cnt, ha->req_qid_map)) {
                        ret = qla25xx_delete_req_que(vha, req);
                        if (ret != QLA_SUCCESS) {
                                ql_log(ql_log_warn, vha, 0x00ea,
@@ -614,7 +614,7 @@ qla25xx_delete_queues(struct scsi_qla_host *vha)
        /* Delete response queues */
        for (cnt = 1; cnt < ha->max_rsp_queues; cnt++) {
                rsp = ha->rsp_q_map[cnt];
-               if (rsp) {
+               if (rsp && test_bit(cnt, ha->rsp_qid_map)) {
                        ret = qla25xx_delete_rsp_que(vha, rsp);
                        if (ret != QLA_SUCCESS) {
                                ql_log(ql_log_warn, vha, 0x00eb,
index f1788db..f6c7ce3 100644 (file)
@@ -409,6 +409,9 @@ static void qla2x00_free_queues(struct qla_hw_data *ha)
        int cnt;
 
        for (cnt = 0; cnt < ha->max_req_queues; cnt++) {
+               if (!test_bit(cnt, ha->req_qid_map))
+                       continue;
+
                req = ha->req_q_map[cnt];
                qla2x00_free_req_que(ha, req);
        }
@@ -416,6 +419,9 @@ static void qla2x00_free_queues(struct qla_hw_data *ha)
        ha->req_q_map = NULL;
 
        for (cnt = 0; cnt < ha->max_rsp_queues; cnt++) {
+               if (!test_bit(cnt, ha->rsp_qid_map))
+                       continue;
+
                rsp = ha->rsp_q_map[cnt];
                qla2x00_free_rsp_que(ha, rsp);
        }
index 8075a4c..ee967be 100644 (file)
@@ -105,7 +105,7 @@ static void qlt_response_pkt(struct scsi_qla_host *ha, response_t *pkt);
 static int qlt_issue_task_mgmt(struct qla_tgt_sess *sess, uint32_t lun,
        int fn, void *iocb, int flags);
 static void qlt_send_term_exchange(struct scsi_qla_host *ha, struct qla_tgt_cmd
-       *cmd, struct atio_from_isp *atio, int ha_locked);
+       *cmd, struct atio_from_isp *atio, int ha_locked, int ul_abort);
 static void qlt_reject_free_srr_imm(struct scsi_qla_host *ha,
        struct qla_tgt_srr_imm *imm, int ha_lock);
 static void qlt_abort_cmd_on_host_reset(struct scsi_qla_host *vha,
@@ -1756,7 +1756,7 @@ void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *mcmd)
                qlt_send_notify_ack(vha, &mcmd->orig_iocb.imm_ntfy,
                    0, 0, 0, 0, 0, 0);
        else {
-               if (mcmd->se_cmd.se_tmr_req->function == TMR_ABORT_TASK)
+               if (mcmd->orig_iocb.atio.u.raw.entry_type == ABTS_RECV_24XX)
                        qlt_24xx_send_abts_resp(vha, &mcmd->orig_iocb.abts,
                            mcmd->fc_tm_rsp, false);
                else
@@ -2665,7 +2665,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type,
                        /* no need to terminate. FW already freed exchange. */
                        qlt_abort_cmd_on_host_reset(cmd->vha, cmd);
                else
-                       qlt_send_term_exchange(vha, cmd, &cmd->atio, 1);
+                       qlt_send_term_exchange(vha, cmd, &cmd->atio, 1, 0);
                spin_unlock_irqrestore(&ha->hardware_lock, flags);
                return 0;
        }
@@ -3173,7 +3173,8 @@ static int __qlt_send_term_exchange(struct scsi_qla_host *vha,
 }
 
 static void qlt_send_term_exchange(struct scsi_qla_host *vha,
-       struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked)
+       struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked,
+       int ul_abort)
 {
        unsigned long flags = 0;
        int rc;
@@ -3193,8 +3194,7 @@ static void qlt_send_term_exchange(struct scsi_qla_host *vha,
                qlt_alloc_qfull_cmd(vha, atio, 0, 0);
 
 done:
-       if (cmd && (!cmd->aborted ||
-           !cmd->cmd_sent_to_fw)) {
+       if (cmd && !ul_abort && !cmd->aborted) {
                if (cmd->sg_mapped)
                        qlt_unmap_sg(vha, cmd);
                vha->hw->tgt.tgt_ops->free_cmd(cmd);
@@ -3253,21 +3253,38 @@ static void qlt_chk_exch_leak_thresh_hold(struct scsi_qla_host *vha)
 
 }
 
-void qlt_abort_cmd(struct qla_tgt_cmd *cmd)
+int qlt_abort_cmd(struct qla_tgt_cmd *cmd)
 {
        struct qla_tgt *tgt = cmd->tgt;
        struct scsi_qla_host *vha = tgt->vha;
        struct se_cmd *se_cmd = &cmd->se_cmd;
+       unsigned long flags;
 
        ql_dbg(ql_dbg_tgt_mgt, vha, 0xf014,
            "qla_target(%d): terminating exchange for aborted cmd=%p "
            "(se_cmd=%p, tag=%llu)", vha->vp_idx, cmd, &cmd->se_cmd,
            se_cmd->tag);
 
+       spin_lock_irqsave(&cmd->cmd_lock, flags);
+       if (cmd->aborted) {
+               spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+               /*
+                * It's normal to see 2 calls in this path:
+                *  1) XFER Rdy completion + CMD_T_ABORT
+                *  2) TCM TMR - drain_state_list
+                */
+               ql_dbg(ql_dbg_tgt_mgt, vha, 0xffff,
+                       "multiple abort. %p transport_state %x, t_state %x,"
+                       " se_cmd_flags %x \n", cmd, cmd->se_cmd.transport_state,
+                       cmd->se_cmd.t_state,cmd->se_cmd.se_cmd_flags);
+               return EIO;
+       }
        cmd->aborted = 1;
        cmd->cmd_flags |= BIT_6;
+       spin_unlock_irqrestore(&cmd->cmd_lock, flags);
 
-       qlt_send_term_exchange(vha, cmd, &cmd->atio, 0);
+       qlt_send_term_exchange(vha, cmd, &cmd->atio, 0, 1);
+       return 0;
 }
 EXPORT_SYMBOL(qlt_abort_cmd);
 
@@ -3282,6 +3299,9 @@ void qlt_free_cmd(struct qla_tgt_cmd *cmd)
 
        BUG_ON(cmd->cmd_in_wq);
 
+       if (cmd->sg_mapped)
+               qlt_unmap_sg(cmd->vha, cmd);
+
        if (!cmd->q_full)
                qlt_decr_num_pend_cmds(cmd->vha);
 
@@ -3399,7 +3419,7 @@ static int qlt_term_ctio_exchange(struct scsi_qla_host *vha, void *ctio,
                term = 1;
 
        if (term)
-               qlt_send_term_exchange(vha, cmd, &cmd->atio, 1);
+               qlt_send_term_exchange(vha, cmd, &cmd->atio, 1, 0);
 
        return term;
 }
@@ -3580,12 +3600,13 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle,
                case CTIO_PORT_LOGGED_OUT:
                case CTIO_PORT_UNAVAILABLE:
                {
-                       int logged_out = (status & 0xFFFF);
+                       int logged_out =
+                               (status & 0xFFFF) == CTIO_PORT_LOGGED_OUT;
+
                        ql_dbg(ql_dbg_tgt_mgt, vha, 0xf059,
                            "qla_target(%d): CTIO with %s status %x "
                            "received (state %x, se_cmd %p)\n", vha->vp_idx,
-                           (logged_out == CTIO_PORT_LOGGED_OUT) ?
-                           "PORT LOGGED OUT" : "PORT UNAVAILABLE",
+                           logged_out ? "PORT LOGGED OUT" : "PORT UNAVAILABLE",
                            status, cmd->state, se_cmd);
 
                        if (logged_out && cmd->sess) {
@@ -3754,6 +3775,7 @@ static void __qlt_do_work(struct qla_tgt_cmd *cmd)
                goto out_term;
        }
 
+       spin_lock_init(&cmd->cmd_lock);
        cdb = &atio->u.isp24.fcp_cmnd.cdb[0];
        cmd->se_cmd.tag = atio->u.isp24.exchange_addr;
        cmd->unpacked_lun = scsilun_to_int(
@@ -3796,7 +3818,7 @@ out_term:
         */
        cmd->cmd_flags |= BIT_2;
        spin_lock_irqsave(&ha->hardware_lock, flags);
-       qlt_send_term_exchange(vha, NULL, &cmd->atio, 1);
+       qlt_send_term_exchange(vha, NULL, &cmd->atio, 1, 0);
 
        qlt_decr_num_pend_cmds(vha);
        percpu_ida_free(&sess->se_sess->sess_tag_pool, cmd->se_cmd.map_tag);
@@ -3918,7 +3940,7 @@ static void qlt_create_sess_from_atio(struct work_struct *work)
 
 out_term:
        spin_lock_irqsave(&ha->hardware_lock, flags);
-       qlt_send_term_exchange(vha, NULL, &op->atio, 1);
+       qlt_send_term_exchange(vha, NULL, &op->atio, 1, 0);
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
        kfree(op);
 
@@ -3982,7 +4004,8 @@ static int qlt_handle_cmd_for_atio(struct scsi_qla_host *vha,
 
        cmd->cmd_in_wq = 1;
        cmd->cmd_flags |= BIT_0;
-       cmd->se_cmd.cpuid = -1;
+       cmd->se_cmd.cpuid = ha->msix_count ?
+               ha->tgt.rspq_vector_cpuid : WORK_CPU_UNBOUND;
 
        spin_lock(&vha->cmd_list_lock);
        list_add_tail(&cmd->cmd_list, &vha->qla_cmd_list);
@@ -3990,7 +4013,6 @@ static int qlt_handle_cmd_for_atio(struct scsi_qla_host *vha,
 
        INIT_WORK(&cmd->work, qlt_do_work);
        if (ha->msix_count) {
-               cmd->se_cmd.cpuid = ha->tgt.rspq_vector_cpuid;
                if (cmd->atio.u.isp24.fcp_cmnd.rddata)
                        queue_work_on(smp_processor_id(), qla_tgt_wq,
                            &cmd->work);
@@ -4771,7 +4793,7 @@ out_reject:
                dump_stack();
        } else {
                cmd->cmd_flags |= BIT_9;
-               qlt_send_term_exchange(vha, cmd, &cmd->atio, 1);
+               qlt_send_term_exchange(vha, cmd, &cmd->atio, 1, 0);
        }
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
 }
@@ -4950,7 +4972,7 @@ static void qlt_prepare_srr_imm(struct scsi_qla_host *vha,
                                    sctio, sctio->srr_id);
                                list_del(&sctio->srr_list_entry);
                                qlt_send_term_exchange(vha, sctio->cmd,
-                                   &sctio->cmd->atio, 1);
+                                   &sctio->cmd->atio, 1, 0);
                                kfree(sctio);
                        }
                }
@@ -5123,7 +5145,7 @@ static int __qlt_send_busy(struct scsi_qla_host *vha,
            atio->u.isp24.fcp_hdr.s_id);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
        if (!sess) {
-               qlt_send_term_exchange(vha, NULL, atio, 1);
+               qlt_send_term_exchange(vha, NULL, atio, 1, 0);
                return 0;
        }
        /* Sending marker isn't necessary, since we called from ISR */
@@ -5406,7 +5428,7 @@ static void qlt_24xx_atio_pkt(struct scsi_qla_host *vha,
 #if 1 /* With TERM EXCHANGE some FC cards refuse to boot */
                                qlt_send_busy(vha, atio, SAM_STAT_BUSY);
 #else
-                               qlt_send_term_exchange(vha, NULL, atio, 1);
+                               qlt_send_term_exchange(vha, NULL, atio, 1, 0);
 #endif
 
                                if (!ha_locked)
@@ -5523,7 +5545,7 @@ static void qlt_response_pkt(struct scsi_qla_host *vha, response_t *pkt)
 #if 1 /* With TERM EXCHANGE some FC cards refuse to boot */
                                qlt_send_busy(vha, atio, 0);
 #else
-                               qlt_send_term_exchange(vha, NULL, atio, 1);
+                               qlt_send_term_exchange(vha, NULL, atio, 1, 0);
 #endif
                        } else {
                                if (tgt->tgt_stop) {
@@ -5532,7 +5554,7 @@ static void qlt_response_pkt(struct scsi_qla_host *vha, response_t *pkt)
                                            "command to target, sending TERM "
                                            "EXCHANGE for rsp\n");
                                        qlt_send_term_exchange(vha, NULL,
-                                           atio, 1);
+                                           atio, 1, 0);
                                } else {
                                        ql_dbg(ql_dbg_tgt, vha, 0xe060,
                                            "qla_target(%d): Unable to send "
@@ -5960,7 +5982,7 @@ static void qlt_tmr_work(struct qla_tgt *tgt,
        return;
 
 out_term:
-       qlt_send_term_exchange(vha, NULL, &prm->tm_iocb2, 0);
+       qlt_send_term_exchange(vha, NULL, &prm->tm_iocb2, 1, 0);
        if (sess)
                ha->tgt.tgt_ops->put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
index 71b2865..22a6a76 100644 (file)
@@ -943,6 +943,36 @@ struct qla_tgt_sess {
        qlt_plogi_ack_t *plogi_link[QLT_PLOGI_LINK_MAX];
 };
 
+typedef enum {
+       /*
+        * BIT_0 - Atio Arrival / schedule to work
+        * BIT_1 - qlt_do_work
+        * BIT_2 - qlt_do work failed
+        * BIT_3 - xfer rdy/tcm_qla2xxx_write_pending
+        * BIT_4 - read respond/tcm_qla2xx_queue_data_in
+        * BIT_5 - status respond / tcm_qla2xx_queue_status
+        * BIT_6 - tcm request to abort/Term exchange.
+        *      pre_xmit_response->qlt_send_term_exchange
+        * BIT_7 - SRR received (qlt_handle_srr->qlt_xmit_response)
+        * BIT_8 - SRR received (qlt_handle_srr->qlt_rdy_to_xfer)
+        * BIT_9 - SRR received (qla_handle_srr->qlt_send_term_exchange)
+        * BIT_10 - Data in - hanlde_data->tcm_qla2xxx_handle_data
+
+        * BIT_12 - good completion - qlt_ctio_do_completion -->free_cmd
+        * BIT_13 - Bad completion -
+        *      qlt_ctio_do_completion --> qlt_term_ctio_exchange
+        * BIT_14 - Back end data received/sent.
+        * BIT_15 - SRR prepare ctio
+        * BIT_16 - complete free
+        * BIT_17 - flush - qlt_abort_cmd_on_host_reset
+        * BIT_18 - completion w/abort status
+        * BIT_19 - completion w/unknown status
+        * BIT_20 - tcm_qla2xxx_free_cmd
+        */
+       CMD_FLAG_DATA_WORK = BIT_11,
+       CMD_FLAG_DATA_WORK_FREE = BIT_21,
+} cmd_flags_t;
+
 struct qla_tgt_cmd {
        struct se_cmd se_cmd;
        struct qla_tgt_sess *sess;
@@ -952,6 +982,7 @@ struct qla_tgt_cmd {
        /* Sense buffer that will be mapped into outgoing status */
        unsigned char sense_buffer[TRANSPORT_SENSE_BUFFER];
 
+       spinlock_t cmd_lock;
        /* to save extra sess dereferences */
        unsigned int conf_compl_supported:1;
        unsigned int sg_mapped:1;
@@ -986,30 +1017,8 @@ struct qla_tgt_cmd {
 
        uint64_t jiffies_at_alloc;
        uint64_t jiffies_at_free;
-       /* BIT_0 - Atio Arrival / schedule to work
-        * BIT_1 - qlt_do_work
-        * BIT_2 - qlt_do work failed
-        * BIT_3 - xfer rdy/tcm_qla2xxx_write_pending
-        * BIT_4 - read respond/tcm_qla2xx_queue_data_in
-        * BIT_5 - status respond / tcm_qla2xx_queue_status
-        * BIT_6 - tcm request to abort/Term exchange.
-        *      pre_xmit_response->qlt_send_term_exchange
-        * BIT_7 - SRR received (qlt_handle_srr->qlt_xmit_response)
-        * BIT_8 - SRR received (qlt_handle_srr->qlt_rdy_to_xfer)
-        * BIT_9 - SRR received (qla_handle_srr->qlt_send_term_exchange)
-        * BIT_10 - Data in - hanlde_data->tcm_qla2xxx_handle_data
-        * BIT_11 - Data actually going to TCM : tcm_qla2xx_handle_data_work
-        * BIT_12 - good completion - qlt_ctio_do_completion -->free_cmd
-        * BIT_13 - Bad completion -
-        *      qlt_ctio_do_completion --> qlt_term_ctio_exchange
-        * BIT_14 - Back end data received/sent.
-        * BIT_15 - SRR prepare ctio
-        * BIT_16 - complete free
-        * BIT_17 - flush - qlt_abort_cmd_on_host_reset
-        * BIT_18 - completion w/abort status
-        * BIT_19 - completion w/unknown status
-        */
-       uint32_t cmd_flags;
+
+       cmd_flags_t cmd_flags;
 };
 
 struct qla_tgt_sess_work_param {
@@ -1148,7 +1157,7 @@ static inline void sid_to_portid(const uint8_t *s_id, port_id_t *p)
 extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, response_t *);
 extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *);
 extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t);
-extern void qlt_abort_cmd(struct qla_tgt_cmd *);
+extern int qlt_abort_cmd(struct qla_tgt_cmd *);
 extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *);
 extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *);
 extern void qlt_free_cmd(struct qla_tgt_cmd *cmd);
index ddbe2e7..c3e6225 100644 (file)
@@ -395,6 +395,10 @@ qla27xx_fwdt_entry_t263(struct scsi_qla_host *vha,
        if (ent->t263.queue_type == T263_QUEUE_TYPE_REQ) {
                for (i = 0; i < vha->hw->max_req_queues; i++) {
                        struct req_que *req = vha->hw->req_q_map[i];
+
+                       if (!test_bit(i, vha->hw->req_qid_map))
+                               continue;
+
                        if (req || !buf) {
                                length = req ?
                                    req->length : REQUEST_ENTRY_CNT_24XX;
@@ -408,6 +412,10 @@ qla27xx_fwdt_entry_t263(struct scsi_qla_host *vha,
        } else if (ent->t263.queue_type == T263_QUEUE_TYPE_RSP) {
                for (i = 0; i < vha->hw->max_rsp_queues; i++) {
                        struct rsp_que *rsp = vha->hw->rsp_q_map[i];
+
+                       if (!test_bit(i, vha->hw->rsp_qid_map))
+                               continue;
+
                        if (rsp || !buf) {
                                length = rsp ?
                                    rsp->length : RESPONSE_ENTRY_CNT_MQ;
@@ -634,6 +642,10 @@ qla27xx_fwdt_entry_t274(struct scsi_qla_host *vha,
        if (ent->t274.queue_type == T274_QUEUE_TYPE_REQ_SHAD) {
                for (i = 0; i < vha->hw->max_req_queues; i++) {
                        struct req_que *req = vha->hw->req_q_map[i];
+
+                       if (!test_bit(i, vha->hw->req_qid_map))
+                               continue;
+
                        if (req || !buf) {
                                qla27xx_insert16(i, buf, len);
                                qla27xx_insert16(1, buf, len);
@@ -645,6 +657,10 @@ qla27xx_fwdt_entry_t274(struct scsi_qla_host *vha,
        } else if (ent->t274.queue_type == T274_QUEUE_TYPE_RSP_SHAD) {
                for (i = 0; i < vha->hw->max_rsp_queues; i++) {
                        struct rsp_que *rsp = vha->hw->rsp_q_map[i];
+
+                       if (!test_bit(i, vha->hw->rsp_qid_map))
+                               continue;
+
                        if (rsp || !buf) {
                                qla27xx_insert16(i, buf, len);
                                qla27xx_insert16(1, buf, len);
index faf0a12..1808a01 100644 (file)
@@ -298,6 +298,10 @@ static void tcm_qla2xxx_free_cmd(struct qla_tgt_cmd *cmd)
 {
        cmd->vha->tgt_counters.core_qla_free_cmd++;
        cmd->cmd_in_wq = 1;
+
+       BUG_ON(cmd->cmd_flags & BIT_20);
+       cmd->cmd_flags |= BIT_20;
+
        INIT_WORK(&cmd->work, tcm_qla2xxx_complete_free);
        queue_work_on(smp_processor_id(), tcm_qla2xxx_free_wq, &cmd->work);
 }
@@ -374,6 +378,20 @@ static int tcm_qla2xxx_write_pending(struct se_cmd *se_cmd)
 {
        struct qla_tgt_cmd *cmd = container_of(se_cmd,
                                struct qla_tgt_cmd, se_cmd);
+
+       if (cmd->aborted) {
+               /* Cmd can loop during Q-full.  tcm_qla2xxx_aborted_task
+                * can get ahead of this cmd. tcm_qla2xxx_aborted_task
+                * already kick start the free.
+                */
+               pr_debug("write_pending aborted cmd[%p] refcount %d "
+                       "transport_state %x, t_state %x, se_cmd_flags %x\n",
+                       cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+                       cmd->se_cmd.transport_state,
+                       cmd->se_cmd.t_state,
+                       cmd->se_cmd.se_cmd_flags);
+               return 0;
+       }
        cmd->cmd_flags |= BIT_3;
        cmd->bufflen = se_cmd->data_length;
        cmd->dma_data_direction = target_reverse_dma_direction(se_cmd);
@@ -405,7 +423,7 @@ static int tcm_qla2xxx_write_pending_status(struct se_cmd *se_cmd)
            se_cmd->t_state == TRANSPORT_COMPLETE_QF_WP) {
                spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
                wait_for_completion_timeout(&se_cmd->t_transport_stop_comp,
-                                           3 * HZ);
+                                               50);
                return 0;
        }
        spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
@@ -444,6 +462,9 @@ static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
        if (bidi)
                flags |= TARGET_SCF_BIDI_OP;
 
+       if (se_cmd->cpuid != WORK_CPU_UNBOUND)
+               flags |= TARGET_SCF_USE_CPUID;
+
        sess = cmd->sess;
        if (!sess) {
                pr_err("Unable to locate struct qla_tgt_sess from qla_tgt_cmd\n");
@@ -465,13 +486,25 @@ static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
 static void tcm_qla2xxx_handle_data_work(struct work_struct *work)
 {
        struct qla_tgt_cmd *cmd = container_of(work, struct qla_tgt_cmd, work);
+       unsigned long flags;
 
        /*
         * Ensure that the complete FCP WRITE payload has been received.
         * Otherwise return an exception via CHECK_CONDITION status.
         */
        cmd->cmd_in_wq = 0;
-       cmd->cmd_flags |= BIT_11;
+
+       spin_lock_irqsave(&cmd->cmd_lock, flags);
+       cmd->cmd_flags |= CMD_FLAG_DATA_WORK;
+       if (cmd->aborted) {
+               cmd->cmd_flags |= CMD_FLAG_DATA_WORK_FREE;
+               spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+
+               tcm_qla2xxx_free_cmd(cmd);
+               return;
+       }
+       spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+
        cmd->vha->tgt_counters.qla_core_ret_ctio++;
        if (!cmd->write_data_transferred) {
                /*
@@ -546,6 +579,20 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
        struct qla_tgt_cmd *cmd = container_of(se_cmd,
                                struct qla_tgt_cmd, se_cmd);
 
+       if (cmd->aborted) {
+               /* Cmd can loop during Q-full.  tcm_qla2xxx_aborted_task
+                * can get ahead of this cmd. tcm_qla2xxx_aborted_task
+                * already kick start the free.
+                */
+               pr_debug("queue_data_in aborted cmd[%p] refcount %d "
+                       "transport_state %x, t_state %x, se_cmd_flags %x\n",
+                       cmd,cmd->se_cmd.cmd_kref.refcount.counter,
+                       cmd->se_cmd.transport_state,
+                       cmd->se_cmd.t_state,
+                       cmd->se_cmd.se_cmd_flags);
+               return 0;
+       }
+
        cmd->cmd_flags |= BIT_4;
        cmd->bufflen = se_cmd->data_length;
        cmd->dma_data_direction = target_reverse_dma_direction(se_cmd);
@@ -637,11 +684,34 @@ static void tcm_qla2xxx_queue_tm_rsp(struct se_cmd *se_cmd)
        qlt_xmit_tm_rsp(mcmd);
 }
 
+
+#define DATA_WORK_NOT_FREE(_flags) \
+       (( _flags & (CMD_FLAG_DATA_WORK|CMD_FLAG_DATA_WORK_FREE)) == \
+        CMD_FLAG_DATA_WORK)
 static void tcm_qla2xxx_aborted_task(struct se_cmd *se_cmd)
 {
        struct qla_tgt_cmd *cmd = container_of(se_cmd,
                                struct qla_tgt_cmd, se_cmd);
-       qlt_abort_cmd(cmd);
+       unsigned long flags;
+
+       if (qlt_abort_cmd(cmd))
+               return;
+
+       spin_lock_irqsave(&cmd->cmd_lock, flags);
+       if ((cmd->state == QLA_TGT_STATE_NEW)||
+               ((cmd->state == QLA_TGT_STATE_DATA_IN) &&
+                DATA_WORK_NOT_FREE(cmd->cmd_flags)) ) {
+
+               cmd->cmd_flags |= CMD_FLAG_DATA_WORK_FREE;
+               spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+               /* Cmd have not reached firmware.
+                * Use this trigger to free it. */
+               tcm_qla2xxx_free_cmd(cmd);
+               return;
+       }
+       spin_unlock_irqrestore(&cmd->cmd_lock, flags);
+       return;
+
 }
 
 static void tcm_qla2xxx_clear_sess_lookup(struct tcm_qla2xxx_lport *,
index 47b9d13..bbfbfd9 100644 (file)
@@ -205,6 +205,8 @@ static struct {
        {"Intel", "Multi-Flex", NULL, BLIST_NO_RSOC},
        {"iRiver", "iFP Mass Driver", NULL, BLIST_NOT_LOCKABLE | BLIST_INQUIRY_36},
        {"LASOUND", "CDX7405", "3.10", BLIST_MAX5LUN | BLIST_SINGLELUN},
+       {"Marvell", "Console", NULL, BLIST_SKIP_VPD_PAGES},
+       {"Marvell", "91xx Config", "1.01", BLIST_SKIP_VPD_PAGES},
        {"MATSHITA", "PD-1", NULL, BLIST_FORCELUN | BLIST_SINGLELUN},
        {"MATSHITA", "DMC-LC5", NULL, BLIST_NOT_LOCKABLE | BLIST_INQUIRY_36},
        {"MATSHITA", "DMC-LC40", NULL, BLIST_NOT_LOCKABLE | BLIST_INQUIRY_36},
index fa6b2c4..8c6e318 100644 (file)
@@ -1344,6 +1344,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 
        switch (ret) {
        case BLKPREP_KILL:
+       case BLKPREP_INVALID:
                req->errors = DID_NO_CONNECT << 16;
                /* release the command and kill it */
                if (req->special) {
index 4f18a85..00bc721 100644 (file)
@@ -1272,16 +1272,18 @@ static void __scsi_remove_target(struct scsi_target *starget)
 void scsi_remove_target(struct device *dev)
 {
        struct Scsi_Host *shost = dev_to_shost(dev->parent);
-       struct scsi_target *starget;
+       struct scsi_target *starget, *last_target = NULL;
        unsigned long flags;
 
 restart:
        spin_lock_irqsave(shost->host_lock, flags);
        list_for_each_entry(starget, &shost->__targets, siblings) {
-               if (starget->state == STARGET_DEL)
+               if (starget->state == STARGET_DEL ||
+                   starget == last_target)
                        continue;
                if (starget->dev.parent == dev || &starget->dev == dev) {
                        kref_get(&starget->reap_ref);
+                       last_target = starget;
                        spin_unlock_irqrestore(shost->host_lock, flags);
                        __scsi_remove_target(starget);
                        scsi_target_reap(starget);
index 4e08d1c..d749da7 100644 (file)
@@ -761,7 +761,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
                break;
 
        default:
-               ret = BLKPREP_KILL;
+               ret = BLKPREP_INVALID;
                goto out;
        }
 
@@ -839,7 +839,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
        int ret;
 
        if (sdkp->device->no_write_same)
-               return BLKPREP_KILL;
+               return BLKPREP_INVALID;
 
        BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
@@ -2893,7 +2893,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
            sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS &&
            sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_CACHE_SIZE)
                rw_max = q->limits.io_opt =
-                       logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
+                       sdkp->opt_xfer_blocks * sdp->sector_size;
        else
                rw_max = BLK_DEF_MAX_SECTORS;
 
@@ -3268,8 +3268,8 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors)
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        int ret = 0;
 
-       if (!sdkp)
-               return 0;       /* this can happen */
+       if (!sdkp)      /* E.g.: runtime suspend following sd_remove() */
+               return 0;
 
        if (sdkp->WCE && sdkp->media_present) {
                sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
@@ -3308,6 +3308,9 @@ static int sd_resume(struct device *dev)
 {
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
 
+       if (!sdkp)      /* E.g.: runtime resume at the start of sd_probe() */
+               return 0;
+
        if (!sdkp->device->manage_start_stop)
                return 0;
 
index 503ab8b..5e82067 100644 (file)
@@ -1261,7 +1261,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
        }
 
        sfp->mmap_called = 1;
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data = sfp;
        vma->vm_ops = &sg_mmap_vm_ops;
        return 0;
index 8bd54a6..64c8674 100644 (file)
@@ -144,6 +144,9 @@ static int sr_runtime_suspend(struct device *dev)
 {
        struct scsi_cd *cd = dev_get_drvdata(dev);
 
+       if (!cd)        /* E.g.: runtime suspend following sr_remove() */
+               return 0;
+
        if (cd->media_present)
                return -EBUSY;
        else
@@ -985,6 +988,7 @@ static int sr_remove(struct device *dev)
        scsi_autopm_get_device(cd->device);
 
        del_gendisk(cd->disk);
+       dev_set_drvdata(dev, NULL);
 
        mutex_lock(&sr_ref_mutex);
        kref_put(&cd->kref, sr_kref_release);
index 55627d0..292c04e 100644 (file)
@@ -42,6 +42,7 @@
 #include <scsi/scsi_devinfo.h>
 #include <scsi/scsi_dbg.h>
 #include <scsi/scsi_transport_fc.h>
+#include <scsi/scsi_transport.h>
 
 /*
  * All wire protocol details (storage protocol between the guest and the host)
@@ -477,19 +478,18 @@ struct hv_host_device {
 struct storvsc_scan_work {
        struct work_struct work;
        struct Scsi_Host *host;
-       uint lun;
+       u8 lun;
+       u8 tgt_id;
 };
 
 static void storvsc_device_scan(struct work_struct *work)
 {
        struct storvsc_scan_work *wrk;
-       uint lun;
        struct scsi_device *sdev;
 
        wrk = container_of(work, struct storvsc_scan_work, work);
-       lun = wrk->lun;
 
-       sdev = scsi_device_lookup(wrk->host, 0, 0, lun);
+       sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun);
        if (!sdev)
                goto done;
        scsi_rescan_device(&sdev->sdev_gendev);
@@ -540,7 +540,7 @@ static void storvsc_remove_lun(struct work_struct *work)
        if (!scsi_host_get(wrk->host))
                goto done;
 
-       sdev = scsi_device_lookup(wrk->host, 0, 0, wrk->lun);
+       sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun);
 
        if (sdev) {
                scsi_remove_device(sdev);
@@ -940,6 +940,7 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
 
        wrk->host = host;
        wrk->lun = vm_srb->lun;
+       wrk->tgt_id = vm_srb->target_id;
        INIT_WORK(&wrk->work, process_err_fn);
        schedule_work(&wrk->work);
 }
@@ -1770,6 +1771,11 @@ static int __init storvsc_drv_init(void)
        fc_transport_template = fc_attach_transport(&fc_transport_functions);
        if (!fc_transport_template)
                return -ENODEV;
+
+       /*
+        * Install Hyper-V specific timeout handler.
+        */
+       fc_transport_template->eh_timed_out = storvsc_eh_timed_out;
 #endif
 
        ret = vmbus_driver_register(&storvsc_drv);
index 91a0030..a9bac3b 100644 (file)
@@ -34,7 +34,7 @@ static struct pm_clk_notifier_block platform_bus_notifier = {
 
 static int __init sh_pm_runtime_init(void)
 {
-       if (IS_ENABLED(CONFIG_ARCH_SHMOBILE)) {
+       if (IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_ARCH_SHMOBILE)) {
                if (!of_find_compatible_node(NULL, NULL,
                                             "renesas,cpg-mstp-clocks"))
                        return 0;
index 2d9e7f3..bb1fb77 100644 (file)
@@ -66,7 +66,7 @@ int superhyway_add_device(unsigned long base, struct superhyway_device *sdev,
        superhyway_read_vcr(dev, base, &dev->vcr);
 
        if (!dev->resource) {
-               dev->resource = kmalloc(sizeof(struct resource), GFP_KERNEL);
+               dev->resource = kzalloc(sizeof(struct resource), GFP_KERNEL);
                if (!dev->resource) {
                        kfree(dev);
                        return -ENOMEM;
index aebad36..8feac59 100644 (file)
@@ -1571,6 +1571,7 @@ static int atmel_spi_probe(struct platform_device *pdev)
 
        as->use_cs_gpios = true;
        if (atmel_spi_is_v2(as) &&
+           pdev->dev.of_node &&
            !of_get_property(pdev->dev.of_node, "cs-gpios", NULL)) {
                as->use_cs_gpios = false;
                master->num_chipselect = 4;
index 7de6f84..ecc73c0 100644 (file)
@@ -73,8 +73,8 @@
 
 /* Bitfields in CNTL1 */
 #define BCM2835_AUX_SPI_CNTL1_CSHIGH   0x00000700
-#define BCM2835_AUX_SPI_CNTL1_IDLE     0x00000080
-#define BCM2835_AUX_SPI_CNTL1_TXEMPTY  0x00000040
+#define BCM2835_AUX_SPI_CNTL1_TXEMPTY  0x00000080
+#define BCM2835_AUX_SPI_CNTL1_IDLE     0x00000040
 #define BCM2835_AUX_SPI_CNTL1_MSBF_IN  0x00000002
 #define BCM2835_AUX_SPI_CNTL1_KEEP_IN  0x00000001
 
index 7fd6a4c..7cb0c19 100644 (file)
@@ -84,7 +84,7 @@ struct fsl_espi_transfer {
 /* SPCOM register values */
 #define SPCOM_CS(x)            ((x) << 30)
 #define SPCOM_TRANLEN(x)       ((x) << 0)
-#define        SPCOM_TRANLEN_MAX       0xFFFF  /* Max transaction length */
+#define        SPCOM_TRANLEN_MAX       0x10000 /* Max transaction length */
 
 #define AUTOSUSPEND_TIMEOUT 2000
 
@@ -233,7 +233,7 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
        reinit_completion(&mpc8xxx_spi->done);
 
        /* Set SPCOM[CS] and SPCOM[TRANLEN] field */
-       if ((t->len - 1) > SPCOM_TRANLEN_MAX) {
+       if (t->len > SPCOM_TRANLEN_MAX) {
                dev_err(mpc8xxx_spi->dev, "Transaction length (%d)"
                                " beyond the SPCOM[TRANLEN] field\n", t->len);
                return -EINVAL;
index d98c33c..c688efa 100644 (file)
@@ -204,8 +204,8 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi,
 {
        struct spi_imx_data *spi_imx = spi_master_get_devdata(master);
 
-       if (spi_imx->dma_is_inited &&
-           transfer->len > spi_imx->wml * sizeof(u32))
+       if (spi_imx->dma_is_inited && transfer->len >= spi_imx->wml &&
+           (transfer->len % spi_imx->wml) == 0)
                return true;
        return false;
 }
@@ -919,8 +919,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
        struct dma_async_tx_descriptor *desc_tx = NULL, *desc_rx = NULL;
        int ret;
        unsigned long timeout;
-       u32 dma;
-       int left;
        struct spi_master *master = spi_imx->bitbang.master;
        struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
 
@@ -929,7 +927,7 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
                                        tx->sgl, tx->nents, DMA_MEM_TO_DEV,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
                if (!desc_tx)
-                       goto no_dma;
+                       goto tx_nodma;
 
                desc_tx->callback = spi_imx_dma_tx_callback;
                desc_tx->callback_param = (void *)spi_imx;
@@ -941,7 +939,7 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
                                        rx->sgl, rx->nents, DMA_DEV_TO_MEM,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
                if (!desc_rx)
-                       goto no_dma;
+                       goto rx_nodma;
 
                desc_rx->callback = spi_imx_dma_rx_callback;
                desc_rx->callback_param = (void *)spi_imx;
@@ -954,13 +952,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
        /* Trigger the cspi module. */
        spi_imx->dma_finished = 0;
 
-       dma = readl(spi_imx->base + MX51_ECSPI_DMA);
-       dma = dma & (~MX51_ECSPI_DMA_RXT_WML_MASK);
-       /* Change RX_DMA_LENGTH trigger dma fetch tail data */
-       left = transfer->len % spi_imx->wml;
-       if (left)
-               writel(dma | (left << MX51_ECSPI_DMA_RXT_WML_OFFSET),
-                               spi_imx->base + MX51_ECSPI_DMA);
        /*
         * Set these order to avoid potential RX overflow. The overflow may
         * happen if we enable SPI HW before starting RX DMA due to rescheduling
@@ -992,10 +983,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
                        spi_imx->devtype_data->reset(spi_imx);
                        dmaengine_terminate_all(master->dma_rx);
                }
-               dma &= ~MX51_ECSPI_DMA_RXT_WML_MASK;
-               writel(dma |
-                      spi_imx->wml << MX51_ECSPI_DMA_RXT_WML_OFFSET,
-                      spi_imx->base + MX51_ECSPI_DMA);
        }
 
        spi_imx->dma_finished = 1;
@@ -1008,7 +995,9 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 
        return ret;
 
-no_dma:
+rx_nodma:
+       dmaengine_terminate_all(master->dma_tx);
+tx_nodma:
        pr_warn_once("%s %s: DMA not available, falling back to PIO\n",
                     dev_driver_string(&master->dev),
                     dev_name(&master->dev));
index 894616f..cf4bb36 100644 (file)
@@ -761,6 +761,7 @@ static int spi_test_run_iter(struct spi_device *spi,
                test.iterate_transfer_mask = 1;
 
        /* count number of transfers with tx/rx_buf != NULL */
+       rx_count = tx_count = 0;
        for (i = 0; i < test.transfer_count; i++) {
                if (test.transfers[i].tx_buf)
                        tx_count++;
index 7273820..0caa3c8 100644 (file)
@@ -1490,6 +1490,8 @@ static int omap2_mcspi_probe(struct platform_device *pdev)
        return status;
 
 disable_pm:
+       pm_runtime_dont_use_autosuspend(&pdev->dev);
+       pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
 free_master:
        spi_master_put(master);
@@ -1501,6 +1503,7 @@ static int omap2_mcspi_remove(struct platform_device *pdev)
        struct spi_master *master = platform_get_drvdata(pdev);
        struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
 
+       pm_runtime_dont_use_autosuspend(mcspi->dev);
        pm_runtime_put_sync(mcspi->dev);
        pm_runtime_disable(&pdev->dev);
 
index 79a8bc4..7cb1b2d 100644 (file)
@@ -749,6 +749,7 @@ static int rockchip_spi_probe(struct platform_device *pdev)
        return 0;
 
 err_register_master:
+       pm_runtime_disable(&pdev->dev);
        if (rs->dma_tx.ch)
                dma_release_channel(rs->dma_tx.ch);
        if (rs->dma_rx.ch)
@@ -778,6 +779,8 @@ static int rockchip_spi_remove(struct platform_device *pdev)
        if (rs->dma_rx.ch)
                dma_release_channel(rs->dma_rx.ch);
 
+       spi_master_put(master);
+
        return 0;
 }
 
index 0c67586..d8e4219 100644 (file)
@@ -83,6 +83,7 @@ config SSB_SDIOHOST
 config SSB_HOST_SOC
        bool "Support for SSB bus on SoC"
        depends on SSB && BCM47XX_NVRAM
+       select SSB_SPROM
        help
          Host interface for a SSB directly mapped into memory. This is
          for some Broadcom SoCs from the BCM47xx and BCM53xx lines.
index cde5ff7..d1a7507 100644 (file)
@@ -613,9 +613,10 @@ out:
        return err;
 }
 
-static int ssb_bus_register(struct ssb_bus *bus,
-                           ssb_invariants_func_t get_invariants,
-                           unsigned long baseaddr)
+static int __maybe_unused
+ssb_bus_register(struct ssb_bus *bus,
+                ssb_invariants_func_t get_invariants,
+                unsigned long baseaddr)
 {
        int err;
 
index 58d4517..b9519be 100644 (file)
@@ -6,6 +6,7 @@ menu "Analog to digital converters"
 config AD7606
        tristate "Analog Devices AD7606 ADC driver"
        depends on GPIOLIB || COMPILE_TEST
+       depends on HAS_IOMEM
        select IIO_BUFFER
        select IIO_TRIGGERED_BUFFER
        help
index f129039..6928710 100644 (file)
@@ -217,8 +217,12 @@ error_ret:
 static int ade7753_reset(struct device *dev)
 {
        u16 val;
+       int ret;
+
+       ret = ade7753_spi_read_reg_16(dev, ADE7753_MODE, &val);
+       if (ret)
+               return ret;
 
-       ade7753_spi_read_reg_16(dev, ADE7753_MODE, &val);
        val |= BIT(6); /* Software Chip Reset */
 
        return ade7753_spi_write_reg_16(dev, ADE7753_MODE, val);
@@ -343,8 +347,12 @@ error_ret:
 static int ade7753_stop_device(struct device *dev)
 {
        u16 val;
+       int ret;
+
+       ret = ade7753_spi_read_reg_16(dev, ADE7753_MODE, &val);
+       if (ret)
+               return ret;
 
-       ade7753_spi_read_reg_16(dev, ADE7753_MODE, &val);
        val |= BIT(4);  /* AD converters can be turned off */
 
        return ade7753_spi_write_reg_16(dev, ADE7753_MODE, val);
index 3ec7e65..db49af9 100644 (file)
@@ -147,7 +147,7 @@ static int vpfe_prepare_pipeline(struct vpfe_video_device *video)
        mutex_lock(&mdev->graph_mutex);
        ret = media_entity_graph_walk_init(&graph, entity->graph_obj.mdev);
        if (ret) {
-               mutex_unlock(&video->lock);
+               mutex_unlock(&mdev->graph_mutex);
                return -ENOMEM;
        }
        media_entity_graph_walk_start(&graph, entity);
index 79ac192..70b8f4f 100644 (file)
@@ -825,8 +825,7 @@ static void lcd_write_cmd_s(int cmd)
        lcd_send_serial(0x1F);  /* R/W=W, RS=0 */
        lcd_send_serial(cmd & 0x0F);
        lcd_send_serial((cmd >> 4) & 0x0F);
-       /* the shortest command takes at least 40 us */
-       usleep_range(40, 100);
+       udelay(40);             /* the shortest command takes at least 40 us */
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -837,8 +836,7 @@ static void lcd_write_data_s(int data)
        lcd_send_serial(0x5F);  /* R/W=W, RS=1 */
        lcd_send_serial(data & 0x0F);
        lcd_send_serial((data >> 4) & 0x0F);
-       /* the shortest data takes at least 40 us */
-       usleep_range(40, 100);
+       udelay(40);             /* the shortest data takes at least 40 us */
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -848,20 +846,19 @@ static void lcd_write_cmd_p8(int cmd)
        spin_lock_irq(&pprt_lock);
        /* present the data to the data port */
        w_dtr(pprt, cmd);
-       /* maintain the data during 20 us before the strobe */
-       usleep_range(20, 100);
+       udelay(20);     /* maintain the data during 20 us before the strobe */
 
        bits.e = BIT_SET;
        bits.rs = BIT_CLR;
        bits.rw = BIT_CLR;
        set_ctrl_bits();
 
-       usleep_range(40, 100);  /* maintain the strobe during 40 us */
+       udelay(40);     /* maintain the strobe during 40 us */
 
        bits.e = BIT_CLR;
        set_ctrl_bits();
 
-       usleep_range(120, 500); /* the shortest command takes at least 120 us */
+       udelay(120);    /* the shortest command takes at least 120 us */
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -871,20 +868,19 @@ static void lcd_write_data_p8(int data)
        spin_lock_irq(&pprt_lock);
        /* present the data to the data port */
        w_dtr(pprt, data);
-       /* maintain the data during 20 us before the strobe */
-       usleep_range(20, 100);
+       udelay(20);     /* maintain the data during 20 us before the strobe */
 
        bits.e = BIT_SET;
        bits.rs = BIT_SET;
        bits.rw = BIT_CLR;
        set_ctrl_bits();
 
-       usleep_range(40, 100);  /* maintain the strobe during 40 us */
+       udelay(40);     /* maintain the strobe during 40 us */
 
        bits.e = BIT_CLR;
        set_ctrl_bits();
 
-       usleep_range(45, 100);  /* the shortest data takes at least 45 us */
+       udelay(45);     /* the shortest data takes at least 45 us */
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -894,7 +890,7 @@ static void lcd_write_cmd_tilcd(int cmd)
        spin_lock_irq(&pprt_lock);
        /* present the data to the control port */
        w_ctr(pprt, cmd);
-       usleep_range(60, 120);
+       udelay(60);
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -904,7 +900,7 @@ static void lcd_write_data_tilcd(int data)
        spin_lock_irq(&pprt_lock);
        /* present the data to the data port */
        w_dtr(pprt, data);
-       usleep_range(60, 120);
+       udelay(60);
        spin_unlock_irq(&pprt_lock);
 }
 
@@ -947,7 +943,7 @@ static void lcd_clear_fast_s(void)
                lcd_send_serial(0x5F);  /* R/W=W, RS=1 */
                lcd_send_serial(' ' & 0x0F);
                lcd_send_serial((' ' >> 4) & 0x0F);
-               usleep_range(40, 100);  /* the shortest data takes at least 40 us */
+               udelay(40);     /* the shortest data takes at least 40 us */
        }
        spin_unlock_irq(&pprt_lock);
 
@@ -971,7 +967,7 @@ static void lcd_clear_fast_p8(void)
                w_dtr(pprt, ' ');
 
                /* maintain the data during 20 us before the strobe */
-               usleep_range(20, 100);
+               udelay(20);
 
                bits.e = BIT_SET;
                bits.rs = BIT_SET;
@@ -979,13 +975,13 @@ static void lcd_clear_fast_p8(void)
                set_ctrl_bits();
 
                /* maintain the strobe during 40 us */
-               usleep_range(40, 100);
+               udelay(40);
 
                bits.e = BIT_CLR;
                set_ctrl_bits();
 
                /* the shortest data takes at least 45 us */
-               usleep_range(45, 100);
+               udelay(45);
        }
        spin_unlock_irq(&pprt_lock);
 
@@ -1007,7 +1003,7 @@ static void lcd_clear_fast_tilcd(void)
        for (pos = 0; pos < lcd.height * lcd.hwidth; pos++) {
                /* present the data to the data port */
                w_dtr(pprt, ' ');
-               usleep_range(60, 120);
+               udelay(60);
        }
 
        spin_unlock_irq(&pprt_lock);
index ba87650..f1f3eca 100644 (file)
@@ -22,12 +22,6 @@ menuconfig STAGING_RDMA
 # Please keep entries in alphabetic order
 if STAGING_RDMA
 
-source "drivers/staging/rdma/amso1100/Kconfig"
-
-source "drivers/staging/rdma/ehca/Kconfig"
-
 source "drivers/staging/rdma/hfi1/Kconfig"
 
-source "drivers/staging/rdma/ipath/Kconfig"
-
 endif
index 139d78e..8c7fc1d 100644 (file)
@@ -1,5 +1,2 @@
 # Entries for RDMA_STAGING tree
-obj-$(CONFIG_INFINIBAND_AMSO1100)      += amso1100/
-obj-$(CONFIG_INFINIBAND_EHCA)  += ehca/
 obj-$(CONFIG_INFINIBAND_HFI1)  += hfi1/
-obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
diff --git a/drivers/staging/rdma/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild
deleted file mode 100644 (file)
index 950dfab..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
-
-obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
-
-iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
-       c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/staging/rdma/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig
deleted file mode 100644 (file)
index e6ce5f2..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-config INFINIBAND_AMSO1100
-       tristate "Ammasso 1100 HCA support"
-       depends on PCI && INET
-       ---help---
-         This is a low-level driver for the Ammasso 1100 host
-         channel adapter (HCA).
-
-config INFINIBAND_AMSO1100_DEBUG
-       bool "Verbose debugging output"
-       depends on INFINIBAND_AMSO1100
-       default n
-       ---help---
-         This option causes the amso1100 driver to produce a bunch of
-         debug messages.  Select this if you are developing the driver
-         or trying to diagnose a problem.
diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO
deleted file mode 100644 (file)
index 18b00a5..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-7/2015
-
-The amso1100 driver has been deprecated and moved to drivers/staging.
-It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c
deleted file mode 100644 (file)
index b46ebd1..0000000
+++ /dev/null
@@ -1,1240 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <linux/prefetch.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_provider.h"
-
-MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
-MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
-
-static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
-    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
-
-static int debug = -1;         /* defaults above */
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
-
-static int c2_up(struct net_device *netdev);
-static int c2_down(struct net_device *netdev);
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
-static void c2_tx_interrupt(struct net_device *netdev);
-static void c2_rx_interrupt(struct net_device *netdev);
-static irqreturn_t c2_interrupt(int irq, void *dev_id);
-static void c2_tx_timeout(struct net_device *netdev);
-static int c2_change_mtu(struct net_device *netdev, int new_mtu);
-static void c2_reset(struct c2_port *c2_port);
-
-static struct pci_device_id c2_pci_table[] = {
-       { PCI_DEVICE(0x18b8, 0xb001) },
-       { 0 }
-};
-
-MODULE_DEVICE_TABLE(pci, c2_pci_table);
-
-static void c2_set_rxbufsize(struct c2_port *c2_port)
-{
-       struct net_device *netdev = c2_port->netdev;
-
-       if (netdev->mtu > RX_BUF_SIZE)
-               c2_port->rx_buf_size =
-                   netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
-                   NET_IP_ALIGN;
-       else
-               c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
-}
-
-/*
- * Allocate TX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
-                           dma_addr_t base, void __iomem * mmio_txp_ring)
-{
-       struct c2_tx_desc *tx_desc;
-       struct c2_txp_desc __iomem *txp_desc;
-       struct c2_element *elem;
-       int i;
-
-       tx_ring->start = kmalloc_array(tx_ring->count, sizeof(*elem),
-                                      GFP_KERNEL);
-       if (!tx_ring->start)
-               return -ENOMEM;
-
-       elem = tx_ring->start;
-       tx_desc = vaddr;
-       txp_desc = mmio_txp_ring;
-       for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
-               tx_desc->len = 0;
-               tx_desc->status = 0;
-
-               /* Set TXP_HTXD_UNINIT */
-               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-                            (void __iomem *) txp_desc + C2_TXP_ADDR);
-               __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
-               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-                            (void __iomem *) txp_desc + C2_TXP_FLAGS);
-
-               elem->skb = NULL;
-               elem->ht_desc = tx_desc;
-               elem->hw_desc = txp_desc;
-
-               if (i == tx_ring->count - 1) {
-                       elem->next = tx_ring->start;
-                       tx_desc->next_offset = base;
-               } else {
-                       elem->next = elem + 1;
-                       tx_desc->next_offset =
-                           base + (i + 1) * sizeof(*tx_desc);
-               }
-       }
-
-       tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
-
-       return 0;
-}
-
-/*
- * Allocate RX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
-                           dma_addr_t base, void __iomem * mmio_rxp_ring)
-{
-       struct c2_rx_desc *rx_desc;
-       struct c2_rxp_desc __iomem *rxp_desc;
-       struct c2_element *elem;
-       int i;
-
-       rx_ring->start = kmalloc_array(rx_ring->count, sizeof(*elem),
-                                      GFP_KERNEL);
-       if (!rx_ring->start)
-               return -ENOMEM;
-
-       elem = rx_ring->start;
-       rx_desc = vaddr;
-       rxp_desc = mmio_rxp_ring;
-       for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
-               rx_desc->len = 0;
-               rx_desc->status = 0;
-
-               /* Set RXP_HRXD_UNINIT */
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
-                      (void __iomem *) rxp_desc + C2_RXP_STATUS);
-               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
-               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
-               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-                            (void __iomem *) rxp_desc + C2_RXP_ADDR);
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-                            (void __iomem *) rxp_desc + C2_RXP_FLAGS);
-
-               elem->skb = NULL;
-               elem->ht_desc = rx_desc;
-               elem->hw_desc = rxp_desc;
-
-               if (i == rx_ring->count - 1) {
-                       elem->next = rx_ring->start;
-                       rx_desc->next_offset = base;
-               } else {
-                       elem->next = elem + 1;
-                       rx_desc->next_offset =
-                           base + (i + 1) * sizeof(*rx_desc);
-               }
-       }
-
-       rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
-
-       return 0;
-}
-
-/* Setup buffer for receiving */
-static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_rx_desc *rx_desc = elem->ht_desc;
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen;
-       struct c2_rxp_hdr *rxp_hdr;
-
-       skb = dev_alloc_skb(c2_port->rx_buf_size);
-       if (unlikely(!skb)) {
-               pr_debug("%s: out of memory for receive\n",
-                       c2_port->netdev->name);
-               return -ENOMEM;
-       }
-
-       /* Zero out the rxp hdr in the sk_buff */
-       memset(skb->data, 0, sizeof(*rxp_hdr));
-
-       skb->dev = c2_port->netdev;
-
-       maplen = c2_port->rx_buf_size;
-       mapaddr =
-           pci_map_single(c2dev->pcidev, skb->data, maplen,
-                          PCI_DMA_FROMDEVICE);
-
-       /* Set the sk_buff RXP_header to RXP_HRXD_READY */
-       rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-       rxp_hdr->flags = RXP_HRXD_READY;
-
-       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-       __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
-                    elem->hw_desc + C2_RXP_LEN);
-       __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                    elem->hw_desc + C2_RXP_FLAGS);
-
-       elem->skb = skb;
-       elem->mapaddr = mapaddr;
-       elem->maplen = maplen;
-       rx_desc->len = maplen;
-
-       return 0;
-}
-
-/*
- * Allocate buffers for the Rx ring
- * For receive:  rx_ring.to_clean is next received frame
- */
-static int c2_rx_fill(struct c2_port *c2_port)
-{
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       int ret = 0;
-
-       elem = rx_ring->start;
-       do {
-               if (c2_rx_alloc(c2_port, elem)) {
-                       ret = 1;
-                       break;
-               }
-       } while ((elem = elem->next) != rx_ring->start);
-
-       rx_ring->to_clean = rx_ring->start;
-       return ret;
-}
-
-/* Free all buffers in RX ring, assumes receiver stopped */
-static void c2_rx_clean(struct c2_port *c2_port)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       struct c2_rx_desc *rx_desc;
-
-       elem = rx_ring->start;
-       do {
-               rx_desc = elem->ht_desc;
-               rx_desc->len = 0;
-
-               __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-               __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-               __raw_writew(0, elem->hw_desc + C2_RXP_LEN);
-               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-                            elem->hw_desc + C2_RXP_ADDR);
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-                            elem->hw_desc + C2_RXP_FLAGS);
-
-               if (elem->skb) {
-                       pci_unmap_single(c2dev->pcidev, elem->mapaddr,
-                                        elem->maplen, PCI_DMA_FROMDEVICE);
-                       dev_kfree_skb(elem->skb);
-                       elem->skb = NULL;
-               }
-       } while ((elem = elem->next) != rx_ring->start);
-}
-
-static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
-{
-       struct c2_tx_desc *tx_desc = elem->ht_desc;
-
-       tx_desc->len = 0;
-
-       pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
-                        PCI_DMA_TODEVICE);
-
-       if (elem->skb) {
-               dev_kfree_skb_any(elem->skb);
-               elem->skb = NULL;
-       }
-
-       return 0;
-}
-
-/* Free all buffers in TX ring, assumes transmitter stopped */
-static void c2_tx_clean(struct c2_port *c2_port)
-{
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       struct c2_txp_desc txp_htxd;
-       int retry;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-       elem = tx_ring->start;
-
-       do {
-               retry = 0;
-               do {
-                       txp_htxd.flags =
-                           readw(elem->hw_desc + C2_TXP_FLAGS);
-
-                       if (txp_htxd.flags == TXP_HTXD_READY) {
-                               retry = 1;
-                               __raw_writew(0,
-                                            elem->hw_desc + C2_TXP_LEN);
-                               __raw_writeq(0,
-                                            elem->hw_desc + C2_TXP_ADDR);
-                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
-                                            elem->hw_desc + C2_TXP_FLAGS);
-                               c2_port->netdev->stats.tx_dropped++;
-                               break;
-                       } else {
-                               __raw_writew(0,
-                                            elem->hw_desc + C2_TXP_LEN);
-                               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-                                            elem->hw_desc + C2_TXP_ADDR);
-                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-                                            elem->hw_desc + C2_TXP_FLAGS);
-                       }
-
-                       c2_tx_free(c2_port->c2dev, elem);
-
-               } while ((elem = elem->next) != tx_ring->start);
-       } while (retry);
-
-       c2_port->tx_avail = c2_port->tx_ring.count - 1;
-       c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
-
-       if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-               netif_wake_queue(c2_port->netdev);
-
-       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-}
-
-/*
- * Process transmit descriptors marked 'DONE' by the firmware,
- * freeing up their unneeded sk_buffs.
- */
-static void c2_tx_interrupt(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       struct c2_txp_desc txp_htxd;
-
-       spin_lock(&c2_port->tx_lock);
-
-       for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
-            elem = elem->next) {
-               txp_htxd.flags =
-                   be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
-
-               if (txp_htxd.flags != TXP_HTXD_DONE)
-                       break;
-
-               if (netif_msg_tx_done(c2_port)) {
-                       /* PCI reads are expensive in fast path */
-                       txp_htxd.len =
-                           be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
-                       pr_debug("%s: tx done slot %3Zu status 0x%x len "
-                               "%5u bytes\n",
-                               netdev->name, elem - tx_ring->start,
-                               txp_htxd.flags, txp_htxd.len);
-               }
-
-               c2_tx_free(c2dev, elem);
-               ++(c2_port->tx_avail);
-       }
-
-       tx_ring->to_clean = elem;
-
-       if (netif_queue_stopped(netdev)
-           && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-               netif_wake_queue(netdev);
-
-       spin_unlock(&c2_port->tx_lock);
-}
-
-static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
-{
-       struct c2_rx_desc *rx_desc = elem->ht_desc;
-       struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-
-       if (rxp_hdr->status != RXP_HRXD_OK ||
-           rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
-               pr_debug("BAD RXP_HRXD\n");
-               pr_debug("  rx_desc : %p\n", rx_desc);
-               pr_debug("    index : %Zu\n",
-                       elem - c2_port->rx_ring.start);
-               pr_debug("    len   : %u\n", rx_desc->len);
-               pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
-                       (void *) __pa((unsigned long) rxp_hdr));
-               pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
-               pr_debug("    status: 0x%x\n", rxp_hdr->status);
-               pr_debug("    len   : %u\n", rxp_hdr->len);
-               pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
-       }
-
-       /* Setup the skb for reuse since we're dropping this pkt */
-       elem->skb->data = elem->skb->head;
-       skb_reset_tail_pointer(elem->skb);
-
-       /* Zero out the rxp hdr in the sk_buff */
-       memset(elem->skb->data, 0, sizeof(*rxp_hdr));
-
-       /* Write the descriptor to the adapter's rx ring */
-       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-       __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-       __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
-                    elem->hw_desc + C2_RXP_LEN);
-       __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
-                    elem->hw_desc + C2_RXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                    elem->hw_desc + C2_RXP_FLAGS);
-
-       pr_debug("packet dropped\n");
-       c2_port->netdev->stats.rx_dropped++;
-}
-
-static void c2_rx_interrupt(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       struct c2_rx_desc *rx_desc;
-       struct c2_rxp_hdr *rxp_hdr;
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen, buflen;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2dev->lock, flags);
-
-       /* Begin where we left off */
-       rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
-
-       for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
-            elem = elem->next) {
-               rx_desc = elem->ht_desc;
-               mapaddr = elem->mapaddr;
-               maplen = elem->maplen;
-               skb = elem->skb;
-               rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-
-               if (rxp_hdr->flags != RXP_HRXD_DONE)
-                       break;
-               buflen = rxp_hdr->len;
-
-               /* Sanity check the RXP header */
-               if (rxp_hdr->status != RXP_HRXD_OK ||
-                   buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
-                       c2_rx_error(c2_port, elem);
-                       continue;
-               }
-
-               /*
-                * Allocate and map a new skb for replenishing the host
-                * RX desc
-                */
-               if (c2_rx_alloc(c2_port, elem)) {
-                       c2_rx_error(c2_port, elem);
-                       continue;
-               }
-
-               /* Unmap the old skb */
-               pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
-                                PCI_DMA_FROMDEVICE);
-
-               prefetch(skb->data);
-
-               /*
-                * Skip past the leading 8 bytes comprising of the
-                * "struct c2_rxp_hdr", prepended by the adapter
-                * to the usual Ethernet header ("struct ethhdr"),
-                * to the start of the raw Ethernet packet.
-                *
-                * Fix up the various fields in the sk_buff before
-                * passing it up to netif_rx(). The transfer size
-                * (in bytes) specified by the adapter len field of
-                * the "struct rxp_hdr_t" does NOT include the
-                * "sizeof(struct c2_rxp_hdr)".
-                */
-               skb->data += sizeof(*rxp_hdr);
-               skb_set_tail_pointer(skb, buflen);
-               skb->len = buflen;
-               skb->protocol = eth_type_trans(skb, netdev);
-
-               netif_rx(skb);
-
-               netdev->stats.rx_packets++;
-               netdev->stats.rx_bytes += buflen;
-       }
-
-       /* Save where we left off */
-       rx_ring->to_clean = elem;
-       c2dev->cur_rx = elem - rx_ring->start;
-       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-       spin_unlock_irqrestore(&c2dev->lock, flags);
-}
-
-/*
- * Handle netisr0 TX & RX interrupts.
- */
-static irqreturn_t c2_interrupt(int irq, void *dev_id)
-{
-       unsigned int netisr0, dmaisr;
-       int handled = 0;
-       struct c2_dev *c2dev = dev_id;
-
-       /* Process CCILNET interrupts */
-       netisr0 = readl(c2dev->regs + C2_NISR0);
-       if (netisr0) {
-
-               /*
-                * There is an issue with the firmware that always
-                * provides the status of RX for both TX & RX
-                * interrupts.  So process both queues here.
-                */
-               c2_rx_interrupt(c2dev->netdev);
-               c2_tx_interrupt(c2dev->netdev);
-
-               /* Clear the interrupt */
-               writel(netisr0, c2dev->regs + C2_NISR0);
-               handled++;
-       }
-
-       /* Process RNIC interrupts */
-       dmaisr = readl(c2dev->regs + C2_DISR);
-       if (dmaisr) {
-               writel(dmaisr, c2dev->regs + C2_DISR);
-               c2_rnic_interrupt(c2dev);
-               handled++;
-       }
-
-       if (handled) {
-               return IRQ_HANDLED;
-       } else {
-               return IRQ_NONE;
-       }
-}
-
-static int c2_up(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_element *elem;
-       struct c2_rxp_hdr *rxp_hdr;
-       struct in_device *in_dev;
-       size_t rx_size, tx_size;
-       int ret, i;
-       unsigned int netimr0;
-
-       if (netif_msg_ifup(c2_port))
-               pr_debug("%s: enabling interface\n", netdev->name);
-
-       /* Set the Rx buffer size based on MTU */
-       c2_set_rxbufsize(c2_port);
-
-       /* Allocate DMA'able memory for Tx/Rx host descriptor rings */
-       rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
-       tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
-
-       c2_port->mem_size = tx_size + rx_size;
-       c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
-                                            &c2_port->dma);
-       if (c2_port->mem == NULL) {
-               pr_debug("Unable to allocate memory for "
-                       "host descriptor rings\n");
-               return -ENOMEM;
-       }
-
-       /* Create the Rx host descriptor ring */
-       if ((ret =
-            c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
-                             c2dev->mmio_rxp_ring))) {
-               pr_debug("Unable to create RX ring\n");
-               goto bail0;
-       }
-
-       /* Allocate Rx buffers for the host descriptor ring */
-       if (c2_rx_fill(c2_port)) {
-               pr_debug("Unable to fill RX ring\n");
-               goto bail1;
-       }
-
-       /* Create the Tx host descriptor ring */
-       if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
-                                   c2_port->dma + rx_size,
-                                   c2dev->mmio_txp_ring))) {
-               pr_debug("Unable to create TX ring\n");
-               goto bail1;
-       }
-
-       /* Set the TX pointer to where we left off */
-       c2_port->tx_avail = c2_port->tx_ring.count - 1;
-       c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
-           c2_port->tx_ring.start + c2dev->cur_tx;
-
-       /* missing: Initialize MAC */
-
-       BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
-
-       /* Reset the adapter, ensures the driver is in sync with the RXP */
-       c2_reset(c2_port);
-
-       /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
-       for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
-            i++, elem++) {
-               rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-               rxp_hdr->flags = 0;
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                            elem->hw_desc + C2_RXP_FLAGS);
-       }
-
-       /* Enable network packets */
-       netif_start_queue(netdev);
-
-       /* Enable IRQ */
-       writel(0, c2dev->regs + C2_IDIS);
-       netimr0 = readl(c2dev->regs + C2_NIMR0);
-       netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
-       writel(netimr0, c2dev->regs + C2_NIMR0);
-
-       /* Tell the stack to ignore arp requests for ipaddrs bound to
-        * other interfaces.  This is needed to prevent the host stack
-        * from responding to arp requests to the ipaddr bound on the
-        * rdma interface.
-        */
-       in_dev = in_dev_get(netdev);
-       IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
-       in_dev_put(in_dev);
-
-       return 0;
-
-bail1:
-       c2_rx_clean(c2_port);
-       kfree(c2_port->rx_ring.start);
-
-bail0:
-       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-                           c2_port->dma);
-
-       return ret;
-}
-
-static int c2_down(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-
-       if (netif_msg_ifdown(c2_port))
-               pr_debug("%s: disabling interface\n",
-                       netdev->name);
-
-       /* Wait for all the queued packets to get sent */
-       c2_tx_interrupt(netdev);
-
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-
-       /* Disable IRQs by clearing the interrupt mask */
-       writel(1, c2dev->regs + C2_IDIS);
-       writel(0, c2dev->regs + C2_NIMR0);
-
-       /* missing: Stop transmitter */
-
-       /* missing: Stop receiver */
-
-       /* Reset the adapter, ensures the driver is in sync with the RXP */
-       c2_reset(c2_port);
-
-       /* missing: Turn off LEDs here */
-
-       /* Free all buffers in the host descriptor rings */
-       c2_tx_clean(c2_port);
-       c2_rx_clean(c2_port);
-
-       /* Free the host descriptor rings */
-       kfree(c2_port->rx_ring.start);
-       kfree(c2_port->tx_ring.start);
-       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-                           c2_port->dma);
-
-       return 0;
-}
-
-static void c2_reset(struct c2_port *c2_port)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       unsigned int cur_rx = c2dev->cur_rx;
-
-       /* Tell the hardware to quiesce */
-       C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
-
-       /*
-        * The hardware will reset the C2_PCI_HRX_QUI bit once
-        * the RXP is quiesced.  Wait 2 seconds for this.
-        */
-       ssleep(2);
-
-       cur_rx = C2_GET_CUR_RX(c2dev);
-
-       if (cur_rx & C2_PCI_HRX_QUI)
-               pr_debug("c2_reset: failed to quiesce the hardware!\n");
-
-       cur_rx &= ~C2_PCI_HRX_QUI;
-
-       c2dev->cur_rx = cur_rx;
-
-       pr_debug("Current RX: %u\n", c2dev->cur_rx);
-}
-
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       dma_addr_t mapaddr;
-       u32 maplen;
-       unsigned long flags;
-       unsigned int i;
-
-       spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-       if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
-               netif_stop_queue(netdev);
-               spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-               pr_debug("%s: Tx ring full when queue awake!\n",
-                       netdev->name);
-               return NETDEV_TX_BUSY;
-       }
-
-       maplen = skb_headlen(skb);
-       mapaddr =
-           pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
-
-       elem = tx_ring->to_use;
-       elem->skb = skb;
-       elem->mapaddr = mapaddr;
-       elem->maplen = maplen;
-
-       /* Tell HW to xmit */
-       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
-                    elem->hw_desc + C2_TXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(maplen),
-                    elem->hw_desc + C2_TXP_LEN);
-       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-                    elem->hw_desc + C2_TXP_FLAGS);
-
-       netdev->stats.tx_packets++;
-       netdev->stats.tx_bytes += maplen;
-
-       /* Loop thru additional data fragments and queue them */
-       if (skb_shinfo(skb)->nr_frags) {
-               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-                       const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-                       maplen = skb_frag_size(frag);
-                       mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
-                                                  0, maplen, DMA_TO_DEVICE);
-                       elem = elem->next;
-                       elem->skb = NULL;
-                       elem->mapaddr = mapaddr;
-                       elem->maplen = maplen;
-
-                       /* Tell HW to xmit */
-                       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
-                                    elem->hw_desc + C2_TXP_ADDR);
-                       __raw_writew((__force u16) cpu_to_be16(maplen),
-                                    elem->hw_desc + C2_TXP_LEN);
-                       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-                                    elem->hw_desc + C2_TXP_FLAGS);
-
-                       netdev->stats.tx_packets++;
-                       netdev->stats.tx_bytes += maplen;
-               }
-       }
-
-       tx_ring->to_use = elem->next;
-       c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
-
-       if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
-               netif_stop_queue(netdev);
-               if (netif_msg_tx_queued(c2_port))
-                       pr_debug("%s: transmit queue full\n",
-                               netdev->name);
-       }
-
-       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-       netdev->trans_start = jiffies;
-
-       return NETDEV_TX_OK;
-}
-
-static void c2_tx_timeout(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-
-       if (netif_msg_timer(c2_port))
-               pr_debug("%s: tx timeout\n", netdev->name);
-
-       c2_tx_clean(c2_port);
-}
-
-static int c2_change_mtu(struct net_device *netdev, int new_mtu)
-{
-       int ret = 0;
-
-       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-               return -EINVAL;
-
-       netdev->mtu = new_mtu;
-
-       if (netif_running(netdev)) {
-               c2_down(netdev);
-
-               c2_up(netdev);
-       }
-
-       return ret;
-}
-
-static const struct net_device_ops c2_netdev = {
-       .ndo_open               = c2_up,
-       .ndo_stop               = c2_down,
-       .ndo_start_xmit         = c2_xmit_frame,
-       .ndo_tx_timeout         = c2_tx_timeout,
-       .ndo_change_mtu         = c2_change_mtu,
-       .ndo_set_mac_address    = eth_mac_addr,
-       .ndo_validate_addr      = eth_validate_addr,
-};
-
-/* Initialize network device */
-static struct net_device *c2_devinit(struct c2_dev *c2dev,
-                                    void __iomem * mmio_addr)
-{
-       struct c2_port *c2_port = NULL;
-       struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
-
-       if (!netdev) {
-               pr_debug("c2_port etherdev alloc failed");
-               return NULL;
-       }
-
-       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-       netdev->netdev_ops = &c2_netdev;
-       netdev->watchdog_timeo = C2_TX_TIMEOUT;
-       netdev->irq = c2dev->pcidev->irq;
-
-       c2_port = netdev_priv(netdev);
-       c2_port->netdev = netdev;
-       c2_port->c2dev = c2dev;
-       c2_port->msg_enable = netif_msg_init(debug, default_msg);
-       c2_port->tx_ring.count = C2_NUM_TX_DESC;
-       c2_port->rx_ring.count = C2_NUM_RX_DESC;
-
-       spin_lock_init(&c2_port->tx_lock);
-
-       /* Copy our 48-bit ethernet hardware address */
-       memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
-
-       /* Validate the MAC address */
-       if (!is_valid_ether_addr(netdev->dev_addr)) {
-               pr_debug("Invalid MAC Address\n");
-               pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name,
-                        netdev->dev_addr, netdev->irq);
-               free_netdev(netdev);
-               return NULL;
-       }
-
-       c2dev->netdev = netdev;
-
-       return netdev;
-}
-
-static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
-{
-       int ret = 0, i;
-       unsigned long reg0_start, reg0_flags, reg0_len;
-       unsigned long reg2_start, reg2_flags, reg2_len;
-       unsigned long reg4_start, reg4_flags, reg4_len;
-       unsigned kva_map_size;
-       struct net_device *netdev = NULL;
-       struct c2_dev *c2dev = NULL;
-       void __iomem *mmio_regs = NULL;
-
-       printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
-               DRV_VERSION);
-
-       /* Enable PCI device */
-       ret = pci_enable_device(pcidev);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
-                       pci_name(pcidev));
-               goto bail0;
-       }
-
-       reg0_start = pci_resource_start(pcidev, BAR_0);
-       reg0_len = pci_resource_len(pcidev, BAR_0);
-       reg0_flags = pci_resource_flags(pcidev, BAR_0);
-
-       reg2_start = pci_resource_start(pcidev, BAR_2);
-       reg2_len = pci_resource_len(pcidev, BAR_2);
-       reg2_flags = pci_resource_flags(pcidev, BAR_2);
-
-       reg4_start = pci_resource_start(pcidev, BAR_4);
-       reg4_len = pci_resource_len(pcidev, BAR_4);
-       reg4_flags = pci_resource_flags(pcidev, BAR_4);
-
-       pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
-       pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
-       pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
-
-       /* Make sure PCI base addr are MMIO */
-       if (!(reg0_flags & IORESOURCE_MEM) ||
-           !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
-               printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
-               ret = -ENODEV;
-               goto bail1;
-       }
-
-       /* Check for weird/broken PCI region reporting */
-       if ((reg0_len < C2_REG0_SIZE) ||
-           (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
-               printk(KERN_ERR PFX "Invalid PCI region sizes\n");
-               ret = -ENODEV;
-               goto bail1;
-       }
-
-       /* Reserve PCI I/O and memory resources */
-       ret = pci_request_regions(pcidev, DRV_NAME);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: Unable to request regions\n",
-                       pci_name(pcidev));
-               goto bail1;
-       }
-
-       if ((sizeof(dma_addr_t) > 4)) {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "64b DMA configuration failed\n");
-                       goto bail2;
-               }
-       } else {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "32b DMA configuration failed\n");
-                       goto bail2;
-               }
-       }
-
-       /* Enables bus-mastering on the device */
-       pci_set_master(pcidev);
-
-       /* Remap the adapter PCI registers in BAR4 */
-       mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-                                   sizeof(struct c2_adapter_pci_regs));
-       if (!mmio_regs) {
-               printk(KERN_ERR PFX
-                       "Unable to remap adapter PCI registers in BAR4\n");
-               ret = -EIO;
-               goto bail2;
-       }
-
-       /* Validate PCI regs magic */
-       for (i = 0; i < sizeof(c2_magic); i++) {
-               if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
-                       printk(KERN_ERR PFX "Downlevel Firmware boot loader "
-                               "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
-                              "utility to update your boot loader\n",
-                               i + 1, sizeof(c2_magic),
-                               readb(mmio_regs + C2_REGS_MAGIC + i),
-                               c2_magic[i]);
-                       printk(KERN_ERR PFX "Adapter not claimed\n");
-                       iounmap(mmio_regs);
-                       ret = -EIO;
-                       goto bail2;
-               }
-       }
-
-       /* Validate the adapter version */
-       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
-               printk(KERN_ERR PFX "Version mismatch "
-                       "[fw=%u, c2=%u], Adapter not claimed\n",
-                       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
-                       C2_VERSION);
-               ret = -EINVAL;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       /* Validate the adapter IVN */
-       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
-               printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
-                      "the OpenIB device support kit. "
-                      "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
-                      C2_IVN);
-               ret = -EINVAL;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       /* Allocate hardware structure */
-       c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
-       if (!c2dev) {
-               printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
-                       pci_name(pcidev));
-               ret = -ENOMEM;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       memset(c2dev, 0, sizeof(*c2dev));
-       spin_lock_init(&c2dev->lock);
-       c2dev->pcidev = pcidev;
-       c2dev->cur_tx = 0;
-
-       /* Get the last RX index */
-       c2dev->cur_rx =
-           (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
-            0xffffc000) / sizeof(struct c2_rxp_desc);
-
-       /* Request an interrupt line for the driver */
-       ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
-                       pci_name(pcidev), pcidev->irq);
-               iounmap(mmio_regs);
-               goto bail3;
-       }
-
-       /* Set driver specific data */
-       pci_set_drvdata(pcidev, c2dev);
-
-       /* Initialize network device */
-       if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
-               ret = -ENOMEM;
-               iounmap(mmio_regs);
-               goto bail4;
-       }
-
-       /* Save off the actual size prior to unmapping mmio_regs */
-       kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
-
-       /* Unmap the adapter PCI registers in BAR4 */
-       iounmap(mmio_regs);
-
-       /* Register network device */
-       ret = register_netdev(netdev);
-       if (ret) {
-               printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
-                       ret);
-               goto bail5;
-       }
-
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-
-       /* Remap the adapter HRXDQ PA space to kernel VA space */
-       c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
-                                              C2_RXP_HRXDQ_SIZE);
-       if (!c2dev->mmio_rxp_ring) {
-               printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
-               ret = -EIO;
-               goto bail6;
-       }
-
-       /* Remap the adapter HTXDQ PA space to kernel VA space */
-       c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
-                                              C2_TXP_HTXDQ_SIZE);
-       if (!c2dev->mmio_txp_ring) {
-               printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
-               ret = -EIO;
-               goto bail7;
-       }
-
-       /* Save off the current RX index in the last 4 bytes of the TXP Ring */
-       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-       /* Remap the PCI registers in adapter BAR0 to kernel VA space */
-       c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
-       if (!c2dev->regs) {
-               printk(KERN_ERR PFX "Unable to remap BAR0\n");
-               ret = -EIO;
-               goto bail8;
-       }
-
-       /* Remap the PCI registers in adapter BAR4 to kernel VA space */
-       c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
-       c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-                                    kva_map_size);
-       if (!c2dev->kva) {
-               printk(KERN_ERR PFX "Unable to remap BAR4\n");
-               ret = -EIO;
-               goto bail9;
-       }
-
-       /* Print out the MAC address */
-       pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr,
-                netdev->irq);
-
-       ret = c2_rnic_init(c2dev);
-       if (ret) {
-               printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
-               goto bail10;
-       }
-
-       ret = c2_register_device(c2dev);
-       if (ret)
-               goto bail10;
-
-       return 0;
-
- bail10:
-       iounmap(c2dev->kva);
-
- bail9:
-       iounmap(c2dev->regs);
-
- bail8:
-       iounmap(c2dev->mmio_txp_ring);
-
- bail7:
-       iounmap(c2dev->mmio_rxp_ring);
-
- bail6:
-       unregister_netdev(netdev);
-
- bail5:
-       free_netdev(netdev);
-
- bail4:
-       free_irq(pcidev->irq, c2dev);
-
- bail3:
-       ib_dealloc_device(&c2dev->ibdev);
-
- bail2:
-       pci_release_regions(pcidev);
-
- bail1:
-       pci_disable_device(pcidev);
-
- bail0:
-       return ret;
-}
-
-static void c2_remove(struct pci_dev *pcidev)
-{
-       struct c2_dev *c2dev = pci_get_drvdata(pcidev);
-       struct net_device *netdev = c2dev->netdev;
-
-       /* Unregister with OpenIB */
-       c2_unregister_device(c2dev);
-
-       /* Clean up the RNIC resources */
-       c2_rnic_term(c2dev);
-
-       /* Remove network device from the kernel */
-       unregister_netdev(netdev);
-
-       /* Free network device */
-       free_netdev(netdev);
-
-       /* Free the interrupt line */
-       free_irq(pcidev->irq, c2dev);
-
-       /* missing: Turn LEDs off here */
-
-       /* Unmap adapter PA space */
-       iounmap(c2dev->kva);
-       iounmap(c2dev->regs);
-       iounmap(c2dev->mmio_txp_ring);
-       iounmap(c2dev->mmio_rxp_ring);
-
-       /* Free the hardware structure */
-       ib_dealloc_device(&c2dev->ibdev);
-
-       /* Release reserved PCI I/O and memory resources */
-       pci_release_regions(pcidev);
-
-       /* Disable PCI device */
-       pci_disable_device(pcidev);
-
-       /* Clear driver specific data */
-       pci_set_drvdata(pcidev, NULL);
-}
-
-static struct pci_driver c2_pci_driver = {
-       .name = DRV_NAME,
-       .id_table = c2_pci_table,
-       .probe = c2_probe,
-       .remove = c2_remove,
-};
-
-module_pci_driver(c2_pci_driver);
diff --git a/drivers/staging/rdma/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h
deleted file mode 100644 (file)
index 21b565a..0000000
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __C2_H
-#define __C2_H
-
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/idr.h>
-
-#include "c2_provider.h"
-#include "c2_mq.h"
-#include "c2_status.h"
-
-#define DRV_NAME     "c2"
-#define DRV_VERSION  "1.1"
-#define PFX          DRV_NAME ": "
-
-#define BAR_0                0
-#define BAR_2                2
-#define BAR_4                4
-
-#define RX_BUF_SIZE         (1536 + 8)
-#define ETH_JUMBO_MTU        9000
-#define C2_MAGIC            "CEPHEUS"
-#define C2_VERSION           4
-#define C2_IVN              (18 & 0x7fffffff)
-
-#define C2_REG0_SIZE        (16 * 1024)
-#define C2_REG2_SIZE        (2 * 1024 * 1024)
-#define C2_REG4_SIZE        (256 * 1024 * 1024)
-#define C2_NUM_TX_DESC       341
-#define C2_NUM_RX_DESC       256
-#define C2_PCI_REGS_OFFSET  (0x10000)
-#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
-#define C2_RXP_HRXDQ_SIZE   (4096)
-#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
-#define C2_TXP_HTXDQ_SIZE   (4096)
-#define C2_TX_TIMEOUT      (6*HZ)
-
-/* CEPHEUS */
-static const u8 c2_magic[] = {
-       0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
-};
-
-enum adapter_pci_regs {
-       C2_REGS_MAGIC = 0x0000,
-       C2_REGS_VERS = 0x0008,
-       C2_REGS_IVN = 0x000C,
-       C2_REGS_PCI_WINSIZE = 0x0010,
-       C2_REGS_Q0_QSIZE = 0x0014,
-       C2_REGS_Q0_MSGSIZE = 0x0018,
-       C2_REGS_Q0_POOLSTART = 0x001C,
-       C2_REGS_Q0_SHARED = 0x0020,
-       C2_REGS_Q1_QSIZE = 0x0024,
-       C2_REGS_Q1_MSGSIZE = 0x0028,
-       C2_REGS_Q1_SHARED = 0x0030,
-       C2_REGS_Q2_QSIZE = 0x0034,
-       C2_REGS_Q2_MSGSIZE = 0x0038,
-       C2_REGS_Q2_SHARED = 0x0040,
-       C2_REGS_ENADDR = 0x004C,
-       C2_REGS_RDMA_ENADDR = 0x0054,
-       C2_REGS_HRX_CUR = 0x006C,
-};
-
-struct c2_adapter_pci_regs {
-       char reg_magic[8];
-       u32 version;
-       u32 ivn;
-       u32 pci_window_size;
-       u32 q0_q_size;
-       u32 q0_msg_size;
-       u32 q0_pool_start;
-       u32 q0_shared;
-       u32 q1_q_size;
-       u32 q1_msg_size;
-       u32 q1_pool_start;
-       u32 q1_shared;
-       u32 q2_q_size;
-       u32 q2_msg_size;
-       u32 q2_pool_start;
-       u32 q2_shared;
-       u32 log_start;
-       u32 log_size;
-       u8 host_enaddr[8];
-       u8 rdma_enaddr[8];
-       u32 crash_entry;
-       u32 crash_ready[2];
-       u32 fw_txd_cur;
-       u32 fw_hrxd_cur;
-       u32 fw_rxd_cur;
-};
-
-enum pci_regs {
-       C2_HISR = 0x0000,
-       C2_DISR = 0x0004,
-       C2_HIMR = 0x0008,
-       C2_DIMR = 0x000C,
-       C2_NISR0 = 0x0010,
-       C2_NISR1 = 0x0014,
-       C2_NIMR0 = 0x0018,
-       C2_NIMR1 = 0x001C,
-       C2_IDIS = 0x0020,
-};
-
-enum {
-       C2_PCI_HRX_INT = 1 << 8,
-       C2_PCI_HTX_INT = 1 << 17,
-       C2_PCI_HRX_QUI = 1 << 31,
-};
-
-/*
- * Cepheus registers in BAR0.
- */
-struct c2_pci_regs {
-       u32 hostisr;
-       u32 dmaisr;
-       u32 hostimr;
-       u32 dmaimr;
-       u32 netisr0;
-       u32 netisr1;
-       u32 netimr0;
-       u32 netimr1;
-       u32 int_disable;
-};
-
-/* TXP flags */
-enum c2_txp_flags {
-       TXP_HTXD_DONE = 0,
-       TXP_HTXD_READY = 1 << 0,
-       TXP_HTXD_UNINIT = 1 << 1,
-};
-
-/* RXP flags */
-enum c2_rxp_flags {
-       RXP_HRXD_UNINIT = 0,
-       RXP_HRXD_READY = 1 << 0,
-       RXP_HRXD_DONE = 1 << 1,
-};
-
-/* RXP status */
-enum c2_rxp_status {
-       RXP_HRXD_ZERO = 0,
-       RXP_HRXD_OK = 1 << 0,
-       RXP_HRXD_BUF_OV = 1 << 1,
-};
-
-/* TXP descriptor fields */
-enum txp_desc {
-       C2_TXP_FLAGS = 0x0000,
-       C2_TXP_LEN = 0x0002,
-       C2_TXP_ADDR = 0x0004,
-};
-
-/* RXP descriptor fields */
-enum rxp_desc {
-       C2_RXP_FLAGS = 0x0000,
-       C2_RXP_STATUS = 0x0002,
-       C2_RXP_COUNT = 0x0004,
-       C2_RXP_LEN = 0x0006,
-       C2_RXP_ADDR = 0x0008,
-};
-
-struct c2_txp_desc {
-       u16 flags;
-       u16 len;
-       u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_desc {
-       u16 flags;
-       u16 status;
-       u16 count;
-       u16 len;
-       u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_hdr {
-       u16 flags;
-       u16 status;
-       u16 len;
-       u16 rsvd;
-} __attribute__ ((packed));
-
-struct c2_tx_desc {
-       u32 len;
-       u32 status;
-       dma_addr_t next_offset;
-};
-
-struct c2_rx_desc {
-       u32 len;
-       u32 status;
-       dma_addr_t next_offset;
-};
-
-struct c2_alloc {
-       u32 last;
-       u32 max;
-       spinlock_t lock;
-       unsigned long *table;
-};
-
-struct c2_array {
-       struct {
-               void **page;
-               int used;
-       } *page_list;
-};
-
-/*
- * The MQ shared pointer pool is organized as a linked list of
- * chunks. Each chunk contains a linked list of free shared pointers
- * that can be allocated to a given user mode client.
- *
- */
-struct sp_chunk {
-       struct sp_chunk *next;
-       dma_addr_t dma_addr;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-       u16 head;
-       u16 shared_ptr[0];
-};
-
-struct c2_pd_table {
-       u32 last;
-       u32 max;
-       spinlock_t lock;
-       unsigned long *table;
-};
-
-struct c2_qp_table {
-       struct idr idr;
-       spinlock_t lock;
-};
-
-struct c2_element {
-       struct c2_element *next;
-       void *ht_desc;          /* host     descriptor */
-       void __iomem *hw_desc;  /* hardware descriptor */
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen;
-};
-
-struct c2_ring {
-       struct c2_element *to_clean;
-       struct c2_element *to_use;
-       struct c2_element *start;
-       unsigned long count;
-};
-
-struct c2_dev {
-       struct ib_device ibdev;
-       void __iomem *regs;
-       void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
-       void __iomem *mmio_rxp_ring;
-       spinlock_t lock;
-       struct pci_dev *pcidev;
-       struct net_device *netdev;
-       struct net_device *pseudo_netdev;
-       unsigned int cur_tx;
-       unsigned int cur_rx;
-       u32 adapter_handle;
-       int device_cap_flags;
-       void __iomem *kva;      /* KVA device memory */
-       unsigned long pa;       /* PA device memory */
-       void **qptr_array;
-
-       struct kmem_cache *host_msg_cache;
-
-       struct list_head cca_link;              /* adapter list */
-       struct list_head eh_wakeup_list;        /* event wakeup list */
-       wait_queue_head_t req_vq_wo;
-
-       /* Cached RNIC properties */
-       struct ib_device_attr props;
-
-       struct c2_pd_table pd_table;
-       struct c2_qp_table qp_table;
-       int ports;              /* num of GigE ports */
-       int devnum;
-       spinlock_t vqlock;      /* sync vbs req MQ */
-
-       /* Verbs Queues */
-       struct c2_mq req_vq;    /* Verbs Request MQ */
-       struct c2_mq rep_vq;    /* Verbs Reply MQ */
-       struct c2_mq aeq;       /* Async Events MQ */
-
-       /* Kernel client MQs */
-       struct sp_chunk *kern_mqsp_pool;
-
-       /* Device updates these values when posting messages to a host
-        * target queue */
-       u16 req_vq_shared;
-       u16 rep_vq_shared;
-       u16 aeq_shared;
-       u16 irq_claimed;
-
-       /*
-        * Shared host target pages for user-accessible MQs.
-        */
-       int hthead;             /* index of first free entry */
-       void *htpages;          /* kernel vaddr */
-       int htlen;              /* length of htpages memory */
-       void *htuva;            /* user mapped vaddr */
-       spinlock_t htlock;      /* serialize allocation */
-
-       u64 adapter_hint_uva;   /* access to the activity FIFO */
-
-       //      spinlock_t aeq_lock;
-       //      spinlock_t rnic_lock;
-
-       __be16 *hint_count;
-       dma_addr_t hint_count_dma;
-       u16 hints_read;
-
-       int init;               /* TRUE if it's ready */
-       char ae_cache_name[16];
-       char vq_cache_name[16];
-};
-
-struct c2_port {
-       u32 msg_enable;
-       struct c2_dev *c2dev;
-       struct net_device *netdev;
-
-       spinlock_t tx_lock;
-       u32 tx_avail;
-       struct c2_ring tx_ring;
-       struct c2_ring rx_ring;
-
-       void *mem;              /* PCI memory for host rings */
-       dma_addr_t dma;
-       unsigned long mem_size;
-
-       u32 rx_buf_size;
-};
-
-/*
- * Activity FIFO registers in BAR0.
- */
-#define PCI_BAR0_HOST_HINT     0x100
-#define PCI_BAR0_ADAPTER_HINT  0x2000
-
-/*
- * Ammasso PCI vendor id and Cepheus PCI device id.
- */
-#define CQ_ARMED       0x01
-#define CQ_WAIT_FOR_DMA        0x80
-
-/*
- * The format of a hint is as follows:
- * Lower 16 bits are the count of hints for the queue.
- * Next 15 bits are the qp_index
- * Upper most bit depends on who reads it:
- *    If read by producer, then it means Full (1) or Not-Full (0)
- *    If read by consumer, then it means Empty (1) or Not-Empty (0)
- */
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-
-/*
- * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
- * struct.
- */
-#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
-
-#ifndef readq
-static inline u64 readq(const void __iomem * addr)
-{
-       u64 ret = readl(addr + 4);
-       ret <<= 32;
-       ret |= readl(addr);
-
-       return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void __raw_writeq(u64 val, void __iomem * addr)
-{
-       __raw_writel((u32) (val), addr);
-       __raw_writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
-#define C2_SET_CUR_RX(c2dev, cur_rx) \
-       __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
-
-#define C2_GET_CUR_RX(c2dev) \
-       be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
-
-static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
-{
-       return container_of(ibdev, struct c2_dev, ibdev);
-}
-
-static inline int c2_errno(void *reply)
-{
-       switch (c2_wr_get_result(reply)) {
-       case C2_OK:
-               return 0;
-       case CCERR_NO_BUFS:
-       case CCERR_INSUFFICIENT_RESOURCES:
-       case CCERR_ZERO_RDMA_READ_RESOURCES:
-               return -ENOMEM;
-       case CCERR_MR_IN_USE:
-       case CCERR_QP_IN_USE:
-               return -EBUSY;
-       case CCERR_ADDR_IN_USE:
-               return -EADDRINUSE;
-       case CCERR_ADDR_NOT_AVAIL:
-               return -EADDRNOTAVAIL;
-       case CCERR_CONN_RESET:
-               return -ECONNRESET;
-       case CCERR_NOT_IMPLEMENTED:
-       case CCERR_INVALID_WQE:
-               return -ENOSYS;
-       case CCERR_QP_NOT_PRIVILEGED:
-               return -EPERM;
-       case CCERR_STACK_ERROR:
-               return -EPROTO;
-       case CCERR_ACCESS_VIOLATION:
-       case CCERR_BASE_AND_BOUNDS_VIOLATION:
-               return -EFAULT;
-       case CCERR_STAG_STATE_NOT_INVALID:
-       case CCERR_INVALID_ADDRESS:
-       case CCERR_INVALID_CQ:
-       case CCERR_INVALID_EP:
-       case CCERR_INVALID_MODIFIER:
-       case CCERR_INVALID_MTU:
-       case CCERR_INVALID_PD_ID:
-       case CCERR_INVALID_QP:
-       case CCERR_INVALID_RNIC:
-       case CCERR_INVALID_STAG:
-               return -EINVAL;
-       default:
-               return -EAGAIN;
-       }
-}
-
-/* Device */
-int c2_register_device(struct c2_dev *c2dev);
-void c2_unregister_device(struct c2_dev *c2dev);
-int c2_rnic_init(struct c2_dev *c2dev);
-void c2_rnic_term(struct c2_dev *c2dev);
-void c2_rnic_interrupt(struct c2_dev *c2dev);
-int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-
-/* QPs */
-int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
-                      struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
-void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
-struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
-int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-                       struct ib_qp_attr *attr, int attr_mask);
-int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-                                int ord, int ird);
-int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-                       struct ib_send_wr **bad_wr);
-int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-                          struct ib_recv_wr **bad_wr);
-void c2_init_qp_table(struct c2_dev *c2dev);
-void c2_cleanup_qp_table(struct c2_dev *c2dev);
-void c2_set_qp_state(struct c2_qp *, int);
-struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
-
-/* PDs */
-int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
-void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
-int c2_init_pd_table(struct c2_dev *c2dev);
-void c2_cleanup_pd_table(struct c2_dev *c2dev);
-
-/* CQs */
-int c2_init_cq(struct c2_dev *c2dev, int entries,
-                     struct c2_ucontext *ctx, struct c2_cq *cq);
-void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
-void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
-void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
-int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
-
-/* CM */
-int c2_llp_connect(struct iw_cm_id *cm_id,
-                         struct iw_cm_conn_param *iw_param);
-int c2_llp_accept(struct iw_cm_id *cm_id,
-                        struct iw_cm_conn_param *iw_param);
-int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
-                        u8 pdata_len);
-int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
-int c2_llp_service_destroy(struct iw_cm_id *cm_id);
-
-/* MM */
-int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
-                                     int page_size, int pbl_depth, u32 length,
-                                     u32 off, u64 *va, enum c2_acf acf,
-                                     struct c2_mr *mr);
-int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
-
-/* AE */
-void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
-
-/* MQSP Allocator */
-int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-                            struct sp_chunk **root);
-void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
-__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-                            dma_addr_t *dma_addr, gfp_t gfp_mask);
-void c2_free_mqsp(__be16* mqsp);
-#endif
diff --git a/drivers/staging/rdma/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c
deleted file mode 100644 (file)
index eb7a92b..0000000
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_status.h"
-#include "c2_ae.h"
-
-static int c2_convert_cm_status(u32 c2_status)
-{
-       switch (c2_status) {
-       case C2_CONN_STATUS_SUCCESS:
-               return 0;
-       case C2_CONN_STATUS_REJECTED:
-               return -ENETRESET;
-       case C2_CONN_STATUS_REFUSED:
-               return -ECONNREFUSED;
-       case C2_CONN_STATUS_TIMEDOUT:
-               return -ETIMEDOUT;
-       case C2_CONN_STATUS_NETUNREACH:
-               return -ENETUNREACH;
-       case C2_CONN_STATUS_HOSTUNREACH:
-               return -EHOSTUNREACH;
-       case C2_CONN_STATUS_INVALID_RNIC:
-               return -EINVAL;
-       case C2_CONN_STATUS_INVALID_QP:
-               return -EINVAL;
-       case C2_CONN_STATUS_INVALID_QP_STATE:
-               return -EINVAL;
-       case C2_CONN_STATUS_ADDR_NOT_AVAIL:
-               return -EADDRNOTAVAIL;
-       default:
-               printk(KERN_ERR PFX
-                      "%s - Unable to convert CM status: %d\n",
-                      __func__, c2_status);
-               return -EIO;
-       }
-}
-
-static const char* to_event_str(int event)
-{
-       static const char* event_str[] = {
-               "CCAE_REMOTE_SHUTDOWN",
-               "CCAE_ACTIVE_CONNECT_RESULTS",
-               "CCAE_CONNECTION_REQUEST",
-               "CCAE_LLP_CLOSE_COMPLETE",
-               "CCAE_TERMINATE_MESSAGE_RECEIVED",
-               "CCAE_LLP_CONNECTION_RESET",
-               "CCAE_LLP_CONNECTION_LOST",
-               "CCAE_LLP_SEGMENT_SIZE_INVALID",
-               "CCAE_LLP_INVALID_CRC",
-               "CCAE_LLP_BAD_FPDU",
-               "CCAE_INVALID_DDP_VERSION",
-               "CCAE_INVALID_RDMA_VERSION",
-               "CCAE_UNEXPECTED_OPCODE",
-               "CCAE_INVALID_DDP_QUEUE_NUMBER",
-               "CCAE_RDMA_READ_NOT_ENABLED",
-               "CCAE_RDMA_WRITE_NOT_ENABLED",
-               "CCAE_RDMA_READ_TOO_SMALL",
-               "CCAE_NO_L_BIT",
-               "CCAE_TAGGED_INVALID_STAG",
-               "CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
-               "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
-               "CCAE_TAGGED_INVALID_PD",
-               "CCAE_WRAP_ERROR",
-               "CCAE_BAD_CLOSE",
-               "CCAE_BAD_LLP_CLOSE",
-               "CCAE_INVALID_MSN_RANGE",
-               "CCAE_INVALID_MSN_GAP",
-               "CCAE_IRRQ_OVERFLOW",
-               "CCAE_IRRQ_MSN_GAP",
-               "CCAE_IRRQ_MSN_RANGE",
-               "CCAE_IRRQ_INVALID_STAG",
-               "CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
-               "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
-               "CCAE_IRRQ_INVALID_PD",
-               "CCAE_IRRQ_WRAP_ERROR",
-               "CCAE_CQ_SQ_COMPLETION_OVERFLOW",
-               "CCAE_CQ_RQ_COMPLETION_ERROR",
-               "CCAE_QP_SRQ_WQE_ERROR",
-               "CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
-               "CCAE_CQ_OVERFLOW",
-               "CCAE_CQ_OPERATION_ERROR",
-               "CCAE_SRQ_LIMIT_REACHED",
-               "CCAE_QP_RQ_LIMIT_REACHED",
-               "CCAE_SRQ_CATASTROPHIC_ERROR",
-               "CCAE_RNIC_CATASTROPHIC_ERROR"
-       };
-
-       if (event < CCAE_REMOTE_SHUTDOWN ||
-           event > CCAE_RNIC_CATASTROPHIC_ERROR)
-               return "<invalid event>";
-
-       event -= CCAE_REMOTE_SHUTDOWN;
-       return event_str[event];
-}
-
-static const char *to_qp_state_str(int state)
-{
-       switch (state) {
-       case C2_QP_STATE_IDLE:
-               return "C2_QP_STATE_IDLE";
-       case C2_QP_STATE_CONNECTING:
-               return "C2_QP_STATE_CONNECTING";
-       case C2_QP_STATE_RTS:
-               return "C2_QP_STATE_RTS";
-       case C2_QP_STATE_CLOSING:
-               return "C2_QP_STATE_CLOSING";
-       case C2_QP_STATE_TERMINATE:
-               return "C2_QP_STATE_TERMINATE";
-       case C2_QP_STATE_ERROR:
-               return "C2_QP_STATE_ERROR";
-       default:
-               return "<invalid QP state>";
-       }
-}
-
-void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
-{
-       struct c2_mq *mq = c2dev->qptr_array[mq_index];
-       union c2wr *wr;
-       void *resource_user_context;
-       struct iw_cm_event cm_event;
-       struct ib_event ib_event;
-       enum c2_resource_indicator resource_indicator;
-       enum c2_event_id event_id;
-       unsigned long flags;
-       int status;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
-
-       /*
-        * retrieve the message
-        */
-       wr = c2_mq_consume(mq);
-       if (!wr)
-               return;
-
-       memset(&ib_event, 0, sizeof(ib_event));
-       memset(&cm_event, 0, sizeof(cm_event));
-
-       event_id = c2_wr_get_id(wr);
-       resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
-       resource_user_context =
-           (void *) (unsigned long) wr->ae.ae_generic.user_context;
-
-       status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
-
-       pr_debug("event received c2_dev=%p, event_id=%d, "
-               "resource_indicator=%d, user_context=%p, status = %d\n",
-               c2dev, event_id, resource_indicator, resource_user_context,
-               status);
-
-       switch (resource_indicator) {
-       case C2_RES_IND_QP:{
-
-               struct c2_qp *qp = resource_user_context;
-               struct iw_cm_id *cm_id = qp->cm_id;
-               struct c2wr_ae_active_connect_results *res;
-
-               if (!cm_id) {
-                       pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
-                               qp);
-                       goto ignore_it;
-               }
-               pr_debug("%s: event = %s, user_context=%llx, "
-                       "resource_type=%x, "
-                       "resource=%x, qp_state=%s\n",
-                       __func__,
-                       to_event_str(event_id),
-                       (unsigned long long) wr->ae.ae_generic.user_context,
-                       be32_to_cpu(wr->ae.ae_generic.resource_type),
-                       be32_to_cpu(wr->ae.ae_generic.resource),
-                       to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
-
-               c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
-
-               switch (event_id) {
-               case CCAE_ACTIVE_CONNECT_RESULTS:
-                       res = &wr->ae.ae_active_connect_results;
-                       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-                       laddr->sin_addr.s_addr = res->laddr;
-                       raddr->sin_addr.s_addr = res->raddr;
-                       laddr->sin_port = res->lport;
-                       raddr->sin_port = res->rport;
-                       if (status == 0) {
-                               cm_event.private_data_len =
-                                       be32_to_cpu(res->private_data_length);
-                               cm_event.private_data = res->private_data;
-                       } else {
-                               spin_lock_irqsave(&qp->lock, flags);
-                               if (qp->cm_id) {
-                                       qp->cm_id->rem_ref(qp->cm_id);
-                                       qp->cm_id = NULL;
-                               }
-                               spin_unlock_irqrestore(&qp->lock, flags);
-                               cm_event.private_data_len = 0;
-                               cm_event.private_data = NULL;
-                       }
-                       if (cm_id->event_handler)
-                               cm_id->event_handler(cm_id, &cm_event);
-                       break;
-               case CCAE_TERMINATE_MESSAGE_RECEIVED:
-               case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
-                       ib_event.device = &c2dev->ibdev;
-                       ib_event.element.qp = &qp->ibqp;
-                       ib_event.event = IB_EVENT_QP_REQ_ERR;
-
-                       if (qp->ibqp.event_handler)
-                               qp->ibqp.event_handler(&ib_event,
-                                                      qp->ibqp.
-                                                      qp_context);
-                       break;
-               case CCAE_BAD_CLOSE:
-               case CCAE_LLP_CLOSE_COMPLETE:
-               case CCAE_LLP_CONNECTION_RESET:
-               case CCAE_LLP_CONNECTION_LOST:
-                       BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
-
-                       spin_lock_irqsave(&qp->lock, flags);
-                       if (qp->cm_id) {
-                               qp->cm_id->rem_ref(qp->cm_id);
-                               qp->cm_id = NULL;
-                       }
-                       spin_unlock_irqrestore(&qp->lock, flags);
-                       cm_event.event = IW_CM_EVENT_CLOSE;
-                       cm_event.status = 0;
-                       if (cm_id->event_handler)
-                               cm_id->event_handler(cm_id, &cm_event);
-                       break;
-               default:
-                       BUG_ON(1);
-                       pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
-                               "CM_ID=%p\n",
-                               __func__, __LINE__,
-                               event_id, qp, cm_id);
-                       break;
-               }
-               break;
-       }
-
-       case C2_RES_IND_EP:{
-
-               struct c2wr_ae_connection_request *req =
-                       &wr->ae.ae_connection_request;
-               struct iw_cm_id *cm_id =
-                       resource_user_context;
-
-               pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
-               if (event_id != CCAE_CONNECTION_REQUEST) {
-                       pr_debug("%s: Invalid event_id: %d\n",
-                               __func__, event_id);
-                       break;
-               }
-               cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
-               cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
-               laddr->sin_addr.s_addr = req->laddr;
-               raddr->sin_addr.s_addr = req->raddr;
-               laddr->sin_port = req->lport;
-               raddr->sin_port = req->rport;
-               cm_event.private_data_len =
-                       be32_to_cpu(req->private_data_length);
-               cm_event.private_data = req->private_data;
-               /*
-                * Until ird/ord negotiation via MPAv2 support is added, send
-                * max supported values
-                */
-               cm_event.ird = cm_event.ord = 128;
-
-               if (cm_id->event_handler)
-                       cm_id->event_handler(cm_id, &cm_event);
-               break;
-       }
-
-       case C2_RES_IND_CQ:{
-               struct c2_cq *cq =
-                   resource_user_context;
-
-               pr_debug("IB_EVENT_CQ_ERR\n");
-               ib_event.device = &c2dev->ibdev;
-               ib_event.element.cq = &cq->ibcq;
-               ib_event.event = IB_EVENT_CQ_ERR;
-
-               if (cq->ibcq.event_handler)
-                       cq->ibcq.event_handler(&ib_event,
-                                              cq->ibcq.cq_context);
-               break;
-       }
-
-       default:
-               printk("Bad resource indicator = %d\n",
-                      resource_indicator);
-               break;
-       }
-
- ignore_it:
-       c2_mq_free(mq);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h
deleted file mode 100644 (file)
index 3a065c3..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_AE_H_
-#define _C2_AE_H_
-
-/*
- * WARNING: If you change this file, also bump C2_IVN_BASE
- * in common/include/clustercore/c2_ivn.h.
- */
-
-/*
- * Asynchronous Event Identifiers
- *
- * These start at 0x80 only so it's obvious from inspection that
- * they are not work-request statuses.  This isn't critical.
- *
- * NOTE: these event id's must fit in eight bits.
- */
-enum c2_event_id {
-       CCAE_REMOTE_SHUTDOWN = 0x80,
-       CCAE_ACTIVE_CONNECT_RESULTS,
-       CCAE_CONNECTION_REQUEST,
-       CCAE_LLP_CLOSE_COMPLETE,
-       CCAE_TERMINATE_MESSAGE_RECEIVED,
-       CCAE_LLP_CONNECTION_RESET,
-       CCAE_LLP_CONNECTION_LOST,
-       CCAE_LLP_SEGMENT_SIZE_INVALID,
-       CCAE_LLP_INVALID_CRC,
-       CCAE_LLP_BAD_FPDU,
-       CCAE_INVALID_DDP_VERSION,
-       CCAE_INVALID_RDMA_VERSION,
-       CCAE_UNEXPECTED_OPCODE,
-       CCAE_INVALID_DDP_QUEUE_NUMBER,
-       CCAE_RDMA_READ_NOT_ENABLED,
-       CCAE_RDMA_WRITE_NOT_ENABLED,
-       CCAE_RDMA_READ_TOO_SMALL,
-       CCAE_NO_L_BIT,
-       CCAE_TAGGED_INVALID_STAG,
-       CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
-       CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
-       CCAE_TAGGED_INVALID_PD,
-       CCAE_WRAP_ERROR,
-       CCAE_BAD_CLOSE,
-       CCAE_BAD_LLP_CLOSE,
-       CCAE_INVALID_MSN_RANGE,
-       CCAE_INVALID_MSN_GAP,
-       CCAE_IRRQ_OVERFLOW,
-       CCAE_IRRQ_MSN_GAP,
-       CCAE_IRRQ_MSN_RANGE,
-       CCAE_IRRQ_INVALID_STAG,
-       CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
-       CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
-       CCAE_IRRQ_INVALID_PD,
-       CCAE_IRRQ_WRAP_ERROR,
-       CCAE_CQ_SQ_COMPLETION_OVERFLOW,
-       CCAE_CQ_RQ_COMPLETION_ERROR,
-       CCAE_QP_SRQ_WQE_ERROR,
-       CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
-       CCAE_CQ_OVERFLOW,
-       CCAE_CQ_OPERATION_ERROR,
-       CCAE_SRQ_LIMIT_REACHED,
-       CCAE_QP_RQ_LIMIT_REACHED,
-       CCAE_SRQ_CATASTROPHIC_ERROR,
-       CCAE_RNIC_CATASTROPHIC_ERROR
-/* WARNING If you add more id's, make sure their values fit in eight bits. */
-};
-
-/*
- * Resource Indicators and Identifiers
- */
-enum c2_resource_indicator {
-       C2_RES_IND_QP = 1,
-       C2_RES_IND_EP,
-       C2_RES_IND_CQ,
-       C2_RES_IND_SRQ,
-};
-
-#endif /* _C2_AE_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c
deleted file mode 100644 (file)
index 039872d..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/errno.h>
-#include <linux/bitmap.h>
-
-#include "c2.h"
-
-static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
-                              struct sp_chunk **head)
-{
-       int i;
-       struct sp_chunk *new_head;
-       dma_addr_t dma_addr;
-
-       new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
-                                     &dma_addr, gfp_mask);
-       if (new_head == NULL)
-               return -ENOMEM;
-
-       new_head->dma_addr = dma_addr;
-       dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
-
-       new_head->next = NULL;
-       new_head->head = 0;
-
-       /* build list where each index is the next free slot */
-       for (i = 0;
-            i < (PAGE_SIZE - sizeof(struct sp_chunk) -
-                 sizeof(u16)) / sizeof(u16) - 1;
-            i++) {
-               new_head->shared_ptr[i] = i + 1;
-       }
-       /* terminate list */
-       new_head->shared_ptr[i] = 0xFFFF;
-
-       *head = new_head;
-       return 0;
-}
-
-int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-                     struct sp_chunk **root)
-{
-       return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
-}
-
-void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
-{
-       struct sp_chunk *next;
-
-       while (root) {
-               next = root->next;
-               dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
-                                 dma_unmap_addr(root, mapping));
-               root = next;
-       }
-}
-
-__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-                     dma_addr_t *dma_addr, gfp_t gfp_mask)
-{
-       u16 mqsp;
-
-       while (head) {
-               mqsp = head->head;
-               if (mqsp != 0xFFFF) {
-                       head->head = head->shared_ptr[mqsp];
-                       break;
-               } else if (head->next == NULL) {
-                       if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
-                           0) {
-                               head = head->next;
-                               mqsp = head->head;
-                               head->head = head->shared_ptr[mqsp];
-                               break;
-                       } else
-                               return NULL;
-               } else
-                       head = head->next;
-       }
-       if (head) {
-               *dma_addr = head->dma_addr +
-                           ((unsigned long) &(head->shared_ptr[mqsp]) -
-                            (unsigned long) head);
-               pr_debug("%s addr %p dma_addr %llx\n", __func__,
-                        &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
-               return (__force __be16 *) &(head->shared_ptr[mqsp]);
-       }
-       return NULL;
-}
-
-void c2_free_mqsp(__be16 *mqsp)
-{
-       struct sp_chunk *head;
-       u16 idx;
-
-       /* The chunk containing this ptr begins at the page boundary */
-       head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
-
-       /* Link head to new mqsp */
-       *mqsp = (__force __be16) head->head;
-
-       /* Compute the shared_ptr index */
-       idx = (offset_in_page(mqsp)) >> 1;
-       idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
-
-       /* Point this index at the head */
-       head->shared_ptr[idx] = head->head;
-
-       /* Point head at this index */
-       head->head = idx;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c
deleted file mode 100644 (file)
index f8dbdb9..0000000
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_vq.h"
-#include <rdma/iw_cm.h>
-
-int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       struct c2_dev *c2dev = to_c2dev(cm_id->device);
-       struct ib_qp *ibqp;
-       struct c2_qp *qp;
-       struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */
-       struct c2_vq_req *vq_req;
-       int err;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
-
-       if (cm_id->remote_addr.ss_family != AF_INET)
-               return -ENOSYS;
-
-       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-       qp = to_c2qp(ibqp);
-
-       /* Associate QP <--> CM_ID */
-       cm_id->provider_data = qp;
-       cm_id->add_ref(cm_id);
-       qp->cm_id = cm_id;
-
-       /*
-        * only support the max private_data length
-        */
-       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-               err = -EINVAL;
-               goto bail0;
-       }
-       /*
-        * Set the rdma read limits
-        */
-       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-       if (err)
-               goto bail0;
-
-       /*
-        * Create and send a WR_QP_CONNECT...
-        */
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       c2_wr_set_id(wr, CCWR_QP_CONNECT);
-       wr->hdr.context = 0;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->qp_handle = qp->adapter_handle;
-
-       wr->remote_addr = raddr->sin_addr.s_addr;
-       wr->remote_port = raddr->sin_port;
-
-       /*
-        * Move any private data from the callers's buf into
-        * the WR.
-        */
-       if (iw_param->private_data) {
-               wr->private_data_length =
-                       cpu_to_be32(iw_param->private_data_len);
-               memcpy(&wr->private_data[0], iw_param->private_data,
-                      iw_param->private_data_len);
-       } else
-               wr->private_data_length = 0;
-
-       /*
-        * Send WR to adapter.  NOTE: There is no synch reply from
-        * the adapter.
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       vq_req_free(c2dev, vq_req);
-
- bail1:
-       kfree(wr);
- bail0:
-       if (err) {
-               /*
-                * If we fail, release reference on QP and
-                * disassociate QP from CM_ID
-                */
-               cm_id->provider_data = NULL;
-               qp->cm_id = NULL;
-               cm_id->rem_ref(cm_id);
-       }
-       return err;
-}
-
-int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-       struct c2_dev *c2dev;
-       struct c2wr_ep_listen_create_req wr;
-       struct c2wr_ep_listen_create_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
-
-       if (cm_id->local_addr.ss_family != AF_INET)
-               return -ENOSYS;
-
-       c2dev = to_c2dev(cm_id->device);
-       if (c2dev == NULL)
-               return -EINVAL;
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
-       wr.hdr.context = (u64) (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.local_addr = laddr->sin_addr.s_addr;
-       wr.local_port = laddr->sin_port;
-       wr.backlog = cpu_to_be32(backlog);
-       wr.user_context = (u64) (unsigned long) cm_id;
-
-       /*
-        * Reference the request struct.  Dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply =
-           (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       if ((err = c2_errno(reply)) != 0)
-               goto bail1;
-
-       /*
-        * Keep the adapter handle. Used in subsequent destroy
-        */
-       cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
-
-       /*
-        * free vq stuff
-        */
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       return 0;
-
- bail1:
-       vq_repbuf_free(c2dev, reply);
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-
-int c2_llp_service_destroy(struct iw_cm_id *cm_id)
-{
-
-       struct c2_dev *c2dev;
-       struct c2wr_ep_listen_destroy_req wr;
-       struct c2wr_ep_listen_destroy_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       c2dev = to_c2dev(cm_id->device);
-       if (c2dev == NULL)
-               return -EINVAL;
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       vq_repbuf_free(c2dev, reply);
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       struct c2_dev *c2dev = to_c2dev(cm_id->device);
-       struct c2_qp *qp;
-       struct ib_qp *ibqp;
-       struct c2wr_cr_accept_req *wr;  /* variable length WR */
-       struct c2_vq_req *vq_req;
-       struct c2wr_cr_accept_rep *reply;       /* VQ Reply msg ptr. */
-       int err;
-
-       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-       qp = to_c2qp(ibqp);
-
-       /* Set the RDMA read limits */
-       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-       if (err)
-               goto bail0;
-
-       /* Allocate verbs request. */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-       vq_req->qp = qp;
-       vq_req->cm_id = cm_id;
-       vq_req->event = IW_CM_EVENT_ESTABLISHED;
-
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       /* Build the WR */
-       c2_wr_set_id(wr, CCWR_CR_ACCEPT);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
-       wr->qp_handle = qp->adapter_handle;
-
-       /* Replace the cr_handle with the QP after accept */
-       cm_id->provider_data = qp;
-       cm_id->add_ref(cm_id);
-       qp->cm_id = cm_id;
-
-       cm_id->provider_data = qp;
-
-       /* Validate private_data length */
-       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-               err = -EINVAL;
-               goto bail1;
-       }
-
-       if (iw_param->private_data) {
-               wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
-               memcpy(&wr->private_data[0],
-                      iw_param->private_data, iw_param->private_data_len);
-       } else
-               wr->private_data_length = 0;
-
-       /* Reference the request struct.  Dereferenced in the int handler. */
-       vq_req_get(c2dev, vq_req);
-
-       /* Send WR to adapter */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       /* Wait for reply from adapter */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       /* Check that reply is present */
-       reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-       if (!err)
-               c2_set_qp_state(qp, C2_QP_STATE_RTS);
- bail1:
-       kfree(wr);
-       vq_req_free(c2dev, vq_req);
- bail0:
-       if (err) {
-               /*
-                * If we fail, release reference on QP and
-                * disassociate QP from CM_ID
-                */
-               cm_id->provider_data = NULL;
-               qp->cm_id = NULL;
-               cm_id->rem_ref(cm_id);
-       }
-       return err;
-}
-
-int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-       struct c2_dev *c2dev;
-       struct c2wr_cr_reject_req wr;
-       struct c2_vq_req *vq_req;
-       struct c2wr_cr_reject_rep *reply;
-       int err;
-
-       c2dev = to_c2dev(cm_id->device);
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_CR_REJECT);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_cr_reject_rep *) (unsigned long)
-               vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-       err = c2_errno(reply);
-       /*
-        * free vq stuff
-        */
-       vq_repbuf_free(c2dev, reply);
-
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
deleted file mode 100644 (file)
index 7ad0c08..0000000
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
-
-static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
-{
-       struct c2_cq *cq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2dev->lock, flags);
-       cq = c2dev->qptr_array[cqn];
-       if (!cq) {
-               spin_unlock_irqrestore(&c2dev->lock, flags);
-               return NULL;
-       }
-       atomic_inc(&cq->refcount);
-       spin_unlock_irqrestore(&c2dev->lock, flags);
-       return cq;
-}
-
-static void c2_cq_put(struct c2_cq *cq)
-{
-       if (atomic_dec_and_test(&cq->refcount))
-               wake_up(&cq->wait);
-}
-
-void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
-{
-       struct c2_cq *cq;
-
-       cq = c2_cq_get(c2dev, mq_index);
-       if (!cq) {
-               printk("discarding events on destroyed CQN=%d\n", mq_index);
-               return;
-       }
-
-       (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
-       c2_cq_put(cq);
-}
-
-void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
-{
-       struct c2_cq *cq;
-       struct c2_mq *q;
-
-       cq = c2_cq_get(c2dev, mq_index);
-       if (!cq)
-               return;
-
-       spin_lock_irq(&cq->lock);
-       q = &cq->mq;
-       if (q && !c2_mq_empty(q)) {
-               u16 priv = q->priv;
-               struct c2wr_ce *msg;
-
-               while (priv != be16_to_cpu(*q->shared)) {
-                       msg = (struct c2wr_ce *)
-                               (q->msg_pool.host + priv * q->msg_size);
-                       if (msg->qp_user_context == (u64) (unsigned long) qp) {
-                               msg->qp_user_context = (u64) 0;
-                       }
-                       priv = (priv + 1) % q->q_size;
-               }
-       }
-       spin_unlock_irq(&cq->lock);
-       c2_cq_put(cq);
-}
-
-static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
-{
-       switch (status) {
-       case C2_OK:
-               return IB_WC_SUCCESS;
-       case CCERR_FLUSHED:
-               return IB_WC_WR_FLUSH_ERR;
-       case CCERR_BASE_AND_BOUNDS_VIOLATION:
-               return IB_WC_LOC_PROT_ERR;
-       case CCERR_ACCESS_VIOLATION:
-               return IB_WC_LOC_ACCESS_ERR;
-       case CCERR_TOTAL_LENGTH_TOO_BIG:
-               return IB_WC_LOC_LEN_ERR;
-       case CCERR_INVALID_WINDOW:
-               return IB_WC_MW_BIND_ERR;
-       default:
-               return IB_WC_GENERAL_ERR;
-       }
-}
-
-
-static inline int c2_poll_one(struct c2_dev *c2dev,
-                             struct c2_cq *cq, struct ib_wc *entry)
-{
-       struct c2wr_ce *ce;
-       struct c2_qp *qp;
-       int is_recv = 0;
-
-       ce = c2_mq_consume(&cq->mq);
-       if (!ce) {
-               return -EAGAIN;
-       }
-
-       /*
-        * if the qp returned is null then this qp has already
-        * been freed and we are unable process the completion.
-        * try pulling the next message
-        */
-       while ((qp =
-               (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
-               c2_mq_free(&cq->mq);
-               ce = c2_mq_consume(&cq->mq);
-               if (!ce)
-                       return -EAGAIN;
-       }
-
-       entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
-       entry->wr_id = ce->hdr.context;
-       entry->qp = &qp->ibqp;
-       entry->wc_flags = 0;
-       entry->slid = 0;
-       entry->sl = 0;
-       entry->src_qp = 0;
-       entry->dlid_path_bits = 0;
-       entry->pkey_index = 0;
-
-       switch (c2_wr_get_id(ce)) {
-       case C2_WR_TYPE_SEND:
-               entry->opcode = IB_WC_SEND;
-               break;
-       case C2_WR_TYPE_RDMA_WRITE:
-               entry->opcode = IB_WC_RDMA_WRITE;
-               break;
-       case C2_WR_TYPE_RDMA_READ:
-               entry->opcode = IB_WC_RDMA_READ;
-               break;
-       case C2_WR_TYPE_RECV:
-               entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
-               entry->opcode = IB_WC_RECV;
-               is_recv = 1;
-               break;
-       default:
-               break;
-       }
-
-       /* consume the WQEs */
-       if (is_recv)
-               c2_mq_lconsume(&qp->rq_mq, 1);
-       else
-               c2_mq_lconsume(&qp->sq_mq,
-                              be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
-
-       /* free the message */
-       c2_mq_free(&cq->mq);
-
-       return 0;
-}
-
-int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-       struct c2_dev *c2dev = to_c2dev(ibcq->device);
-       struct c2_cq *cq = to_c2cq(ibcq);
-       unsigned long flags;
-       int npolled, err;
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       for (npolled = 0; npolled < num_entries; ++npolled) {
-
-               err = c2_poll_one(c2dev, cq, entry + npolled);
-               if (err)
-                       break;
-       }
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       return npolled;
-}
-
-int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-       struct c2_mq_shared __iomem *shared;
-       struct c2_cq *cq;
-       unsigned long flags;
-       int ret = 0;
-
-       cq = to_c2cq(ibcq);
-       shared = cq->mq.peer;
-
-       if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
-               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
-       else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
-       else
-               return -EINVAL;
-
-       writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
-
-       /*
-        * Now read back shared->armed to make the PCI
-        * write synchronous.  This is necessary for
-        * correct cq notification semantics.
-        */
-       readb(&shared->armed);
-
-       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-               spin_lock_irqsave(&cq->lock, flags);
-               ret = !c2_mq_empty(&cq->mq);
-               spin_unlock_irqrestore(&cq->lock, flags);
-       }
-
-       return ret;
-}
-
-static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
-{
-       dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
-                         mq->msg_pool.host, dma_unmap_addr(mq, mapping));
-}
-
-static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
-                          size_t q_size, size_t msg_size)
-{
-       u8 *pool_start;
-
-       if (q_size > SIZE_MAX / msg_size)
-               return -EINVAL;
-
-       pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
-                                       &mq->host_dma, GFP_KERNEL);
-       if (!pool_start)
-               return -ENOMEM;
-
-       c2_mq_rep_init(mq,
-                      0,               /* index (currently unknown) */
-                      q_size,
-                      msg_size,
-                      pool_start,
-                      NULL,    /* peer (currently unknown) */
-                      C2_MQ_HOST_TARGET);
-
-       dma_unmap_addr_set(mq, mapping, mq->host_dma);
-
-       return 0;
-}
-
-int c2_init_cq(struct c2_dev *c2dev, int entries,
-              struct c2_ucontext *ctx, struct c2_cq *cq)
-{
-       struct c2wr_cq_create_req wr;
-       struct c2wr_cq_create_rep *reply;
-       unsigned long peer_pa;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       might_sleep();
-
-       cq->ibcq.cqe = entries - 1;
-       cq->is_kernel = !ctx;
-
-       /* Allocate a shared pointer */
-       cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                     &cq->mq.shared_dma, GFP_KERNEL);
-       if (!cq->mq.shared)
-               return -ENOMEM;
-
-       /* Allocate pages for the message pool */
-       err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
-       if (err)
-               goto bail0;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_CQ_CREATE);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.msg_size = cpu_to_be32(cq->mq.msg_size);
-       wr.depth = cpu_to_be32(cq->mq.q_size);
-       wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
-       wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
-       wr.user_context = (u64) (unsigned long) (cq);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail2;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail2;
-
-       reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-
-       if ((err = c2_errno(reply)) != 0)
-               goto bail3;
-
-       cq->adapter_handle = reply->cq_handle;
-       cq->mq.index = be32_to_cpu(reply->mq_index);
-
-       peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
-       cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
-       if (!cq->mq.peer) {
-               err = -ENOMEM;
-               goto bail3;
-       }
-
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       spin_lock_init(&cq->lock);
-       atomic_set(&cq->refcount, 1);
-       init_waitqueue_head(&cq->wait);
-
-       /*
-        * Use the MQ index allocated by the adapter to
-        * store the CQ in the qptr_array
-        */
-       cq->cqn = cq->mq.index;
-       c2dev->qptr_array[cq->cqn] = cq;
-
-       return 0;
-
-bail3:
-       vq_repbuf_free(c2dev, reply);
-bail2:
-       vq_req_free(c2dev, vq_req);
-bail1:
-       c2_free_cq_buf(c2dev, &cq->mq);
-bail0:
-       c2_free_mqsp(cq->mq.shared);
-
-       return err;
-}
-
-void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
-{
-       int err;
-       struct c2_vq_req *vq_req;
-       struct c2wr_cq_destroy_req wr;
-       struct c2wr_cq_destroy_rep *reply;
-
-       might_sleep();
-
-       /* Clear CQ from the qptr array */
-       spin_lock_irq(&c2dev->lock);
-       c2dev->qptr_array[cq->mq.index] = NULL;
-       atomic_dec(&cq->refcount);
-       spin_unlock_irq(&c2dev->lock);
-
-       wait_event(cq->wait, !atomic_read(&cq->refcount));
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               goto bail0;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.cq_handle = cq->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-       if (reply)
-               vq_repbuf_free(c2dev, reply);
-bail1:
-       vq_req_free(c2dev, vq_req);
-bail0:
-       if (cq->is_kernel) {
-               c2_free_cq_buf(c2dev, &cq->mq);
-       }
-
-       return;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c
deleted file mode 100644 (file)
index 74b32a9..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_vq.h"
-
-static void handle_mq(struct c2_dev *c2dev, u32 index);
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
-
-/*
- * Handle RNIC interrupts
- */
-void c2_rnic_interrupt(struct c2_dev *c2dev)
-{
-       unsigned int mq_index;
-
-       while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
-               mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
-               if (mq_index & 0x80000000) {
-                       break;
-               }
-
-               c2dev->hints_read++;
-               handle_mq(c2dev, mq_index);
-       }
-
-}
-
-/*
- * Top level MQ handler
- */
-static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
-{
-       if (c2dev->qptr_array[mq_index] == NULL) {
-               pr_debug("handle_mq: stray activity for mq_index=%d\n",
-                        mq_index);
-               return;
-       }
-
-       switch (mq_index) {
-       case (0):
-               /*
-                * An index of 0 in the activity queue
-                * indicates the req vq now has messages
-                * available...
-                *
-                * Wake up any waiters waiting on req VQ
-                * message availability.
-                */
-               wake_up(&c2dev->req_vq_wo);
-               break;
-       case (1):
-               handle_vq(c2dev, mq_index);
-               break;
-       case (2):
-               /* We have to purge the VQ in case there are pending
-                * accept reply requests that would result in the
-                * generation of an ESTABLISHED event. If we don't
-                * generate these first, a CLOSE event could end up
-                * being delivered before the ESTABLISHED event.
-                */
-               handle_vq(c2dev, 1);
-
-               c2_ae_event(c2dev, mq_index);
-               break;
-       default:
-               /* There is no event synchronization between CQ events
-                * and AE or CM events. In fact, CQE could be
-                * delivered for all of the I/O up to and including the
-                * FLUSH for a peer disconenct prior to the ESTABLISHED
-                * event being delivered to the app. The reason for this
-                * is that CM events are delivered on a thread, while AE
-                * and CM events are delivered on interrupt context.
-                */
-               c2_cq_event(c2dev, mq_index);
-               break;
-       }
-
-       return;
-}
-
-/*
- * Handles verbs WR replies.
- */
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
-{
-       void *adapter_msg, *reply_msg;
-       struct c2wr_hdr *host_msg;
-       struct c2wr_hdr tmp;
-       struct c2_mq *reply_vq;
-       struct c2_vq_req *req;
-       struct iw_cm_event cm_event;
-       int err;
-
-       reply_vq = c2dev->qptr_array[mq_index];
-
-       /*
-        * get next msg from mq_index into adapter_msg.
-        * don't free it yet.
-        */
-       adapter_msg = c2_mq_consume(reply_vq);
-       if (adapter_msg == NULL) {
-               return;
-       }
-
-       host_msg = vq_repbuf_alloc(c2dev);
-
-       /*
-        * If we can't get a host buffer, then we'll still
-        * wakeup the waiter, we just won't give him the msg.
-        * It is assumed the waiter will deal with this...
-        */
-       if (!host_msg) {
-               pr_debug("handle_vq: no repbufs!\n");
-
-               /*
-                * just copy the WR header into a local variable.
-                * this allows us to still demux on the context
-                */
-               host_msg = &tmp;
-               memcpy(host_msg, adapter_msg, sizeof(tmp));
-               reply_msg = NULL;
-       } else {
-               memcpy(host_msg, adapter_msg, reply_vq->msg_size);
-               reply_msg = host_msg;
-       }
-
-       /*
-        * consume the msg from the MQ
-        */
-       c2_mq_free(reply_vq);
-
-       /*
-        * wakeup the waiter.
-        */
-       req = (struct c2_vq_req *) (unsigned long) host_msg->context;
-       if (req == NULL) {
-               /*
-                * We should never get here, as the adapter should
-                * never send us a reply that we're not expecting.
-                */
-               if (reply_msg != NULL)
-                       vq_repbuf_free(c2dev, host_msg);
-               pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
-               return;
-       }
-
-       if (reply_msg)
-               err = c2_errno(reply_msg);
-       else
-               err = -ENOMEM;
-
-       if (!err) switch (req->event) {
-       case IW_CM_EVENT_ESTABLISHED:
-               c2_set_qp_state(req->qp,
-                               C2_QP_STATE_RTS);
-               /*
-                * Until ird/ord negotiation via MPAv2 support is added, send
-                * max supported values
-                */
-               cm_event.ird = cm_event.ord = 128;
-       case IW_CM_EVENT_CLOSE:
-
-               /*
-                * Move the QP to RTS if this is
-                * the established event
-                */
-               cm_event.event = req->event;
-               cm_event.status = 0;
-               cm_event.local_addr = req->cm_id->local_addr;
-               cm_event.remote_addr = req->cm_id->remote_addr;
-               cm_event.private_data = NULL;
-               cm_event.private_data_len = 0;
-               req->cm_id->event_handler(req->cm_id, &cm_event);
-               break;
-       default:
-               break;
-       }
-
-       req->reply_msg = (u64) (unsigned long) (reply_msg);
-       atomic_set(&req->reply_ready, 1);
-       wake_up(&req->wait_object);
-
-       /*
-        * If the request was cancelled, then this put will
-        * free the vq_req memory...and reply_msg!!!
-        */
-       vq_req_put(c2dev, req);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c
deleted file mode 100644 (file)
index 25081e2..0000000
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-
-#define PBL_VIRT 1
-#define PBL_PHYS 2
-
-/*
- * Send all the PBL messages to convey the remainder of the PBL
- * Wait for the adapter's reply on the last one.
- * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
- *
- * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
- *       Reply buffer _is_ freed by this function.
- */
-static int
-send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
-                 unsigned long va, u32 pbl_depth,
-                 struct c2_vq_req *vq_req, int pbl_type)
-{
-       u32 pbe_count;          /* amt that fits in a PBL msg */
-       u32 count;              /* amt in this PBL MSG. */
-       struct c2wr_nsmr_pbl_req *wr;   /* PBL WR ptr */
-       struct c2wr_nsmr_pbl_rep *reply;        /* reply ptr */
-       int err, pbl_virt, pbl_index, i;
-
-       switch (pbl_type) {
-       case PBL_VIRT:
-               pbl_virt = 1;
-               break;
-       case PBL_PHYS:
-               pbl_virt = 0;
-               break;
-       default:
-               return -EINVAL;
-               break;
-       }
-
-       pbe_count = (c2dev->req_vq.msg_size -
-                    sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               return -ENOMEM;
-       }
-       c2_wr_set_id(wr, CCWR_NSMR_PBL);
-
-       /*
-        * Only the last PBL message will generate a reply from the verbs,
-        * so we set the context to 0 indicating there is no kernel verbs
-        * handler blocked awaiting this reply.
-        */
-       wr->hdr.context = 0;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->stag_index = stag_index;    /* already swapped */
-       wr->flags = 0;
-       pbl_index = 0;
-       while (pbl_depth) {
-               count = min(pbe_count, pbl_depth);
-               wr->addrs_length = cpu_to_be32(count);
-
-               /*
-                *  If this is the last message, then reference the
-                *  vq request struct cuz we're gonna wait for a reply.
-                *  also make this PBL msg as the last one.
-                */
-               if (count == pbl_depth) {
-                       /*
-                        * reference the request struct.  dereferenced in the
-                        * int handler.
-                        */
-                       vq_req_get(c2dev, vq_req);
-                       wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
-
-                       /*
-                        * This is the last PBL message.
-                        * Set the context to our VQ Request Object so we can
-                        * wait for the reply.
-                        */
-                       wr->hdr.context = (unsigned long) vq_req;
-               }
-
-               /*
-                * If pbl_virt is set then va is a virtual address
-                * that describes a virtually contiguous memory
-                * allocation. The wr needs the start of each virtual page
-                * to be converted to the corresponding physical address
-                * of the page. If pbl_virt is not set then va is an array
-                * of physical addresses and there is no conversion to do.
-                * Just fill in the wr with what is in the array.
-                */
-               for (i = 0; i < count; i++) {
-                       if (pbl_virt) {
-                               va += PAGE_SIZE;
-                       } else {
-                               wr->paddrs[i] =
-                                   cpu_to_be64(((u64 *)va)[pbl_index + i]);
-                       }
-               }
-
-               /*
-                * Send WR to adapter
-                */
-               err = vq_send_wr(c2dev, (union c2wr *) wr);
-               if (err) {
-                       if (count <= pbe_count) {
-                               vq_req_put(c2dev, vq_req);
-                       }
-                       goto bail0;
-               }
-               pbl_depth -= count;
-               pbl_index += count;
-       }
-
-       /*
-        *  Now wait for the reply...
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       kfree(wr);
-       return err;
-}
-
-#define C2_PBL_MAX_DEPTH 131072
-int
-c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
-                          int page_size, int pbl_depth, u32 length,
-                          u32 offset, u64 *va, enum c2_acf acf,
-                          struct c2_mr *mr)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_nsmr_register_req *wr;
-       struct c2wr_nsmr_register_rep *reply;
-       u16 flags;
-       int i, pbe_count, count;
-       int err;
-
-       if (!va || !length || !addr_list || !pbl_depth)
-               return -EINTR;
-
-       /*
-        * Verify PBL depth is within rnic max
-        */
-       if (pbl_depth > C2_PBL_MAX_DEPTH) {
-               return -EINTR;
-       }
-
-       /*
-        * allocate verbs request object
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       /*
-        * build the WR
-        */
-       c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-
-       flags = (acf | MEM_VA_BASED | MEM_REMOTE);
-
-       /*
-        * compute how many pbes can fit in the message
-        */
-       pbe_count = (c2dev->req_vq.msg_size -
-                    sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
-
-       if (pbl_depth <= pbe_count) {
-               flags |= MEM_PBL_COMPLETE;
-       }
-       wr->flags = cpu_to_be16(flags);
-       wr->stag_key = 0;       //stag_key;
-       wr->va = cpu_to_be64(*va);
-       wr->pd_id = mr->pd->pd_id;
-       wr->pbe_size = cpu_to_be32(page_size);
-       wr->length = cpu_to_be32(length);
-       wr->pbl_depth = cpu_to_be32(pbl_depth);
-       wr->fbo = cpu_to_be32(offset);
-       count = min(pbl_depth, pbe_count);
-       wr->addrs_length = cpu_to_be32(count);
-
-       /*
-        * fill out the PBL for this message
-        */
-       for (i = 0; i < count; i++) {
-               wr->paddrs[i] = cpu_to_be64(addr_list[i]);
-       }
-
-       /*
-        * regerence the request struct
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * send the WR to the adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       /*
-        * wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail1;
-       }
-
-       /*
-        * process reply
-        */
-       reply =
-           (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-       if ((err = c2_errno(reply))) {
-               goto bail2;
-       }
-       //*p_pb_entries = be32_to_cpu(reply->pbl_depth);
-       mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
-       vq_repbuf_free(c2dev, reply);
-
-       /*
-        * if there are still more PBEs we need to send them to
-        * the adapter and wait for a reply on the final one.
-        * reuse vq_req for this purpose.
-        */
-       pbl_depth -= count;
-       if (pbl_depth) {
-
-               vq_req->reply_msg = (unsigned long) NULL;
-               atomic_set(&vq_req->reply_ready, 0);
-               err = send_pbl_messages(c2dev,
-                                       cpu_to_be32(mr->ibmr.lkey),
-                                       (unsigned long) &addr_list[i],
-                                       pbl_depth, vq_req, PBL_PHYS);
-               if (err) {
-                       goto bail1;
-               }
-       }
-
-       vq_req_free(c2dev, vq_req);
-       kfree(wr);
-
-       return err;
-
-bail2:
-       vq_repbuf_free(c2dev, reply);
-bail1:
-       kfree(wr);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
-{
-       struct c2_vq_req *vq_req;       /* verbs request object */
-       struct c2wr_stag_dealloc_req wr;        /* work request */
-       struct c2wr_stag_dealloc_rep *reply;    /* WR reply  */
-       int err;
-
-
-       /*
-        * allocate verbs request object
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               return -ENOMEM;
-       }
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
-       wr.hdr.context = (u64) (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.stag_index = cpu_to_be32(stag_index);
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c
deleted file mode 100644 (file)
index 7827fb8..0000000
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include "c2_mq.h"
-
-void *c2_mq_alloc(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       if (c2_mq_full(q)) {
-               return NULL;
-       } else {
-#ifdef DEBUG
-               struct c2wr_hdr *m =
-                   (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-               BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
-               m->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-               return m;
-#else
-               return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-       }
-}
-
-void c2_mq_produce(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       if (!c2_mq_full(q)) {
-               q->priv = (q->priv + 1) % q->q_size;
-               q->hint_count++;
-               /* Update peer's offset. */
-               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-       }
-}
-
-void *c2_mq_consume(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-       if (c2_mq_empty(q)) {
-               return NULL;
-       } else {
-#ifdef DEBUG
-               struct c2wr_hdr *m = (struct c2wr_hdr *)
-                   (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-               BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
-#endif
-               return m;
-#else
-               return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-       }
-}
-
-void c2_mq_free(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-       if (!c2_mq_empty(q)) {
-
-#ifdef CCMSGMAGIC
-               {
-                       struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
-                           (q->msg_pool.adapter + q->priv * q->msg_size);
-                       __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
-               }
-#endif
-               q->priv = (q->priv + 1) % q->q_size;
-               /* Update peer's offset. */
-               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-       }
-}
-
-
-void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       while (wqe_count--) {
-               BUG_ON(c2_mq_empty(q));
-               *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
-       }
-}
-
-#if 0
-u32 c2_mq_count(struct c2_mq *q)
-{
-       s32 count;
-
-       if (q->type == C2_MQ_HOST_TARGET)
-               count = be16_to_cpu(*q->shared) - q->priv;
-       else
-               count = q->priv - be16_to_cpu(*q->shared);
-
-       if (count < 0)
-               count += q->q_size;
-
-       return (u32) count;
-}
-#endif  /*  0  */
-
-void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                   u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
-{
-       BUG_ON(!q->shared);
-
-       /* This code assumes the byte swapping has already been done! */
-       q->index = index;
-       q->q_size = q_size;
-       q->msg_size = msg_size;
-       q->msg_pool.adapter = pool_start;
-       q->peer = (struct c2_mq_shared __iomem *) peer;
-       q->magic = C2_MQ_MAGIC;
-       q->type = type;
-       q->priv = 0;
-       q->hint_count = 0;
-       return;
-}
-
-void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                   u8 *pool_start, u16 __iomem *peer, u32 type)
-{
-       BUG_ON(!q->shared);
-
-       /* This code assumes the byte swapping has already been done! */
-       q->index = index;
-       q->q_size = q_size;
-       q->msg_size = msg_size;
-       q->msg_pool.host = pool_start;
-       q->peer = (struct c2_mq_shared __iomem *) peer;
-       q->magic = C2_MQ_MAGIC;
-       q->type = type;
-       q->priv = 0;
-       q->hint_count = 0;
-       return;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h
deleted file mode 100644 (file)
index 8e1b4d1..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _C2_MQ_H_
-#define _C2_MQ_H_
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include "c2_wr.h"
-
-enum c2_shared_regs {
-
-       C2_SHARED_ARMED = 0x10,
-       C2_SHARED_NOTIFY = 0x18,
-       C2_SHARED_SHARED = 0x40,
-};
-
-struct c2_mq_shared {
-       u16 unused1;
-       u8 armed;
-       u8 notification_type;
-       u32 unused2;
-       u16 shared;
-       /* Pad to 64 bytes. */
-       u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
-};
-
-enum c2_mq_type {
-       C2_MQ_HOST_TARGET = 1,
-       C2_MQ_ADAPTER_TARGET = 2,
-};
-
-/*
- * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
- * c2_user_mq_t (which is the same format) is for user-mode MQs...
- */
-#define C2_MQ_MAGIC 0x4d512020 /* 'MQ  ' */
-struct c2_mq {
-       u32 magic;
-       union {
-               u8 *host;
-               u8 __iomem *adapter;
-       } msg_pool;
-       dma_addr_t host_dma;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-       u16 hint_count;
-       u16 priv;
-       struct c2_mq_shared __iomem *peer;
-       __be16 *shared;
-       dma_addr_t shared_dma;
-       u32 q_size;
-       u32 msg_size;
-       u32 index;
-       enum c2_mq_type type;
-};
-
-static __inline__ int c2_mq_empty(struct c2_mq *q)
-{
-       return q->priv == be16_to_cpu(*q->shared);
-}
-
-static __inline__ int c2_mq_full(struct c2_mq *q)
-{
-       return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
-}
-
-void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
-void *c2_mq_alloc(struct c2_mq *q);
-void c2_mq_produce(struct c2_mq *q);
-void *c2_mq_consume(struct c2_mq *q);
-void c2_mq_free(struct c2_mq *q);
-void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                      u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
-void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                          u8 *pool_start, u16 __iomem *peer, u32 type);
-
-#endif                         /* _C2_MQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c
deleted file mode 100644 (file)
index f3e81dc..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-
-#include "c2.h"
-#include "c2_provider.h"
-
-int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
-{
-       u32 obj;
-       int ret = 0;
-
-       spin_lock(&c2dev->pd_table.lock);
-       obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
-                                c2dev->pd_table.last);
-       if (obj >= c2dev->pd_table.max)
-               obj = find_first_zero_bit(c2dev->pd_table.table,
-                                         c2dev->pd_table.max);
-       if (obj < c2dev->pd_table.max) {
-               pd->pd_id = obj;
-               __set_bit(obj, c2dev->pd_table.table);
-               c2dev->pd_table.last = obj+1;
-               if (c2dev->pd_table.last >= c2dev->pd_table.max)
-                       c2dev->pd_table.last = 0;
-       } else
-               ret = -ENOMEM;
-       spin_unlock(&c2dev->pd_table.lock);
-       return ret;
-}
-
-void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
-{
-       spin_lock(&c2dev->pd_table.lock);
-       __clear_bit(pd->pd_id, c2dev->pd_table.table);
-       spin_unlock(&c2dev->pd_table.lock);
-}
-
-int c2_init_pd_table(struct c2_dev *c2dev)
-{
-
-       c2dev->pd_table.last = 0;
-       c2dev->pd_table.max = c2dev->props.max_pd;
-       spin_lock_init(&c2dev->pd_table.lock);
-       c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
-                                       sizeof(long), GFP_KERNEL);
-       if (!c2dev->pd_table.table)
-               return -ENOMEM;
-       bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
-       return 0;
-}
-
-void c2_cleanup_pd_table(struct c2_dev *c2dev)
-{
-       kfree(c2dev->pd_table.table);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
deleted file mode 100644 (file)
index de8d10e..0000000
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/if_arp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_umem.h>
-#include <rdma/ib_user_verbs.h>
-#include "c2.h"
-#include "c2_provider.h"
-#include "c2_user.h"
-
-static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                          struct ib_udata *uhw)
-{
-       struct c2_dev *c2dev = to_c2dev(ibdev);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       *props = c2dev->props;
-       return 0;
-}
-
-static int c2_query_port(struct ib_device *ibdev,
-                        u8 port, struct ib_port_attr *props)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       props->max_mtu = IB_MTU_4096;
-       props->lid = 0;
-       props->lmc = 0;
-       props->sm_lid = 0;
-       props->sm_sl = 0;
-       props->state = IB_PORT_ACTIVE;
-       props->phys_state = 0;
-       props->port_cap_flags =
-           IB_PORT_CM_SUP |
-           IB_PORT_REINIT_SUP |
-           IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-       props->gid_tbl_len = 1;
-       props->pkey_tbl_len = 1;
-       props->qkey_viol_cntr = 0;
-       props->active_width = 1;
-       props->active_speed = IB_SPEED_SDR;
-
-       return 0;
-}
-
-static int c2_query_pkey(struct ib_device *ibdev,
-                        u8 port, u16 index, u16 * pkey)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       *pkey = 0;
-       return 0;
-}
-
-static int c2_query_gid(struct ib_device *ibdev, u8 port,
-                       int index, union ib_gid *gid)
-{
-       struct c2_dev *c2dev = to_c2dev(ibdev);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-       memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
-
-       return 0;
-}
-
-/* Allocate the user context data structure. This keeps track
- * of all objects associated with a particular user-mode client.
- */
-static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
-                                            struct ib_udata *udata)
-{
-       struct c2_ucontext *context;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       context = kmalloc(sizeof(*context), GFP_KERNEL);
-       if (!context)
-               return ERR_PTR(-ENOMEM);
-
-       return &context->ibucontext;
-}
-
-static int c2_dealloc_ucontext(struct ib_ucontext *context)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       kfree(context);
-       return 0;
-}
-
-static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
-                                struct ib_ucontext *context,
-                                struct ib_udata *udata)
-{
-       struct c2_pd *pd;
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-       if (!pd)
-               return ERR_PTR(-ENOMEM);
-
-       err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
-       if (err) {
-               kfree(pd);
-               return ERR_PTR(err);
-       }
-
-       if (context) {
-               if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
-                       c2_pd_free(to_c2dev(ibdev), pd);
-                       kfree(pd);
-                       return ERR_PTR(-EFAULT);
-               }
-       }
-
-       return &pd->ibpd;
-}
-
-static int c2_dealloc_pd(struct ib_pd *pd)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
-       kfree(pd);
-
-       return 0;
-}
-
-static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return ERR_PTR(-ENOSYS);
-}
-
-static int c2_ah_destroy(struct ib_ah *ah)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static void c2_add_ref(struct ib_qp *ibqp)
-{
-       struct c2_qp *qp;
-       BUG_ON(!ibqp);
-       qp = to_c2qp(ibqp);
-       atomic_inc(&qp->refcount);
-}
-
-static void c2_rem_ref(struct ib_qp *ibqp)
-{
-       struct c2_qp *qp;
-       BUG_ON(!ibqp);
-       qp = to_c2qp(ibqp);
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
-{
-       struct c2_dev* c2dev = to_c2dev(device);
-       struct c2_qp *qp;
-
-       qp = c2_find_qpn(c2dev, qpn);
-       pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
-               __func__, qp, qpn, device,
-               (qp?atomic_read(&qp->refcount):0));
-
-       return (qp?&qp->ibqp:NULL);
-}
-
-static struct ib_qp *c2_create_qp(struct ib_pd *pd,
-                                 struct ib_qp_init_attr *init_attr,
-                                 struct ib_udata *udata)
-{
-       struct c2_qp *qp;
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       if (init_attr->create_flags)
-               return ERR_PTR(-EINVAL);
-
-       switch (init_attr->qp_type) {
-       case IB_QPT_RC:
-               qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-               if (!qp) {
-                       pr_debug("%s: Unable to allocate QP\n", __func__);
-                       return ERR_PTR(-ENOMEM);
-               }
-               spin_lock_init(&qp->lock);
-               if (pd->uobject) {
-                       /* userspace specific */
-               }
-
-               err = c2_alloc_qp(to_c2dev(pd->device),
-                                 to_c2pd(pd), init_attr, qp);
-
-               if (err && pd->uobject) {
-                       /* userspace specific */
-               }
-
-               break;
-       default:
-               pr_debug("%s: Invalid QP type: %d\n", __func__,
-                       init_attr->qp_type);
-               return ERR_PTR(-EINVAL);
-       }
-
-       if (err) {
-               kfree(qp);
-               return ERR_PTR(err);
-       }
-
-       return &qp->ibqp;
-}
-
-static int c2_destroy_qp(struct ib_qp *ib_qp)
-{
-       struct c2_qp *qp = to_c2qp(ib_qp);
-
-       pr_debug("%s:%u qp=%p,qp->state=%d\n",
-               __func__, __LINE__, ib_qp, qp->state);
-       c2_free_qp(to_c2dev(ib_qp->device), qp);
-       kfree(qp);
-       return 0;
-}
-
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
-                                 const struct ib_cq_init_attr *attr,
-                                 struct ib_ucontext *context,
-                                 struct ib_udata *udata)
-{
-       int entries = attr->cqe;
-       struct c2_cq *cq;
-       int err;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               pr_debug("%s: Unable to allocate CQ\n", __func__);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
-       if (err) {
-               pr_debug("%s: error initializing CQ\n", __func__);
-               kfree(cq);
-               return ERR_PTR(err);
-       }
-
-       return &cq->ibcq;
-}
-
-static int c2_destroy_cq(struct ib_cq *ib_cq)
-{
-       struct c2_cq *cq = to_c2cq(ib_cq);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       c2_free_cq(to_c2dev(ib_cq->device), cq);
-       kfree(cq);
-
-       return 0;
-}
-
-static inline u32 c2_convert_access(int acc)
-{
-       return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
-           (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
-           (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
-           C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
-}
-
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct c2_mr *mr;
-       u64 *page_list;
-       const u32 total_len = 0xffffffff;       /* AMSO1100 limit */
-       int err, page_shift, pbl_depth, i;
-       u64 kva = 0;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /*
-        * This is a map of all phy mem...use a 32k page_shift.
-        */
-       page_shift = PAGE_SHIFT + 3;
-       pbl_depth = ALIGN(total_len, BIT(page_shift)) >> page_shift;
-
-       page_list = vmalloc(sizeof(u64) * pbl_depth);
-       if (!page_list) {
-               pr_debug("couldn't vmalloc page_list of size %zd\n",
-                       (sizeof(u64) * pbl_depth));
-               return ERR_PTR(-ENOMEM);
-       }
-
-       for (i = 0; i < pbl_depth; i++)
-               page_list[i] = (i << page_shift);
-
-       mr = kmalloc(sizeof(*mr), GFP_KERNEL);
-       if (!mr) {
-               vfree(page_list);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       mr->pd = to_c2pd(pd);
-       mr->umem = NULL;
-       pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
-               "*iova_start %llx, first pa %llx, last pa %llx\n",
-               __func__, page_shift, pbl_depth, total_len,
-               (unsigned long long) kva,
-               (unsigned long long) page_list[0],
-               (unsigned long long) page_list[pbl_depth-1]);
-       err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), page_list,
-                                        BIT(page_shift), pbl_depth,
-                                        total_len, 0, &kva,
-                                        c2_convert_access(acc), mr);
-       vfree(page_list);
-       if (err) {
-               kfree(mr);
-               return ERR_PTR(err);
-       }
-
-       return &mr->ibmr;
-}
-
-static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                                   u64 virt, int acc, struct ib_udata *udata)
-{
-       u64 *pages;
-       u64 kva = 0;
-       int shift, n, len;
-       int i, k, entry;
-       int err = 0;
-       struct scatterlist *sg;
-       struct c2_pd *c2pd = to_c2pd(pd);
-       struct c2_mr *c2mr;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
-       if (!c2mr)
-               return ERR_PTR(-ENOMEM);
-       c2mr->pd = c2pd;
-
-       c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
-       if (IS_ERR(c2mr->umem)) {
-               err = PTR_ERR(c2mr->umem);
-               kfree(c2mr);
-               return ERR_PTR(err);
-       }
-
-       shift = ffs(c2mr->umem->page_size) - 1;
-       n = c2mr->umem->nmap;
-
-       pages = kmalloc_array(n, sizeof(u64), GFP_KERNEL);
-       if (!pages) {
-               err = -ENOMEM;
-               goto err;
-       }
-
-       i = 0;
-       for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
-               len = sg_dma_len(sg) >> shift;
-               for (k = 0; k < len; ++k) {
-                       pages[i++] =
-                               sg_dma_address(sg) +
-                               (c2mr->umem->page_size * k);
-               }
-       }
-
-       kva = virt;
-       err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
-                                        pages,
-                                        c2mr->umem->page_size,
-                                        i,
-                                        length,
-                                        ib_umem_offset(c2mr->umem),
-                                        &kva,
-                                        c2_convert_access(acc),
-                                        c2mr);
-       kfree(pages);
-       if (err)
-               goto err;
-       return &c2mr->ibmr;
-
-err:
-       ib_umem_release(c2mr->umem);
-       kfree(c2mr);
-       return ERR_PTR(err);
-}
-
-static int c2_dereg_mr(struct ib_mr *ib_mr)
-{
-       struct c2_mr *mr = to_c2mr(ib_mr);
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
-       if (err)
-               pr_debug("c2_stag_dealloc failed: %d\n", err);
-       else {
-               if (mr->umem)
-                       ib_umem_release(mr->umem);
-               kfree(mr);
-       }
-
-       return err;
-}
-
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-                       char *buf)
-{
-       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%x\n", c2dev->props.hw_ver);
-}
-
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%x.%x.%x\n",
-                      (int) (c2dev->props.fw_ver >> 32),
-                      (int) (c2dev->props.fw_ver >> 16) & 0xffff,
-                      (int) (c2dev->props.fw_ver & 0xffff));
-}
-
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-                       char *buf)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "AMSO1100\n");
-}
-
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-                         char *buf)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-
-static struct device_attribute *c2_dev_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
-       &dev_attr_hca_type,
-       &dev_attr_board_id
-};
-
-static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                       int attr_mask, struct ib_udata *udata)
-{
-       int err;
-
-       err =
-           c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
-                        attr_mask);
-
-       return err;
-}
-
-static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_process_mad(struct ib_device *ibdev,
-                         int mad_flags,
-                         u8 port_num,
-                         const struct ib_wc *in_wc,
-                         const struct ib_grh *in_grh,
-                         const struct ib_mad_hdr *in_mad,
-                         size_t in_mad_size,
-                         struct ib_mad_hdr *out_mad,
-                         size_t *out_mad_size,
-                         u16 *out_mad_pkey_index)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* Request a connection */
-       return c2_llp_connect(cm_id, iw_param);
-}
-
-static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* Accept the new connection */
-       return c2_llp_accept(cm_id, iw_param);
-}
-
-static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       return c2_llp_reject(cm_id, pdata, pdata_len);
-}
-
-static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       err = c2_llp_service_create(cm_id, backlog);
-       pr_debug("%s:%u err=%d\n",
-               __func__, __LINE__,
-               err);
-       return err;
-}
-
-static int c2_service_destroy(struct iw_cm_id *cm_id)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       return c2_llp_service_destroy(cm_id);
-}
-
-static int c2_pseudo_up(struct net_device *netdev)
-{
-       struct in_device *ind;
-       struct c2_dev *c2dev = netdev->ml_priv;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       pr_debug("adding...\n");
-       for_ifa(ind) {
-#ifdef DEBUG
-               u8 *ip = (u8 *) & ifa->ifa_address;
-
-               pr_debug("%s: %d.%d.%d.%d\n",
-                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-               c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-
-       return 0;
-}
-
-static int c2_pseudo_down(struct net_device *netdev)
-{
-       struct in_device *ind;
-       struct c2_dev *c2dev = netdev->ml_priv;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       pr_debug("deleting...\n");
-       for_ifa(ind) {
-#ifdef DEBUG
-               u8 *ip = (u8 *) & ifa->ifa_address;
-
-               pr_debug("%s: %d.%d.%d.%d\n",
-                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-               c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-
-       return 0;
-}
-
-static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-       kfree_skb(skb);
-       return NETDEV_TX_OK;
-}
-
-static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
-{
-       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-               return -EINVAL;
-
-       netdev->mtu = new_mtu;
-
-       /* TODO: Tell rnic about new rmda interface mtu */
-       return 0;
-}
-
-static const struct net_device_ops c2_pseudo_netdev_ops = {
-       .ndo_open               = c2_pseudo_up,
-       .ndo_stop               = c2_pseudo_down,
-       .ndo_start_xmit         = c2_pseudo_xmit_frame,
-       .ndo_change_mtu         = c2_pseudo_change_mtu,
-       .ndo_validate_addr      = eth_validate_addr,
-};
-
-static void setup(struct net_device *netdev)
-{
-       netdev->netdev_ops = &c2_pseudo_netdev_ops;
-
-       netdev->watchdog_timeo = 0;
-       netdev->type = ARPHRD_ETHER;
-       netdev->mtu = 1500;
-       netdev->hard_header_len = ETH_HLEN;
-       netdev->addr_len = ETH_ALEN;
-       netdev->tx_queue_len = 0;
-       netdev->flags |= IFF_NOARP;
-}
-
-static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
-{
-       char name[IFNAMSIZ];
-       struct net_device *netdev;
-
-       /* change ethxxx to iwxxx */
-       strcpy(name, "iw");
-       strcat(name, &c2dev->netdev->name[3]);
-       netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
-       if (!netdev) {
-               printk(KERN_ERR PFX "%s -  etherdev alloc failed",
-                       __func__);
-               return NULL;
-       }
-
-       netdev->ml_priv = c2dev;
-
-       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-       memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
-
-       /* Print out the MAC address */
-       pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
-
-#if 0
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-#endif
-       return netdev;
-}
-
-static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
-                            struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = c2_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-       return 0;
-}
-
-int c2_register_device(struct c2_dev *dev)
-{
-       int ret = -ENOMEM;
-       int i;
-
-       /* Register pseudo network device */
-       dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
-       if (!dev->pseudo_netdev)
-               goto out;
-
-       ret = register_netdev(dev->pseudo_netdev);
-       if (ret)
-               goto out_free_netdev;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
-       dev->ibdev.owner = THIS_MODULE;
-       dev->ibdev.uverbs_cmd_mask =
-           (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-           (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-           (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-           (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-           (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-           (1ull << IB_USER_VERBS_CMD_REG_MR) |
-           (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-           (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-           (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-           (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-           (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-           (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-           (1ull << IB_USER_VERBS_CMD_POST_SEND) |
-           (1ull << IB_USER_VERBS_CMD_POST_RECV);
-
-       dev->ibdev.node_type = RDMA_NODE_RNIC;
-       memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
-       memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
-       dev->ibdev.phys_port_cnt = 1;
-       dev->ibdev.num_comp_vectors = 1;
-       dev->ibdev.dma_device = &dev->pcidev->dev;
-       dev->ibdev.query_device = c2_query_device;
-       dev->ibdev.query_port = c2_query_port;
-       dev->ibdev.query_pkey = c2_query_pkey;
-       dev->ibdev.query_gid = c2_query_gid;
-       dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
-       dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
-       dev->ibdev.mmap = c2_mmap_uar;
-       dev->ibdev.alloc_pd = c2_alloc_pd;
-       dev->ibdev.dealloc_pd = c2_dealloc_pd;
-       dev->ibdev.create_ah = c2_ah_create;
-       dev->ibdev.destroy_ah = c2_ah_destroy;
-       dev->ibdev.create_qp = c2_create_qp;
-       dev->ibdev.modify_qp = c2_modify_qp;
-       dev->ibdev.destroy_qp = c2_destroy_qp;
-       dev->ibdev.create_cq = c2_create_cq;
-       dev->ibdev.destroy_cq = c2_destroy_cq;
-       dev->ibdev.poll_cq = c2_poll_cq;
-       dev->ibdev.get_dma_mr = c2_get_dma_mr;
-       dev->ibdev.reg_user_mr = c2_reg_user_mr;
-       dev->ibdev.dereg_mr = c2_dereg_mr;
-       dev->ibdev.get_port_immutable = c2_port_immutable;
-
-       dev->ibdev.alloc_fmr = NULL;
-       dev->ibdev.unmap_fmr = NULL;
-       dev->ibdev.dealloc_fmr = NULL;
-       dev->ibdev.map_phys_fmr = NULL;
-
-       dev->ibdev.attach_mcast = c2_multicast_attach;
-       dev->ibdev.detach_mcast = c2_multicast_detach;
-       dev->ibdev.process_mad = c2_process_mad;
-
-       dev->ibdev.req_notify_cq = c2_arm_cq;
-       dev->ibdev.post_send = c2_post_send;
-       dev->ibdev.post_recv = c2_post_receive;
-
-       dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
-       if (dev->ibdev.iwcm == NULL) {
-               ret = -ENOMEM;
-               goto out_unregister_netdev;
-       }
-       dev->ibdev.iwcm->add_ref = c2_add_ref;
-       dev->ibdev.iwcm->rem_ref = c2_rem_ref;
-       dev->ibdev.iwcm->get_qp = c2_get_qp;
-       dev->ibdev.iwcm->connect = c2_connect;
-       dev->ibdev.iwcm->accept = c2_accept;
-       dev->ibdev.iwcm->reject = c2_reject;
-       dev->ibdev.iwcm->create_listen = c2_service_create;
-       dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
-
-       ret = ib_register_device(&dev->ibdev, NULL);
-       if (ret)
-               goto out_free_iwcm;
-
-       for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
-               ret = device_create_file(&dev->ibdev.dev,
-                                              c2_dev_attributes[i]);
-               if (ret)
-                       goto out_unregister_ibdev;
-       }
-       goto out;
-
-out_unregister_ibdev:
-       ib_unregister_device(&dev->ibdev);
-out_free_iwcm:
-       kfree(dev->ibdev.iwcm);
-out_unregister_netdev:
-       unregister_netdev(dev->pseudo_netdev);
-out_free_netdev:
-       free_netdev(dev->pseudo_netdev);
-out:
-       pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
-       return ret;
-}
-
-void c2_unregister_device(struct c2_dev *dev)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       unregister_netdev(dev->pseudo_netdev);
-       free_netdev(dev->pseudo_netdev);
-       ib_unregister_device(&dev->ibdev);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h
deleted file mode 100644 (file)
index bf18998..0000000
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_PROVIDER_H
-#define C2_PROVIDER_H
-#include <linux/inetdevice.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-
-#include "c2_mq.h"
-#include <rdma/iw_cm.h>
-
-#define C2_MPT_FLAG_ATOMIC        (1 << 14)
-#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
-#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
-#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
-#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
-
-struct c2_buf_list {
-       void *buf;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-};
-
-
-/* The user context keeps track of objects allocated for a
- * particular user-mode client. */
-struct c2_ucontext {
-       struct ib_ucontext ibucontext;
-};
-
-struct c2_mtt;
-
-/* All objects associated with a PD are kept in the
- * associated user context if present.
- */
-struct c2_pd {
-       struct ib_pd ibpd;
-       u32 pd_id;
-};
-
-struct c2_mr {
-       struct ib_mr ibmr;
-       struct c2_pd *pd;
-       struct ib_umem *umem;
-};
-
-struct c2_av;
-
-enum c2_ah_type {
-       C2_AH_ON_HCA,
-       C2_AH_PCI_POOL,
-       C2_AH_KMALLOC
-};
-
-struct c2_ah {
-       struct ib_ah ibah;
-};
-
-struct c2_cq {
-       struct ib_cq ibcq;
-       spinlock_t lock;
-       atomic_t refcount;
-       int cqn;
-       int is_kernel;
-       wait_queue_head_t wait;
-
-       u32 adapter_handle;
-       struct c2_mq mq;
-};
-
-struct c2_wq {
-       spinlock_t lock;
-};
-struct iw_cm_id;
-struct c2_qp {
-       struct ib_qp ibqp;
-       struct iw_cm_id *cm_id;
-       spinlock_t lock;
-       atomic_t refcount;
-       wait_queue_head_t wait;
-       int qpn;
-
-       u32 adapter_handle;
-       u32 send_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u8 state;
-
-       struct c2_mq sq_mq;
-       struct c2_mq rq_mq;
-};
-
-struct c2_cr_query_attrs {
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-};
-
-static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
-{
-       return container_of(ibpd, struct c2_pd, ibpd);
-}
-
-static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
-{
-       return container_of(ibucontext, struct c2_ucontext, ibucontext);
-}
-
-static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
-{
-       return container_of(ibmr, struct c2_mr, ibmr);
-}
-
-
-static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
-{
-       return container_of(ibah, struct c2_ah, ibah);
-}
-
-static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
-{
-       return container_of(ibcq, struct c2_cq, ibcq);
-}
-
-static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct c2_qp, ibqp);
-}
-
-static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
-{
-       struct in_device *ind;
-       int ret = 0;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       for_ifa(ind) {
-               if (ifa->ifa_address == addr) {
-                       ret = 1;
-                       break;
-               }
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-       return ret;
-}
-#endif                         /* C2_PROVIDER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c
deleted file mode 100644 (file)
index ca364db..0000000
+++ /dev/null
@@ -1,1024 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_MAX_ORD_PER_QP 128
-#define C2_MAX_IRD_PER_QP 128
-
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-#define NO_SUPPORT -1
-static const u8 c2_opcode[] = {
-       [IB_WR_SEND] = C2_WR_TYPE_SEND,
-       [IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
-       [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
-       [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
-};
-
-static int to_c2_state(enum ib_qp_state ib_state)
-{
-       switch (ib_state) {
-       case IB_QPS_RESET:
-               return C2_QP_STATE_IDLE;
-       case IB_QPS_RTS:
-               return C2_QP_STATE_RTS;
-       case IB_QPS_SQD:
-               return C2_QP_STATE_CLOSING;
-       case IB_QPS_SQE:
-               return C2_QP_STATE_CLOSING;
-       case IB_QPS_ERR:
-               return C2_QP_STATE_ERROR;
-       default:
-               return -1;
-       }
-}
-
-static int to_ib_state(enum c2_qp_state c2_state)
-{
-       switch (c2_state) {
-       case C2_QP_STATE_IDLE:
-               return IB_QPS_RESET;
-       case C2_QP_STATE_CONNECTING:
-               return IB_QPS_RTR;
-       case C2_QP_STATE_RTS:
-               return IB_QPS_RTS;
-       case C2_QP_STATE_CLOSING:
-               return IB_QPS_SQD;
-       case C2_QP_STATE_ERROR:
-               return IB_QPS_ERR;
-       case C2_QP_STATE_TERMINATE:
-               return IB_QPS_SQE;
-       default:
-               return -1;
-       }
-}
-
-static const char *to_ib_state_str(int ib_state)
-{
-       static const char *state_str[] = {
-               "IB_QPS_RESET",
-               "IB_QPS_INIT",
-               "IB_QPS_RTR",
-               "IB_QPS_RTS",
-               "IB_QPS_SQD",
-               "IB_QPS_SQE",
-               "IB_QPS_ERR"
-       };
-       if (ib_state < IB_QPS_RESET ||
-           ib_state > IB_QPS_ERR)
-               return "<invalid IB QP state>";
-
-       ib_state -= IB_QPS_RESET;
-       return state_str[ib_state];
-}
-
-void c2_set_qp_state(struct c2_qp *qp, int c2_state)
-{
-       int new_state = to_ib_state(c2_state);
-
-       pr_debug("%s: qp[%p] state modify %s --> %s\n",
-              __func__,
-               qp,
-               to_ib_state_str(qp->state),
-               to_ib_state_str(new_state));
-       qp->state = new_state;
-}
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-                struct ib_qp_attr *attr, int attr_mask)
-{
-       struct c2wr_qp_modify_req wr;
-       struct c2wr_qp_modify_rep *reply;
-       struct c2_vq_req *vq_req;
-       unsigned long flags;
-       u8 next_state;
-       int err;
-
-       pr_debug("%s:%d qp=%p, %s --> %s\n",
-               __func__, __LINE__,
-               qp,
-               to_ib_state_str(qp->state),
-               to_ib_state_str(attr->qp_state));
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-       wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-       if (attr_mask & IB_QP_STATE) {
-               /* Ensure the state is valid */
-               if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
-                       err = -EINVAL;
-                       goto bail0;
-               }
-
-               wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
-
-               if (attr->qp_state == IB_QPS_ERR) {
-                       spin_lock_irqsave(&qp->lock, flags);
-                       if (qp->cm_id && qp->state == IB_QPS_RTS) {
-                               pr_debug("Generating CLOSE event for QP-->ERR, "
-                                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
-                               /* Generate an CLOSE event */
-                               vq_req->cm_id = qp->cm_id;
-                               vq_req->event = IW_CM_EVENT_CLOSE;
-                       }
-                       spin_unlock_irqrestore(&qp->lock, flags);
-               }
-               next_state =  attr->qp_state;
-
-       } else if (attr_mask & IB_QP_CUR_STATE) {
-
-               if (attr->cur_qp_state != IB_QPS_RTR &&
-                   attr->cur_qp_state != IB_QPS_RTS &&
-                   attr->cur_qp_state != IB_QPS_SQD &&
-                   attr->cur_qp_state != IB_QPS_SQE) {
-                       err = -EINVAL;
-                       goto bail0;
-               } else
-                       wr.next_qp_state =
-                           cpu_to_be32(to_c2_state(attr->cur_qp_state));
-
-               next_state = attr->cur_qp_state;
-
-       } else {
-               err = 0;
-               goto bail0;
-       }
-
-       /* reference the request struct */
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-       if (!err)
-               qp->state = next_state;
-#ifdef DEBUG
-       else
-               pr_debug("%s: c2_errno=%d\n", __func__, err);
-#endif
-       /*
-        * If we're going to error and generating the event here, then
-        * we need to remove the reference because there will be no
-        * close event generated by the adapter
-       */
-       spin_lock_irqsave(&qp->lock, flags);
-       if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
-               qp->cm_id->rem_ref(qp->cm_id);
-               qp->cm_id = NULL;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-
-       pr_debug("%s:%d qp=%p, cur_state=%s\n",
-               __func__, __LINE__,
-               qp,
-               to_ib_state_str(qp->state));
-       return err;
-}
-
-int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-                         int ord, int ird)
-{
-       struct c2wr_qp_modify_req wr;
-       struct c2wr_qp_modify_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-       wr.ord = cpu_to_be32(ord);
-       wr.ird = cpu_to_be32(ird);
-       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-       /* reference the request struct */
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       reply = (struct c2wr_qp_modify_rep *) (unsigned long)
-               vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_qp_destroy_req wr;
-       struct c2wr_qp_destroy_rep *reply;
-       unsigned long flags;
-       int err;
-
-       /*
-        * Allocate a verb request message
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               return -ENOMEM;
-       }
-
-       /*
-        * Initialize the WR
-        */
-       c2_wr_set_id(&wr, CCWR_QP_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       spin_lock_irqsave(&qp->lock, flags);
-       if (qp->cm_id && qp->state == IB_QPS_RTS) {
-               pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
-                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
-               /* Generate an CLOSE event */
-               vq_req->qp = qp;
-               vq_req->cm_id = qp->cm_id;
-               vq_req->event = IW_CM_EVENT_CLOSE;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       spin_lock_irqsave(&qp->lock, flags);
-       if (qp->cm_id) {
-               qp->cm_id->rem_ref(qp->cm_id);
-               qp->cm_id = NULL;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       int ret;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irq(&c2dev->qp_table.lock);
-
-       ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
-       if (ret >= 0)
-               qp->qpn = ret;
-
-       spin_unlock_irq(&c2dev->qp_table.lock);
-       idr_preload_end();
-       return ret < 0 ? ret : 0;
-}
-
-static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
-{
-       spin_lock_irq(&c2dev->qp_table.lock);
-       idr_remove(&c2dev->qp_table.idr, qpn);
-       spin_unlock_irq(&c2dev->qp_table.lock);
-}
-
-struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
-{
-       unsigned long flags;
-       struct c2_qp *qp;
-
-       spin_lock_irqsave(&c2dev->qp_table.lock, flags);
-       qp = idr_find(&c2dev->qp_table.idr, qpn);
-       spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
-       return qp;
-}
-
-int c2_alloc_qp(struct c2_dev *c2dev,
-               struct c2_pd *pd,
-               struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
-{
-       struct c2wr_qp_create_req wr;
-       struct c2wr_qp_create_rep *reply;
-       struct c2_vq_req *vq_req;
-       struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
-       struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
-       unsigned long peer_pa;
-       u32 q_size, msg_size, mmap_size;
-       void __iomem *mmap;
-       int err;
-
-       err = c2_alloc_qpn(c2dev, qp);
-       if (err)
-               return err;
-       qp->ibqp.qp_num = qp->qpn;
-       qp->ibqp.qp_type = IB_QPT_RC;
-
-       /* Allocate the SQ and RQ shared pointers */
-       qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                        &qp->sq_mq.shared_dma, GFP_KERNEL);
-       if (!qp->sq_mq.shared) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                        &qp->rq_mq.shared_dma, GFP_KERNEL);
-       if (!qp->rq_mq.shared) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       /* Allocate the verbs request */
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-
-       /* Initialize the work request */
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_QP_CREATE);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.sq_cq_handle = send_cq->adapter_handle;
-       wr.rq_cq_handle = recv_cq->adapter_handle;
-       wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
-       wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
-       wr.srq_handle = 0;
-       wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
-                              QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
-       wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-       wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
-       wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-       wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
-       wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
-       wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
-       wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
-       wr.pd_id = pd->pd_id;
-       wr.user_context = (unsigned long) qp;
-
-       vq_req_get(c2dev, vq_req);
-
-       /* Send the WR to the adapter */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail3;
-       }
-
-       /* Wait for the verb reply  */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail3;
-       }
-
-       /* Process the reply */
-       reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail3;
-       }
-
-       if ((err = c2_wr_get_result(reply)) != 0) {
-               goto bail4;
-       }
-
-       /* Fill in the kernel QP struct */
-       atomic_set(&qp->refcount, 1);
-       qp->adapter_handle = reply->qp_handle;
-       qp->state = IB_QPS_RESET;
-       qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
-       qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
-       qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
-       init_waitqueue_head(&qp->wait);
-
-       /* Initialize the SQ MQ */
-       q_size = be32_to_cpu(reply->sq_depth);
-       msg_size = be32_to_cpu(reply->sq_msg_size);
-       peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
-       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-       mmap = ioremap_nocache(peer_pa, mmap_size);
-       if (!mmap) {
-               err = -ENOMEM;
-               goto bail5;
-       }
-
-       c2_mq_req_init(&qp->sq_mq,
-                      be32_to_cpu(reply->sq_mq_index),
-                      q_size,
-                      msg_size,
-                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
-                      mmap,                            /* peer */
-                      C2_MQ_ADAPTER_TARGET);
-
-       /* Initialize the RQ mq */
-       q_size = be32_to_cpu(reply->rq_depth);
-       msg_size = be32_to_cpu(reply->rq_msg_size);
-       peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
-       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-       mmap = ioremap_nocache(peer_pa, mmap_size);
-       if (!mmap) {
-               err = -ENOMEM;
-               goto bail6;
-       }
-
-       c2_mq_req_init(&qp->rq_mq,
-                      be32_to_cpu(reply->rq_mq_index),
-                      q_size,
-                      msg_size,
-                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
-                      mmap,                            /* peer */
-                      C2_MQ_ADAPTER_TARGET);
-
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       return 0;
-
-bail6:
-       iounmap(qp->sq_mq.peer);
-bail5:
-       destroy_qp(c2dev, qp);
-bail4:
-       vq_repbuf_free(c2dev, reply);
-bail3:
-       vq_req_free(c2dev, vq_req);
-bail2:
-       c2_free_mqsp(qp->rq_mq.shared);
-bail1:
-       c2_free_mqsp(qp->sq_mq.shared);
-bail0:
-       c2_free_qpn(c2dev, qp->qpn);
-       return err;
-}
-
-static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-       if (send_cq == recv_cq)
-               spin_lock_irq(&send_cq->lock);
-       else if (send_cq > recv_cq) {
-               spin_lock_irq(&send_cq->lock);
-               spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
-       } else {
-               spin_lock_irq(&recv_cq->lock);
-               spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
-       }
-}
-
-static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-       if (send_cq == recv_cq)
-               spin_unlock_irq(&send_cq->lock);
-       else if (send_cq > recv_cq) {
-               spin_unlock(&recv_cq->lock);
-               spin_unlock_irq(&send_cq->lock);
-       } else {
-               spin_unlock(&send_cq->lock);
-               spin_unlock_irq(&recv_cq->lock);
-       }
-}
-
-void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       struct c2_cq *send_cq;
-       struct c2_cq *recv_cq;
-
-       send_cq = to_c2cq(qp->ibqp.send_cq);
-       recv_cq = to_c2cq(qp->ibqp.recv_cq);
-
-       /*
-        * Lock CQs here, so that CQ polling code can do QP lookup
-        * without taking a lock.
-        */
-       c2_lock_cqs(send_cq, recv_cq);
-       c2_free_qpn(c2dev, qp->qpn);
-       c2_unlock_cqs(send_cq, recv_cq);
-
-       /*
-        * Destroy qp in the rnic...
-        */
-       destroy_qp(c2dev, qp);
-
-       /*
-        * Mark any unreaped CQEs as null and void.
-        */
-       c2_cq_clean(c2dev, qp, send_cq->cqn);
-       if (send_cq != recv_cq)
-               c2_cq_clean(c2dev, qp, recv_cq->cqn);
-       /*
-        * Unmap the MQs and return the shared pointers
-        * to the message pool.
-        */
-       iounmap(qp->sq_mq.peer);
-       iounmap(qp->rq_mq.peer);
-       c2_free_mqsp(qp->sq_mq.shared);
-       c2_free_mqsp(qp->rq_mq.shared);
-
-       atomic_dec(&qp->refcount);
-       wait_event(qp->wait, !atomic_read(&qp->refcount));
-}
-
-/*
- * Function: move_sgl
- *
- * Description:
- * Move an SGL from the user's work request struct into a CCIL Work Request
- * message, swapping to WR byte order and ensure the total length doesn't
- * overflow.
- *
- * IN:
- * dst         - ptr to CCIL Work Request message SGL memory.
- * src         - ptr to the consumers SGL memory.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int
-move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
-        u8 * actual_count)
-{
-       u32 tot = 0;            /* running total */
-       u8 acount = 0;          /* running total non-0 len sge's */
-
-       while (count > 0) {
-               /*
-                * If the addition of this SGE causes the
-                * total SGL length to exceed 2^32-1, then
-                * fail-n-bail.
-                *
-                * If the current total plus the next element length
-                * wraps, then it will go negative and be less than the
-                * current total...
-                */
-               if ((tot + src->length) < tot) {
-                       return -EINVAL;
-               }
-               /*
-                * Bug: 1456 (as well as 1498 & 1643)
-                * Skip over any sge's supplied with len=0
-                */
-               if (src->length) {
-                       tot += src->length;
-                       dst->stag = cpu_to_be32(src->lkey);
-                       dst->to = cpu_to_be64(src->addr);
-                       dst->length = cpu_to_be32(src->length);
-                       dst++;
-                       acount++;
-               }
-               src++;
-               count--;
-       }
-
-       if (acount == 0) {
-               /*
-                * Bug: 1476 (as well as 1498, 1456 and 1643)
-                * Setup the SGL in the WR to make it easier for the RNIC.
-                * This way, the FW doesn't have to deal with special cases.
-                * Setting length=0 should be sufficient.
-                */
-               dst->stag = 0;
-               dst->to = 0;
-               dst->length = 0;
-       }
-
-       *p_len = tot;
-       *actual_count = acount;
-       return 0;
-}
-
-/*
- * Function: c2_activity (private function)
- *
- * Description:
- * Post an mq index to the host->adapter activity fifo.
- *
- * IN:
- * c2dev       - ptr to c2dev structure
- * mq_index    - mq index to post
- * shared      - value most recently written to shared
- *
- * OUT:
- *
- * Return:
- * none
- */
-static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
-{
-       /*
-        * First read the register to see if the FIFO is full, and if so,
-        * spin until it's not.  This isn't perfect -- there is no
-        * synchronization among the clients of the register, but in
-        * practice it prevents multiple CPU from hammering the bus
-        * with PCI RETRY. Note that when this does happen, the card
-        * cannot get on the bus and the card and system hang in a
-        * deadlock -- thus the need for this code. [TOT]
-        */
-       while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
-               udelay(10);
-
-       __raw_writel(C2_HINT_MAKE(mq_index, shared),
-                    c2dev->regs + PCI_BAR0_ADAPTER_HINT);
-}
-
-/*
- * Function: qp_wr_post
- *
- * Description:
- * This in-line function allocates a MQ msg, then moves the host-copy of
- * the completed WR into msg.  Then it posts the message.
- *
- * IN:
- * q           - ptr to user MQ.
- * wr          - ptr to host-copy of the WR.
- * qp          - ptr to user qp
- * size                - Number of bytes to post.  Assumed to be divisible by 4.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
-{
-       union c2wr *msg;
-
-       msg = c2_mq_alloc(q);
-       if (msg == NULL) {
-               return -EINVAL;
-       }
-#ifdef CCMSGMAGIC
-       ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-
-       /*
-        * Since all header fields in the WR are the same as the
-        * CQE, set the following so the adapter need not.
-        */
-       c2_wr_set_result(wr, CCERR_PENDING);
-
-       /*
-        * Copy the wr down to the adapter
-        */
-       memcpy((void *) msg, (void *) wr, size);
-
-       c2_mq_produce(q);
-       return 0;
-}
-
-
-int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-                struct ib_send_wr **bad_wr)
-{
-       struct c2_dev *c2dev = to_c2dev(ibqp->device);
-       struct c2_qp *qp = to_c2qp(ibqp);
-       union c2wr wr;
-       unsigned long lock_flags;
-       int err = 0;
-
-       u32 flags;
-       u32 tot_len;
-       u8 actual_sge_count;
-       u32 msg_size;
-
-       if (qp->state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       while (ib_wr) {
-
-               flags = 0;
-               wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-               if (ib_wr->send_flags & IB_SEND_SIGNALED) {
-                       flags |= SQ_SIGNALED;
-               }
-
-               switch (ib_wr->opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_INV:
-                       if (ib_wr->opcode == IB_WR_SEND) {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
-                               else
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
-                               wr.sqwr.send.remote_stag = 0;
-                       } else {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
-                               else
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
-                               wr.sqwr.send.remote_stag =
-                                       cpu_to_be32(ib_wr->ex.invalidate_rkey);
-                       }
-
-                       msg_size = sizeof(struct c2wr_send_req) +
-                               sizeof(struct c2_data_addr) * ib_wr->num_sge;
-                       if (ib_wr->num_sge > qp->send_sgl_depth) {
-                               err = -EINVAL;
-                               break;
-                       }
-                       if (ib_wr->send_flags & IB_SEND_FENCE) {
-                               flags |= SQ_READ_FENCE;
-                       }
-                       err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
-                                      ib_wr->sg_list,
-                                      ib_wr->num_sge,
-                                      &tot_len, &actual_sge_count);
-                       wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
-                       c2_wr_set_sge_count(&wr, actual_sge_count);
-                       break;
-               case IB_WR_RDMA_WRITE:
-                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
-                       msg_size = sizeof(struct c2wr_rdma_write_req) +
-                           (sizeof(struct c2_data_addr) * ib_wr->num_sge);
-                       if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
-                               err = -EINVAL;
-                               break;
-                       }
-                       if (ib_wr->send_flags & IB_SEND_FENCE) {
-                               flags |= SQ_READ_FENCE;
-                       }
-                       wr.sqwr.rdma_write.remote_stag =
-                           cpu_to_be32(rdma_wr(ib_wr)->rkey);
-                       wr.sqwr.rdma_write.remote_to =
-                           cpu_to_be64(rdma_wr(ib_wr)->remote_addr);
-                       err = move_sgl((struct c2_data_addr *)
-                                      & (wr.sqwr.rdma_write.data),
-                                      ib_wr->sg_list,
-                                      ib_wr->num_sge,
-                                      &tot_len, &actual_sge_count);
-                       wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
-                       c2_wr_set_sge_count(&wr, actual_sge_count);
-                       break;
-               case IB_WR_RDMA_READ:
-                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
-                       msg_size = sizeof(struct c2wr_rdma_read_req);
-
-                       /* IWarp only suppots 1 sge for RDMA reads */
-                       if (ib_wr->num_sge > 1) {
-                               err = -EINVAL;
-                               break;
-                       }
-
-                       /*
-                        * Move the local and remote stag/to/len into the WR.
-                        */
-                       wr.sqwr.rdma_read.local_stag =
-                           cpu_to_be32(ib_wr->sg_list->lkey);
-                       wr.sqwr.rdma_read.local_to =
-                           cpu_to_be64(ib_wr->sg_list->addr);
-                       wr.sqwr.rdma_read.remote_stag =
-                           cpu_to_be32(rdma_wr(ib_wr)->rkey);
-                       wr.sqwr.rdma_read.remote_to =
-                           cpu_to_be64(rdma_wr(ib_wr)->remote_addr);
-                       wr.sqwr.rdma_read.length =
-                           cpu_to_be32(ib_wr->sg_list->length);
-                       break;
-               default:
-                       /* error */
-                       msg_size = 0;
-                       err = -EINVAL;
-                       break;
-               }
-
-               /*
-                * If we had an error on the last wr build, then
-                * break out.  Possible errors include bogus WR
-                * type, and a bogus SGL length...
-                */
-               if (err) {
-                       break;
-               }
-
-               /*
-                * Store flags
-                */
-               c2_wr_set_flags(&wr, flags);
-
-               /*
-                * Post the puppy!
-                */
-               spin_lock_irqsave(&qp->lock, lock_flags);
-               err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
-               if (err) {
-                       spin_unlock_irqrestore(&qp->lock, lock_flags);
-                       break;
-               }
-
-               /*
-                * Enqueue mq index to activity FIFO.
-                */
-               c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
-               spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-               ib_wr = ib_wr->next;
-       }
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-                   struct ib_recv_wr **bad_wr)
-{
-       struct c2_dev *c2dev = to_c2dev(ibqp->device);
-       struct c2_qp *qp = to_c2qp(ibqp);
-       union c2wr wr;
-       unsigned long lock_flags;
-       int err = 0;
-
-       if (qp->state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       /*
-        * Try and post each work request
-        */
-       while (ib_wr) {
-               u32 tot_len;
-               u8 actual_sge_count;
-
-               if (ib_wr->num_sge > qp->recv_sgl_depth) {
-                       err = -EINVAL;
-                       break;
-               }
-
-               /*
-                * Create local host-copy of the WR
-                */
-               wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-               c2_wr_set_id(&wr, CCWR_RECV);
-               c2_wr_set_flags(&wr, 0);
-
-               /* sge_count is limited to eight bits. */
-               BUG_ON(ib_wr->num_sge >= 256);
-               err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
-                              ib_wr->sg_list,
-                              ib_wr->num_sge, &tot_len, &actual_sge_count);
-               c2_wr_set_sge_count(&wr, actual_sge_count);
-
-               /*
-                * If we had an error on the last wr build, then
-                * break out.  Possible errors include bogus WR
-                * type, and a bogus SGL length...
-                */
-               if (err) {
-                       break;
-               }
-
-               spin_lock_irqsave(&qp->lock, lock_flags);
-               err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
-               if (err) {
-                       spin_unlock_irqrestore(&qp->lock, lock_flags);
-                       break;
-               }
-
-               /*
-                * Enqueue mq index to activity FIFO
-                */
-               c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
-               spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-               ib_wr = ib_wr->next;
-       }
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-void c2_init_qp_table(struct c2_dev *c2dev)
-{
-       spin_lock_init(&c2dev->qp_table.lock);
-       idr_init(&c2dev->qp_table.idr);
-}
-
-void c2_cleanup_qp_table(struct c2_dev *c2dev)
-{
-       idr_destroy(&c2dev->qp_table.idr);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c
deleted file mode 100644 (file)
index 5e65c6d..0000000
+++ /dev/null
@@ -1,652 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/inet.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <linux/route.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_vq.h"
-
-/* Device capabilities */
-#define C2_MIN_PAGESIZE  1024
-
-#define C2_MAX_MRS       32768
-#define C2_MAX_QPS       16000
-#define C2_MAX_WQE_SZ    256
-#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
-#define C2_MAX_SGES      4
-#define C2_MAX_SGE_RD    1
-#define C2_MAX_CQS       32768
-#define C2_MAX_CQES      4096
-#define C2_MAX_PDS       16384
-
-/*
- * Send the adapter INIT message to the amso1100
- */
-static int c2_adapter_init(struct c2_dev *c2dev)
-{
-       struct c2wr_init_req wr;
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_INIT);
-       wr.hdr.context = 0;
-       wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
-       wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
-       wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
-       wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
-       wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
-       wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
-
-       /* Post the init message */
-       return vq_send_wr(c2dev, (union c2wr *) & wr);
-}
-
-/*
- * Send the adapter TERM message to the amso1100
- */
-static void c2_adapter_term(struct c2_dev *c2dev)
-{
-       struct c2wr_init_req wr;
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_TERM);
-       wr.hdr.context = 0;
-
-       /* Post the init message */
-       vq_send_wr(c2dev, (union c2wr *) & wr);
-       c2dev->init = 0;
-
-       return;
-}
-
-/*
- * Query the adapter
- */
-static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_query_req wr;
-       struct c2wr_rnic_query_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply)
-               err = -ENOMEM;
-       else
-               err = c2_errno(reply);
-       if (err)
-               goto bail2;
-
-       props->fw_ver =
-               ((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
-               ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
-               (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
-       memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
-       props->max_mr_size         = 0xFFFFFFFF;
-       props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
-       props->vendor_id           = be32_to_cpu(reply->vendor_id);
-       props->vendor_part_id      = be32_to_cpu(reply->part_number);
-       props->hw_ver              = be32_to_cpu(reply->hw_version);
-       props->max_qp              = be32_to_cpu(reply->max_qps);
-       props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
-       props->device_cap_flags    = c2dev->device_cap_flags;
-       props->max_sge             = C2_MAX_SGES;
-       props->max_sge_rd          = C2_MAX_SGE_RD;
-       props->max_cq              = be32_to_cpu(reply->max_cqs);
-       props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
-       props->max_mr              = be32_to_cpu(reply->max_mrs);
-       props->max_pd              = be32_to_cpu(reply->max_pds);
-       props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
-       props->max_ee_rd_atom      = 0;
-       props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
-       props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
-       props->max_ee_init_rd_atom = 0;
-       props->atomic_cap          = IB_ATOMIC_NONE;
-       props->max_ee              = 0;
-       props->max_rdd             = 0;
-       props->max_mw              = be32_to_cpu(reply->max_mws);
-       props->max_raw_ipv6_qp     = 0;
-       props->max_raw_ethy_qp     = 0;
-       props->max_mcast_grp       = 0;
-       props->max_mcast_qp_attach = 0;
-       props->max_total_mcast_qp_attach = 0;
-       props->max_ah              = 0;
-       props->max_fmr             = 0;
-       props->max_map_per_fmr     = 0;
-       props->max_srq             = 0;
-       props->max_srq_wr          = 0;
-       props->max_srq_sge         = 0;
-       props->max_pkeys           = 0;
-       props->local_ca_ack_delay  = 0;
-
- bail2:
-       vq_repbuf_free(c2dev, reply);
-
- bail1:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Add an IP address to the RNIC interface
- */
-int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_setconfig_req *wr;
-       struct c2wr_rnic_setconfig_rep *reply;
-       struct c2_netaddr netaddr;
-       int err, len;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       len = sizeof(struct c2_netaddr);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
-
-       netaddr.ip_addr = inaddr;
-       netaddr.netmask = inmask;
-       netaddr.mtu = 0;
-
-       memcpy(wr->data, &netaddr, len);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-bail1:
-       kfree(wr);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Delete an IP address from the RNIC interface
- */
-int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_setconfig_req *wr;
-       struct c2wr_rnic_setconfig_rep *reply;
-       struct c2_netaddr netaddr;
-       int err, len;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       len = sizeof(struct c2_netaddr);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
-
-       netaddr.ip_addr = inaddr;
-       netaddr.netmask = inmask;
-       netaddr.mtu = 0;
-
-       memcpy(wr->data, &netaddr, len);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-bail1:
-       kfree(wr);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Open a single RNIC instance to use with all
- * low level openib calls
- */
-static int c2_rnic_open(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *vq_req;
-       union c2wr wr;
-       struct c2wr_rnic_open_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               return -ENOMEM;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
-       wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
-       wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
-       wr.rnic_open.req.port_num = cpu_to_be16(0);
-       wr.rnic_open.req.user_context = (unsigned long) c2dev;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       if ((err = c2_errno(reply)) != 0) {
-               goto bail1;
-       }
-
-       c2dev->adapter_handle = reply->rnic_handle;
-
-bail1:
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Close the RNIC instance
- */
-static int c2_rnic_close(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *vq_req;
-       union c2wr wr;
-       struct c2wr_rnic_close_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               return -ENOMEM;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
-       wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
-       wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       if ((err = c2_errno(reply)) != 0) {
-               goto bail1;
-       }
-
-       c2dev->adapter_handle = 0;
-
-bail1:
-       vq_repbuf_free(c2dev, reply);
-bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Called by c2_probe to initialize the RNIC. This principally
- * involves initializing the various limits and resource pools that
- * comprise the RNIC instance.
- */
-int c2_rnic_init(struct c2_dev *c2dev)
-{
-       int err;
-       u32 qsize, msgsize;
-       void *q1_pages;
-       void *q2_pages;
-       void __iomem *mmio_regs;
-
-       /* Device capabilities */
-       c2dev->device_cap_flags =
-           (IB_DEVICE_RESIZE_MAX_WR |
-            IB_DEVICE_CURR_QP_STATE_MOD |
-            IB_DEVICE_SYS_IMAGE_GUID |
-            IB_DEVICE_LOCAL_DMA_LKEY |
-            IB_DEVICE_MEM_WINDOW);
-
-       /* Allocate the qptr_array */
-       c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
-       if (!c2dev->qptr_array) {
-               return -ENOMEM;
-       }
-
-       /* Initialize the qptr_array */
-       c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
-       c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
-       c2dev->qptr_array[2] = (void *) &c2dev->aeq;
-
-       /* Initialize data structures */
-       init_waitqueue_head(&c2dev->req_vq_wo);
-       spin_lock_init(&c2dev->vqlock);
-       spin_lock_init(&c2dev->lock);
-
-       /* Allocate MQ shared pointer pool for kernel clients. User
-        * mode client pools are hung off the user context
-        */
-       err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
-       if (err) {
-               goto bail0;
-       }
-
-       /* Allocate shared pointers for Q0, Q1, and Q2 from
-        * the shared pointer pool.
-        */
-
-       c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->hint_count_dma,
-                                            GFP_KERNEL);
-       c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->req_vq.shared_dma,
-                                            GFP_KERNEL);
-       c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->rep_vq.shared_dma,
-                                            GFP_KERNEL);
-       c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                         &c2dev->aeq.shared_dma, GFP_KERNEL);
-       if (!c2dev->hint_count || !c2dev->req_vq.shared ||
-           !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       mmio_regs = c2dev->kva;
-       /* Initialize the Verbs Request Queue */
-       c2_mq_req_init(&c2dev->req_vq, 0,
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
-                      C2_MQ_ADAPTER_TARGET);
-
-       /* Initialize the Verbs Reply Queue */
-       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
-       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
-       q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-                                     &c2dev->rep_vq.host_dma, GFP_KERNEL);
-       if (!q1_pages) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-       dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
-       pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
-                (unsigned long long) c2dev->rep_vq.host_dma);
-       c2_mq_rep_init(&c2dev->rep_vq,
-                  1,
-                  qsize,
-                  msgsize,
-                  q1_pages,
-                  mmio_regs +
-                  be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
-                  C2_MQ_HOST_TARGET);
-
-       /* Initialize the Asynchronus Event Queue */
-       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
-       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
-       q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-                                     &c2dev->aeq.host_dma, GFP_KERNEL);
-       if (!q2_pages) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-       dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
-       pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
-                (unsigned long long) c2dev->aeq.host_dma);
-       c2_mq_rep_init(&c2dev->aeq,
-                      2,
-                      qsize,
-                      msgsize,
-                      q2_pages,
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
-                      C2_MQ_HOST_TARGET);
-
-       /* Initialize the verbs request allocator */
-       err = vq_init(c2dev);
-       if (err)
-               goto bail3;
-
-       /* Enable interrupts on the adapter */
-       writel(0, c2dev->regs + C2_IDIS);
-
-       /* create the WR init message */
-       err = c2_adapter_init(c2dev);
-       if (err)
-               goto bail4;
-       c2dev->init++;
-
-       /* open an adapter instance */
-       err = c2_rnic_open(c2dev);
-       if (err)
-               goto bail4;
-
-       /* Initialize cached the adapter limits */
-       err = c2_rnic_query(c2dev, &c2dev->props);
-       if (err)
-               goto bail5;
-
-       /* Initialize the PD pool */
-       err = c2_init_pd_table(c2dev);
-       if (err)
-               goto bail5;
-
-       /* Initialize the QP pool */
-       c2_init_qp_table(c2dev);
-       return 0;
-
-bail5:
-       c2_rnic_close(c2dev);
-bail4:
-       vq_term(c2dev);
-bail3:
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
-                         q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
-bail2:
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-                         q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
-bail1:
-       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-bail0:
-       vfree(c2dev->qptr_array);
-
-       return err;
-}
-
-/*
- * Called by c2_remove to cleanup the RNIC resources.
- */
-void c2_rnic_term(struct c2_dev *c2dev)
-{
-
-       /* Close the open adapter instance */
-       c2_rnic_close(c2dev);
-
-       /* Send the TERM message to the adapter */
-       c2_adapter_term(c2dev);
-
-       /* Disable interrupts on the adapter */
-       writel(1, c2dev->regs + C2_IDIS);
-
-       /* Free the QP pool */
-       c2_cleanup_qp_table(c2dev);
-
-       /* Free the PD pool */
-       c2_cleanup_pd_table(c2dev);
-
-       /* Free the verbs request allocator */
-       vq_term(c2dev);
-
-       /* Free the asynchronus event queue */
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
-                         c2dev->aeq.msg_pool.host,
-                         dma_unmap_addr(&c2dev->aeq, mapping));
-
-       /* Free the verbs reply queue */
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-                         c2dev->rep_vq.msg_pool.host,
-                         dma_unmap_addr(&c2dev->rep_vq, mapping));
-
-       /* Free the MQ shared pointer pool */
-       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-
-       /* Free the qptr_array */
-       vfree(c2dev->qptr_array);
-
-       return;
-}
diff --git a/drivers/staging/rdma/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h
deleted file mode 100644 (file)
index 6ee4aa9..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef        _C2_STATUS_H_
-#define _C2_STATUS_H_
-
-/*
- * Verbs Status Codes
- */
-enum c2_status {
-       C2_OK = 0,              /* This must be zero */
-       CCERR_INSUFFICIENT_RESOURCES = 1,
-       CCERR_INVALID_MODIFIER = 2,
-       CCERR_INVALID_MODE = 3,
-       CCERR_IN_USE = 4,
-       CCERR_INVALID_RNIC = 5,
-       CCERR_INTERRUPTED_OPERATION = 6,
-       CCERR_INVALID_EH = 7,
-       CCERR_INVALID_CQ = 8,
-       CCERR_CQ_EMPTY = 9,
-       CCERR_NOT_IMPLEMENTED = 10,
-       CCERR_CQ_DEPTH_TOO_SMALL = 11,
-       CCERR_PD_IN_USE = 12,
-       CCERR_INVALID_PD = 13,
-       CCERR_INVALID_SRQ = 14,
-       CCERR_INVALID_ADDRESS = 15,
-       CCERR_INVALID_NETMASK = 16,
-       CCERR_INVALID_QP = 17,
-       CCERR_INVALID_QP_STATE = 18,
-       CCERR_TOO_MANY_WRS_POSTED = 19,
-       CCERR_INVALID_WR_TYPE = 20,
-       CCERR_INVALID_SGL_LENGTH = 21,
-       CCERR_INVALID_SQ_DEPTH = 22,
-       CCERR_INVALID_RQ_DEPTH = 23,
-       CCERR_INVALID_ORD = 24,
-       CCERR_INVALID_IRD = 25,
-       CCERR_QP_ATTR_CANNOT_CHANGE = 26,
-       CCERR_INVALID_STAG = 27,
-       CCERR_QP_IN_USE = 28,
-       CCERR_OUTSTANDING_WRS = 29,
-       CCERR_STAG_IN_USE = 30,
-       CCERR_INVALID_STAG_INDEX = 31,
-       CCERR_INVALID_SGL_FORMAT = 32,
-       CCERR_ADAPTER_TIMEOUT = 33,
-       CCERR_INVALID_CQ_DEPTH = 34,
-       CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
-       CCERR_INVALID_EP = 36,
-       CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
-       CCERR_FLUSHED = 38,
-       CCERR_INVALID_WQE = 39,
-       CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
-       CCERR_REMOTE_TERMINATION_ERROR = 41,
-       CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
-       CCERR_ACCESS_VIOLATION = 43,
-       CCERR_INVALID_PD_ID = 44,
-       CCERR_WRAP_ERROR = 45,
-       CCERR_INV_STAG_ACCESS_ERROR = 46,
-       CCERR_ZERO_RDMA_READ_RESOURCES = 47,
-       CCERR_QP_NOT_PRIVILEGED = 48,
-       CCERR_STAG_STATE_NOT_INVALID = 49,
-       CCERR_INVALID_PAGE_SIZE = 50,
-       CCERR_INVALID_BUFFER_SIZE = 51,
-       CCERR_INVALID_PBE = 52,
-       CCERR_INVALID_FBO = 53,
-       CCERR_INVALID_LENGTH = 54,
-       CCERR_INVALID_ACCESS_RIGHTS = 55,
-       CCERR_PBL_TOO_BIG = 56,
-       CCERR_INVALID_VA = 57,
-       CCERR_INVALID_REGION = 58,
-       CCERR_INVALID_WINDOW = 59,
-       CCERR_TOTAL_LENGTH_TOO_BIG = 60,
-       CCERR_INVALID_QP_ID = 61,
-       CCERR_ADDR_IN_USE = 62,
-       CCERR_ADDR_NOT_AVAIL = 63,
-       CCERR_NET_DOWN = 64,
-       CCERR_NET_UNREACHABLE = 65,
-       CCERR_CONN_ABORTED = 66,
-       CCERR_CONN_RESET = 67,
-       CCERR_NO_BUFS = 68,
-       CCERR_CONN_TIMEDOUT = 69,
-       CCERR_CONN_REFUSED = 70,
-       CCERR_HOST_UNREACHABLE = 71,
-       CCERR_INVALID_SEND_SGL_DEPTH = 72,
-       CCERR_INVALID_RECV_SGL_DEPTH = 73,
-       CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
-       CCERR_INSUFFICIENT_PRIVILEGES = 75,
-       CCERR_STACK_ERROR = 76,
-       CCERR_INVALID_VERSION = 77,
-       CCERR_INVALID_MTU = 78,
-       CCERR_INVALID_IMAGE = 79,
-       CCERR_PENDING = 98,     /* not an error; user internally by adapter */
-       CCERR_DEFER = 99,       /* not an error; used internally by adapter */
-       CCERR_FAILED_WRITE = 100,
-       CCERR_FAILED_ERASE = 101,
-       CCERR_FAILED_VERIFICATION = 102,
-       CCERR_NOT_FOUND = 103,
-
-};
-
-/*
- * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
- */
-enum c2_connect_status {
-       C2_CONN_STATUS_SUCCESS = C2_OK,
-       C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
-       C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
-       C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
-       C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
-       C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
-       C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
-       C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
-       C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
-       C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
-       C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
-};
-
-/*
- * Flash programming status codes.
- */
-enum c2_flash_status {
-       C2_FLASH_STATUS_SUCCESS = 0x0000,
-       C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
-       C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
-       C2_FLASH_STATUS_ECLBS = 0x0400,
-       C2_FLASH_STATUS_PSLBS = 0x0800,
-       C2_FLASH_STATUS_VPENS = 0x1000,
-};
-
-#endif                         /* _C2_STATUS_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h
deleted file mode 100644 (file)
index 7e9e7ad..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_USER_H
-#define C2_USER_H
-
-#include <linux/types.h>
-
-/*
- * Make sure that all structs defined in this file remain laid out so
- * that they pack the same way on 32-bit and 64-bit architectures (to
- * avoid incompatibility between 32-bit userspace and 64-bit kernels).
- * In particular do not use pointer types -- pass pointers in __u64
- * instead.
- */
-
-struct c2_alloc_ucontext_resp {
-       __u32 qp_tab_size;
-       __u32 uarc_size;
-};
-
-struct c2_alloc_pd_resp {
-       __u32 pdn;
-       __u32 reserved;
-};
-
-struct c2_create_cq {
-       __u32 lkey;
-       __u32 pdn;
-       __u64 arm_db_page;
-       __u64 set_db_page;
-       __u32 arm_db_index;
-       __u32 set_db_index;
-};
-
-struct c2_create_cq_resp {
-       __u32 cqn;
-       __u32 reserved;
-};
-
-struct c2_create_qp {
-       __u32 lkey;
-       __u32 reserved;
-       __u64 sq_db_page;
-       __u64 rq_db_page;
-       __u32 sq_db_index;
-       __u32 rq_db_index;
-};
-
-#endif                         /* C2_USER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c
deleted file mode 100644 (file)
index 2ec716f..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-
-#include "c2_vq.h"
-#include "c2_provider.h"
-
-/*
- * Verbs Request Objects:
- *
- * VQ Request Objects are allocated by the kernel verbs handlers.
- * They contain a wait object, a refcnt, an atomic bool indicating that the
- * adapter has replied, and a copy of the verb reply work request.
- * A pointer to the VQ Request Object is passed down in the context
- * field of the work request message, and reflected back by the adapter
- * in the verbs reply message.  The function handle_vq() in the interrupt
- * path will use this pointer to:
- *     1) append a copy of the verbs reply message
- *     2) mark that the reply is ready
- *     3) wake up the kernel verbs handler blocked awaiting the reply.
- *
- *
- * The kernel verbs handlers do a "get" to put a 2nd reference on the
- * VQ Request object.  If the kernel verbs handler exits before the adapter
- * can respond, this extra reference will keep the VQ Request object around
- * until the adapter's reply can be processed.  The reason we need this is
- * because a pointer to this object is stuffed into the context field of
- * the verbs work request message, and reflected back in the reply message.
- * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
- * kernel verb handler that is blocked awaiting the verb reply.
- * So handle_vq() will do a "put" on the object when it's done accessing it.
- * NOTE:  If we guarantee that the kernel verb handler will never bail before
- *        getting the reply, then we don't need these refcnts.
- *
- *
- * VQ Request objects are freed by the kernel verbs handlers only
- * after the verb has been processed, or when the adapter fails and
- * does not reply.
- *
- *
- * Verbs Reply Buffers:
- *
- * VQ Reply bufs are local host memory copies of a
- * outstanding Verb Request reply
- * message.  The are always allocated by the kernel verbs handlers, and _may_ be
- * freed by either the kernel verbs handler -or- the interrupt handler.  The
- * kernel verbs handler _must_ free the repbuf, then free the vq request object
- * in that order.
- */
-
-int vq_init(struct c2_dev *c2dev)
-{
-       sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
-               (char) ('0' + c2dev->devnum));
-       c2dev->host_msg_cache =
-           kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
-                             SLAB_HWCACHE_ALIGN, NULL);
-       if (c2dev->host_msg_cache == NULL) {
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-void vq_term(struct c2_dev *c2dev)
-{
-       kmem_cache_destroy(c2dev->host_msg_cache);
-}
-
-/* vq_req_alloc - allocate a VQ Request Object and initialize it.
- * The refcnt is set to 1.
- */
-struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *r;
-
-       r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
-       if (r) {
-               init_waitqueue_head(&r->wait_object);
-               r->reply_msg = 0;
-               r->event = 0;
-               r->cm_id = NULL;
-               r->qp = NULL;
-               atomic_set(&r->refcnt, 1);
-               atomic_set(&r->reply_ready, 0);
-       }
-       return r;
-}
-
-
-/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
- * has already free the VQ Reply Buffer if it existed.
- */
-void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       r->reply_msg = 0;
-       if (atomic_dec_and_test(&r->refcnt)) {
-               kfree(r);
-       }
-}
-
-/* vq_req_get - reference a VQ Request Object.  Done
- * only in the kernel verbs handlers.
- */
-void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       atomic_inc(&r->refcnt);
-}
-
-
-/* vq_req_put - dereference and potentially free a VQ Request Object.
- *
- * This is only called by handle_vq() on the
- * interrupt when it is done processing
- * a verb reply message.  If the associated
- * kernel verbs handler has already bailed,
- * then this put will actually free the VQ
- * Request object _and_ the VQ Reply Buffer
- * if it exists.
- */
-void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       if (atomic_dec_and_test(&r->refcnt)) {
-               if (r->reply_msg != 0)
-                       vq_repbuf_free(c2dev,
-                                      (void *) (unsigned long) r->reply_msg);
-               kfree(r);
-       }
-}
-
-
-/*
- * vq_repbuf_alloc - allocate a VQ Reply Buffer.
- */
-void *vq_repbuf_alloc(struct c2_dev *c2dev)
-{
-       return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
-}
-
-/*
- * vq_send_wr - post a verbs request message to the Verbs Request Queue.
- * If a message is not available in the MQ, then block until one is available.
- * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
- * When the adapter drains the Verbs Request Queue,
- * it inserts MQ index 0 in to the
- * adapter->host activity fifo and interrupts the host.
- */
-int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
-{
-       void *msg;
-       wait_queue_t __wait;
-
-       /*
-        * grab adapter vq lock
-        */
-       spin_lock(&c2dev->vqlock);
-
-       /*
-        * allocate msg
-        */
-       msg = c2_mq_alloc(&c2dev->req_vq);
-
-       /*
-        * If we cannot get a msg, then we'll wait
-        * When a messages are available, the int handler will wake_up()
-        * any waiters.
-        */
-       while (msg == NULL) {
-               pr_debug("%s:%d no available msg in VQ, waiting...\n",
-                      __func__, __LINE__);
-               init_waitqueue_entry(&__wait, current);
-               add_wait_queue(&c2dev->req_vq_wo, &__wait);
-               spin_unlock(&c2dev->vqlock);
-               for (;;) {
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       if (!c2_mq_full(&c2dev->req_vq)) {
-                               break;
-                       }
-                       if (!signal_pending(current)) {
-                               schedule_timeout(1 * HZ);       /* 1 second... */
-                               continue;
-                       }
-                       set_current_state(TASK_RUNNING);
-                       remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-                       return -EINTR;
-               }
-               set_current_state(TASK_RUNNING);
-               remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-               spin_lock(&c2dev->vqlock);
-               msg = c2_mq_alloc(&c2dev->req_vq);
-       }
-
-       /*
-        * copy wr into adapter msg
-        */
-       memcpy(msg, wr, c2dev->req_vq.msg_size);
-
-       /*
-        * post msg
-        */
-       c2_mq_produce(&c2dev->req_vq);
-
-       /*
-        * release adapter vq lock
-        */
-       spin_unlock(&c2dev->vqlock);
-       return 0;
-}
-
-
-/*
- * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
- */
-int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
-{
-       if (!wait_event_timeout(req->wait_object,
-                               atomic_read(&req->reply_ready),
-                               60*HZ))
-               return -ETIMEDOUT;
-
-       return 0;
-}
-
-/*
- * vq_repbuf_free - Free a Verbs Reply Buffer.
- */
-void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
-{
-       kmem_cache_free(c2dev->host_msg_cache, reply);
-}
diff --git a/drivers/staging/rdma/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h
deleted file mode 100644 (file)
index c1f6cef..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_VQ_H_
-#define _C2_VQ_H_
-#include <linux/sched.h>
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_provider.h"
-
-struct c2_vq_req {
-       u64 reply_msg;          /* ptr to reply msg */
-       wait_queue_head_t wait_object;  /* wait object for vq reqs */
-       atomic_t reply_ready;   /* set when reply is ready */
-       atomic_t refcnt;        /* used to cancel WRs... */
-       int event;
-       struct iw_cm_id *cm_id;
-       struct c2_qp *qp;
-};
-
-int vq_init(struct c2_dev *c2dev);
-void vq_term(struct c2_dev *c2dev);
-
-struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
-void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
-void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
-void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
-int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
-
-void *vq_repbuf_alloc(struct c2_dev *c2dev);
-void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
-
-int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
-#endif                         /* _C2_VQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h
deleted file mode 100644 (file)
index 8d4b4ca..0000000
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_WR_H_
-#define _C2_WR_H_
-
-#ifdef CCDEBUG
-#define CCWR_MAGIC             0xb07700b0
-#endif
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-/* Maximum allowed size in bytes of private_data exchange
- * on connect.
- */
-#define C2_MAX_PRIVATE_DATA_SIZE 200
-
-/*
- * These types are shared among the adapter, host, and CCIL consumer.
- */
-enum c2_cq_notification_type {
-       C2_CQ_NOTIFICATION_TYPE_NONE = 1,
-       C2_CQ_NOTIFICATION_TYPE_NEXT,
-       C2_CQ_NOTIFICATION_TYPE_NEXT_SE
-};
-
-enum c2_setconfig_cmd {
-       C2_CFG_ADD_ADDR = 1,
-       C2_CFG_DEL_ADDR = 2,
-       C2_CFG_ADD_ROUTE = 3,
-       C2_CFG_DEL_ROUTE = 4
-};
-
-enum c2_getconfig_cmd {
-       C2_GETCONFIG_ROUTES = 1,
-       C2_GETCONFIG_ADDRS
-};
-
-/*
- *  CCIL Work Request Identifiers
- */
-enum c2wr_ids {
-       CCWR_RNIC_OPEN = 1,
-       CCWR_RNIC_QUERY,
-       CCWR_RNIC_SETCONFIG,
-       CCWR_RNIC_GETCONFIG,
-       CCWR_RNIC_CLOSE,
-       CCWR_CQ_CREATE,
-       CCWR_CQ_QUERY,
-       CCWR_CQ_MODIFY,
-       CCWR_CQ_DESTROY,
-       CCWR_QP_CONNECT,
-       CCWR_PD_ALLOC,
-       CCWR_PD_DEALLOC,
-       CCWR_SRQ_CREATE,
-       CCWR_SRQ_QUERY,
-       CCWR_SRQ_MODIFY,
-       CCWR_SRQ_DESTROY,
-       CCWR_QP_CREATE,
-       CCWR_QP_QUERY,
-       CCWR_QP_MODIFY,
-       CCWR_QP_DESTROY,
-       CCWR_NSMR_STAG_ALLOC,
-       CCWR_NSMR_REGISTER,
-       CCWR_NSMR_PBL,
-       CCWR_STAG_DEALLOC,
-       CCWR_NSMR_REREGISTER,
-       CCWR_SMR_REGISTER,
-       CCWR_MR_QUERY,
-       CCWR_MW_ALLOC,
-       CCWR_MW_QUERY,
-       CCWR_EP_CREATE,
-       CCWR_EP_GETOPT,
-       CCWR_EP_SETOPT,
-       CCWR_EP_DESTROY,
-       CCWR_EP_BIND,
-       CCWR_EP_CONNECT,
-       CCWR_EP_LISTEN,
-       CCWR_EP_SHUTDOWN,
-       CCWR_EP_LISTEN_CREATE,
-       CCWR_EP_LISTEN_DESTROY,
-       CCWR_EP_QUERY,
-       CCWR_CR_ACCEPT,
-       CCWR_CR_REJECT,
-       CCWR_CONSOLE,
-       CCWR_TERM,
-       CCWR_FLASH_INIT,
-       CCWR_FLASH,
-       CCWR_BUF_ALLOC,
-       CCWR_BUF_FREE,
-       CCWR_FLASH_WRITE,
-       CCWR_INIT,              /* WARNING: Don't move this ever again! */
-
-
-
-       /* Add new IDs here */
-
-
-
-       /*
-        * WARNING: CCWR_LAST must always be the last verbs id defined!
-        *          All the preceding IDs are fixed, and must not change.
-        *          You can add new IDs, but must not remove or reorder
-        *          any IDs. If you do, YOU will ruin any hope of
-        *          compatibility between versions.
-        */
-       CCWR_LAST,
-
-       /*
-        * Start over at 1 so that arrays indexed by user wr id's
-        * begin at 1.  This is OK since the verbs and user wr id's
-        * are always used on disjoint sets of queues.
-        */
-       /*
-        * The order of the CCWR_SEND_XX verbs must
-        * match the order of the RDMA_OPs
-        */
-       CCWR_SEND = 1,
-       CCWR_SEND_INV,
-       CCWR_SEND_SE,
-       CCWR_SEND_SE_INV,
-       CCWR_RDMA_WRITE,
-       CCWR_RDMA_READ,
-       CCWR_RDMA_READ_INV,
-       CCWR_MW_BIND,
-       CCWR_NSMR_FASTREG,
-       CCWR_STAG_INVALIDATE,
-       CCWR_RECV,
-       CCWR_NOP,
-       CCWR_UNIMPL,
-/* WARNING: This must always be the last user wr id defined! */
-};
-#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
-
-/*
- * SQ/RQ Work Request Types
- */
-enum c2_wr_type {
-       C2_WR_TYPE_SEND = CCWR_SEND,
-       C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
-       C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
-       C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
-       C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
-       C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
-       C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
-       C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
-       C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
-       C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
-       C2_WR_TYPE_RECV = CCWR_RECV,
-       C2_WR_TYPE_NOP = CCWR_NOP,
-};
-
-struct c2_netaddr {
-       __be32 ip_addr;
-       __be32 netmask;
-       u32 mtu;
-};
-
-struct c2_route {
-       u32 ip_addr;            /* 0 indicates the default route */
-       u32 netmask;            /* netmask associated with dst */
-       u32 flags;
-       union {
-               u32 ipaddr;     /* address of the nexthop interface */
-               u8 enaddr[6];
-       } nexthop;
-};
-
-/*
- * A Scatter Gather Entry.
- */
-struct c2_data_addr {
-       __be32 stag;
-       __be32 length;
-       __be64 to;
-};
-
-/*
- * MR and MW flags used by the consumer, RI, and RNIC.
- */
-enum c2_mm_flags {
-       MEM_REMOTE = 0x0001,    /* allow mw binds with remote access. */
-       MEM_VA_BASED = 0x0002,  /* Not Zero-based */
-       MEM_PBL_COMPLETE = 0x0004,      /* PBL array is complete in this msg */
-       MEM_LOCAL_READ = 0x0008,        /* allow local reads */
-       MEM_LOCAL_WRITE = 0x0010,       /* allow local writes */
-       MEM_REMOTE_READ = 0x0020,       /* allow remote reads */
-       MEM_REMOTE_WRITE = 0x0040,      /* allow remote writes */
-       MEM_WINDOW_BIND = 0x0080,       /* binds allowed */
-       MEM_SHARED = 0x0100,    /* set if MR is shared */
-       MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */
-};
-
-/*
- * CCIL API ACF flags defined in terms of the low level mem flags.
- * This minimizes translation needed in the user API
- */
-enum c2_acf {
-       C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
-       C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
-       C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
-       C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
-       C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
-};
-
-/*
- * Image types of objects written to flash
- */
-#define C2_FLASH_IMG_BITFILE 1
-#define C2_FLASH_IMG_OPTION_ROM 2
-#define C2_FLASH_IMG_VPD 3
-
-/*
- *  to fix bug 1815 we define the max size allowable of the
- *  terminate message (per the IETF spec).Refer to the IETF
- *  protocol specification, section 12.1.6, page 64)
- *  The message is prefixed by 20 types of DDP info.
- *
- *  Then the message has 6 bytes for the terminate control
- *  and DDP segment length info plus a DDP header (either
- *  14 or 18 byts) plus 28 bytes for the RDMA header.
- *  Thus the max size in:
- *  20 + (6 + 18 + 28) = 72
- */
-#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
-
-/*
- * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
- */
-#define WR_BUILD_STR_LEN 64
-
-/*
- * WARNING:  All of these structs need to align any 64bit types on
- * 64 bit boundaries!  64bit types include u64 and u64.
- */
-
-/*
- * Clustercore Work Request Header.  Be sensitive to field layout
- * and alignment.
- */
-struct c2wr_hdr {
-       /* wqe_count is part of the cqe.  It is put here so the
-        * adapter can write to it while the wr is pending without
-        * clobbering part of the wr.  This word need not be dma'd
-        * from the host to adapter by libccil, but we copy it anyway
-        * to make the memcpy to the adapter better aligned.
-        */
-       __be32 wqe_count;
-
-       /* Put these fields next so that later 32- and 64-bit
-        * quantities are naturally aligned.
-        */
-       u8 id;
-       u8 result;              /* adapter -> host */
-       u8 sge_count;           /* host -> adapter */
-       u8 flags;               /* host -> adapter */
-
-       u64 context;
-#ifdef CCMSGMAGIC
-       u32 magic;
-       u32 pad;
-#endif
-} __attribute__((packed));
-
-/*
- *------------------------ RNIC ------------------------
- */
-
-/*
- * WR_RNIC_OPEN
- */
-
-/*
- * Flags for the RNIC WRs
- */
-enum c2_rnic_flags {
-       RNIC_IRD_STATIC = 0x0001,
-       RNIC_ORD_STATIC = 0x0002,
-       RNIC_QP_STATIC = 0x0004,
-       RNIC_SRQ_SUPPORTED = 0x0008,
-       RNIC_PBL_BLOCK_MODE = 0x0010,
-       RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
-       RNIC_CQ_OVF_DETECTED = 0x0040,
-       RNIC_PRIV_MODE = 0x0080
-};
-
-struct c2wr_rnic_open_req {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       __be16 flags;           /* See enum c2_rnic_flags */
-       __be16 port_num;
-} __attribute__((packed));
-
-struct c2wr_rnic_open_rep {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-union c2wr_rnic_open {
-       struct c2wr_rnic_open_req req;
-       struct c2wr_rnic_open_rep rep;
-} __attribute__((packed));
-
-struct c2wr_rnic_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_QUERY
- */
-struct c2wr_rnic_query_rep {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       __be32 vendor_id;
-       __be32 part_number;
-       __be32 hw_version;
-       __be32 fw_ver_major;
-       __be32 fw_ver_minor;
-       __be32 fw_ver_patch;
-       char fw_ver_build_str[WR_BUILD_STR_LEN];
-       __be32 max_qps;
-       __be32 max_qp_depth;
-       u32 max_srq_depth;
-       u32 max_send_sgl_depth;
-       u32 max_rdma_sgl_depth;
-       __be32 max_cqs;
-       __be32 max_cq_depth;
-       u32 max_cq_event_handlers;
-       __be32 max_mrs;
-       u32 max_pbl_depth;
-       __be32 max_pds;
-       __be32 max_global_ird;
-       u32 max_global_ord;
-       __be32 max_qp_ird;
-       __be32 max_qp_ord;
-       u32 flags;
-       __be32 max_mws;
-       u32 pbe_range_low;
-       u32 pbe_range_high;
-       u32 max_srqs;
-       u32 page_size;
-} __attribute__((packed));
-
-union c2wr_rnic_query {
-       struct c2wr_rnic_query_req req;
-       struct c2wr_rnic_query_rep rep;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_GETCONFIG
- */
-
-struct c2wr_rnic_getconfig_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 option;             /* see c2_getconfig_cmd_t */
-       u64 reply_buf;
-       u32 reply_buf_len;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_getconfig_rep {
-       struct c2wr_hdr hdr;
-       u32 option;             /* see c2_getconfig_cmd_t */
-       u32 count_len;          /* length of the number of addresses configured */
-} __attribute__((packed)) ;
-
-union c2wr_rnic_getconfig {
-       struct c2wr_rnic_getconfig_req req;
-       struct c2wr_rnic_getconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_SETCONFIG
- */
-struct c2wr_rnic_setconfig_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 option;          /* See c2_setconfig_cmd_t */
-       /* variable data and pad. See c2_netaddr and c2_route */
-       u8 data[0];
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_setconfig_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_setconfig {
-       struct c2wr_rnic_setconfig_req req;
-       struct c2wr_rnic_setconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_CLOSE
- */
-struct c2wr_rnic_close_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_close_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_close {
-       struct c2wr_rnic_close_req req;
-       struct c2wr_rnic_close_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ CQ ------------------------
- */
-struct c2wr_cq_create_req {
-       struct c2wr_hdr hdr;
-       __be64 shared_ht;
-       u64 user_context;
-       __be64 msg_pool;
-       u32 rnic_handle;
-       __be32 msg_size;
-       __be32 depth;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_create_rep {
-       struct c2wr_hdr hdr;
-       __be32 mq_index;
-       __be32 adapter_shared;
-       u32 cq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_cq_create {
-       struct c2wr_cq_create_req req;
-       struct c2wr_cq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 cq_handle;
-       u32 new_depth;
-       u64 new_msg_pool;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_modify {
-       struct c2wr_cq_modify_req req;
-       struct c2wr_cq_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 cq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_destroy {
-       struct c2wr_cq_destroy_req req;
-       struct c2wr_cq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ PD ------------------------
- */
-struct c2wr_pd_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_alloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_alloc {
-       struct c2wr_pd_alloc_req req;
-       struct c2wr_pd_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_dealloc {
-       struct c2wr_pd_dealloc_req req;
-       struct c2wr_pd_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ SRQ ------------------------
- */
-struct c2wr_srq_create_req {
-       struct c2wr_hdr hdr;
-       u64 shared_ht;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 srq_depth;
-       u32 srq_limit;
-       u32 sgl_depth;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_create_rep {
-       struct c2wr_hdr hdr;
-       u32 srq_depth;
-       u32 sgl_depth;
-       u32 msg_size;
-       u32 mq_index;
-       u32 mq_start;
-       u32 srq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_srq_create {
-       struct c2wr_srq_create_req req;
-       struct c2wr_srq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 srq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_srq_destroy {
-       struct c2wr_srq_destroy_req req;
-       struct c2wr_srq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ QP ------------------------
- */
-enum c2wr_qp_flags {
-       QP_RDMA_READ = 0x00000001,      /* RDMA read enabled? */
-       QP_RDMA_WRITE = 0x00000002,     /* RDMA write enabled? */
-       QP_MW_BIND = 0x00000004,        /* MWs enabled */
-       QP_ZERO_STAG = 0x00000008,      /* enabled? */
-       QP_REMOTE_TERMINATION = 0x00000010,     /* remote end terminated */
-       QP_RDMA_READ_RESPONSE = 0x00000020      /* Remote RDMA read  */
-           /* enabled? */
-};
-
-struct c2wr_qp_create_req {
-       struct c2wr_hdr hdr;
-       __be64 shared_sq_ht;
-       __be64 shared_rq_ht;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 sq_cq_handle;
-       u32 rq_cq_handle;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 srq_handle;
-       u32 srq_limit;
-       __be32 flags;           /* see enum c2wr_qp_flags */
-       __be32 send_sgl_depth;
-       __be32 recv_sgl_depth;
-       __be32 rdma_write_sgl_depth;
-       __be32 ord;
-       __be32 ird;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_create_rep {
-       struct c2wr_hdr hdr;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 send_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u32 ord;
-       u32 ird;
-       __be32 sq_msg_size;
-       __be32 sq_mq_index;
-       __be32 sq_mq_start;
-       __be32 rq_msg_size;
-       __be32 rq_mq_index;
-       __be32 rq_mq_start;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-union c2wr_qp_create {
-       struct c2wr_qp_create_req req;
-       struct c2wr_qp_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_rep {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 sq_depth;
-       u32 rq_depth;
-       u32 send_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 ord;
-       u32 ird;
-       u16 qp_state;
-       u16 flags;              /* see c2wr_qp_flags_t */
-       u32 qp_id;
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-       u32 terminate_msg_length;       /* 0 if not present */
-       u8 data[0];
-       /* Terminate Message in-line here. */
-} __attribute__((packed)) ;
-
-union c2wr_qp_query {
-       struct c2wr_qp_query_req req;
-       struct c2wr_qp_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_req {
-       struct c2wr_hdr hdr;
-       u64 stream_msg;
-       u32 stream_msg_length;
-       u32 rnic_handle;
-       u32 qp_handle;
-       __be32 next_qp_state;
-       __be32 ord;
-       __be32 ird;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 llp_ep_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_rep {
-       struct c2wr_hdr hdr;
-       u32 ord;
-       u32 ird;
-       u32 sq_depth;
-       u32 rq_depth;
-       u32 sq_msg_size;
-       u32 sq_mq_index;
-       u32 sq_mq_start;
-       u32 rq_msg_size;
-       u32 rq_mq_index;
-       u32 rq_mq_start;
-} __attribute__((packed)) ;
-
-union c2wr_qp_modify {
-       struct c2wr_qp_modify_req req;
-       struct c2wr_qp_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_qp_destroy {
-       struct c2wr_qp_destroy_req req;
-       struct c2wr_qp_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
- * only be posted when a QP is in IDLE state.  After the connect request is
- * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
- * No synchronous reply from adapter to this WR.  The results of
- * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
- * See c2wr_ae_active_connect_results_t
- */
-struct c2wr_qp_connect_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-       __be32 remote_addr;
-       __be16 remote_port;
-       u16 pad;
-       __be32 private_data_length;
-       u8 private_data[0];     /* Private data in-line. */
-} __attribute__((packed)) ;
-
-struct c2wr_qp_connect {
-       struct c2wr_qp_connect_req req;
-       /* no synchronous reply.         */
-} __attribute__((packed)) ;
-
-
-/*
- *------------------------ MM ------------------------
- */
-
-struct c2wr_nsmr_stag_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pbl_depth;
-       u32 pd_id;
-       u32 flags;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_stag_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_stag_alloc {
-       struct c2wr_nsmr_stag_alloc_req req;
-       struct c2wr_nsmr_stag_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_req {
-       struct c2wr_hdr hdr;
-       __be64 va;
-       u32 rnic_handle;
-       __be16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 pd_id;
-       __be32 pbl_depth;
-       __be32 pbe_size;
-       __be32 fbo;
-       __be32 length;
-       __be32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       __be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       __be32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_register {
-       struct c2wr_nsmr_register_req req;
-       struct c2wr_nsmr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 flags;
-       __be32 stag_index;
-       __be32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       __be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_pbl {
-       struct c2wr_nsmr_pbl_req req;
-       struct c2wr_nsmr_pbl_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_rep {
-       struct c2wr_hdr hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 pd_id;
-       u32 flags;
-       u32 pbl_depth;
-} __attribute__((packed)) ;
-
-union c2wr_mr_query {
-       struct c2wr_mr_query_req req;
-       struct c2wr_mr_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_rep {
-       struct c2wr_hdr hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 pd_id;
-       u32 flags;
-} __attribute__((packed)) ;
-
-union c2wr_mw_query {
-       struct c2wr_mw_query_req req;
-       struct c2wr_mw_query_rep rep;
-} __attribute__((packed)) ;
-
-
-struct c2wr_stag_dealloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_stag_dealloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_stag_dealloc {
-       struct c2wr_stag_dealloc_req req;
-       struct c2wr_stag_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_req {
-       struct c2wr_hdr hdr;
-       u64 va;
-       u32 rnic_handle;
-       u16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 stag_index;
-       u32 pd_id;
-       u32 pbl_depth;
-       u32 pbe_size;
-       u32 fbo;
-       u32 length;
-       u32 addrs_length;
-       u32 pad1;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       u64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_reregister {
-       struct c2wr_nsmr_reregister_req req;
-       struct c2wr_nsmr_reregister_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_req {
-       struct c2wr_hdr hdr;
-       u64 va;
-       u32 rnic_handle;
-       u16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 stag_index;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_rep {
-       struct c2wr_hdr hdr;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_smr_register {
-       struct c2wr_smr_register_req req;
-       struct c2wr_smr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_mw_alloc {
-       struct c2wr_mw_alloc_req req;
-       struct c2wr_mw_alloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ WRs -----------------------
- */
-
-struct c2wr_user_hdr {
-       struct c2wr_hdr hdr;            /* Has status and WR Type */
-} __attribute__((packed)) ;
-
-enum c2_qp_state {
-       C2_QP_STATE_IDLE = 0x01,
-       C2_QP_STATE_CONNECTING = 0x02,
-       C2_QP_STATE_RTS = 0x04,
-       C2_QP_STATE_CLOSING = 0x08,
-       C2_QP_STATE_TERMINATE = 0x10,
-       C2_QP_STATE_ERROR = 0x20,
-};
-
-/* Completion queue entry. */
-struct c2wr_ce {
-       struct c2wr_hdr hdr;            /* Has status and WR Type */
-       u64 qp_user_context;    /* c2_user_qp_t * */
-       u32 qp_state;           /* Current QP State */
-       u32 handle;             /* QPID or EP Handle */
-       __be32 bytes_rcvd;              /* valid for RECV WCs */
-       u32 stag;
-} __attribute__((packed)) ;
-
-
-/*
- * Flags used for all post-sq WRs.  These must fit in the flags
- * field of the struct c2wr_hdr (eight bits).
- */
-enum {
-       SQ_SIGNALED = 0x01,
-       SQ_READ_FENCE = 0x02,
-       SQ_FENCE = 0x04,
-};
-
-/*
- * Common fields for all post-sq WRs.  Namely the standard header and a
- * secondary header with fields common to all post-sq WRs.
- */
-struct c2_sq_hdr {
-       struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * Same as above but for post-rq WRs.
- */
-struct c2_rq_hdr {
-       struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * use the same struct for all sends.
- */
-struct c2wr_send_req {
-       struct c2_sq_hdr sq_hdr;
-       __be32 sge_len;
-       __be32 remote_stag;
-       u8 data[0];             /* SGE array */
-} __attribute__((packed));
-
-union c2wr_send {
-       struct c2wr_send_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_write_req {
-       struct c2_sq_hdr sq_hdr;
-       __be64 remote_to;
-       __be32 remote_stag;
-       __be32 sge_len;
-       u8 data[0];             /* SGE array */
-} __attribute__((packed));
-
-union c2wr_rdma_write {
-       struct c2wr_rdma_write_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_read_req {
-       struct c2_sq_hdr sq_hdr;
-       __be64 local_to;
-       __be64 remote_to;
-       __be32 local_stag;
-       __be32 remote_stag;
-       __be32 length;
-} __attribute__((packed));
-
-union c2wr_rdma_read {
-       struct c2wr_rdma_read_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_mw_bind_req {
-       struct c2_sq_hdr sq_hdr;
-       u64 va;
-       u8 stag_key;
-       u8 pad[3];
-       u32 mw_stag_index;
-       u32 mr_stag_index;
-       u32 length;
-       u32 flags;
-} __attribute__((packed));
-
-union c2wr_mw_bind {
-       struct c2wr_mw_bind_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_nsmr_fastreg_req {
-       struct c2_sq_hdr sq_hdr;
-       u64 va;
-       u8 stag_key;
-       u8 pad[3];
-       u32 stag_index;
-       u32 pbe_size;
-       u32 fbo;
-       u32 length;
-       u32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       u64 paddrs[0];
-} __attribute__((packed));
-
-union c2wr_nsmr_fastreg {
-       struct c2wr_nsmr_fastreg_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_stag_invalidate_req {
-       struct c2_sq_hdr sq_hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 stag_index;
-} __attribute__((packed));
-
-union c2wr_stag_invalidate {
-       struct c2wr_stag_invalidate_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-union c2wr_sqwr {
-       struct c2_sq_hdr sq_hdr;
-       struct c2wr_send_req send;
-       struct c2wr_send_req send_se;
-       struct c2wr_send_req send_inv;
-       struct c2wr_send_req send_se_inv;
-       struct c2wr_rdma_write_req rdma_write;
-       struct c2wr_rdma_read_req rdma_read;
-       struct c2wr_mw_bind_req mw_bind;
-       struct c2wr_nsmr_fastreg_req nsmr_fastreg;
-       struct c2wr_stag_invalidate_req stag_inv;
-} __attribute__((packed));
-
-
-/*
- * RQ WRs
- */
-struct c2wr_rqwr {
-       struct c2_rq_hdr rq_hdr;
-       u8 data[0];             /* array of SGEs */
-} __attribute__((packed));
-
-union c2wr_recv {
-       struct c2wr_rqwr req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-/*
- * All AEs start with this header.  Most AEs only need to convey the
- * information in the header.  Some, like LLP connection events, need
- * more info.  The union typdef c2wr_ae_t has all the possible AEs.
- *
- * hdr.context is the user_context from the rnic_open WR.  NULL If this
- * is not affiliated with an rnic
- *
- * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
- * CCAE_LLP_CLOSE_COMPLETE)
- *
- * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
- *
- * user_context is the context passed down when the host created the resource.
- */
-struct c2wr_ae_hdr {
-       struct c2wr_hdr hdr;
-       u64 user_context;       /* user context for this res. */
-       __be32 resource_type;   /* see enum c2_resource_indicator */
-       __be32 resource;        /* handle for resource */
-       __be32 qp_state;        /* current QP State */
-} __attribute__((packed));
-
-/*
- * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
- * the adapter moves the QP into RTS state
- */
-struct c2wr_ae_active_connect_results {
-       struct c2wr_ae_hdr ae_hdr;
-       __be32 laddr;
-       __be32 raddr;
-       __be16 lport;
-       __be16 rport;
-       __be32 private_data_length;
-       u8 private_data[0];     /* data is in-line in the msg. */
-} __attribute__((packed));
-
-/*
- * When connections are established by the stack (and the private data
- * MPA frame is received), the adapter will generate an event to the host.
- * The details of the connection, any private data, and the new connection
- * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
- * AE queue:
- */
-struct c2wr_ae_connection_request {
-       struct c2wr_ae_hdr ae_hdr;
-       u32 cr_handle;          /* connreq handle (sock ptr) */
-       __be32 laddr;
-       __be32 raddr;
-       __be16 lport;
-       __be16 rport;
-       __be32 private_data_length;
-       u8 private_data[0];     /* data is in-line in the msg. */
-} __attribute__((packed));
-
-union c2wr_ae {
-       struct c2wr_ae_hdr ae_generic;
-       struct c2wr_ae_active_connect_results ae_active_connect_results;
-       struct c2wr_ae_connection_request ae_connection_request;
-} __attribute__((packed));
-
-struct c2wr_init_req {
-       struct c2wr_hdr hdr;
-       __be64 hint_count;
-       __be64 q0_host_shared;
-       __be64 q1_host_shared;
-       __be64 q1_host_msg_pool;
-       __be64 q2_host_shared;
-       __be64 q2_host_msg_pool;
-} __attribute__((packed));
-
-struct c2wr_init_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_init {
-       struct c2wr_init_req req;
-       struct c2wr_init_rep rep;
-} __attribute__((packed));
-
-/*
- * For upgrading flash.
- */
-
-struct c2wr_flash_init_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-struct c2wr_flash_init_rep {
-       struct c2wr_hdr hdr;
-       u32 adapter_flash_buf_offset;
-       u32 adapter_flash_len;
-} __attribute__((packed));
-
-union c2wr_flash_init {
-       struct c2wr_flash_init_req req;
-       struct c2wr_flash_init_rep rep;
-} __attribute__((packed));
-
-struct c2wr_flash_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 len;
-} __attribute__((packed));
-
-struct c2wr_flash_rep {
-       struct c2wr_hdr hdr;
-       u32 status;
-} __attribute__((packed));
-
-union c2wr_flash {
-       struct c2wr_flash_req req;
-       struct c2wr_flash_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 size;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 offset;             /* 0 if mem not available */
-       u32 size;               /* 0 if mem not available */
-} __attribute__((packed));
-
-union c2wr_buf_alloc {
-       struct c2wr_buf_alloc_req req;
-       struct c2wr_buf_alloc_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_free_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 offset;             /* Must match value from alloc */
-       u32 size;               /* Must match value from alloc */
-} __attribute__((packed));
-
-struct c2wr_buf_free_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_buf_free {
-       struct c2wr_buf_free_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_flash_write_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 offset;
-       u32 size;
-       u32 type;
-       u32 flags;
-} __attribute__((packed));
-
-struct c2wr_flash_write_rep {
-       struct c2wr_hdr hdr;
-       u32 status;
-} __attribute__((packed));
-
-union c2wr_flash_write {
-       struct c2wr_flash_write_req req;
-       struct c2wr_flash_write_rep rep;
-} __attribute__((packed));
-
-/*
- * Messages for LLP connection setup.
- */
-
-/*
- * Listen Request.  This allocates a listening endpoint to allow passive
- * connection setup.  Newly established LLP connections are passed up
- * via an AE.  See c2wr_ae_connection_request_t
- */
-struct c2wr_ep_listen_create_req {
-       struct c2wr_hdr hdr;
-       u64 user_context;       /* returned in AEs. */
-       u32 rnic_handle;
-       __be32 local_addr;              /* local addr, or 0  */
-       __be16 local_port;              /* 0 means "pick one" */
-       u16 pad;
-       __be32 backlog;         /* tradional tcp listen bl */
-} __attribute__((packed));
-
-struct c2wr_ep_listen_create_rep {
-       struct c2wr_hdr hdr;
-       u32 ep_handle;          /* handle to new listening ep */
-       u16 local_port;         /* resulting port... */
-       u16 pad;
-} __attribute__((packed));
-
-union c2wr_ep_listen_create {
-       struct c2wr_ep_listen_create_req req;
-       struct c2wr_ep_listen_create_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_ep_listen_destroy {
-       struct c2wr_ep_listen_destroy_req req;
-       struct c2wr_ep_listen_destroy_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_query_rep {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-} __attribute__((packed));
-
-union c2wr_ep_query {
-       struct c2wr_ep_query_req req;
-       struct c2wr_ep_query_rep rep;
-} __attribute__((packed));
-
-
-/*
- * The host passes this down to indicate acceptance of a pending iWARP
- * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
- * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
- */
-struct c2wr_cr_accept_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;          /* QP to bind to this LLP conn */
-       u32 ep_handle;          /* LLP  handle to accept */
-       __be32 private_data_length;
-       u8 private_data[0];     /* data in-line in msg. */
-} __attribute__((packed));
-
-/*
- * adapter sends reply when private data is successfully submitted to
- * the LLP.
- */
-struct c2wr_cr_accept_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_accept {
-       struct c2wr_cr_accept_req req;
-       struct c2wr_cr_accept_rep rep;
-} __attribute__((packed));
-
-/*
- * The host sends this down if a given iWARP connection request was
- * rejected by the consumer.  The cr_handle was obtained from a
- * previous c2wr_ae_connection_request_t AE sent by the adapter.
- */
-struct  c2wr_cr_reject_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;          /* LLP handle to reject */
-} __attribute__((packed));
-
-/*
- * Dunno if this is needed, but we'll add it for now.  The adapter will
- * send the reject_reply after the LLP endpoint has been destroyed.
- */
-struct  c2wr_cr_reject_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_reject {
-       struct c2wr_cr_reject_req req;
-       struct c2wr_cr_reject_rep rep;
-} __attribute__((packed));
-
-/*
- * console command.  Used to implement a debug console over the verbs
- * request and reply queues.
- */
-
-/*
- * Console request message.  It contains:
- *     - message hdr with id = CCWR_CONSOLE
- *     - the physaddr/len of host memory to be used for the reply.
- *     - the command string.  eg:  "netstat -s" or "zoneinfo"
- */
-struct c2wr_console_req {
-       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
-       u64 reply_buf;          /* pinned host buf for reply */
-       u32 reply_buf_len;      /* length of reply buffer */
-       u8 command[0];          /* NUL terminated ascii string */
-       /* containing the command req */
-} __attribute__((packed));
-
-/*
- * flags used in the console reply.
- */
-enum c2_console_flags {
-       CONS_REPLY_TRUNCATED = 0x00000001       /* reply was truncated */
-} __attribute__((packed));
-
-/*
- * Console reply message.
- * hdr.result contains the c2_status_t error if the reply was _not_ generated,
- * or C2_OK if the reply was generated.
- */
-struct c2wr_console_rep {
-       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
-       u32 flags;
-} __attribute__((packed));
-
-union c2wr_console {
-       struct c2wr_console_req req;
-       struct c2wr_console_rep rep;
-} __attribute__((packed));
-
-
-/*
- * Giant union with all WRs.  Makes life easier...
- */
-union c2wr {
-       struct c2wr_hdr hdr;
-       struct c2wr_user_hdr user_hdr;
-       union c2wr_rnic_open rnic_open;
-       union c2wr_rnic_query rnic_query;
-       union c2wr_rnic_getconfig rnic_getconfig;
-       union c2wr_rnic_setconfig rnic_setconfig;
-       union c2wr_rnic_close rnic_close;
-       union c2wr_cq_create cq_create;
-       union c2wr_cq_modify cq_modify;
-       union c2wr_cq_destroy cq_destroy;
-       union c2wr_pd_alloc pd_alloc;
-       union c2wr_pd_dealloc pd_dealloc;
-       union c2wr_srq_create srq_create;
-       union c2wr_srq_destroy srq_destroy;
-       union c2wr_qp_create qp_create;
-       union c2wr_qp_query qp_query;
-       union c2wr_qp_modify qp_modify;
-       union c2wr_qp_destroy qp_destroy;
-       struct c2wr_qp_connect qp_connect;
-       union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
-       union c2wr_nsmr_register nsmr_register;
-       union c2wr_nsmr_pbl nsmr_pbl;
-       union c2wr_mr_query mr_query;
-       union c2wr_mw_query mw_query;
-       union c2wr_stag_dealloc stag_dealloc;
-       union c2wr_sqwr sqwr;
-       struct c2wr_rqwr rqwr;
-       struct c2wr_ce ce;
-       union c2wr_ae ae;
-       union c2wr_init init;
-       union c2wr_ep_listen_create ep_listen_create;
-       union c2wr_ep_listen_destroy ep_listen_destroy;
-       union c2wr_cr_accept cr_accept;
-       union c2wr_cr_reject cr_reject;
-       union c2wr_console console;
-       union c2wr_flash_init flash_init;
-       union c2wr_flash flash;
-       union c2wr_buf_alloc buf_alloc;
-       union c2wr_buf_free buf_free;
-       union c2wr_flash_write flash_write;
-} __attribute__((packed));
-
-
-/*
- * Accessors for the wr fields that are packed together tightly to
- * reduce the wr message size.  The wr arguments are void* so that
- * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
- * in the struct c2wr union can be passed in.
- */
-static __inline__ u8 c2_wr_get_id(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->id;
-}
-static __inline__ void c2_wr_set_id(void *wr, u8 id)
-{
-       ((struct c2wr_hdr *) wr)->id = id;
-}
-static __inline__ u8 c2_wr_get_result(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->result;
-}
-static __inline__ void c2_wr_set_result(void *wr, u8 result)
-{
-       ((struct c2wr_hdr *) wr)->result = result;
-}
-static __inline__ u8 c2_wr_get_flags(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->flags;
-}
-static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
-{
-       ((struct c2wr_hdr *) wr)->flags = flags;
-}
-static __inline__ u8 c2_wr_get_sge_count(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->sge_count;
-}
-static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
-{
-       ((struct c2wr_hdr *) wr)->sge_count = sge_count;
-}
-static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->wqe_count;
-}
-static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
-{
-       ((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
-}
-
-#endif                         /* _C2_WR_H_ */
diff --git a/drivers/staging/rdma/ehca/Kconfig b/drivers/staging/rdma/ehca/Kconfig
deleted file mode 100644 (file)
index 3fadd2a..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-config INFINIBAND_EHCA
-       tristate "eHCA support"
-       depends on IBMEBUS
-       ---help---
-       This driver supports the deprecated IBM pSeries eHCA InfiniBand
-       adapter.
-
-       To compile the driver as a module, choose M here. The module
-       will be called ib_ehca.
-
diff --git a/drivers/staging/rdma/ehca/Makefile b/drivers/staging/rdma/ehca/Makefile
deleted file mode 100644 (file)
index 74d284e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#  Authors: Heiko J Schick <schickhj@de.ibm.com>
-#           Christoph Raisch <raisch@de.ibm.com>
-#           Joachim Fenkes <fenkes@de.ibm.com>
-#
-#  Copyright (c) 2005 IBM Corporation
-#
-#  All rights reserved.
-#
-#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
-
-obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
-
-ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
-               ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
-               ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
-
diff --git a/drivers/staging/rdma/ehca/TODO b/drivers/staging/rdma/ehca/TODO
deleted file mode 100644 (file)
index 199a4a6..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-9/2015
-
-The ehca driver has been deprecated and moved to drivers/staging/rdma.
-It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/ehca/ehca_av.c b/drivers/staging/rdma/ehca/ehca_av.c
deleted file mode 100644 (file)
index 94e088c..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  address vector functions
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *av_cache;
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd)
-{
-       int path = ib_rate_to_mult(path_rate);
-       int link, ret;
-       struct ib_port_attr pa;
-
-       if (path_rate == IB_RATE_PORT_CURRENT) {
-               *ipd = 0;
-               return 0;
-       }
-
-       if (unlikely(path < 0)) {
-               ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
-                        path_rate);
-               return -EINVAL;
-       }
-
-       ret = ehca_query_port(&shca->ib_device, port, &pa);
-       if (unlikely(ret < 0)) {
-               ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
-               return ret;
-       }
-
-       link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
-
-       if (path >= link)
-               /* no need to throttle if path faster than link */
-               *ipd = 0;
-       else
-               /* IPD = round((link / path) - 1) */
-               *ipd = ((link + (path >> 1)) / path) - 1;
-
-       return 0;
-}
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-       int ret;
-       struct ehca_av *av;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-
-       av = kmem_cache_alloc(av_cache, GFP_KERNEL);
-       if (!av) {
-               ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
-                        pd, ah_attr);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       av->av.sl = ah_attr->sl;
-       av->av.dlid = ah_attr->dlid;
-       av->av.slid_path_bits = ah_attr->src_path_bits;
-
-       if (ehca_static_rate < 0) {
-               u32 ipd;
-
-               if (ehca_calc_ipd(shca, ah_attr->port_num,
-                                 ah_attr->static_rate, &ipd)) {
-                       ret = -EINVAL;
-                       goto create_ah_exit1;
-               }
-               av->av.ipd = ipd;
-       } else
-               av->av.ipd = ehca_static_rate;
-
-       av->av.lnh = ah_attr->ah_flags;
-       av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                           ah_attr->grh.traffic_class);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                           ah_attr->grh.flow_label);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                           ah_attr->grh.hop_limit);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(pd->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(pd->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
-       }
-       av->av.pmtu = shca->max_mtu;
-
-       /* dgid comes in grh.word_3 */
-       memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       return &av->ib_ah;
-
-create_ah_exit1:
-       kmem_cache_free(av_cache, av);
-
-       return ERR_PTR(ret);
-}
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av;
-       struct ehca_ud_av new_ehca_av;
-       struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
-                                             ib_device);
-
-       memset(&new_ehca_av, 0, sizeof(new_ehca_av));
-       new_ehca_av.sl = ah_attr->sl;
-       new_ehca_av.dlid = ah_attr->dlid;
-       new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
-       new_ehca_av.ipd = ah_attr->static_rate;
-       new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
-                                        (ah_attr->ah_flags & IB_AH_GRH) > 0);
-       new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                               ah_attr->grh.traffic_class);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                                ah_attr->grh.flow_label);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                                ah_attr->grh.hop_limit);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
-
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(ah->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ehca_err(ah->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x",
-                                rc, ah, ah_attr, ah_attr->port_num);
-                       return -EINVAL;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(ah->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ehca_err(ah->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x "
-                                "sgid_index=%x",
-                                rc, ah, ah_attr, ah_attr->port_num,
-                                ah_attr->grh.sgid_index);
-                       return -EINVAL;
-               }
-               memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
-       }
-
-       new_ehca_av.pmtu = shca->max_mtu;
-
-       memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       av = container_of(ah, struct ehca_av, ib_ah);
-       av->av = new_ehca_av;
-
-       return 0;
-}
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
-
-       memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
-              sizeof(ah_attr->grh.dgid));
-       ah_attr->sl = av->av.sl;
-
-       ah_attr->dlid = av->av.dlid;
-
-       ah_attr->src_path_bits = av->av.slid_path_bits;
-       ah_attr->static_rate = av->av.ipd;
-       ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
-       ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
-                                                   av->av.grh.word_0);
-       ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
-                                               av->av.grh.word_0);
-       ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
-                                                av->av.grh.word_0);
-
-       return 0;
-}
-
-int ehca_destroy_ah(struct ib_ah *ah)
-{
-       kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
-
-       return 0;
-}
-
-int ehca_init_av_cache(void)
-{
-       av_cache = kmem_cache_create("ehca_cache_av",
-                                  sizeof(struct ehca_av), 0,
-                                  SLAB_HWCACHE_ALIGN,
-                                  NULL);
-       if (!av_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_av_cache(void)
-{
-       kmem_cache_destroy(av_cache);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
deleted file mode 100644 (file)
index e8c3387..0000000
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Struct definition for eHCA internal structures
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_H__
-#define __EHCA_CLASSES_H__
-
-struct ehca_module;
-struct ehca_qp;
-struct ehca_cq;
-struct ehca_eq;
-struct ehca_mr;
-struct ehca_mw;
-struct ehca_pd;
-struct ehca_av;
-
-#include <linux/wait.h>
-#include <linux/mutex.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_user_verbs.h>
-
-#ifdef CONFIG_PPC64
-#include "ehca_classes_pSeries.h"
-#endif
-#include "ipz_pt_fn.h"
-#include "ehca_qes.h"
-#include "ehca_irq.h"
-
-#define EHCA_EQE_CACHE_SIZE 20
-#define EHCA_MAX_NUM_QUEUES 0xffff
-
-struct ehca_eqe_cache_entry {
-       struct ehca_eqe *eqe;
-       struct ehca_cq *cq;
-};
-
-struct ehca_eq {
-       u32 length;
-       struct ipz_queue ipz_queue;
-       struct ipz_eq_handle ipz_eq_handle;
-       struct work_struct work;
-       struct h_galpas galpas;
-       int is_initialized;
-       struct ehca_pfeq pf;
-       spinlock_t spinlock;
-       struct tasklet_struct interrupt_task;
-       u32 ist;
-       spinlock_t irq_spinlock;
-       struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
-};
-
-struct ehca_sma_attr {
-       u16 lid, lmc, sm_sl, sm_lid;
-       u16 pkey_tbl_len, pkeys[16];
-};
-
-struct ehca_sport {
-       struct ib_cq *ibcq_aqp1;
-       struct ib_qp *ibqp_sqp[2];
-       /* lock to serialze modify_qp() calls for sqp in normal
-        * and irq path (when event PORT_ACTIVE is received first time)
-        */
-       spinlock_t mod_sqp_lock;
-       enum ib_port_state port_state;
-       struct ehca_sma_attr saved_attr;
-       u32 pma_qp_nr;
-};
-
-#define HCA_CAP_MR_PGSIZE_4K  0x80000000
-#define HCA_CAP_MR_PGSIZE_64K 0x40000000
-#define HCA_CAP_MR_PGSIZE_1M  0x20000000
-#define HCA_CAP_MR_PGSIZE_16M 0x10000000
-
-struct ehca_shca {
-       struct ib_device ib_device;
-       struct platform_device *ofdev;
-       u8 num_ports;
-       int hw_level;
-       struct list_head shca_list;
-       struct ipz_adapter_handle ipz_hca_handle;
-       struct ehca_sport sport[2];
-       struct ehca_eq eq;
-       struct ehca_eq neq;
-       struct ehca_mr *maxmr;
-       struct ehca_pd *pd;
-       struct h_galpas galpas;
-       struct mutex modify_mutex;
-       u64 hca_cap;
-       /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
-       u32 hca_cap_mr_pgsize;
-       int max_mtu;
-       int max_num_qps;
-       int max_num_cqs;
-       atomic_t num_cqs;
-       atomic_t num_qps;
-};
-
-struct ehca_pd {
-       struct ib_pd ib_pd;
-       struct ipz_pd fw_pd;
-       /* small queue mgmt */
-       struct mutex lock;
-       struct list_head free[2];
-       struct list_head full[2];
-};
-
-enum ehca_ext_qp_type {
-       EQPT_NORMAL    = 0,
-       EQPT_LLQP      = 1,
-       EQPT_SRQBASE   = 2,
-       EQPT_SRQ       = 3,
-};
-
-/* struct to cache modify_qp()'s parms for GSI/SMI qp */
-struct ehca_mod_qp_parm {
-       int mask;
-       struct ib_qp_attr attr;
-};
-
-#define EHCA_MOD_QP_PARM_MAX 4
-
-#define QMAP_IDX_MASK 0xFFFFULL
-
-/* struct for tracking if cqes have been reported to the application */
-struct ehca_qmap_entry {
-       u16 app_wr_id;
-       u8 reported;
-       u8 cqe_req;
-};
-
-struct ehca_queue_map {
-       struct ehca_qmap_entry *map;
-       unsigned int entries;
-       unsigned int tail;
-       unsigned int left_to_poll;
-       unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
-};
-
-/* function to calculate the next index for the qmap */
-static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
-{
-       unsigned int temp = cur_index + 1;
-       return (temp == limit) ? 0 : temp;
-}
-
-struct ehca_qp {
-       union {
-               struct ib_qp ib_qp;
-               struct ib_srq ib_srq;
-       };
-       u32 qp_type;
-       enum ehca_ext_qp_type ext_type;
-       enum ib_qp_state state;
-       struct ipz_queue ipz_squeue;
-       struct ehca_queue_map sq_map;
-       struct ipz_queue ipz_rqueue;
-       struct ehca_queue_map rq_map;
-       struct h_galpas galpas;
-       u32 qkey;
-       u32 real_qp_num;
-       u32 token;
-       spinlock_t spinlock_s;
-       spinlock_t spinlock_r;
-       u32 sq_max_inline_data_size;
-       struct ipz_qp_handle ipz_qp_handle;
-       struct ehca_pfqp pf;
-       struct ib_qp_init_attr init_attr;
-       struct ehca_cq *send_cq;
-       struct ehca_cq *recv_cq;
-       unsigned int sqerr_purgeflag;
-       struct hlist_node list_entries;
-       /* array to cache modify_qp()'s parms for GSI/SMI qp */
-       struct ehca_mod_qp_parm *mod_qp_parm;
-       int mod_qp_parm_idx;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_squeue;
-       u32 mm_count_rqueue;
-       u32 mm_count_galpa;
-       /* unsolicited ack circumvention */
-       int unsol_ack_circ;
-       int mtu_shift;
-       u32 message_count;
-       u32 packet_count;
-       atomic_t nr_events; /* events seen */
-       wait_queue_head_t wait_completion;
-       int mig_armed;
-       struct list_head sq_err_node;
-       struct list_head rq_err_node;
-};
-
-#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
-#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
-#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
-
-/* must be power of 2 */
-#define QP_HASHTAB_LEN 8
-
-struct ehca_cq {
-       struct ib_cq ib_cq;
-       struct ipz_queue ipz_queue;
-       struct h_galpas galpas;
-       spinlock_t spinlock;
-       u32 cq_number;
-       u32 token;
-       u32 nr_of_entries;
-       struct ipz_cq_handle ipz_cq_handle;
-       struct ehca_pfcq pf;
-       spinlock_t cb_lock;
-       struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
-       struct list_head entry;
-       u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
-       atomic_t nr_events; /* #events seen */
-       wait_queue_head_t wait_completion;
-       spinlock_t task_lock;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_queue;
-       u32 mm_count_galpa;
-       struct list_head sqp_err_list;
-       struct list_head rqp_err_list;
-};
-
-enum ehca_mr_flag {
-       EHCA_MR_FLAG_FMR = 0x80000000,   /* FMR, created with ehca_alloc_fmr */
-       EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
-};
-
-struct ehca_mr {
-       union {
-               struct ib_mr ib_mr;     /* must always be first in ehca_mr */
-               struct ib_fmr ib_fmr;   /* must always be first in ehca_mr */
-       } ib;
-       struct ib_umem *umem;
-       spinlock_t mrlock;
-
-       enum ehca_mr_flag flags;
-       u32 num_kpages;         /* number of kernel pages */
-       u32 num_hwpages;        /* number of hw pages to form MR */
-       u64 hwpage_size;        /* hw page size used for this MR */
-       int acl;                /* ACL (stored here for usage in reregister) */
-       u64 *start;             /* virtual start address (stored here for */
-                               /* usage in reregister) */
-       u64 size;               /* size (stored here for usage in reregister) */
-       u32 fmr_page_size;      /* page size for FMR */
-       u32 fmr_max_pages;      /* max pages for FMR */
-       u32 fmr_max_maps;       /* max outstanding maps for FMR */
-       u32 fmr_map_cnt;        /* map counter for FMR */
-       /* fw specific data */
-       struct ipz_mrmw_handle ipz_mr_handle;   /* MR handle for h-calls */
-       struct h_galpas galpas;
-};
-
-struct ehca_mw {
-       struct ib_mw ib_mw;     /* gen2 mw, must always be first in ehca_mw */
-       spinlock_t mwlock;
-
-       u8 never_bound;         /* indication MW was never bound */
-       struct ipz_mrmw_handle ipz_mw_handle;   /* MW handle for h-calls */
-       struct h_galpas galpas;
-};
-
-enum ehca_mr_pgi_type {
-       EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
-                                 * ehca_rereg_phys_mr,
-                                 * ehca_reg_internal_maxmr */
-       EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
-       EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
-};
-
-struct ehca_mr_pginfo {
-       enum ehca_mr_pgi_type type;
-       u64 num_kpages;
-       u64 kpage_cnt;
-       u64 hwpage_size;     /* hw page size used for this MR */
-       u64 num_hwpages;     /* number of hw pages */
-       u64 hwpage_cnt;      /* counter for hw pages */
-       u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
-
-       union {
-               struct { /* type EHCA_MR_PGI_PHYS section */
-                       u64 addr;
-                       u16 size;
-               } phy;
-               struct { /* type EHCA_MR_PGI_USER section */
-                       struct ib_umem *region;
-                       struct scatterlist *next_sg;
-                       u64 next_nmap;
-               } usr;
-               struct { /* type EHCA_MR_PGI_FMR section */
-                       u64 fmr_pgsize;
-                       u64 *page_list;
-                       u64 next_listelem;
-               } fmr;
-       } u;
-};
-
-/* output parameters for MR/FMR hipz calls */
-struct ehca_mr_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 lkey;
-       u32 rkey;
-       u64 len;
-       u64 vaddr;
-       u32 acl;
-};
-
-/* output parameters for MW hipz calls */
-struct ehca_mw_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 rkey;
-};
-
-struct ehca_av {
-       struct ib_ah ib_ah;
-       struct ehca_ud_av av;
-};
-
-struct ehca_ucontext {
-       struct ib_ucontext ib_ucontext;
-};
-
-int ehca_init_pd_cache(void);
-void ehca_cleanup_pd_cache(void);
-int ehca_init_cq_cache(void);
-void ehca_cleanup_cq_cache(void);
-int ehca_init_qp_cache(void);
-void ehca_cleanup_qp_cache(void);
-int ehca_init_av_cache(void);
-void ehca_cleanup_av_cache(void);
-int ehca_init_mrmw_cache(void);
-void ehca_cleanup_mrmw_cache(void);
-int ehca_init_small_qp_cache(void);
-void ehca_cleanup_small_qp_cache(void);
-
-extern rwlock_t ehca_qp_idr_lock;
-extern rwlock_t ehca_cq_idr_lock;
-extern struct idr ehca_qp_idr;
-extern struct idr ehca_cq_idr;
-extern spinlock_t shca_list_lock;
-
-extern int ehca_static_rate;
-extern int ehca_port_act_time;
-extern bool ehca_use_hp_mr;
-extern bool ehca_scaling_code;
-extern int ehca_lock_hcalls;
-extern int ehca_nr_ports;
-extern int ehca_max_cq;
-extern int ehca_max_qp;
-
-struct ipzu_queue_resp {
-       u32 qe_size;      /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length; /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;
-       u32 offset; /* save offset within a page for small_qp */
-};
-
-struct ehca_create_cq_resp {
-       u32 cq_number;
-       u32 token;
-       struct ipzu_queue_resp ipz_queue;
-       u32 fw_handle_ofs;
-       u32 dummy;
-};
-
-struct ehca_create_qp_resp {
-       u32 qp_num;
-       u32 token;
-       u32 qp_type;
-       u32 ext_type;
-       u32 qkey;
-       /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
-       u32 real_qp_num;
-       u32 fw_handle_ofs;
-       u32 dummy;
-       struct ipzu_queue_resp ipz_squeue;
-       struct ipzu_queue_resp ipz_rqueue;
-};
-
-struct ehca_alloc_cq_parms {
-       u32 nr_cqe;
-       u32 act_nr_of_entries;
-       u32 act_pages;
-       struct ipz_eq_handle eq_handle;
-};
-
-enum ehca_service_type {
-       ST_RC  = 0,
-       ST_UC  = 1,
-       ST_RD  = 2,
-       ST_UD  = 3,
-};
-
-enum ehca_ll_comp_flags {
-       LLQP_SEND_COMP = 0x20,
-       LLQP_RECV_COMP = 0x40,
-       LLQP_COMP_MASK = 0x60,
-};
-
-struct ehca_alloc_queue_parms {
-       /* input parameters */
-       int max_wr;
-       int max_sge;
-       int page_size;
-       int is_small;
-
-       /* output parameters */
-       u16 act_nr_wqes;
-       u8  act_nr_sges;
-       u32 queue_size; /* bytes for small queues, pages otherwise */
-};
-
-struct ehca_alloc_qp_parms {
-       struct ehca_alloc_queue_parms squeue;
-       struct ehca_alloc_queue_parms rqueue;
-
-       /* input parameters */
-       enum ehca_service_type servicetype;
-       int qp_storage;
-       int sigtype;
-       enum ehca_ext_qp_type ext_type;
-       enum ehca_ll_comp_flags ll_comp_flags;
-       int ud_av_l_key_ctl;
-
-       u32 token;
-       struct ipz_eq_handle eq_handle;
-       struct ipz_pd pd;
-       struct ipz_cq_handle send_cq_handle, recv_cq_handle;
-
-       u32 srq_qpn, srq_token, srq_limit;
-
-       /* output parameters */
-       u32 real_qp_num;
-       struct ipz_qp_handle qp_handle;
-       struct h_galpas galpas;
-};
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
-
-#endif
diff --git a/drivers/staging/rdma/ehca/ehca_classes_pSeries.h b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h
deleted file mode 100644 (file)
index 689c357..0000000
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  pSeries interface definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_PSERIES_H__
-#define __EHCA_CLASSES_PSERIES_H__
-
-#include "hcp_phyp.h"
-#include "ipz_pt_fn.h"
-
-
-struct ehca_pfqp {
-       struct ipz_qpt sqpt;
-       struct ipz_qpt rqpt;
-};
-
-struct ehca_pfcq {
-       struct ipz_qpt qpt;
-       u32 cqnr;
-};
-
-struct ehca_pfeq {
-       struct ipz_qpt qpt;
-       struct h_galpa galpa;
-       u32 eqnr;
-};
-
-struct ipz_adapter_handle {
-       u64 handle;
-};
-
-struct ipz_cq_handle {
-       u64 handle;
-};
-
-struct ipz_eq_handle {
-       u64 handle;
-};
-
-struct ipz_qp_handle {
-       u64 handle;
-};
-struct ipz_mrmw_handle {
-       u64 handle;
-};
-
-struct ipz_pd {
-       u32 value;
-};
-
-struct hcp_modify_qp_control_block {
-       u32 qkey;                      /* 00 */
-       u32 rdd;                       /* reliable datagram domain */
-       u32 send_psn;                  /* 02 */
-       u32 receive_psn;               /* 03 */
-       u32 prim_phys_port;            /* 04 */
-       u32 alt_phys_port;             /* 05 */
-       u32 prim_p_key_idx;            /* 06 */
-       u32 alt_p_key_idx;             /* 07 */
-       u32 rdma_atomic_ctrl;          /* 08 */
-       u32 qp_state;                  /* 09 */
-       u32 reserved_10;               /* 10 */
-       u32 rdma_nr_atomic_resp_res;   /* 11 */
-       u32 path_migration_state;      /* 12 */
-       u32 rdma_atomic_outst_dest_qp; /* 13 */
-       u32 dest_qp_nr;                /* 14 */
-       u32 min_rnr_nak_timer_field;   /* 15 */
-       u32 service_level;             /* 16 */
-       u32 send_grh_flag;             /* 17 */
-       u32 retry_count;               /* 18 */
-       u32 timeout;                   /* 19 */
-       u32 path_mtu;                  /* 20 */
-       u32 max_static_rate;           /* 21 */
-       u32 dlid;                      /* 22 */
-       u32 rnr_retry_count;           /* 23 */
-       u32 source_path_bits;          /* 24 */
-       u32 traffic_class;             /* 25 */
-       u32 hop_limit;                 /* 26 */
-       u32 source_gid_idx;            /* 27 */
-       u32 flow_label;                /* 28 */
-       u32 reserved_29;               /* 29 */
-       union {                        /* 30 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid;
-       u32 service_level_al;          /* 34 */
-       u32 send_grh_flag_al;          /* 35 */
-       u32 retry_count_al;            /* 36 */
-       u32 timeout_al;                /* 37 */
-       u32 max_static_rate_al;        /* 38 */
-       u32 dlid_al;                   /* 39 */
-       u32 rnr_retry_count_al;        /* 40 */
-       u32 source_path_bits_al;       /* 41 */
-       u32 traffic_class_al;          /* 42 */
-       u32 hop_limit_al;              /* 43 */
-       u32 source_gid_idx_al;         /* 44 */
-       u32 flow_label_al;             /* 45 */
-       u32 reserved_46;               /* 46 */
-       u32 reserved_47;               /* 47 */
-       union {                        /* 48 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid_al;
-       u32 max_nr_outst_send_wr;      /* 52 */
-       u32 max_nr_outst_recv_wr;      /* 53 */
-       u32 disable_ete_credit_check;  /* 54 */
-       u32 qp_number;                 /* 55 */
-       u64 send_queue_handle;         /* 56 */
-       u64 recv_queue_handle;         /* 58 */
-       u32 actual_nr_sges_in_sq_wqe;  /* 60 */
-       u32 actual_nr_sges_in_rq_wqe;  /* 61 */
-       u32 qp_enable;                 /* 62 */
-       u32 curr_srq_limit;            /* 63 */
-       u64 qp_aff_asyn_ev_log_reg;    /* 64 */
-       u64 shared_rq_hndl;            /* 66 */
-       u64 trigg_doorbell_qp_hndl;    /* 68 */
-       u32 reserved_70_127[58];       /* 70 */
-};
-
-#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
-#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
-#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
-#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
-#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
-#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
-#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
-#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
-#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
-#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
-#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
-#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
-#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
-#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
-#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
-#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
-#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
-#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
-#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
-#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
-#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
-#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
-#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
-#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
-#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
-#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
-#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
-#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
-#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
-#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
-#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
-#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
-#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
-#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
-#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
-#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
-#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
-#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
-#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
-#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
-#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
-#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
-#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
-#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
-#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
-#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
-#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
-#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
-
-#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/staging/rdma/ehca/ehca_cq.c b/drivers/staging/rdma/ehca/ehca_cq.c
deleted file mode 100644 (file)
index 1aa7931..0000000
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Completion queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *cq_cache;
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
-{
-       unsigned int qp_num = qp->real_qp_num;
-       unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-
-       ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
-                cq->cq_number, qp_num);
-
-       return 0;
-}
-
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
-{
-       int ret = -EINVAL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       hlist_del(iter);
-                       ehca_dbg(cq->ib_cq.device,
-                                "removed qp from cq .cq_num=%x real_qp_num=%x",
-                                cq->cq_number, real_qp_num);
-                       ret = 0;
-                       break;
-               }
-       }
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-       if (ret)
-               ehca_err(cq->ib_cq.device,
-                        "qp not found cq_num=%x real_qp_num=%x",
-                        cq->cq_number, real_qp_num);
-
-       return ret;
-}
-
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
-{
-       struct ehca_qp *ret = NULL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       ret = qp;
-                       break;
-               }
-       }
-       return ret;
-}
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata)
-{
-       int cqe = attr->cqe;
-       static const u32 additional_cqe = 20;
-       struct ib_cq *cq;
-       struct ehca_cq *my_cq;
-       struct ehca_shca *shca =
-               container_of(device, struct ehca_shca, ib_device);
-       struct ipz_adapter_handle adapter_handle;
-       struct ehca_alloc_cq_parms param; /* h_call's out parameters */
-       struct h_galpa gal;
-       void *vpage;
-       u32 counter;
-       u64 rpage, cqx_fec, h_ret;
-       int rc, i;
-       unsigned long flags;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
-               return ERR_PTR(-EINVAL);
-
-       if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
-               ehca_err(device, "Unable to create CQ, max number of %i "
-                       "CQs reached.", shca->max_num_cqs);
-               ehca_err(device, "To increase the maximum number of CQs "
-                       "use the number_of_cqs module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
-       if (!my_cq) {
-               ehca_err(device, "Out of memory for ehca_cq struct device=%p",
-                        device);
-               atomic_dec(&shca->num_cqs);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
-
-       spin_lock_init(&my_cq->spinlock);
-       spin_lock_init(&my_cq->cb_lock);
-       spin_lock_init(&my_cq->task_lock);
-       atomic_set(&my_cq->nr_events, 0);
-       init_waitqueue_head(&my_cq->wait_completion);
-
-       cq = &my_cq->ib_cq;
-
-       adapter_handle = shca->ipz_hca_handle;
-       param.eq_handle = shca->eq.ipz_eq_handle;
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       rc = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-       idr_preload_end();
-
-       if (rc < 0) {
-               cq = ERR_PTR(-ENOMEM);
-               ehca_err(device, "Can't allocate new idr entry. device=%p",
-                        device);
-               goto create_cq_exit1;
-       }
-       my_cq->token = rc;
-
-       /*
-        * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
-        * for receiving errors CQEs.
-        */
-       param.nr_cqe = cqe + additional_cqe;
-       h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_alloc_resource_cq() failed "
-                        "h_ret=%lli device=%p", h_ret, device);
-               cq = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto create_cq_exit2;
-       }
-
-       rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
-                               EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
-       if (!rc) {
-               ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
-                        rc, device);
-               cq = ERR_PTR(-EINVAL);
-               goto create_cq_exit3;
-       }
-
-       for (counter = 0; counter < param.act_pages; counter++) {
-               vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-               if (!vpage) {
-                       ehca_err(device, "ipz_qpageit_get_inc() "
-                                "returns NULL device=%p", device);
-                       cq = ERR_PTR(-EAGAIN);
-                       goto create_cq_exit4;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_cq(adapter_handle,
-                                                my_cq->ipz_cq_handle,
-                                                &my_cq->pf,
-                                                0,
-                                                0,
-                                                rpage,
-                                                1,
-                                                my_cq->galpas.
-                                                kernel);
-
-               if (h_ret < H_SUCCESS) {
-                       ehca_err(device, "hipz_h_register_rpage_cq() failed "
-                                "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
-                                "act_pages=%i", my_cq, my_cq->cq_number,
-                                h_ret, counter, param.act_pages);
-                       cq = ERR_PTR(-EINVAL);
-                       goto create_cq_exit4;
-               }
-
-               if (counter == (param.act_pages - 1)) {
-                       vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-                       if ((h_ret != H_SUCCESS) || vpage) {
-                               ehca_err(device, "Registration of pages not "
-                                        "complete ehca_cq=%p cq_num=%x "
-                                        "h_ret=%lli", my_cq, my_cq->cq_number,
-                                        h_ret);
-                               cq = ERR_PTR(-EAGAIN);
-                               goto create_cq_exit4;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(device, "Registration of page failed "
-                                        "ehca_cq=%p cq_num=%x h_ret=%lli "
-                                        "counter=%i act_pages=%i",
-                                        my_cq, my_cq->cq_number,
-                                        h_ret, counter, param.act_pages);
-                               cq = ERR_PTR(-ENOMEM);
-                               goto create_cq_exit4;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(&my_cq->ipz_queue);
-
-       gal = my_cq->galpas.kernel;
-       cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
-       ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
-                my_cq, my_cq->cq_number, cqx_fec);
-
-       my_cq->ib_cq.cqe = my_cq->nr_of_entries =
-               param.act_nr_of_entries - additional_cqe;
-       my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
-
-       for (i = 0; i < QP_HASHTAB_LEN; i++)
-               INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
-
-       INIT_LIST_HEAD(&my_cq->sqp_err_list);
-       INIT_LIST_HEAD(&my_cq->rqp_err_list);
-
-       if (context) {
-               struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
-               struct ehca_create_cq_resp resp;
-               memset(&resp, 0, sizeof(resp));
-               resp.cq_number = my_cq->cq_number;
-               resp.token = my_cq->token;
-               resp.ipz_queue.qe_size = ipz_queue->qe_size;
-               resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
-               resp.ipz_queue.queue_length = ipz_queue->queue_length;
-               resp.ipz_queue.pagesize = ipz_queue->pagesize;
-               resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
-               resp.fw_handle_ofs = (u32)
-                       (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
-               if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-                       ehca_err(device, "Copy to udata failed.");
-                       cq = ERR_PTR(-EFAULT);
-                       goto create_cq_exit4;
-               }
-       }
-
-       return cq;
-
-create_cq_exit4:
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-
-create_cq_exit3:
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-       if (h_ret != H_SUCCESS)
-               ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
-                        "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
-
-create_cq_exit2:
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-create_cq_exit1:
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return cq;
-}
-
-int ehca_destroy_cq(struct ib_cq *cq)
-{
-       u64 h_ret;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int cq_num = my_cq->cq_number;
-       struct ib_device *device = cq->device;
-       struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       unsigned long flags;
-
-       if (cq->uobject) {
-               if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
-                       ehca_err(device, "Resources still referenced in "
-                                "user space cq_num=%x", my_cq->cq_number);
-                       return -EINVAL;
-               }
-       }
-
-       /*
-        * remove the CQ from the idr first to make sure
-        * no more interrupt tasklets will touch this CQ
-        */
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
-
-       /* nobody's using our CQ any longer -- we can destroy it */
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
-       if (h_ret == H_R_STATE) {
-               /* cq in err: read err data and destroy it forcibly */
-               ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
-                        "state. Try to delete it forcibly.",
-                        my_cq, cq_num, my_cq->ipz_cq_handle.handle);
-               ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
-               h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-               if (h_ret == H_SUCCESS)
-                       ehca_dbg(device, "cq_num=%x deleted successfully.",
-                                cq_num);
-       }
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
-                        "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
-               return ehca2ib_return_code(h_ret);
-       }
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return 0;
-}
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
-{
-       /* TODO: proper resize needs to be done */
-       ehca_err(cq->device, "not implemented yet");
-
-       return -EFAULT;
-}
-
-int ehca_init_cq_cache(void)
-{
-       cq_cache = kmem_cache_create("ehca_cache_cq",
-                                    sizeof(struct ehca_cq), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!cq_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_cq_cache(void)
-{
-       kmem_cache_destroy(cq_cache);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_eq.c b/drivers/staging/rdma/ehca/ehca_eq.c
deleted file mode 100644 (file)
index 90da674..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Event queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_qes.h"
-#include "hcp_if.h"
-#include "ipz_pt_fn.h"
-
-int ehca_create_eq(struct ehca_shca *shca,
-                  struct ehca_eq *eq,
-                  const enum ehca_eq_type type, const u32 length)
-{
-       int ret;
-       u64 h_ret;
-       u32 nr_pages;
-       u32 i;
-       void *vpage;
-       struct ib_device *ib_dev = &shca->ib_device;
-
-       spin_lock_init(&eq->spinlock);
-       spin_lock_init(&eq->irq_spinlock);
-       eq->is_initialized = 0;
-
-       if (type != EHCA_EQ && type != EHCA_NEQ) {
-               ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
-               return -EINVAL;
-       }
-       if (!length) {
-               ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
-                                        &eq->pf,
-                                        type,
-                                        length,
-                                        &eq->ipz_eq_handle,
-                                        &eq->length,
-                                        &nr_pages, &eq->ist);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
-                            EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
-       if (!ret) {
-               ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
-               goto create_eq_exit1;
-       }
-
-       for (i = 0; i < nr_pages; i++) {
-               u64 rpage;
-
-               vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-               if (!vpage)
-                       goto create_eq_exit2;
-
-               rpage = __pa(vpage);
-               h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
-                                                eq->ipz_eq_handle,
-                                                &eq->pf,
-                                                0, 0, rpage, 1);
-
-               if (i == (nr_pages - 1)) {
-                       /* last page */
-                       vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-                       if (h_ret != H_SUCCESS || vpage)
-                               goto create_eq_exit2;
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED)
-                               goto create_eq_exit2;
-               }
-       }
-
-       ipz_qeit_reset(&eq->ipz_queue);
-
-       /* register interrupt handlers and initialize work queues */
-       if (type == EHCA_EQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
-                                         0, "ehca_eq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       } else if (type == EHCA_NEQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
-                                         0, "ehca_neq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       }
-
-       eq->is_initialized = 1;
-
-       return 0;
-
-create_eq_exit2:
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-create_eq_exit1:
-       hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       return -EINVAL;
-}
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       void *eqe;
-
-       spin_lock_irqsave(&eq->spinlock, flags);
-       eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
-       spin_unlock_irqrestore(&eq->spinlock, flags);
-
-       return eqe;
-}
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       u64 h_ret;
-
-       ibmebus_free_irq(eq->ist, (void *)shca);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       eq->is_initialized = 0;
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       tasklet_kill(&eq->interrupt_task);
-
-       h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't free EQ resources.");
-               return -EINVAL;
-       }
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-       return 0;
-}
diff --git a/drivers/staging/rdma/ehca/ehca_hca.c b/drivers/staging/rdma/ehca/ehca_hca.c
deleted file mode 100644 (file)
index e8b1bb6..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HCA query functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/gfp.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static unsigned int limit_uint(unsigned int value)
-{
-       return min_t(unsigned int, value, INT_MAX);
-}
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw)
-{
-       int i, ret = 0;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_hca *rblock;
-
-       static const u32 cap_mapping[] = {
-               IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
-               IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
-               IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
-               IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
-               IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
-               IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
-               IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
-               IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
-               IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
-               IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
-               IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
-       };
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto query_device1;
-       }
-
-       memset(props, 0, sizeof(struct ib_device_attr));
-       props->page_size_cap   = shca->hca_cap_mr_pgsize;
-       props->fw_ver          = rblock->hw_ver;
-       props->max_mr_size     = rblock->max_mr_size;
-       props->vendor_id       = rblock->vendor_id >> 8;
-       props->vendor_part_id  = rblock->vendor_part_id >> 16;
-       props->hw_ver          = rblock->hw_ver;
-       props->max_qp          = limit_uint(rblock->max_qp);
-       props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
-       props->max_sge         = limit_uint(rblock->max_sge);
-       props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
-       props->max_cq          = limit_uint(rblock->max_cq);
-       props->max_cqe         = limit_uint(rblock->max_cqe);
-       props->max_mr          = limit_uint(rblock->max_mr);
-       props->max_mw          = limit_uint(rblock->max_mw);
-       props->max_pd          = limit_uint(rblock->max_pd);
-       props->max_ah          = limit_uint(rblock->max_ah);
-       props->max_ee          = limit_uint(rblock->max_rd_ee_context);
-       props->max_rdd         = limit_uint(rblock->max_rd_domain);
-       props->max_fmr         = limit_uint(rblock->max_mr);
-       props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
-       props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
-       props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
-       props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
-       props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               props->max_srq         = limit_uint(props->max_qp);
-               props->max_srq_wr      = limit_uint(props->max_qp_wr);
-               props->max_srq_sge     = 3;
-       }
-
-       props->max_pkeys           = 16;
-       /* Some FW versions say 0 here; insert sensible value in that case */
-       props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
-               min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
-       props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
-       props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
-       props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
-       props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
-       props->max_total_mcast_qp_attach
-               = limit_uint(rblock->max_total_mcast_qp_attach);
-
-       /* translate device capabilities */
-       props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
-               IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
-       for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
-               if (rblock->hca_cap_indicators & cap_mapping[i + 1])
-                       props->device_cap_flags |= cap_mapping[i];
-
-query_device1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
-{
-       switch (fw_mtu) {
-       case 0x1:
-               return IB_MTU_256;
-       case 0x2:
-               return IB_MTU_512;
-       case 0x3:
-               return IB_MTU_1024;
-       case 0x4:
-               return IB_MTU_2048;
-       case 0x5:
-               return IB_MTU_4096;
-       default:
-               ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
-                        fw_mtu);
-               return 0;
-       }
-}
-
-static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
-{
-       switch (vl_cap) {
-       case 0x1:
-               return 1;
-       case 0x2:
-               return 2;
-       case 0x3:
-               return 4;
-       case 0x4:
-               return 8;
-       case 0x5:
-               return 15;
-       default:
-               ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
-                        vl_cap);
-               return 0;
-       }
-}
-
-int ehca_query_port(struct ib_device *ibdev,
-                   u8 port, struct ib_port_attr *props)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_port1;
-       }
-
-       memset(props, 0, sizeof(struct ib_port_attr));
-
-       props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
-       props->port_cap_flags  = rblock->capability_mask;
-       props->gid_tbl_len     = rblock->gid_tbl_len;
-       if (rblock->max_msg_sz)
-               props->max_msg_sz      = rblock->max_msg_sz;
-       else
-               props->max_msg_sz      = 0x1 << 31;
-       props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
-       props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
-       props->pkey_tbl_len    = rblock->pkey_tbl_len;
-       props->lid             = rblock->lid;
-       props->sm_lid          = rblock->sm_lid;
-       props->lmc             = rblock->lmc;
-       props->sm_sl           = rblock->sm_sl;
-       props->subnet_timeout  = rblock->subnet_timeout;
-       props->init_type_reply = rblock->init_type_reply;
-       props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
-
-       if (rblock->state && rblock->phys_width) {
-               props->phys_state      = rblock->phys_pstate;
-               props->state           = rblock->phys_state;
-               props->active_width    = rblock->phys_width;
-               props->active_speed    = rblock->phys_speed;
-       } else {
-               /* old firmware releases don't report physical
-                * port info, so use default values
-                */
-               props->phys_state      = 5;
-               props->state           = rblock->state;
-               props->active_width    = IB_WIDTH_12X;
-               props->active_speed    = IB_SPEED_SDR;
-       }
-
-query_port1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_sma_attr(struct ehca_shca *shca,
-                       u8 port, struct ehca_sma_attr *attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_sma_attr1;
-       }
-
-       memset(attr, 0, sizeof(struct ehca_sma_attr));
-
-       attr->lid    = rblock->lid;
-       attr->lmc    = rblock->lmc;
-       attr->sm_sl  = rblock->sm_sl;
-       attr->sm_lid = rblock->sm_lid;
-
-       attr->pkey_tbl_len = rblock->pkey_tbl_len;
-       memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
-
-query_sma_attr1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if (index > 16) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_pkey1;
-       }
-
-       memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
-
-query_pkey1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port,
-                  int index, union ib_gid *gid)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       if (index < 0 || index > 255) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_gid1;
-       }
-
-       memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
-       memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
-
-query_gid1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static const u32 allowed_port_caps = (
-       IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
-       IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
-       IB_PORT_VENDOR_CLASS_SUP);
-
-int ehca_modify_port(struct ib_device *ibdev,
-                    u8 port, int port_modify_mask,
-                    struct ib_port_modify *props)
-{
-       int ret = 0;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-       u32 cap;
-       u64 hret;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if ((props->set_port_cap_mask | props->clr_port_cap_mask)
-           & ~allowed_port_caps) {
-               ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
-                        "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
-                        props->clr_port_cap_mask, allowed_port_caps);
-               return -EINVAL;
-       }
-
-       if (mutex_lock_interruptible(&shca->modify_mutex))
-               return -ERESTARTSYS;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               ret = -ENOMEM;
-               goto modify_port1;
-       }
-
-       hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto modify_port2;
-       }
-
-       cap = (rblock->capability_mask | props->set_port_cap_mask)
-               & ~props->clr_port_cap_mask;
-
-       hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
-                                 cap, props->init_type, port_modify_mask);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
-                        hret);
-               ret = -EINVAL;
-       }
-
-modify_port2:
-       ehca_free_fw_ctrlblock(rblock);
-
-modify_port1:
-       mutex_unlock(&shca->modify_mutex);
-
-       return ret;
-}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.c b/drivers/staging/rdma/ehca/ehca_irq.c
deleted file mode 100644 (file)
index 8615d7c..0000000
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Functions for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <linux/smpboot.h>
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
-#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
-#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
-#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
-#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
-
-#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
-#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
-#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
-#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
-#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
-#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
-
-#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
-#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
-
-static void queue_comp_task(struct ehca_cq *__cq);
-
-static struct ehca_comp_pool *pool;
-
-static inline void comp_event_callback(struct ehca_cq *cq)
-{
-       if (!cq->ib_cq.comp_handler)
-               return;
-
-       spin_lock(&cq->cb_lock);
-       cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
-       spin_unlock(&cq->cb_lock);
-
-       return;
-}
-
-static void print_error_data(struct ehca_shca *shca, void *data,
-                            u64 *rblock, int length)
-{
-       u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
-       u64 resource = rblock[1];
-
-       switch (type) {
-       case 0x1: /* Queue Pair */
-       {
-               struct ehca_qp *qp = (struct ehca_qp *)data;
-
-               /* only print error data if AER is set */
-               if (rblock[6] == 0)
-                       return;
-
-               ehca_err(&shca->ib_device,
-                        "QP 0x%x (resource=%llx) has errors.",
-                        qp->ib_qp.qp_num, resource);
-               break;
-       }
-       case 0x4: /* Completion Queue */
-       {
-               struct ehca_cq *cq = (struct ehca_cq *)data;
-
-               ehca_err(&shca->ib_device,
-                        "CQ 0x%x (resource=%llx) has errors.",
-                        cq->cq_number, resource);
-               break;
-       }
-       default:
-               ehca_err(&shca->ib_device,
-                        "Unknown error type: %llx on %s.",
-                        type, shca->ib_device.name);
-               break;
-       }
-
-       ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data begin "
-                "---------------------------------------------------");
-       ehca_dmp(rblock, length, "resource=%llx", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data end "
-                "----------------------------------------------------");
-
-       return;
-}
-
-int ehca_error_data(struct ehca_shca *shca, void *data,
-                   u64 resource)
-{
-
-       unsigned long ret;
-       u64 *rblock;
-       unsigned long block_count;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
-               ret = -ENOMEM;
-               goto error_data1;
-       }
-
-       /* rblock must be 4K aligned and should be 4K large */
-       ret = hipz_h_error_data(shca->ipz_hca_handle,
-                               resource,
-                               rblock,
-                               &block_count);
-
-       if (ret == H_R_STATE)
-               ehca_err(&shca->ib_device,
-                        "No error data is available: %llx.", resource);
-       else if (ret == H_SUCCESS) {
-               int length;
-
-               length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
-
-               if (length > EHCA_PAGESIZE)
-                       length = EHCA_PAGESIZE;
-
-               print_error_data(shca, data, rblock, length);
-       } else
-               ehca_err(&shca->ib_device,
-                        "Error data could not be fetched: %llx", resource);
-
-       ehca_free_fw_ctrlblock(rblock);
-
-error_data1:
-       return ret;
-
-}
-
-static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
-                             enum ib_event_type event_type)
-{
-       struct ib_event event;
-
-       /* PATH_MIG without the QP ever having been armed is false alarm */
-       if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
-               return;
-
-       event.device = &shca->ib_device;
-       event.event = event_type;
-
-       if (qp->ext_type == EQPT_SRQ) {
-               if (!qp->ib_srq.event_handler)
-                       return;
-
-               event.element.srq = &qp->ib_srq;
-               qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
-       } else {
-               if (!qp->ib_qp.event_handler)
-                       return;
-
-               event.element.qp = &qp->ib_qp;
-               qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
-       }
-}
-
-static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
-                             enum ib_event_type event_type, int fatal)
-{
-       struct ehca_qp *qp;
-       u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
-
-       read_lock(&ehca_qp_idr_lock);
-       qp = idr_find(&ehca_qp_idr, token);
-       if (qp)
-               atomic_inc(&qp->nr_events);
-       read_unlock(&ehca_qp_idr_lock);
-
-       if (!qp)
-               return;
-
-       if (fatal)
-               ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
-
-       dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
-                         IB_EVENT_SRQ_ERR : event_type);
-
-       /*
-        * eHCA only processes one WQE at a time for SRQ base QPs,
-        * so the last WQE has been processed as soon as the QP enters
-        * error state.
-        */
-       if (fatal && qp->ext_type == EQPT_SRQBASE)
-               dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
-
-       if (atomic_dec_and_test(&qp->nr_events))
-               wake_up(&qp->wait_completion);
-       return;
-}
-
-static void cq_event_callback(struct ehca_shca *shca,
-                             u64 eqe)
-{
-       struct ehca_cq *cq;
-       u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
-
-       read_lock(&ehca_cq_idr_lock);
-       cq = idr_find(&ehca_cq_idr, token);
-       if (cq)
-               atomic_inc(&cq->nr_events);
-       read_unlock(&ehca_cq_idr_lock);
-
-       if (!cq)
-               return;
-
-       ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
-
-       if (atomic_dec_and_test(&cq->nr_events))
-               wake_up(&cq->wait_completion);
-
-       return;
-}
-
-static void parse_identifier(struct ehca_shca *shca, u64 eqe)
-{
-       u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
-
-       switch (identifier) {
-       case 0x02: /* path migrated */
-               qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
-               break;
-       case 0x03: /* communication established */
-               qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
-               break;
-       case 0x04: /* send queue drained */
-               qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
-               break;
-       case 0x05: /* QP error */
-       case 0x06: /* QP error */
-               qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
-               break;
-       case 0x07: /* CQ error */
-       case 0x08: /* CQ error */
-               cq_event_callback(shca, eqe);
-               break;
-       case 0x09: /* MRMWPTE error */
-               ehca_err(&shca->ib_device, "MRMWPTE error.");
-               break;
-       case 0x0A: /* port event */
-               ehca_err(&shca->ib_device, "Port event.");
-               break;
-       case 0x0B: /* MR access error */
-               ehca_err(&shca->ib_device, "MR access error.");
-               break;
-       case 0x0C: /* EQ error */
-               ehca_err(&shca->ib_device, "EQ error.");
-               break;
-       case 0x0D: /* P/Q_Key mismatch */
-               ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
-               break;
-       case 0x10: /* sampling complete */
-               ehca_err(&shca->ib_device, "Sampling complete.");
-               break;
-       case 0x11: /* unaffiliated access error */
-               ehca_err(&shca->ib_device, "Unaffiliated access error.");
-               break;
-       case 0x12: /* path migrating */
-               ehca_err(&shca->ib_device, "Path migrating.");
-               break;
-       case 0x13: /* interface trace stopped */
-               ehca_err(&shca->ib_device, "Interface trace stopped.");
-               break;
-       case 0x14: /* first error capture info available */
-               ehca_info(&shca->ib_device, "First error capture available");
-               break;
-       case 0x15: /* SRQ limit reached */
-               qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
-                        identifier, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static void dispatch_port_event(struct ehca_shca *shca, int port_num,
-                               enum ib_event_type type, const char *msg)
-{
-       struct ib_event event;
-
-       ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
-       event.device = &shca->ib_device;
-       event.event = type;
-       event.element.port_num = port_num;
-       ib_dispatch_event(&event);
-}
-
-static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
-{
-       struct ehca_sma_attr  new_attr;
-       struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
-
-       ehca_query_sma_attr(shca, port_num, &new_attr);
-
-       if (new_attr.sm_sl  != old_attr->sm_sl ||
-           new_attr.sm_lid != old_attr->sm_lid)
-               dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
-                                   "SM changed");
-
-       if (new_attr.lid != old_attr->lid ||
-           new_attr.lmc != old_attr->lmc)
-               dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
-                                   "LID changed");
-
-       if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
-           memcmp(new_attr.pkeys, old_attr->pkeys,
-                  sizeof(u16) * new_attr.pkey_tbl_len))
-               dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
-                                   "P_Key changed");
-
-       *old_attr = new_attr;
-}
-
-/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
-static int replay_modify_qp(struct ehca_sport *sport)
-{
-       int aqp1_destroyed;
-       unsigned long flags;
-
-       spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-
-       aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
-
-       if (sport->ibqp_sqp[IB_QPT_SMI])
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
-       if (!aqp1_destroyed)
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
-
-       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-
-       return aqp1_destroyed;
-}
-
-static void parse_ec(struct ehca_shca *shca, u64 eqe)
-{
-       u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
-       u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
-       u8 spec_event;
-       struct ehca_sport *sport = &shca->sport[port - 1];
-
-       switch (ec) {
-       case 0x30: /* port availability change */
-               if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
-                       /* only replay modify_qp calls in autodetect mode;
-                        * if AQP1 was destroyed, the port is already down
-                        * again and we can drop the event.
-                        */
-                       if (ehca_nr_ports < 0)
-                               if (replay_modify_qp(sport))
-                                       break;
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port, &sport->saved_attr);
-               } else {
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-               }
-               break;
-       case 0x31:
-               /* port configuration change
-                * disruptive change is caused by
-                * LID, PKEY or SM change
-                */
-               if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
-                       ehca_warn(&shca->ib_device, "disruptive port "
-                                 "%d configuration change", port);
-
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port,
-                                           &sport->saved_attr);
-               } else
-                       notify_port_conf_change(shca, port);
-               break;
-       case 0x32: /* adapter malfunction */
-               ehca_err(&shca->ib_device, "Adapter malfunction.");
-               break;
-       case 0x33:  /* trace stopped */
-               ehca_err(&shca->ib_device, "Traced stopped.");
-               break;
-       case 0x34: /* util async event */
-               spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
-               if (spec_event == 0x80) /* client reregister required */
-                       dispatch_port_event(shca, port,
-                                           IB_EVENT_CLIENT_REREGISTER,
-                                           "client reregister req.");
-               else
-                       ehca_warn(&shca->ib_device, "Unknown util async "
-                                 "event %x on port %x", spec_event, port);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
-                        ec, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static inline void reset_eq_pending(struct ehca_cq *cq)
-{
-       u64 CQx_EP;
-       struct h_galpa gal = cq->galpas.kernel;
-
-       hipz_galpa_store_cq(gal, cqx_ep, 0x0);
-       CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->neq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-void ehca_tasklet_neq(unsigned long data)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)data;
-       struct ehca_eqe *eqe;
-       u64 ret;
-
-       eqe = ehca_poll_eq(shca, &shca->neq);
-
-       while (eqe) {
-               if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
-                       parse_ec(shca, eqe->entry);
-
-               eqe = ehca_poll_eq(shca, &shca->neq);
-       }
-
-       ret = hipz_h_reset_event(shca->ipz_hca_handle,
-                                shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
-
-       if (ret != H_SUCCESS)
-               ehca_err(&shca->ib_device, "Can't clear notification events.");
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->eq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-
-static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
-{
-       u64 eqe_value;
-       u32 token;
-       struct ehca_cq *cq;
-
-       eqe_value = eqe->entry;
-       ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
-       if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-               ehca_dbg(&shca->ib_device, "Got completion event");
-               token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, token);
-               if (cq)
-                       atomic_inc(&cq->nr_events);
-               read_unlock(&ehca_cq_idr_lock);
-               if (cq == NULL) {
-                       ehca_err(&shca->ib_device,
-                                "Invalid eqe for non-existing cq token=%x",
-                                token);
-                       return;
-               }
-               reset_eq_pending(cq);
-               if (ehca_scaling_code)
-                       queue_comp_task(cq);
-               else {
-                       comp_event_callback(cq);
-                       if (atomic_dec_and_test(&cq->nr_events))
-                               wake_up(&cq->wait_completion);
-               }
-       } else {
-               ehca_dbg(&shca->ib_device, "Got non completion event");
-               parse_identifier(shca, eqe_value);
-       }
-}
-
-void ehca_process_eq(struct ehca_shca *shca, int is_irq)
-{
-       struct ehca_eq *eq = &shca->eq;
-       struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-       u64 eqe_value, ret;
-       int eqe_cnt, i;
-       int eq_empty = 0;
-
-       spin_lock(&eq->irq_spinlock);
-       if (is_irq) {
-               const int max_query_cnt = 100;
-               int query_cnt = 0;
-               int int_state = 1;
-               do {
-                       int_state = hipz_h_query_int_state(
-                               shca->ipz_hca_handle, eq->ist);
-                       query_cnt++;
-                       iosync();
-               } while (int_state && query_cnt < max_query_cnt);
-               if (unlikely((query_cnt == max_query_cnt)))
-                       ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
-                                int_state, query_cnt);
-       }
-
-       /* read out all eqes */
-       eqe_cnt = 0;
-       do {
-               u32 token;
-               eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
-               if (!eqe_cache[eqe_cnt].eqe)
-                       break;
-               eqe_value = eqe_cache[eqe_cnt].eqe->entry;
-               if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-                       token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-                       read_lock(&ehca_cq_idr_lock);
-                       eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
-                       if (eqe_cache[eqe_cnt].cq)
-                               atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
-                       read_unlock(&ehca_cq_idr_lock);
-                       if (!eqe_cache[eqe_cnt].cq) {
-                               ehca_err(&shca->ib_device,
-                                        "Invalid eqe for non-existing cq "
-                                        "token=%x", token);
-                               continue;
-                       }
-               } else
-                       eqe_cache[eqe_cnt].cq = NULL;
-               eqe_cnt++;
-       } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
-       if (!eqe_cnt) {
-               if (is_irq)
-                       ehca_dbg(&shca->ib_device,
-                                "No eqe found for irq event");
-               goto unlock_irq_spinlock;
-       } else if (!is_irq) {
-               ret = hipz_h_eoi(eq->ist);
-               if (ret != H_SUCCESS)
-                       ehca_err(&shca->ib_device,
-                                "bad return code EOI -rc = %lld\n", ret);
-               ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
-       }
-       if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
-               ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
-       /* enable irq for new packets */
-       for (i = 0; i < eqe_cnt; i++) {
-               if (eq->eqe_cache[i].cq)
-                       reset_eq_pending(eq->eqe_cache[i].cq);
-       }
-       /* check eq */
-       spin_lock(&eq->spinlock);
-       eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
-       spin_unlock(&eq->spinlock);
-       /* call completion handler for cached eqes */
-       for (i = 0; i < eqe_cnt; i++)
-               if (eq->eqe_cache[i].cq) {
-                       if (ehca_scaling_code)
-                               queue_comp_task(eq->eqe_cache[i].cq);
-                       else {
-                               struct ehca_cq *cq = eq->eqe_cache[i].cq;
-                               comp_event_callback(cq);
-                               if (atomic_dec_and_test(&cq->nr_events))
-                                       wake_up(&cq->wait_completion);
-                       }
-               } else {
-                       ehca_dbg(&shca->ib_device, "Got non completion event");
-                       parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
-               }
-       /* poll eq if not empty */
-       if (eq_empty)
-               goto unlock_irq_spinlock;
-       do {
-               struct ehca_eqe *eqe;
-               eqe = ehca_poll_eq(shca, &shca->eq);
-               if (!eqe)
-                       break;
-               process_eqe(shca, eqe);
-       } while (1);
-
-unlock_irq_spinlock:
-       spin_unlock(&eq->irq_spinlock);
-}
-
-void ehca_tasklet_eq(unsigned long data)
-{
-       ehca_process_eq((struct ehca_shca*)data, 1);
-}
-
-static int find_next_online_cpu(struct ehca_comp_pool *pool)
-{
-       int cpu;
-       unsigned long flags;
-
-       WARN_ON_ONCE(!in_interrupt());
-       if (ehca_debug_level >= 3)
-               ehca_dmp(cpu_online_mask, cpumask_size(), "");
-
-       spin_lock_irqsave(&pool->last_cpu_lock, flags);
-       do {
-               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
-               if (cpu >= nr_cpu_ids)
-                       cpu = cpumask_first(cpu_online_mask);
-               pool->last_cpu = cpu;
-       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
-       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
-
-       return cpu;
-}
-
-static void __queue_comp_task(struct ehca_cq *__cq,
-                             struct ehca_cpu_comp_task *cct,
-                             struct task_struct *thread)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       spin_lock(&__cq->task_lock);
-
-       if (__cq->nr_callbacks == 0) {
-               __cq->nr_callbacks++;
-               list_add_tail(&__cq->entry, &cct->cq_list);
-               cct->cq_jobs++;
-               wake_up_process(thread);
-       } else
-               __cq->nr_callbacks++;
-
-       spin_unlock(&__cq->task_lock);
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-}
-
-static void queue_comp_task(struct ehca_cq *__cq)
-{
-       int cpu_id;
-       struct ehca_cpu_comp_task *cct;
-       struct task_struct *thread;
-       int cq_jobs;
-       unsigned long flags;
-
-       cpu_id = find_next_online_cpu(pool);
-       BUG_ON(!cpu_online(cpu_id));
-
-       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-       BUG_ON(!cct || !thread);
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       cq_jobs = cct->cq_jobs;
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-       if (cq_jobs > 0) {
-               cpu_id = find_next_online_cpu(pool);
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-               BUG_ON(!cct || !thread);
-       }
-       __queue_comp_task(__cq, cct, thread);
-}
-
-static void run_comp_task(struct ehca_cpu_comp_task *cct)
-{
-       struct ehca_cq *cq;
-
-       while (!list_empty(&cct->cq_list)) {
-               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-               spin_unlock_irq(&cct->task_lock);
-
-               comp_event_callback(cq);
-               if (atomic_dec_and_test(&cq->nr_events))
-                       wake_up(&cq->wait_completion);
-
-               spin_lock_irq(&cct->task_lock);
-               spin_lock(&cq->task_lock);
-               cq->nr_callbacks--;
-               if (!cq->nr_callbacks) {
-                       list_del_init(cct->cq_list.next);
-                       cct->cq_jobs--;
-               }
-               spin_unlock(&cq->task_lock);
-       }
-}
-
-static void comp_task_park(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       struct ehca_cpu_comp_task *target;
-       struct task_struct *thread;
-       struct ehca_cq *cq, *tmp;
-       LIST_HEAD(list);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       list_splice_init(&cct->cq_list, &list);
-       spin_unlock_irq(&cct->task_lock);
-
-       cpu = find_next_online_cpu(pool);
-       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
-       spin_lock_irq(&target->task_lock);
-       list_for_each_entry_safe(cq, tmp, &list, entry) {
-               list_del(&cq->entry);
-               __queue_comp_task(cq, target, thread);
-       }
-       spin_unlock_irq(&target->task_lock);
-}
-
-static void comp_task_stop(unsigned int cpu, bool online)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       WARN_ON(!list_empty(&cct->cq_list));
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static int comp_task_should_run(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       return cct->cq_jobs;
-}
-
-static void comp_task(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
-       int cql_empty;
-
-       spin_lock_irq(&cct->task_lock);
-       cql_empty = list_empty(&cct->cq_list);
-       if (!cql_empty) {
-               __set_current_state(TASK_RUNNING);
-               run_comp_task(cct);
-       }
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static struct smp_hotplug_thread comp_pool_threads = {
-       .thread_should_run      = comp_task_should_run,
-       .thread_fn              = comp_task,
-       .thread_comm            = "ehca_comp/%u",
-       .cleanup                = comp_task_stop,
-       .park                   = comp_task_park,
-};
-
-int ehca_create_comp_pool(void)
-{
-       int cpu, ret = -ENOMEM;
-
-       if (!ehca_scaling_code)
-               return 0;
-
-       pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
-       if (pool == NULL)
-               return -ENOMEM;
-
-       spin_lock_init(&pool->last_cpu_lock);
-       pool->last_cpu = cpumask_any(cpu_online_mask);
-
-       pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
-       if (!pool->cpu_comp_tasks)
-               goto out_pool;
-
-       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
-       if (!pool->cpu_comp_threads)
-               goto out_tasks;
-
-       for_each_present_cpu(cpu) {
-               struct ehca_cpu_comp_task *cct;
-
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-               spin_lock_init(&cct->task_lock);
-               INIT_LIST_HEAD(&cct->cq_list);
-       }
-
-       comp_pool_threads.store = pool->cpu_comp_threads;
-       ret = smpboot_register_percpu_thread(&comp_pool_threads);
-       if (ret)
-               goto out_threads;
-
-       pr_info("eHCA scaling code enabled\n");
-       return ret;
-
-out_threads:
-       free_percpu(pool->cpu_comp_threads);
-out_tasks:
-       free_percpu(pool->cpu_comp_tasks);
-out_pool:
-       kfree(pool);
-       return ret;
-}
-
-void ehca_destroy_comp_pool(void)
-{
-       if (!ehca_scaling_code)
-               return;
-
-       smpboot_unregister_percpu_thread(&comp_pool_threads);
-
-       free_percpu(pool->cpu_comp_threads);
-       free_percpu(pool->cpu_comp_tasks);
-       kfree(pool);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.h b/drivers/staging/rdma/ehca/ehca_irq.h
deleted file mode 100644 (file)
index 5370199..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions and structs for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IRQ_H
-#define __EHCA_IRQ_H
-
-
-struct ehca_shca;
-
-#include <linux/interrupt.h>
-#include <linux/types.h>
-
-int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
-void ehca_tasklet_neq(unsigned long data);
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
-void ehca_tasklet_eq(unsigned long data);
-void ehca_process_eq(struct ehca_shca *shca, int is_irq);
-
-struct ehca_cpu_comp_task {
-       struct list_head cq_list;
-       spinlock_t task_lock;
-       int cq_jobs;
-       int active;
-};
-
-struct ehca_comp_pool {
-       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
-       struct task_struct * __percpu *cpu_comp_threads;
-       int last_cpu;
-       spinlock_t last_cpu_lock;
-};
-
-int ehca_create_comp_pool(void);
-void ehca_destroy_comp_pool(void);
-
-#endif
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
deleted file mode 100644 (file)
index cca5933..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions for internal functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Dietmar Decker <ddecker@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IVERBS_H__
-#define __EHCA_IVERBS_H__
-
-#include "ehca_classes.h"
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw);
-
-int ehca_query_port(struct ib_device *ibdev, u8 port,
-                   struct ib_port_attr *props);
-
-enum rdma_protocol_type
-ehca_query_protocol(struct ib_device *device, u8 port_num);
-
-int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
-                       struct ehca_sma_attr *attr);
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
-                  union ib_gid *gid);
-
-int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
-                    struct ib_port_modify *props);
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context,
-                           struct ib_udata *udata);
-
-int ehca_dealloc_pd(struct ib_pd *pd);
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_destroy_ah(struct ib_ah *ah);
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata);
-
-int ehca_dereg_mr(struct ib_mr *mr);
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-int ehca_dealloc_mw(struct ib_mw *mw);
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr);
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list, int list_len, u64 iova);
-
-int ehca_unmap_fmr(struct list_head *fmr_list);
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr);
-
-enum ehca_eq_type {
-       EHCA_EQ = 0, /* Event Queue              */
-       EHCA_NEQ     /* Notification Event Queue */
-};
-
-int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
-                  enum ehca_eq_type type, const u32 length);
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata);
-
-int ehca_destroy_cq(struct ib_cq *cq);
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
-
-int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *init_attr,
-                            struct ib_udata *udata);
-
-int ehca_destroy_qp(struct ib_qp *qp);
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata);
-
-int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr);
-
-int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr);
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *init_attr,
-                              struct ib_udata *udata);
-
-int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-
-int ehca_destroy_srq(struct ib_srq *srq);
-
-u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
-                   struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata);
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context);
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index);
-
-void ehca_poll_eqs(unsigned long data);
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd);
-
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
-
-#ifdef CONFIG_PPC_64K_PAGES
-void *ehca_alloc_fw_ctrlblock(gfp_t flags);
-void ehca_free_fw_ctrlblock(void *ptr);
-#else
-#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
-#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
-#endif
-
-void ehca_recover_sqp(struct ib_qp *sqp);
-
-#endif
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
deleted file mode 100644 (file)
index 832f22f..0000000
+++ /dev/null
@@ -1,1118 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  module start stop, hca detection
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef CONFIG_PPC_64K_PAGES
-#include <linux/slab.h>
-#endif
-
-#include <linux/notifier.h>
-#include <linux/memory.h>
-#include <rdma/ib_mad.h>
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-#define HCAD_VERSION "0029"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
-MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
-MODULE_VERSION(HCAD_VERSION);
-
-static bool ehca_open_aqp1    = 0;
-static int ehca_hw_level      = 0;
-static bool ehca_poll_all_eqs = 1;
-
-int ehca_debug_level   = 0;
-int ehca_nr_ports      = -1;
-bool ehca_use_hp_mr    = 0;
-int ehca_port_act_time = 30;
-int ehca_static_rate   = -1;
-bool ehca_scaling_code = 0;
-int ehca_lock_hcalls   = -1;
-int ehca_max_cq        = -1;
-int ehca_max_qp        = -1;
-
-module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
-module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
-module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
-module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
-module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
-module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
-module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
-module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
-module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
-module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
-module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
-
-MODULE_PARM_DESC(open_aqp1,
-                "Open AQP1 on startup (default: no)");
-MODULE_PARM_DESC(debug_level,
-                "Amount of debug output (0: none (default), 1: traces, "
-                "2: some dumps, 3: lots)");
-MODULE_PARM_DESC(hw_level,
-                "Hardware level (0: autosensing (default), "
-                "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
-MODULE_PARM_DESC(nr_ports,
-                "number of connected ports (-1: autodetect (default), "
-                "1: port one only, 2: two ports)");
-MODULE_PARM_DESC(use_hp_mr,
-                "Use high performance MRs (default: no)");
-MODULE_PARM_DESC(port_act_time,
-                "Time to wait for port activation (default: 30 sec)");
-MODULE_PARM_DESC(poll_all_eqs,
-                "Poll all event queues periodically (default: yes)");
-MODULE_PARM_DESC(static_rate,
-                "Set permanent static rate (default: no static rate)");
-MODULE_PARM_DESC(scaling_code,
-                "Enable scaling code (default: no)");
-MODULE_PARM_DESC(lock_hcalls,
-                "Serialize all hCalls made by the driver "
-                "(default: autodetect)");
-MODULE_PARM_DESC(number_of_cqs,
-               "Max number of CQs which can be allocated "
-               "(default: autodetect)");
-MODULE_PARM_DESC(number_of_qps,
-               "Max number of QPs which can be allocated "
-               "(default: autodetect)");
-
-DEFINE_RWLOCK(ehca_qp_idr_lock);
-DEFINE_RWLOCK(ehca_cq_idr_lock);
-DEFINE_IDR(ehca_qp_idr);
-DEFINE_IDR(ehca_cq_idr);
-
-static LIST_HEAD(shca_list); /* list of all registered ehcas */
-DEFINE_SPINLOCK(shca_list_lock);
-
-static struct timer_list poll_eqs_timer;
-
-#ifdef CONFIG_PPC_64K_PAGES
-static struct kmem_cache *ctblk_cache;
-
-void *ehca_alloc_fw_ctrlblock(gfp_t flags)
-{
-       void *ret = kmem_cache_zalloc(ctblk_cache, flags);
-       if (!ret)
-               ehca_gen_err("Out of memory for ctblk");
-       return ret;
-}
-
-void ehca_free_fw_ctrlblock(void *ptr)
-{
-       if (ptr)
-               kmem_cache_free(ctblk_cache, ptr);
-
-}
-#endif
-
-int ehca2ib_return_code(u64 ehca_rc)
-{
-       switch (ehca_rc) {
-       case H_SUCCESS:
-               return 0;
-       case H_RESOURCE:             /* Resource in use */
-       case H_BUSY:
-               return -EBUSY;
-       case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
-       case H_CONSTRAINED:          /* resource constraint */
-       case H_NO_MEM:
-               return -ENOMEM;
-       default:
-               return -EINVAL;
-       }
-}
-
-static int ehca_create_slab_caches(void)
-{
-       int ret;
-
-       ret = ehca_init_pd_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create PD SLAB cache.");
-               return ret;
-       }
-
-       ret = ehca_init_cq_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create CQ SLAB cache.");
-               goto create_slab_caches2;
-       }
-
-       ret = ehca_init_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create QP SLAB cache.");
-               goto create_slab_caches3;
-       }
-
-       ret = ehca_init_av_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create AV SLAB cache.");
-               goto create_slab_caches4;
-       }
-
-       ret = ehca_init_mrmw_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create MR&MW SLAB cache.");
-               goto create_slab_caches5;
-       }
-
-       ret = ehca_init_small_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create small queue SLAB cache.");
-               goto create_slab_caches6;
-       }
-
-#ifdef CONFIG_PPC_64K_PAGES
-       ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
-                                       EHCA_PAGESIZE, H_CB_ALIGNMENT,
-                                       SLAB_HWCACHE_ALIGN,
-                                       NULL);
-       if (!ctblk_cache) {
-               ehca_gen_err("Cannot create ctblk SLAB cache.");
-               ehca_cleanup_small_qp_cache();
-               ret = -ENOMEM;
-               goto create_slab_caches6;
-       }
-#endif
-       return 0;
-
-create_slab_caches6:
-       ehca_cleanup_mrmw_cache();
-
-create_slab_caches5:
-       ehca_cleanup_av_cache();
-
-create_slab_caches4:
-       ehca_cleanup_qp_cache();
-
-create_slab_caches3:
-       ehca_cleanup_cq_cache();
-
-create_slab_caches2:
-       ehca_cleanup_pd_cache();
-
-       return ret;
-}
-
-static void ehca_destroy_slab_caches(void)
-{
-       ehca_cleanup_small_qp_cache();
-       ehca_cleanup_mrmw_cache();
-       ehca_cleanup_av_cache();
-       ehca_cleanup_qp_cache();
-       ehca_cleanup_cq_cache();
-       ehca_cleanup_pd_cache();
-#ifdef CONFIG_PPC_64K_PAGES
-       kmem_cache_destroy(ctblk_cache);
-#endif
-}
-
-#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
-#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
-
-static struct cap_descr {
-       u64 mask;
-       char *descr;
-} hca_cap_descr[] = {
-       { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
-       { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
-       { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
-       { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
-       { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
-       { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
-       { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
-       { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
-       { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
-       { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
-       { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
-       { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
-       { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
-       { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
-       { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
-       { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
-       { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
-       { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
-};
-
-static int ehca_sense_attributes(struct ehca_shca *shca)
-{
-       int i, ret = 0;
-       u64 h_ret;
-       struct hipz_query_hca *rblock;
-       struct hipz_query_port *port;
-       const char *loc_code;
-
-       static const u32 pgsize_map[] = {
-               HCA_CAP_MR_PGSIZE_4K,  0x1000,
-               HCA_CAP_MR_PGSIZE_64K, 0x10000,
-               HCA_CAP_MR_PGSIZE_1M,  0x100000,
-               HCA_CAP_MR_PGSIZE_16M, 0x1000000,
-       };
-
-       ehca_gen_dbg("Probing adapter %s...",
-                    shca->ofdev->dev.of_node->full_name);
-       loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
-                                  NULL);
-       if (loc_code)
-               ehca_gen_dbg(" ... location lode=%s", loc_code);
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_gen_err("Cannot allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query device properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       if (ehca_nr_ports == 1)
-               shca->num_ports = 1;
-       else
-               shca->num_ports = (u8)rblock->num_ports;
-
-       ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
-
-       if (ehca_hw_level == 0) {
-               u32 hcaaver;
-               u32 revid;
-
-               hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
-               revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
-
-               ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
-
-               if (hcaaver == 1) {
-                       if (revid <= 3)
-                               shca->hw_level = 0x10 | (revid + 1);
-                       else
-                               shca->hw_level = 0x14;
-               } else if (hcaaver == 2) {
-                       if (revid == 0)
-                               shca->hw_level = 0x21;
-                       else if (revid == 0x10)
-                               shca->hw_level = 0x22;
-                       else if (revid == 0x20 || revid == 0x21)
-                               shca->hw_level = 0x23;
-               }
-
-               if (!shca->hw_level) {
-                       ehca_gen_warn("unknown hardware version"
-                                     " - assuming default level");
-                       shca->hw_level = 0x22;
-               }
-       } else
-               shca->hw_level = ehca_hw_level;
-       ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
-
-       shca->hca_cap = rblock->hca_cap_indicators;
-       ehca_gen_dbg(" ... HCA capabilities:");
-       for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
-               if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
-                       ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
-
-       /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
-        * a firmware property, so it's valid across all adapters
-        */
-       if (ehca_lock_hcalls == -1)
-               ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
-                                       shca->hca_cap);
-
-       /* translate supported MR page sizes; always support 4K */
-       shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
-       for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
-               if (rblock->memory_page_size_supported & pgsize_map[i])
-                       shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
-
-       /* Set maximum number of CQs and QPs to calculate EQ size */
-       if (shca->max_num_qps == -1)
-               shca->max_num_qps = min_t(int, rblock->max_qp,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
-               ehca_gen_warn("The requested number of QPs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_qp, rblock->max_qp);
-               shca->max_num_qps = rblock->max_qp;
-       }
-
-       if (shca->max_num_cqs == -1)
-               shca->max_num_cqs = min_t(int, rblock->max_cq,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
-               ehca_gen_warn("The requested number of CQs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_cq, rblock->max_cq);
-       }
-
-       /* query max MTU from first port -- it's the same for all ports */
-       port = (struct hipz_query_port *)rblock;
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query port properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       shca->max_mtu = port->max_mtu;
-
-sense_attributes1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int init_node_guid(struct ehca_shca *shca)
-{
-       int ret = 0;
-       struct hipz_query_hca *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto init_node_guid1;
-       }
-
-       memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
-
-init_node_guid1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
-                              struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = ehca_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-       return 0;
-}
-
-static int ehca_init_device(struct ehca_shca *shca)
-{
-       int ret;
-
-       ret = init_node_guid(shca);
-       if (ret)
-               return ret;
-
-       strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
-       shca->ib_device.owner               = THIS_MODULE;
-
-       shca->ib_device.uverbs_abi_ver      = 8;
-       shca->ib_device.uverbs_cmd_mask     =
-               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
-               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
-               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
-               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
-               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
-               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
-               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
-               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
-               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
-               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
-               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
-
-       shca->ib_device.node_type           = RDMA_NODE_IB_CA;
-       shca->ib_device.phys_port_cnt       = shca->num_ports;
-       shca->ib_device.num_comp_vectors    = 1;
-       shca->ib_device.dma_device          = &shca->ofdev->dev;
-       shca->ib_device.query_device        = ehca_query_device;
-       shca->ib_device.query_port          = ehca_query_port;
-       shca->ib_device.query_gid           = ehca_query_gid;
-       shca->ib_device.query_pkey          = ehca_query_pkey;
-       /* shca->in_device.modify_device    = ehca_modify_device    */
-       shca->ib_device.modify_port         = ehca_modify_port;
-       shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
-       shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
-       shca->ib_device.alloc_pd            = ehca_alloc_pd;
-       shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
-       shca->ib_device.create_ah           = ehca_create_ah;
-       /* shca->ib_device.modify_ah        = ehca_modify_ah;       */
-       shca->ib_device.query_ah            = ehca_query_ah;
-       shca->ib_device.destroy_ah          = ehca_destroy_ah;
-       shca->ib_device.create_qp           = ehca_create_qp;
-       shca->ib_device.modify_qp           = ehca_modify_qp;
-       shca->ib_device.query_qp            = ehca_query_qp;
-       shca->ib_device.destroy_qp          = ehca_destroy_qp;
-       shca->ib_device.post_send           = ehca_post_send;
-       shca->ib_device.post_recv           = ehca_post_recv;
-       shca->ib_device.create_cq           = ehca_create_cq;
-       shca->ib_device.destroy_cq          = ehca_destroy_cq;
-       shca->ib_device.resize_cq           = ehca_resize_cq;
-       shca->ib_device.poll_cq             = ehca_poll_cq;
-       /* shca->ib_device.peek_cq          = ehca_peek_cq;         */
-       shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
-       /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
-       shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
-       shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
-       shca->ib_device.dereg_mr            = ehca_dereg_mr;
-       shca->ib_device.alloc_mw            = ehca_alloc_mw;
-       shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
-       shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
-       shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
-       shca->ib_device.unmap_fmr           = ehca_unmap_fmr;
-       shca->ib_device.dealloc_fmr         = ehca_dealloc_fmr;
-       shca->ib_device.attach_mcast        = ehca_attach_mcast;
-       shca->ib_device.detach_mcast        = ehca_detach_mcast;
-       shca->ib_device.process_mad         = ehca_process_mad;
-       shca->ib_device.mmap                = ehca_mmap;
-       shca->ib_device.dma_ops             = &ehca_dma_mapping_ops;
-       shca->ib_device.get_port_immutable  = ehca_port_immutable;
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               shca->ib_device.uverbs_cmd_mask |=
-                       (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
-
-               shca->ib_device.create_srq          = ehca_create_srq;
-               shca->ib_device.modify_srq          = ehca_modify_srq;
-               shca->ib_device.query_srq           = ehca_query_srq;
-               shca->ib_device.destroy_srq         = ehca_destroy_srq;
-               shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
-       }
-
-       return ret;
-}
-
-static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
-{
-       struct ehca_sport *sport = &shca->sport[port - 1];
-       struct ib_cq *ibcq;
-       struct ib_qp *ibqp;
-       struct ib_qp_init_attr qp_init_attr;
-       struct ib_cq_init_attr cq_attr = {};
-       int ret;
-
-       if (sport->ibcq_aqp1) {
-               ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
-               return -EPERM;
-       }
-
-       cq_attr.cqe = 10;
-       ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
-                           &cq_attr);
-       if (IS_ERR(ibcq)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
-               return PTR_ERR(ibcq);
-       }
-       sport->ibcq_aqp1 = ibcq;
-
-       if (sport->ibqp_sqp[IB_QPT_GSI]) {
-               ehca_err(&shca->ib_device, "AQP1 QP is already created.");
-               ret = -EPERM;
-               goto create_aqp1;
-       }
-
-       memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
-       qp_init_attr.send_cq          = ibcq;
-       qp_init_attr.recv_cq          = ibcq;
-       qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
-       qp_init_attr.cap.max_send_wr  = 100;
-       qp_init_attr.cap.max_recv_wr  = 100;
-       qp_init_attr.cap.max_send_sge = 2;
-       qp_init_attr.cap.max_recv_sge = 1;
-       qp_init_attr.qp_type          = IB_QPT_GSI;
-       qp_init_attr.port_num         = port;
-       qp_init_attr.qp_context       = NULL;
-       qp_init_attr.event_handler    = NULL;
-       qp_init_attr.srq              = NULL;
-
-       ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
-       if (IS_ERR(ibqp)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
-               ret = PTR_ERR(ibqp);
-               goto create_aqp1;
-       }
-       sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
-
-       return 0;
-
-create_aqp1:
-       ib_destroy_cq(sport->ibcq_aqp1);
-       return ret;
-}
-
-static int ehca_destroy_aqp1(struct ehca_sport *sport)
-{
-       int ret;
-
-       ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
-       if (ret) {
-               ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
-               return ret;
-       }
-
-       ret = ib_destroy_cq(sport->ibcq_aqp1);
-       if (ret)
-               ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
-
-       return ret;
-}
-
-static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
-{
-       return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
-}
-
-static ssize_t ehca_store_debug_level(struct device_driver *ddp,
-                                     const char *buf, size_t count)
-{
-       int value = (*buf) - '0';
-       if (value >= 0 && value <= 9)
-               ehca_debug_level = value;
-       return 1;
-}
-
-static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
-                  ehca_show_debug_level, ehca_store_debug_level);
-
-static struct attribute *ehca_drv_attrs[] = {
-       &driver_attr_debug_level.attr,
-       NULL
-};
-
-static struct attribute_group ehca_drv_attr_grp = {
-       .attrs = ehca_drv_attrs
-};
-
-static const struct attribute_group *ehca_drv_attr_groups[] = {
-       &ehca_drv_attr_grp,
-       NULL,
-};
-
-#define EHCA_RESOURCE_ATTR(name)                                           \
-static ssize_t  ehca_show_##name(struct device *dev,                       \
-                                struct device_attribute *attr,            \
-                                char *buf)                                \
-{                                                                         \
-       struct ehca_shca *shca;                                            \
-       struct hipz_query_hca *rblock;                                     \
-       int data;                                                          \
-                                                                          \
-       shca = dev_get_drvdata(dev);                                       \
-                                                                          \
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);                      \
-       if (!rblock) {                                                     \
-               dev_err(dev, "Can't allocate rblock memory.\n");           \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
-               dev_err(dev, "Can't query device properties\n");           \
-               ehca_free_fw_ctrlblock(rblock);                            \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       data = rblock->name;                                               \
-       ehca_free_fw_ctrlblock(rblock);                                    \
-                                                                          \
-       if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))     \
-               return snprintf(buf, 256, "1\n");                          \
-       else                                                               \
-               return snprintf(buf, 256, "%d\n", data);                   \
-                                                                          \
-}                                                                         \
-static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
-
-EHCA_RESOURCE_ATTR(num_ports);
-EHCA_RESOURCE_ATTR(hw_ver);
-EHCA_RESOURCE_ATTR(max_eq);
-EHCA_RESOURCE_ATTR(cur_eq);
-EHCA_RESOURCE_ATTR(max_cq);
-EHCA_RESOURCE_ATTR(cur_cq);
-EHCA_RESOURCE_ATTR(max_qp);
-EHCA_RESOURCE_ATTR(cur_qp);
-EHCA_RESOURCE_ATTR(max_mr);
-EHCA_RESOURCE_ATTR(cur_mr);
-EHCA_RESOURCE_ATTR(max_mw);
-EHCA_RESOURCE_ATTR(cur_mw);
-EHCA_RESOURCE_ATTR(max_pd);
-EHCA_RESOURCE_ATTR(max_ah);
-
-static ssize_t ehca_show_adapter_handle(struct device *dev,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct ehca_shca *shca = dev_get_drvdata(dev);
-
-       return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
-
-}
-static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
-
-static struct attribute *ehca_dev_attrs[] = {
-       &dev_attr_adapter_handle.attr,
-       &dev_attr_num_ports.attr,
-       &dev_attr_hw_ver.attr,
-       &dev_attr_max_eq.attr,
-       &dev_attr_cur_eq.attr,
-       &dev_attr_max_cq.attr,
-       &dev_attr_cur_cq.attr,
-       &dev_attr_max_qp.attr,
-       &dev_attr_cur_qp.attr,
-       &dev_attr_max_mr.attr,
-       &dev_attr_cur_mr.attr,
-       &dev_attr_max_mw.attr,
-       &dev_attr_cur_mw.attr,
-       &dev_attr_max_pd.attr,
-       &dev_attr_max_ah.attr,
-       NULL
-};
-
-static struct attribute_group ehca_dev_attr_grp = {
-       .attrs = ehca_dev_attrs
-};
-
-static int ehca_probe(struct platform_device *dev)
-{
-       struct ehca_shca *shca;
-       const u64 *handle;
-       struct ib_pd *ibpd;
-       int ret, i, eq_size;
-       unsigned long flags;
-
-       handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
-       if (!handle) {
-               ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       if (!(*handle)) {
-               ehca_gen_err("Wrong eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
-       if (!shca) {
-               ehca_gen_err("Cannot allocate shca memory.");
-               return -ENOMEM;
-       }
-
-       mutex_init(&shca->modify_mutex);
-       atomic_set(&shca->num_cqs, 0);
-       atomic_set(&shca->num_qps, 0);
-       shca->max_num_qps = ehca_max_qp;
-       shca->max_num_cqs = ehca_max_cq;
-
-       for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
-               spin_lock_init(&shca->sport[i].mod_sqp_lock);
-
-       shca->ofdev = dev;
-       shca->ipz_hca_handle.handle = *handle;
-       dev_set_drvdata(&dev->dev, shca);
-
-       ret = ehca_sense_attributes(shca);
-       if (ret < 0) {
-               ehca_gen_err("Cannot sense eHCA attributes.");
-               goto probe1;
-       }
-
-       ret = ehca_init_device(shca);
-       if (ret) {
-               ehca_gen_err("Cannot init ehca  device struct");
-               goto probe1;
-       }
-
-       eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
-       /* create event queues */
-       ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create EQ.");
-               goto probe1;
-       }
-
-       ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create NEQ.");
-               goto probe3;
-       }
-
-       /* create internal protection domain */
-       ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
-       if (IS_ERR(ibpd)) {
-               ehca_err(&shca->ib_device, "Cannot create internal PD.");
-               ret = PTR_ERR(ibpd);
-               goto probe4;
-       }
-
-       shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
-       shca->pd->ib_pd.device = &shca->ib_device;
-
-       /* create internal max MR */
-       ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
-
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
-                        ret);
-               goto probe5;
-       }
-
-       ret = ib_register_device(&shca->ib_device, NULL);
-       if (ret) {
-               ehca_err(&shca->ib_device,
-                        "ib_register_device() failed ret=%i", ret);
-               goto probe6;
-       }
-
-       /* create AQP1 for port 1 */
-       if (ehca_open_aqp1 == 1) {
-               shca->sport[0].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 1);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 1.");
-                       goto probe7;
-               }
-       }
-
-       /* create AQP1 for port 2 */
-       if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
-               shca->sport[1].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 2);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 2.");
-                       goto probe8;
-               }
-       }
-
-       ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-       if (ret) /* only complain; we can live without attributes */
-               ehca_err(&shca->ib_device,
-                        "Cannot create device attributes  ret=%d", ret);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_add(&shca->shca_list, &shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return 0;
-
-probe8:
-       ret = ehca_destroy_aqp1(&shca->sport[0]);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy AQP1 for port 1. ret=%i", ret);
-
-probe7:
-       ib_unregister_device(&shca->ib_device);
-
-probe6:
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%x", ret);
-
-probe5:
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%x", ret);
-
-probe4:
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy NEQ. ret=%x", ret);
-
-probe3:
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy EQ. ret=%x", ret);
-
-probe1:
-       ib_dealloc_device(&shca->ib_device);
-
-       return -EINVAL;
-}
-
-static int ehca_remove(struct platform_device *dev)
-{
-       struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
-       unsigned long flags;
-       int ret;
-
-       sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-
-       if (ehca_open_aqp1 == 1) {
-               int i;
-               for (i = 0; i < shca->num_ports; i++) {
-                       ret = ehca_destroy_aqp1(&shca->sport[i]);
-                       if (ret)
-                               ehca_err(&shca->ib_device,
-                                        "Cannot destroy AQP1 for port %x "
-                                        "ret=%i", ret, i);
-               }
-       }
-
-       ib_unregister_device(&shca->ib_device);
-
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%i", ret);
-
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
-
-       ib_dealloc_device(&shca->ib_device);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_del(&shca->shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return ret;
-}
-
-static struct of_device_id ehca_device_table[] =
-{
-       {
-               .name       = "lhca",
-               .compatible = "IBM,lhca",
-       },
-       {},
-};
-MODULE_DEVICE_TABLE(of, ehca_device_table);
-
-static struct platform_driver ehca_driver = {
-       .probe       = ehca_probe,
-       .remove      = ehca_remove,
-       .driver = {
-               .name = "ehca",
-               .owner = THIS_MODULE,
-               .groups = ehca_drv_attr_groups,
-               .of_match_table = ehca_device_table,
-       },
-};
-
-void ehca_poll_eqs(unsigned long data)
-{
-       struct ehca_shca *shca;
-
-       spin_lock(&shca_list_lock);
-       list_for_each_entry(shca, &shca_list, shca_list) {
-               if (shca->eq.is_initialized) {
-                       /* call deadman proc only if eq ptr does not change */
-                       struct ehca_eq *eq = &shca->eq;
-                       int max = 3;
-                       volatile u64 q_ofs, q_ofs2;
-                       unsigned long flags;
-                       spin_lock_irqsave(&eq->spinlock, flags);
-                       q_ofs = eq->ipz_queue.current_q_offset;
-                       spin_unlock_irqrestore(&eq->spinlock, flags);
-                       do {
-                               spin_lock_irqsave(&eq->spinlock, flags);
-                               q_ofs2 = eq->ipz_queue.current_q_offset;
-                               spin_unlock_irqrestore(&eq->spinlock, flags);
-                               max--;
-                       } while (q_ofs == q_ofs2 && max > 0);
-                       if (q_ofs == q_ofs2)
-                               ehca_process_eq(shca, 0);
-               }
-       }
-       mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
-       spin_unlock(&shca_list_lock);
-}
-
-static int ehca_mem_notifier(struct notifier_block *nb,
-                            unsigned long action, void *data)
-{
-       static unsigned long ehca_dmem_warn_time;
-       unsigned long flags;
-
-       switch (action) {
-       case MEM_CANCEL_OFFLINE:
-       case MEM_CANCEL_ONLINE:
-       case MEM_ONLINE:
-       case MEM_OFFLINE:
-               return NOTIFY_OK;
-       case MEM_GOING_ONLINE:
-       case MEM_GOING_OFFLINE:
-               /* only ok if no hca is attached to the lpar */
-               spin_lock_irqsave(&shca_list_lock, flags);
-               if (list_empty(&shca_list)) {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       return NOTIFY_OK;
-               } else {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       if (printk_timed_ratelimit(&ehca_dmem_warn_time,
-                                                  30 * 1000))
-                               ehca_gen_err("DMEM operations are not allowed"
-                                            "in conjunction with eHCA");
-                       return NOTIFY_BAD;
-               }
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block ehca_mem_nb = {
-       .notifier_call = ehca_mem_notifier,
-};
-
-static int __init ehca_module_init(void)
-{
-       int ret;
-
-       printk(KERN_INFO "eHCA Infiniband Device Driver "
-              "(Version " HCAD_VERSION ")\n");
-
-       ret = ehca_create_comp_pool();
-       if (ret) {
-               ehca_gen_err("Cannot create comp pool.");
-               return ret;
-       }
-
-       ret = ehca_create_slab_caches();
-       if (ret) {
-               ehca_gen_err("Cannot create SLAB caches");
-               ret = -ENOMEM;
-               goto module_init1;
-       }
-
-       ret = ehca_create_busmap();
-       if (ret) {
-               ehca_gen_err("Cannot create busmap.");
-               goto module_init2;
-       }
-
-       ret = ibmebus_register_driver(&ehca_driver);
-       if (ret) {
-               ehca_gen_err("Cannot register eHCA device driver");
-               ret = -EINVAL;
-               goto module_init3;
-       }
-
-       ret = register_memory_notifier(&ehca_mem_nb);
-       if (ret) {
-               ehca_gen_err("Failed registering memory add/remove notifier");
-               goto module_init4;
-       }
-
-       if (ehca_poll_all_eqs != 1) {
-               ehca_gen_err("WARNING!!!");
-               ehca_gen_err("It is possible to lose interrupts.");
-       } else {
-               init_timer(&poll_eqs_timer);
-               poll_eqs_timer.function = ehca_poll_eqs;
-               poll_eqs_timer.expires = jiffies + HZ;
-               add_timer(&poll_eqs_timer);
-       }
-
-       return 0;
-
-module_init4:
-       ibmebus_unregister_driver(&ehca_driver);
-
-module_init3:
-       ehca_destroy_busmap();
-
-module_init2:
-       ehca_destroy_slab_caches();
-
-module_init1:
-       ehca_destroy_comp_pool();
-       return ret;
-};
-
-static void __exit ehca_module_exit(void)
-{
-       if (ehca_poll_all_eqs == 1)
-               del_timer_sync(&poll_eqs_timer);
-
-       ibmebus_unregister_driver(&ehca_driver);
-
-       unregister_memory_notifier(&ehca_mem_nb);
-
-       ehca_destroy_busmap();
-
-       ehca_destroy_slab_caches();
-
-       ehca_destroy_comp_pool();
-
-       idr_destroy(&ehca_cq_idr);
-       idr_destroy(&ehca_qp_idr);
-};
-
-module_init(ehca_module_init);
-module_exit(ehca_module_exit);
diff --git a/drivers/staging/rdma/ehca/ehca_mcast.c b/drivers/staging/rdma/ehca/ehca_mcast.c
deleted file mode 100644 (file)
index cec1815..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  mcast  functions
- *
- *  Authors: Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define MAX_MC_LID 0xFFFE
-#define MIN_MC_LID 0xC000      /* Multicast limits */
-#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
-#define EHCA_VALID_MULTICAST_LID(lid) \
-       (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
-
-int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
-
-int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->pd->device,
-                                             struct ehca_shca, ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
deleted file mode 100644 (file)
index 3367205..0000000
+++ /dev/null
@@ -1,2202 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <rdma/ib_umem.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "hcp_if.h"
-#include "hipz_hw.h"
-
-#define NUM_CHUNKS(length, chunk_size) \
-       (((length) + (chunk_size - 1)) / (chunk_size))
-
-/* max number of rpages (per hcall register_rpages) */
-#define MAX_RPAGES 512
-
-/* DMEM toleration management */
-#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
-#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
-#define EHCA_HUGEPAGESHIFT     34
-#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
-#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
-#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
-#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
-#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
-#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
-#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
-#define EHCA_DIR_MAP_SIZE (0x10000)
-#define EHCA_ENT_MAP_SIZE (0x10000)
-#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
-
-static unsigned long ehca_mr_len;
-
-/*
- * Memory map data structures
- */
-struct ehca_dir_bmap {
-       u64 ent[EHCA_MAP_ENTRIES];
-};
-struct ehca_top_bmap {
-       struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
-};
-struct ehca_bmap {
-       struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
-};
-
-static struct ehca_bmap *ehca_bmap;
-
-static struct kmem_cache *mr_cache;
-static struct kmem_cache *mw_cache;
-
-enum ehca_mr_pgsize {
-       EHCA_MR_PGSIZE4K  = 0x1000L,
-       EHCA_MR_PGSIZE64K = 0x10000L,
-       EHCA_MR_PGSIZE1M  = 0x100000L,
-       EHCA_MR_PGSIZE16M = 0x1000000L
-};
-
-#define EHCA_MR_PGSHIFT4K  12
-#define EHCA_MR_PGSHIFT64K 16
-#define EHCA_MR_PGSHIFT1M  20
-#define EHCA_MR_PGSHIFT16M 24
-
-static u64 ehca_map_vaddr(void *caddr);
-
-static u32 ehca_encode_hwpage_size(u32 pgsize)
-{
-       int log = ilog2(pgsize);
-       WARN_ON(log < 12 || log > 24 || log & 3);
-       return (log - 12) / 4;
-}
-
-static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
-{
-       return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
-}
-
-static struct ehca_mr *ehca_mr_new(void)
-{
-       struct ehca_mr *me;
-
-       me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mrlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mr_delete(struct ehca_mr *me)
-{
-       kmem_cache_free(mr_cache, me);
-}
-
-static struct ehca_mw *ehca_mw_new(void)
-{
-       struct ehca_mw *me;
-
-       me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mwlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mw_delete(struct ehca_mw *me)
-{
-       kmem_cache_free(mw_cache, me);
-}
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-
-       if (shca->maxmr) {
-               e_maxmr = ehca_mr_new();
-               if (!e_maxmr) {
-                       ehca_err(&shca->ib_device, "out of memory");
-                       ib_mr = ERR_PTR(-ENOMEM);
-                       goto get_dma_mr_exit0;
-               }
-
-               ret = ehca_reg_maxmr(shca, e_maxmr,
-                                    (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
-                                    mr_access_flags, e_pd,
-                                    &e_maxmr->ib.ib_mr.lkey,
-                                    &e_maxmr->ib.ib_mr.rkey);
-               if (ret) {
-                       ehca_mr_delete(e_maxmr);
-                       ib_mr = ERR_PTR(ret);
-                       goto get_dma_mr_exit0;
-               }
-               ib_mr = &e_maxmr->ib.ib_mr;
-       } else {
-               ehca_err(&shca->ib_device, "no internal max-MR exist!");
-               ib_mr = ERR_PTR(-EINVAL);
-               goto get_dma_mr_exit0;
-       }
-
-get_dma_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
-                        PTR_ERR(ib_mr), pd, mr_access_flags);
-       return ib_mr;
-} /* end ehca_get_dma_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata)
-{
-       struct ib_mr *ib_mr;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       int ret, page_shift;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hwpage_size;
-
-       if (!pd) {
-               ehca_gen_err("bad pd=%p", pd);
-               return ERR_PTR(-EFAULT);
-       }
-
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       if (length == 0 || virt + length < virt) {
-               ehca_err(pd->device, "bad input values: length=%llx "
-                        "virt_base=%llx", length, virt);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
-                                mr_access_flags, 0);
-       if (IS_ERR(e_mr->umem)) {
-               ib_mr = (void *)e_mr->umem;
-               goto reg_user_mr_exit1;
-       }
-
-       if (e_mr->umem->page_size != PAGE_SIZE) {
-               ehca_err(pd->device, "page size not supported, "
-                        "e_mr->umem->page_size=%x", e_mr->umem->page_size);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit2;
-       }
-
-       /* determine number of MR pages */
-       num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
-       /* select proper hw_pgsize */
-       page_shift = PAGE_SHIFT;
-       if (e_mr->umem->hugetlb) {
-               /* determine page_shift, clamp between 4K and 16M */
-               page_shift = (fls64(length - 1) + 3) & ~3;
-               page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
-                                EHCA_MR_PGSHIFT16M);
-       }
-       hwpage_size = 1UL << page_shift;
-
-       /* now that we have the desired page size, shift until it's
-        * supported, too. 4K is always supported, so this terminates.
-        */
-       while (!(hwpage_size & shca->hca_cap_mr_pgsize))
-               hwpage_size >>= 4;
-
-reg_user_mr_fallback:
-       num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_USER;
-       pginfo.hwpage_size = hwpage_size;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.u.usr.region = e_mr->umem;
-       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
-       pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
-       ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
-                         e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-       if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
-               ehca_warn(pd->device, "failed to register mr "
-                         "with hwpage_size=%llx", hwpage_size);
-               ehca_info(pd->device, "try to register mr with "
-                         "kpage_size=%lx", PAGE_SIZE);
-               /*
-                * this means kpages are not contiguous for a hw page
-                * try kernel page size as fallback solution
-                */
-               hwpage_size = PAGE_SIZE;
-               goto reg_user_mr_fallback;
-       }
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_user_mr_exit2;
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_user_mr_exit2:
-       ib_umem_release(e_mr->umem);
-reg_user_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_user_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
-                        PTR_ERR(ib_mr), pd, mr_access_flags, udata);
-       return ib_mr;
-} /* end ehca_reg_user_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_mr(struct ib_mr *mr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       } else if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       }
-
-       /* TODO: BUSY: MR still has bound window(s) */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
-                        "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
-                        h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto dereg_mr_exit0;
-       }
-
-       if (e_mr->umem)
-               ib_umem_release(e_mr->umem);
-
-       /* successful deregistration */
-       ehca_mr_delete(e_mr);
-
-dereg_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
-       return ret;
-} /* end ehca_dereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
-       struct ib_mw *ib_mw;
-       u64 h_ret;
-       struct ehca_mw *e_mw;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_mw_hipzout_parms hipzout;
-
-       if (type != IB_MW_TYPE_1)
-               return ERR_PTR(-EINVAL);
-
-       e_mw = ehca_mw_new();
-       if (!e_mw) {
-               ib_mw = ERR_PTR(-ENOMEM);
-               goto alloc_mw_exit0;
-       }
-
-       h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
-                        "shca=%p hca_hndl=%llx mw=%p",
-                        h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
-               ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto alloc_mw_exit1;
-       }
-       /* successful MW allocation */
-       e_mw->ipz_mw_handle = hipzout.handle;
-       e_mw->ib_mw.rkey    = hipzout.rkey;
-       return &e_mw->ib_mw;
-
-alloc_mw_exit1:
-       ehca_mw_delete(e_mw);
-alloc_mw_exit0:
-       if (IS_ERR(ib_mw))
-               ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
-       return ib_mw;
-} /* end ehca_alloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_mw(struct ib_mw *mw)
-{
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mw->device, struct ehca_shca, ib_device);
-       struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
-
-       h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
-                        "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
-                        h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
-                        e_mw->ipz_mw_handle.handle);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful deallocation */
-       ehca_mw_delete(e_mw);
-       return 0;
-} /* end ehca_dealloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr)
-{
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr *e_fmr;
-       int ret;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       u64 hw_pgsize;
-
-       /* check other parameters */
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if (mr_access_flags & IB_ACCESS_MW_BIND) {
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
-               ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
-                        "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
-                        fmr_attr->max_pages, fmr_attr->max_maps,
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       hw_pgsize = 1 << fmr_attr->page_shift;
-       if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
-               ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       e_fmr = ehca_mr_new();
-       if (!e_fmr) {
-               ib_fmr = ERR_PTR(-ENOMEM);
-               goto alloc_fmr_exit0;
-       }
-       e_fmr->flags |= EHCA_MR_FLAG_FMR;
-
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.hwpage_size = hw_pgsize;
-       /*
-        * pginfo.num_hwpages==0, ie register_rpages() will not be called
-        * but deferred to map_phys_fmr()
-        */
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         fmr_attr->max_pages * (1 << fmr_attr->page_shift),
-                         mr_access_flags, e_pd, &pginfo,
-                         &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               ib_fmr = ERR_PTR(ret);
-               goto alloc_fmr_exit1;
-       }
-
-       /* successful */
-       e_fmr->hwpage_size = hw_pgsize;
-       e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
-       e_fmr->fmr_max_pages = fmr_attr->max_pages;
-       e_fmr->fmr_max_maps = fmr_attr->max_maps;
-       e_fmr->fmr_map_cnt = 0;
-       return &e_fmr->ib.ib_fmr;
-
-alloc_fmr_exit1:
-       ehca_mr_delete(e_fmr);
-alloc_fmr_exit0:
-       return ib_fmr;
-} /* end ehca_alloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list,
-                     int list_len,
-                     u64 iova)
-{
-       int ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-       struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       u32 tmp_lkey, tmp_rkey;
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
-       if (ret)
-               goto map_phys_fmr_exit0;
-       if (iova % e_fmr->fmr_page_size) {
-               /* only whole-numbered pages */
-               ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
-                        iova, e_fmr->fmr_page_size);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
-               /* HCAD does not limit the maps, however trace this anyway */
-               ehca_info(fmr->device, "map limit exceeded, fmr=%p "
-                         "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
-                         fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
-       }
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       pginfo.num_kpages = list_len;
-       pginfo.hwpage_size = e_fmr->hwpage_size;
-       pginfo.num_hwpages =
-               list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
-       pginfo.u.fmr.page_list = page_list;
-       pginfo.next_hwpage =
-               (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
-       pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
-
-       ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
-                           list_len * e_fmr->fmr_page_size,
-                           e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto map_phys_fmr_exit0;
-
-       /* successful reregistration */
-       e_fmr->fmr_map_cnt++;
-       e_fmr->ib.ib_fmr.lkey = tmp_lkey;
-       e_fmr->ib.ib_fmr.rkey = tmp_rkey;
-       return 0;
-
-map_phys_fmr_exit0:
-       if (ret)
-               ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
-                        "iova=%llx", ret, fmr, page_list, list_len, iova);
-       return ret;
-} /* end ehca_map_phys_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_fmr(struct list_head *fmr_list)
-{
-       int ret = 0;
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca = NULL;
-       struct ehca_shca *prev_shca;
-       struct ehca_mr *e_fmr;
-       u32 num_fmr = 0;
-       u32 unmap_fmr_cnt = 0;
-
-       /* check all FMR belong to same SHCA, and check internal flag */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               prev_shca = shca;
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               if ((shca != prev_shca) && prev_shca) {
-                       ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
-                                "prev_shca=%p e_fmr=%p",
-                                shca, prev_shca, e_fmr);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-                       ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
-                                "e_fmr->flags=%x", e_fmr, e_fmr->flags);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               num_fmr++;
-       }
-
-       /* loop over all FMRs to unmap */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               unmap_fmr_cnt++;
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               ret = ehca_unmap_one_fmr(shca, e_fmr);
-               if (ret) {
-                       /* unmap failed, stop unmapping of rest of FMRs */
-                       ehca_err(&shca->ib_device, "unmap of one FMR failed, "
-                                "stop rest, e_fmr=%p num_fmr=%x "
-                                "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
-                                unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
-                       goto unmap_fmr_exit0;
-               }
-       }
-
-unmap_fmr_exit0:
-       if (ret)
-               ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
-                            ret, fmr_list, num_fmr, unmap_fmr_cnt);
-       return ret;
-} /* end ehca_unmap_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr)
-{
-       int ret;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto free_fmr_exit0;
-       }
-
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
-                        "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle, fmr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto free_fmr_exit0;
-       }
-       /* successful deregistration */
-       ehca_mr_delete(e_fmr);
-       return 0;
-
-free_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
-       return ret;
-} /* end ehca_dealloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo);
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey, /*OUT*/
-               u32 *rkey, /*OUT*/
-               enum ehca_reg_type reg_type)
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-       if (ehca_use_hp_mr == 1)
-               hipz_acl |= 0x00000001;
-
-       h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
-                                        (u64)iova_start, size, hipz_acl,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
-                        "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_mr_exit0;
-       }
-
-       e_mr->ipz_mr_handle = hipzout.handle;
-
-       if (reg_type == EHCA_REG_BUSMAP_MR)
-               ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
-       else if (reg_type == EHCA_REG_MR)
-               ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
-       else
-               ret = -EINVAL;
-
-       if (ret)
-               goto ehca_reg_mr_exit1;
-
-       /* successful registration */
-       e_mr->num_kpages = pginfo->num_kpages;
-       e_mr->num_hwpages = pginfo->num_hwpages;
-       e_mr->hwpage_size = pginfo->hwpage_size;
-       e_mr->start = iova_start;
-       e_mr->size = size;
-       e_mr->acl = acl;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_mr_exit1:
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
-                        h_ret, shca, e_mr, iova_start, size, acl, e_pd,
-                        hipzout.lkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages, ret);
-               ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
-                        "not recoverable");
-       }
-ehca_reg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx",
-                        ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
-                        pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo)
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 rnum;
-       u64 rpage;
-       u32 i;
-       u64 *kpage;
-
-       if (!pginfo->num_hwpages) /* in case of fmr */
-               return 0;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_reg_mr_rpages_exit0;
-       }
-
-       /* max MAX_RPAGES ehca mr pages per register call */
-       for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
-                       if (rnum == 0)
-                               rnum = MAX_RPAGES;      /* last shot is full */
-               } else
-                       rnum = MAX_RPAGES;
-
-               ret = ehca_set_pagebuf(pginfo, rnum, kpage);
-               if (ret) {
-                       ehca_err(&shca->ib_device, "ehca_set_pagebuf "
-                                "bad rc, ret=%i rnum=%x kpage=%p",
-                                ret, rnum, kpage);
-                       goto ehca_reg_mr_rpages_exit1;
-               }
-
-               if (rnum > 1) {
-                       rpage = __pa(kpage);
-                       if (!rpage) {
-                               ehca_err(&shca->ib_device, "kpage=%p i=%x",
-                                        kpage, i);
-                               ret = -EFAULT;
-                               goto ehca_reg_mr_rpages_exit1;
-                       }
-               } else
-                       rpage = *kpage;
-
-               h_ret = hipz_h_register_rpage_mr(
-                       shca->ipz_hca_handle, e_mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       /*
-                        * check for 'registration complete'==H_SUCCESS
-                        * and for 'page registered'==H_PAGE_REGISTERED
-                        */
-                       if (h_ret != H_SUCCESS) {
-                               ehca_err(&shca->ib_device, "last "
-                                        "hipz_reg_rpage_mr failed, h_ret=%lli "
-                                        "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
-                                        " lkey=%x", h_ret, e_mr, i,
-                                        shca->ipz_hca_handle.handle,
-                                        e_mr->ipz_mr_handle.handle,
-                                        e_mr->ib.ib_mr.lkey);
-                               ret = ehca2ib_return_code(h_ret);
-                               break;
-                       } else
-                               ret = 0;
-               } else if (h_ret != H_PAGE_REGISTERED) {
-                       ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
-                                "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
-                                "mr_hndl=%llx", h_ret, e_mr, i,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-                       ret = ehca2ib_return_code(h_ret);
-                       break;
-               } else
-                       ret = 0;
-       } /* end for(i) */
-
-
-ehca_reg_mr_rpages_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_reg_mr_rpages_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
-                        pginfo, pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr_rpages() */
-
-/*----------------------------------------------------------------------*/
-
-inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
-                               struct ehca_mr *e_mr,
-                               u64 *iova_start,
-                               u64 size,
-                               u32 acl,
-                               struct ehca_pd *e_pd,
-                               struct ehca_mr_pginfo *pginfo,
-                               u32 *lkey, /*OUT*/
-                               u32 *rkey) /*OUT*/
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       u64 *kpage;
-       u64 rpage;
-       struct ehca_mr_pginfo pginfo_save;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_rereg_mr_rereg1_exit0;
-       }
-
-       pginfo_save = *pginfo;
-       ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
-       if (ret) {
-               ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
-                        "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
-                        "kpage=%p", e_mr, pginfo, pginfo->type,
-                        pginfo->num_kpages, pginfo->num_hwpages, kpage);
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       rpage = __pa(kpage);
-       if (!rpage) {
-               ehca_err(&shca->ib_device, "kpage=%p", kpage);
-               ret = -EFAULT;
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
-                                     (u64)iova_start, size, hipz_acl,
-                                     e_pd->fw_pd, rpage, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               /*
-                * reregistration unsuccessful, try it again with the 3 hCalls,
-                * e.g. this is required in case H_MR_CONDITION
-                * (MW bound or MR is shared)
-                */
-               ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
-                         "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
-               *pginfo = pginfo_save;
-               ret = -EAGAIN;
-       } else if ((u64 *)hipzout.vaddr != iova_start) {
-               ehca_err(&shca->ib_device, "PHYP changed iova_start in "
-                        "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
-                        "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
-                        hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
-                        e_mr->ib.ib_mr.lkey, hipzout.lkey);
-               ret = -EFAULT;
-       } else {
-               /*
-                * successful reregistration
-                * note: start and start_out are identical for eServer HCAs
-                */
-               e_mr->num_kpages = pginfo->num_kpages;
-               e_mr->num_hwpages = pginfo->num_hwpages;
-               e_mr->hwpage_size = pginfo->hwpage_size;
-               e_mr->start = iova_start;
-               e_mr->size = size;
-               e_mr->acl = acl;
-               *lkey = hipzout.lkey;
-               *rkey = hipzout.rkey;
-       }
-
-ehca_rereg_mr_rereg1_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_rereg_mr_rereg1_exit0:
-       if ( ret && (ret != -EAGAIN) )
-               ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx",
-                        ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages);
-       return ret;
-} /* end ehca_rereg_mr_rereg1() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int acl,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
-       int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
-
-       /* first determine reregistration hCall(s) */
-       if ((pginfo->num_hwpages > MAX_RPAGES) ||
-           (e_mr->num_hwpages > MAX_RPAGES) ||
-           (pginfo->num_hwpages > e_mr->num_hwpages)) {
-               ehca_dbg(&shca->ib_device, "Rereg3 case, "
-                        "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
-                        pginfo->num_hwpages, e_mr->num_hwpages);
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-       }
-
-       if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-               e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
-               ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
-                        e_mr);
-       }
-
-       if (rereg_1_hcall) {
-               ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
-                                          acl, e_pd, pginfo, lkey, rkey);
-               if (ret) {
-                       if (ret == -EAGAIN)
-                               rereg_3_hcall = 1;
-                       else
-                               goto ehca_rereg_mr_exit0;
-               }
-       }
-
-       if (rereg_3_hcall) {
-               struct ehca_mr save_mr;
-
-               /* first deregister old MR */
-               h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                                "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
-                                "mr->lkey=%x",
-                                h_ret, e_mr, shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle,
-                                e_mr->ib.ib_mr.lkey);
-                       ret = ehca2ib_return_code(h_ret);
-                       goto ehca_rereg_mr_exit0;
-               }
-               /* clean ehca_mr_t, without changing struct ib_mr and lock */
-               save_mr = *e_mr;
-               ehca_mr_deletenew(e_mr);
-
-               /* set some MR values */
-               e_mr->flags = save_mr.flags;
-               e_mr->hwpage_size = save_mr.hwpage_size;
-               e_mr->fmr_page_size = save_mr.fmr_page_size;
-               e_mr->fmr_max_pages = save_mr.fmr_max_pages;
-               e_mr->fmr_max_maps = save_mr.fmr_max_maps;
-               e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-                                 e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
-               if (ret) {
-                       u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
-                       memcpy(&e_mr->flags, &(save_mr.flags),
-                              sizeof(struct ehca_mr) - offset);
-                       goto ehca_rereg_mr_exit0;
-               }
-       }
-
-ehca_rereg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
-                        "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
-                        acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
-                        rereg_1_hcall, rereg_3_hcall);
-       return ret;
-} /* end ehca_rereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_pd *e_pd =
-               container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
-       struct ehca_mr save_fmr;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       struct ehca_mr_hipzout_parms hipzout;
-       struct ehca_mr save_mr;
-
-       if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
-               /*
-                * note: after using rereg hcall with len=0,
-                * rereg hcall must be used again for registering pages
-                */
-               h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
-                                             0, 0, e_pd->fw_pd, 0, &hipzout);
-               if (h_ret == H_SUCCESS) {
-                       /* successful reregistration */
-                       e_fmr->start = NULL;
-                       e_fmr->size = 0;
-                       tmp_lkey = hipzout.lkey;
-                       tmp_rkey = hipzout.rkey;
-                       return 0;
-               }
-               /*
-                * should not happen, because length checked above,
-                * FMRs are not shared and no MW bound to FMRs
-                */
-               ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
-                        "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
-                        "mr_hndl=%llx lkey=%x lkey_out=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
-               /* try free and rereg */
-       }
-
-       /* first free old FMR */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                        "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
-                        "lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_unmap_one_fmr_exit0;
-       }
-       /* clean ehca_mr_t, without changing lock */
-       save_fmr = *e_fmr;
-       ehca_mr_deletenew(e_fmr);
-
-       /* set some MR values */
-       e_fmr->flags = save_fmr.flags;
-       e_fmr->hwpage_size = save_fmr.hwpage_size;
-       e_fmr->fmr_page_size = save_fmr.fmr_page_size;
-       e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
-       e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
-       e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
-       e_fmr->acl = save_fmr.acl;
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
-                         e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-                         &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
-               memcpy(&e_fmr->flags, &(save_mr.flags),
-                      sizeof(struct ehca_mr) - offset);
-       }
-
-ehca_unmap_one_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
-                        "fmr_max_pages=%x",
-                        ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
-       return ret;
-} /* end ehca_unmap_one_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey, /*OUT*/
-                u32 *rkey) /*OUT*/
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
-                        "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
-                        shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_smr_exit0;
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size   = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_smr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
-                        "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
-                        ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
-       return ret;
-} /* end ehca_reg_smr() */
-
-/*----------------------------------------------------------------------*/
-static inline void *ehca_calc_sectbase(int top, int dir, int idx)
-{
-       unsigned long ret = idx;
-       ret |= dir << EHCA_DIR_INDEX_SHIFT;
-       ret |= top << EHCA_TOP_INDEX_SHIFT;
-       return __va(ret << SECTION_SIZE_BITS);
-}
-
-#define ehca_bmap_valid(entry) \
-       ((u64)entry != (u64)EHCA_INVAL_ADDR)
-
-static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
-                              struct ehca_shca *shca, struct ehca_mr *mr,
-                              struct ehca_mr_pginfo *pginfo)
-{
-       u64 h_ret = 0;
-       unsigned long page = 0;
-       u64 rpage = __pa(kpage);
-       int page_count;
-
-       void *sectbase = ehca_calc_sectbase(top, dir, idx);
-       if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
-               ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
-                                          "hwpage_size does not fit to "
-                                          "section start address");
-       }
-       page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
-
-       while (page < page_count) {
-               u64 rnum;
-               for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
-                    rnum++) {
-                       void *pg = sectbase + ((page++) * pginfo->hwpage_size);
-                       kpage[rnum] = __pa(pg);
-               }
-
-               h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
-                       ehca_err(&shca->ib_device, "register_rpage_mr failed");
-                       return h_ret;
-               }
-       }
-       return h_ret;
-}
-
-static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
-                               struct ehca_shca *shca, struct ehca_mr *mr,
-                               struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int idx;
-
-       for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
-                       continue;
-
-               hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
-                                          pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
-                                   struct ehca_mr *mr,
-                                   struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int dir;
-
-       for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                       continue;
-
-               hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-/* register internal max-MR to internal SHCA */
-int ehca_reg_internal_maxmr(
-       struct ehca_shca *shca,
-       struct ehca_pd *e_pd,
-       struct ehca_mr **e_maxmr)  /*OUT*/
-{
-       int ret;
-       struct ehca_mr *e_mr;
-       u64 *iova_start;
-       u64 size_maxmr;
-       struct ehca_mr_pginfo pginfo;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hw_pgsize;
-
-       if (!ehca_bmap) {
-               ret = -EFAULT;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(&shca->ib_device, "out of memory");
-               ret = -ENOMEM;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-       e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-
-       /* register internal max-MR on HCA */
-       size_maxmr = ehca_mr_len;
-       iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
-       num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
-                               PAGE_SIZE);
-       hw_pgsize = ehca_get_max_hwpage_size(shca);
-       num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
-                                hw_pgsize);
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_PHYS;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.hwpage_size = hw_pgsize;
-       pginfo.u.phy.addr = 0;
-       pginfo.u.phy.size = size_maxmr;
-
-       ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
-                         &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
-       if (ret) {
-               ehca_err(&shca->ib_device, "reg of internal max MR failed, "
-                        "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
-                        "num_hwpages=%x", e_mr, iova_start, size_maxmr,
-                        num_kpages, num_hwpages);
-               goto ehca_reg_internal_maxmr_exit1;
-       }
-
-       /* successful registration of all pages */
-       e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
-       e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
-       e_mr->ib.ib_mr.uobject = NULL;
-       atomic_inc(&(e_pd->ib_pd.usecnt));
-       *e_maxmr = e_mr;
-       return 0;
-
-ehca_reg_internal_maxmr_exit1:
-       ehca_mr_delete(e_mr);
-ehca_reg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
-                        ret, shca, e_pd, e_maxmr);
-       return ret;
-} /* end ehca_reg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey)
-{
-       u64 h_ret;
-       struct ehca_mr *e_origmr = shca->maxmr;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, e_origmr, shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-} /* end ehca_reg_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
-{
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ib_pd *ib_pd;
-
-       if (!shca->maxmr) {
-               ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
-               ret = -EINVAL;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       e_maxmr = shca->maxmr;
-       ib_pd = e_maxmr->ib.ib_mr.pd;
-       shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
-
-       ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
-       if (ret) {
-               ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
-                        "ret=%i e_maxmr=%p shca=%p lkey=%x",
-                        ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
-               shca->maxmr = e_maxmr;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       atomic_dec(&ib_pd->usecnt);
-
-ehca_dereg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
-                        ret, shca, shca->maxmr);
-       return ret;
-} /* end ehca_dereg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/* check page list of map FMR verb for validness */
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len)
-{
-       u32 i;
-       u64 *page;
-
-       if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
-               ehca_gen_err("bad list_len, list_len=%x "
-                            "e_fmr->fmr_max_pages=%x fmr=%p",
-                            list_len, e_fmr->fmr_max_pages, e_fmr);
-               return -EINVAL;
-       }
-
-       /* each page must be aligned */
-       page = page_list;
-       for (i = 0; i < list_len; i++) {
-               if (*page % e_fmr->fmr_page_size) {
-                       ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
-                                    "fmr_page_size=%x", i, *page, page, e_fmr,
-                                    e_fmr->fmr_page_size);
-                       return -EINVAL;
-               }
-               page++;
-       }
-
-       return 0;
-} /* end ehca_fmr_check_page_list() */
-
-/*----------------------------------------------------------------------*/
-
-/* PAGE_SIZE >= pginfo->hwpage_size */
-static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr;
-       u32 j = 0;
-       int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-               pgaddr = page_to_pfn(sg_page(*sg))
-                       << PAGE_SHIFT;
-               *kpage = pgaddr + (pginfo->next_hwpage *
-                                  pginfo->hwpage_size);
-               if (!(*kpage)) {
-                       ehca_gen_err("pgaddr=%llx "
-                                    "sg_dma_address=%llx "
-                                    "entry=%llx next_hwpage=%llx",
-                                    pgaddr, (u64)sg_dma_address(*sg),
-                                    pginfo->u.usr.next_nmap,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               (pginfo->next_hwpage)++;
-               kpage++;
-               if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
-                       (pginfo->kpage_cnt)++;
-                       (pginfo->u.usr.next_nmap)++;
-                       pginfo->next_hwpage = 0;
-                       *sg = sg_next(*sg);
-               }
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-/*
- * check given pages for contiguous layout
- * last page addr is returned in prev_pgaddr for further check
- */
-static int ehca_check_kpages_per_ate(struct scatterlist **sg,
-                                    int num_pages,
-                                    u64 *prev_pgaddr)
-{
-       for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
-               u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
-               if (ehca_debug_level >= 3)
-                       ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
-                                    *(u64 *)__va(pgaddr));
-               if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
-                       ehca_gen_err("uncontiguous page found pgaddr=%llx "
-                                    "prev_pgaddr=%llx entries_left_in_hwpage=%x",
-                                    pgaddr, *prev_pgaddr, num_pages);
-                       return -EINVAL;
-               }
-               *prev_pgaddr = pgaddr;
-       }
-       return 0;
-}
-
-/* PAGE_SIZE < pginfo->hwpage_size */
-static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr, prev_pgaddr;
-       u32 j = 0;
-       int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
-       int nr_kpages = kpages_per_hwpage;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-
-               if (nr_kpages == kpages_per_hwpage) {
-                       pgaddr = (page_to_pfn(sg_page(*sg))
-                                  << PAGE_SHIFT);
-                       *kpage = pgaddr;
-                       if (!(*kpage)) {
-                               ehca_gen_err("pgaddr=%llx entry=%llx",
-                                            pgaddr, pginfo->u.usr.next_nmap);
-                               ret = -EFAULT;
-                               return ret;
-                       }
-                       /*
-                        * The first page in a hwpage must be aligned;
-                        * the first MR page is exempt from this rule.
-                        */
-                       if (pgaddr & (pginfo->hwpage_size - 1)) {
-                               if (pginfo->hwpage_cnt) {
-                                       ehca_gen_err(
-                                               "invalid alignment "
-                                               "pgaddr=%llx entry=%llx "
-                                               "mr_pgsize=%llx",
-                                               pgaddr, pginfo->u.usr.next_nmap,
-                                               pginfo->hwpage_size);
-                                       ret = -EFAULT;
-                                       return ret;
-                               }
-                               /* first MR page */
-                               pginfo->kpage_cnt =
-                                       (pgaddr &
-                                        (pginfo->hwpage_size - 1)) >>
-                                       PAGE_SHIFT;
-                               nr_kpages -= pginfo->kpage_cnt;
-                               *kpage = pgaddr &
-                                        ~(pginfo->hwpage_size - 1);
-                       }
-                       if (ehca_debug_level >= 3) {
-                               u64 val = *(u64 *)__va(pgaddr);
-                               ehca_gen_dbg("kpage=%llx page=%llx "
-                                            "value=%016llx",
-                                            *kpage, pgaddr, val);
-                       }
-                       prev_pgaddr = pgaddr;
-                       *sg = sg_next(*sg);
-                       pginfo->kpage_cnt++;
-                       pginfo->u.usr.next_nmap++;
-                       nr_kpages--;
-                       if (!nr_kpages)
-                               goto next_kpage;
-                       continue;
-               }
-
-               ret = ehca_check_kpages_per_ate(sg, nr_kpages,
-                                               &prev_pgaddr);
-               if (ret)
-                       return ret;
-               pginfo->kpage_cnt += nr_kpages;
-               pginfo->u.usr.next_nmap += nr_kpages;
-
-next_kpage:
-               nr_kpages = kpages_per_hwpage;
-               (pginfo->hwpage_cnt)++;
-               kpage++;
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
-                                u32 number, u64 *kpage)
-{
-       int ret = 0;
-       u64 addr = pginfo->u.phy.addr;
-       u64 size = pginfo->u.phy.size;
-       u64 num_hw, offs_hw;
-       u32 i = 0;
-
-       num_hw  = NUM_CHUNKS((addr % pginfo->hwpage_size) + size,
-                               pginfo->hwpage_size);
-       offs_hw = (addr & ~(pginfo->hwpage_size - 1)) / pginfo->hwpage_size;
-
-       while (pginfo->next_hwpage < offs_hw + num_hw) {
-               /* sanity check */
-               if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
-                   (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
-                       ehca_gen_err("kpage_cnt >= num_kpages, "
-                                    "kpage_cnt=%llx num_kpages=%llx "
-                                    "hwpage_cnt=%llx "
-                                    "num_hwpages=%llx i=%x",
-                                    pginfo->kpage_cnt,
-                                    pginfo->num_kpages,
-                                    pginfo->hwpage_cnt,
-                                    pginfo->num_hwpages, i);
-                       return -EFAULT;
-               }
-               *kpage = (addr & ~(pginfo->hwpage_size - 1)) +
-                        (pginfo->next_hwpage * pginfo->hwpage_size);
-               if ( !(*kpage) && addr ) {
-                       ehca_gen_err("addr=%llx size=%llx "
-                                    "next_hwpage=%llx", addr,
-                                    size, pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               (pginfo->next_hwpage)++;
-               if (PAGE_SIZE >= pginfo->hwpage_size) {
-                       if (pginfo->next_hwpage %
-                           (PAGE_SIZE / pginfo->hwpage_size) == 0)
-                               (pginfo->kpage_cnt)++;
-               } else
-                       pginfo->kpage_cnt += pginfo->hwpage_size /
-                               PAGE_SIZE;
-               kpage++;
-               i++;
-               if (i >= number) break;
-       }
-       if (pginfo->next_hwpage >= offs_hw + num_hw) {
-               pginfo->next_hwpage = 0;
-       }
-
-       return ret;
-}
-
-static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
-                               u32 number, u64 *kpage)
-{
-       int ret = 0;
-       u64 *fmrlist;
-       u32 i;
-
-       /* loop over desired page_list entries */
-       fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
-       for (i = 0; i < number; i++) {
-               *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
-                          pginfo->next_hwpage * pginfo->hwpage_size;
-               if ( !(*kpage) ) {
-                       ehca_gen_err("*fmrlist=%llx fmrlist=%p "
-                                    "next_listelem=%llx next_hwpage=%llx",
-                                    *fmrlist, fmrlist,
-                                    pginfo->u.fmr.next_listelem,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
-                       if (pginfo->next_hwpage %
-                           (pginfo->u.fmr.fmr_pgsize /
-                            pginfo->hwpage_size) == 0) {
-                               (pginfo->kpage_cnt)++;
-                               (pginfo->u.fmr.next_listelem)++;
-                               fmrlist++;
-                               pginfo->next_hwpage = 0;
-                       } else
-                               (pginfo->next_hwpage)++;
-               } else {
-                       unsigned int cnt_per_hwpage = pginfo->hwpage_size /
-                               pginfo->u.fmr.fmr_pgsize;
-                       unsigned int j;
-                       u64 prev = *kpage;
-                       /* check if adrs are contiguous */
-                       for (j = 1; j < cnt_per_hwpage; j++) {
-                               u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
-                               if (prev + pginfo->u.fmr.fmr_pgsize != p) {
-                                       ehca_gen_err("uncontiguous fmr pages "
-                                                    "found prev=%llx p=%llx "
-                                                    "idx=%x", prev, p, i + j);
-                                       return -EINVAL;
-                               }
-                               prev = p;
-                       }
-                       pginfo->kpage_cnt += cnt_per_hwpage;
-                       pginfo->u.fmr.next_listelem += cnt_per_hwpage;
-                       fmrlist += cnt_per_hwpage;
-               }
-               kpage++;
-       }
-       return ret;
-}
-
-/* setup page buffer from page info */
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage)
-{
-       int ret;
-
-       switch (pginfo->type) {
-       case EHCA_MR_PGI_PHYS:
-               ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_USER:
-               ret = PAGE_SIZE >= pginfo->hwpage_size ?
-                       ehca_set_pagebuf_user1(pginfo, number, kpage) :
-                       ehca_set_pagebuf_user2(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_FMR:
-               ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
-               break;
-       default:
-               ehca_gen_err("bad pginfo->type=%x", pginfo->type);
-               ret = -EFAULT;
-               break;
-       }
-       return ret;
-} /* end ehca_set_pagebuf() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * check MR if it is a max-MR, i.e. uses whole memory
- * in case it's a max-MR 1 is returned, else 0
- */
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start)
-{
-       /* a MR is treated as max-MR only if it fits following: */
-       if ((size == ehca_mr_len) &&
-           (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
-               ehca_gen_dbg("this is a max-MR");
-               return 1;
-       } else
-               return 0;
-} /* end ehca_mr_is_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/* map access control for MR/MW. This routine is used for MR and MW. */
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl)
-{
-       *hipz_acl = 0;
-       if (ib_acl & IB_ACCESS_REMOTE_READ)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
-       if (ib_acl & IB_ACCESS_REMOTE_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
-       if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
-       if (ib_acl & IB_ACCESS_LOCAL_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
-       if (ib_acl & IB_ACCESS_MW_BIND)
-               *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
-} /* end ehca_mrmw_map_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/* sets page size in hipz access control for MR/MW. */
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
-{
-       *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
-} /* end ehca_mrmw_set_pgsize_hipz_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * reverse map access control for MR/MW.
- * This routine is used for MR and MW.
- */
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl) /*OUT*/
-{
-       *ib_acl = 0;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
-               *ib_acl |= IB_ACCESS_REMOTE_READ;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
-               *ib_acl |= IB_ACCESS_REMOTE_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
-               *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
-               *ib_acl |= IB_ACCESS_LOCAL_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
-               *ib_acl |= IB_ACCESS_MW_BIND;
-} /* end ehca_mrmw_reverse_map_acl() */
-
-
-/*----------------------------------------------------------------------*/
-
-/*
- * MR destructor and constructor
- * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
- * except struct ib_mr and spinlock
- */
-void ehca_mr_deletenew(struct ehca_mr *mr)
-{
-       mr->flags = 0;
-       mr->num_kpages = 0;
-       mr->num_hwpages = 0;
-       mr->acl = 0;
-       mr->start = NULL;
-       mr->fmr_page_size = 0;
-       mr->fmr_max_pages = 0;
-       mr->fmr_max_maps = 0;
-       mr->fmr_map_cnt = 0;
-       memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
-       memset(&mr->galpas, 0, sizeof(mr->galpas));
-} /* end ehca_mr_deletenew() */
-
-int ehca_init_mrmw_cache(void)
-{
-       mr_cache = kmem_cache_create("ehca_cache_mr",
-                                    sizeof(struct ehca_mr), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mr_cache)
-               return -ENOMEM;
-       mw_cache = kmem_cache_create("ehca_cache_mw",
-                                    sizeof(struct ehca_mw), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mw_cache) {
-               kmem_cache_destroy(mr_cache);
-               mr_cache = NULL;
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-void ehca_cleanup_mrmw_cache(void)
-{
-       kmem_cache_destroy(mr_cache);
-       kmem_cache_destroy(mw_cache);
-}
-
-static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
-                                    int dir)
-{
-       if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
-               ehca_top_bmap->dir[dir] =
-                       kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
-               if (!ehca_top_bmap->dir[dir])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
-       }
-       return 0;
-}
-
-static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
-{
-       if (!ehca_bmap_valid(ehca_bmap->top[top])) {
-               ehca_bmap->top[top] =
-                       kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
-               if (!ehca_bmap->top[top])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
-       }
-       return ehca_init_top_bmap(ehca_bmap->top[top], dir);
-}
-
-static inline int ehca_calc_index(unsigned long i, unsigned long s)
-{
-       return (i >> s) & EHCA_INDEX_MASK;
-}
-
-void ehca_destroy_busmap(void)
-{
-       int top, dir;
-
-       if (!ehca_bmap)
-               return;
-
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-                       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                               continue;
-
-                       kfree(ehca_bmap->top[top]->dir[dir]);
-               }
-
-               kfree(ehca_bmap->top[top]);
-       }
-
-       kfree(ehca_bmap);
-       ehca_bmap = NULL;
-}
-
-static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
-{
-       unsigned long i, start_section, end_section;
-       int top, dir, idx;
-
-       if (!nr_pages)
-               return 0;
-
-       if (!ehca_bmap) {
-               ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
-               if (!ehca_bmap)
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
-       }
-
-       start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
-       end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
-       for (i = start_section; i < end_section; i++) {
-               int ret;
-               top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
-               dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
-               idx = i & EHCA_INDEX_MASK;
-
-               ret = ehca_init_bmap(ehca_bmap, top, dir);
-               if (ret) {
-                       ehca_destroy_busmap();
-                       return ret;
-               }
-               ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
-               ehca_mr_len += EHCA_SECTSIZE;
-       }
-       return 0;
-}
-
-static int ehca_is_hugepage(unsigned long pfn)
-{
-       int page_order;
-
-       if (pfn & EHCA_HUGEPAGE_PFN_MASK)
-               return 0;
-
-       page_order = compound_order(pfn_to_page(pfn));
-       if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
-               return 0;
-
-       return 1;
-}
-
-static int ehca_create_busmap_callback(unsigned long initial_pfn,
-                                      unsigned long total_nr_pages, void *arg)
-{
-       int ret;
-       unsigned long pfn, start_pfn, end_pfn, nr_pages;
-
-       if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
-               return ehca_update_busmap(initial_pfn, total_nr_pages);
-
-       /* Given chunk is >= 16GB -> check for hugepages */
-       start_pfn = initial_pfn;
-       end_pfn = initial_pfn + total_nr_pages;
-       pfn = start_pfn;
-
-       while (pfn < end_pfn) {
-               if (ehca_is_hugepage(pfn)) {
-                       /* Add mem found in front of the hugepage */
-                       nr_pages = pfn - start_pfn;
-                       ret = ehca_update_busmap(start_pfn, nr_pages);
-                       if (ret)
-                               return ret;
-                       /* Skip the hugepage */
-                       pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
-                       start_pfn = pfn;
-               } else
-                       pfn += (EHCA_SECTSIZE / PAGE_SIZE);
-       }
-
-       /* Add mem found behind the hugepage(s)  */
-       nr_pages = pfn - start_pfn;
-       return ehca_update_busmap(start_pfn, nr_pages);
-}
-
-int ehca_create_busmap(void)
-{
-       int ret;
-
-       ehca_mr_len = 0;
-       ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
-                                  ehca_create_busmap_callback);
-       return ret;
-}
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo)
-{
-       int top;
-       u64 hret, *kpage;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               return -ENOMEM;
-       }
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
-               if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
-                       break;
-       }
-
-       ehca_free_fw_ctrlblock(kpage);
-
-       if (hret == H_SUCCESS)
-               return 0; /* Everything is fine */
-       else {
-               ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
-                                "h_ret=%lli e_mr=%p top=%x lkey=%x "
-                                "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-               return ehca2ib_return_code(hret);
-       }
-}
-
-static u64 ehca_map_vaddr(void *caddr)
-{
-       int top, dir, idx;
-       unsigned long abs_addr, offset;
-       u64 entry;
-
-       if (!ehca_bmap)
-               return EHCA_INVAL_ADDR;
-
-       abs_addr = __pa(caddr);
-       top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]))
-               return EHCA_INVAL_ADDR;
-
-       dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-               return EHCA_INVAL_ADDR;
-
-       idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
-
-       entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
-       if (ehca_bmap_valid(entry)) {
-               offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
-               return entry | offset;
-       } else
-               return EHCA_INVAL_ADDR;
-}
-
-static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == EHCA_INVAL_ADDR;
-}
-
-static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
-                              size_t size, enum dma_data_direction direction)
-{
-       if (cpu_addr)
-               return ehca_map_vaddr(cpu_addr);
-       else
-               return EHCA_INVAL_ADDR;
-}
-
-static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
-                                 enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
-                            unsigned long offset, size_t size,
-                            enum dma_data_direction direction)
-{
-       u64 addr;
-
-       if (offset + size > PAGE_SIZE)
-               return EHCA_INVAL_ADDR;
-
-       addr = ehca_map_vaddr(page_address(page));
-       if (!ehca_dma_mapping_error(dev, addr))
-               addr += offset;
-
-       return addr;
-}
-
-static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                          int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i) {
-               u64 addr;
-               addr = ehca_map_vaddr(sg_virt(sg));
-               if (ehca_dma_mapping_error(dev, addr))
-                       return 0;
-
-               sg->dma_address = addr;
-               sg->dma_length = sg->length;
-       }
-       return nents;
-}
-
-static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
-                             int nents, enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
-                                        size_t size,
-                                        enum dma_data_direction dir)
-{
-       dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
-}
-
-static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
-                                           size_t size,
-                                           enum dma_data_direction dir)
-{
-       dma_sync_single_for_device(dev->dma_device, addr, size, dir);
-}
-
-static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                    u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-       u64 dma_addr;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p) {
-               addr = page_address(p);
-               dma_addr = ehca_map_vaddr(addr);
-               if (ehca_dma_mapping_error(dev, dma_addr)) {
-                       free_pages((unsigned long)addr, get_order(size));
-                       return NULL;
-               }
-               if (dma_handle)
-                       *dma_handle = dma_addr;
-               return addr;
-       }
-       return NULL;
-}
-
-static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
-                                  void *cpu_addr, u64 dma_handle)
-{
-       if (cpu_addr && size)
-               free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-
-struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
-       .mapping_error          = ehca_dma_mapping_error,
-       .map_single             = ehca_dma_map_single,
-       .unmap_single           = ehca_dma_unmap_single,
-       .map_page               = ehca_dma_map_page,
-       .unmap_page             = ehca_dma_unmap_page,
-       .map_sg                 = ehca_dma_map_sg,
-       .unmap_sg               = ehca_dma_unmap_sg,
-       .sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
-       .sync_single_for_device = ehca_dma_sync_single_for_device,
-       .alloc_coherent         = ehca_dma_alloc_coherent,
-       .free_coherent          = ehca_dma_free_coherent,
-};
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
deleted file mode 100644 (file)
index 52bfa95..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW declarations and inline functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _EHCA_MRMW_H_
-#define _EHCA_MRMW_H_
-
-enum ehca_reg_type {
-       EHCA_REG_MR,
-       EHCA_REG_BUSMAP_MR
-};
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey,
-               u32 *rkey,
-               enum ehca_reg_type reg_type);
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo);
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int mr_access_flags,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey);
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr);
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey,
-                u32 *rkey);
-
-int ehca_reg_internal_maxmr(struct ehca_shca *shca,
-                           struct ehca_pd *e_pd,
-                           struct ehca_mr **maxmr);
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey);
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
-
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len);
-
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage);
-
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start);
-
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl);
-
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
-
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl);
-
-void ehca_mr_deletenew(struct ehca_mr *mr);
-
-int ehca_create_busmap(void);
-
-void ehca_destroy_busmap(void);
-
-extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
-#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_pd.c b/drivers/staging/rdma/ehca/ehca_pd.c
deleted file mode 100644 (file)
index 2a8aae4..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  PD functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-
-static struct kmem_cache *pd_cache;
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context, struct ib_udata *udata)
-{
-       struct ehca_pd *pd;
-       int i;
-
-       pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
-       if (!pd) {
-               ehca_err(device, "device=%p context=%p out of memory",
-                        device, context);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       for (i = 0; i < 2; i++) {
-               INIT_LIST_HEAD(&pd->free[i]);
-               INIT_LIST_HEAD(&pd->full[i]);
-       }
-       mutex_init(&pd->lock);
-
-       /*
-        * Kernel PD: when device = -1, 0
-        * User   PD: when context != -1
-        */
-       if (!context) {
-               /*
-                * Kernel PDs after init reuses always
-                * the one created in ehca_shca_reopen()
-                */
-               struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                                     ib_device);
-               pd->fw_pd.value = shca->pd->fw_pd.value;
-       } else
-               pd->fw_pd.value = (u64)pd;
-
-       return &pd->ib_pd;
-}
-
-int ehca_dealloc_pd(struct ib_pd *pd)
-{
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       int i, leftovers = 0;
-       struct ipz_small_queue_page *page, *tmp;
-
-       for (i = 0; i < 2; i++) {
-               list_splice(&my_pd->full[i], &my_pd->free[i]);
-               list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
-                       leftovers = 1;
-                       free_page(page->page);
-                       kmem_cache_free(small_qp_cache, page);
-               }
-       }
-
-       if (leftovers)
-               ehca_warn(pd->device,
-                         "Some small queue pages were not freed");
-
-       kmem_cache_free(pd_cache, my_pd);
-
-       return 0;
-}
-
-int ehca_init_pd_cache(void)
-{
-       pd_cache = kmem_cache_create("ehca_cache_pd",
-                                    sizeof(struct ehca_pd), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!pd_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_pd_cache(void)
-{
-       kmem_cache_destroy(pd_cache);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_qes.h b/drivers/staging/rdma/ehca/ehca_qes.h
deleted file mode 100644 (file)
index 90c4efa..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Hardware request structures
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef _EHCA_QES_H_
-#define _EHCA_QES_H_
-
-#include "ehca_tools.h"
-
-/* virtual scatter gather entry to specify remote addresses with length */
-struct ehca_vsgentry {
-       u64 vaddr;
-       u32 lkey;
-       u32 length;
-};
-
-#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
-#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
-#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
-#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
-#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
-#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
-#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
-
-/*
- * Unreliable Datagram Address Vector Format
- * see IBTA Vol1 chapter 8.3 Global Routing Header
- */
-struct ehca_ud_av {
-       u8 sl;
-       u8 lnh;
-       u16 dlid;
-       u8 reserved1;
-       u8 reserved2;
-       u8 reserved3;
-       u8 slid_path_bits;
-       u8 reserved4;
-       u8 ipd;
-       u8 reserved5;
-       u8 pmtu;
-       u32 reserved6;
-       u64 reserved7;
-       union {
-               struct {
-                       u64 word_0; /* always set to 6  */
-                       /*should be 0x1B for IB transport */
-                       u64 word_1;
-                       u64 word_2;
-                       u64 word_3;
-                       u64 word_4;
-               } grh;
-               struct {
-                       u32 wd_0;
-                       u32 wd_1;
-                       /* DWord_1 --> SGID */
-
-                       u32 sgid_wd3;
-                       u32 sgid_wd2;
-
-                       u32 sgid_wd1;
-                       u32 sgid_wd0;
-                       /* DWord_3 --> DGID */
-
-                       u32 dgid_wd3;
-                       u32 dgid_wd2;
-
-                       u32 dgid_wd1;
-                       u32 dgid_wd0;
-               } grh_l;
-       };
-};
-
-/* maximum number of sg entries allowed in a WQE */
-#define MAX_WQE_SG_ENTRIES 252
-
-#define WQE_OPTYPE_SEND             0x80
-#define WQE_OPTYPE_RDMAREAD         0x40
-#define WQE_OPTYPE_RDMAWRITE        0x20
-#define WQE_OPTYPE_CMPSWAP          0x10
-#define WQE_OPTYPE_FETCHADD         0x08
-#define WQE_OPTYPE_BIND             0x04
-
-#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
-#define WQE_WRFLAG_FENCE            0x40
-#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
-#define WQE_WRFLAG_SOLIC_EVENT      0x10
-
-#define WQEF_CACHE_HINT             0x80
-#define WQEF_CACHE_HINT_RD_WR       0x40
-#define WQEF_TIMED_WQE              0x20
-#define WQEF_PURGE                  0x08
-#define WQEF_HIGH_NIBBLE            0xF0
-
-#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
-#define MW_BIND_ACCESSCTRL_R_READ    0x20
-#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
-
-struct ehca_wqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 wr_flag;
-       u16 pkeyi;
-       u8 wqef;
-       u8 nr_of_data_seg;
-       u16 wqe_provided_slid;
-       u32 destination_qp_number;
-       u32 resync_psn_sqp;
-       u32 local_ee_context_qkey;
-       u32 immediate_data;
-       union {
-               struct {
-                       u64 remote_virtual_address;
-                       u32 rkey;
-                       u32 reserved;
-                       u64 atomic_1st_op_dma_len;
-                       u64 atomic_2nd_op;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-
-               } nud;
-               struct {
-                       u64 ehca_ud_av_ptr;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } ud_avp;
-               struct {
-                       struct ehca_ud_av ud_av;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
-                                                    2];
-               } ud_av;
-               struct {
-                       u64 reserved0;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } all_rcv;
-
-               struct {
-                       u64 reserved;
-                       u32 rkey;
-                       u32 old_rkey;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 virtual_address;
-                       u32 reserved3;
-                       u32 length;
-                       u32 reserved4;
-                       u16 reserved5;
-                       u8 reserved6;
-                       u8 lr_ctl;
-                       u32 lkey;
-                       u32 reserved7;
-                       u64 reserved8;
-                       u64 reserved9;
-                       u64 reserved10;
-                       u64 reserved11;
-               } bind;
-               struct {
-                       u64 reserved12;
-                       u64 reserved13;
-                       u32 size;
-                       u32 start;
-               } inline_data;
-       } u;
-
-};
-
-#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
-#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
-#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
-#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
-#define WC_STATUS_ERROR_BIT 0x80000000
-#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
-#define WC_STATUS_PURGE_BIT 0x10
-#define WC_SEND_RECEIVE_BIT 0x80
-
-struct ehca_cqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 w_completion_flags;
-       u16 reserved1;
-       u32 nr_bytes_transferred;
-       u32 immediate_data;
-       u32 local_qp_number;
-       u8 freed_resource_count;
-       u8 service_level;
-       u16 wqe_count;
-       u32 qp_token;
-       u32 qkey_ee_token;
-       u32 remote_qp_number;
-       u16 dlid;
-       u16 rlid;
-       u16 reserved2;
-       u16 pkey_index;
-       u32 cqe_timestamp;
-       u32 wqe_timestamp;
-       u8 wqe_timestamp_valid;
-       u8 reserved3;
-       u8 reserved4;
-       u8 cqe_flags;
-       u32 status;
-};
-
-struct ehca_eqe {
-       u64 entry;
-};
-
-struct ehca_mrte {
-       u64 starting_va;
-       u64 length; /* length of memory region in bytes*/
-       u32 pd;
-       u8 key_instance;
-       u8 pagesize;
-       u8 mr_control;
-       u8 local_remote_access_ctrl;
-       u8 reserved[0x20 - 0x18];
-       u64 at_pointer[4];
-};
-#endif /*_EHCA_QES_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_qp.c b/drivers/staging/rdma/ehca/ehca_qp.c
deleted file mode 100644 (file)
index 896c01f..0000000
+++ /dev/null
@@ -1,2256 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  QP functions
- *
- *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
- *           Stefan Roscher <stefan.roscher@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-static struct kmem_cache *qp_cache;
-
-/*
- * attributes not supported by query qp
- */
-#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
-                                    IB_QP_EN_SQD_ASYNC_NOTIFY)
-
-/*
- * ehca (internal) qp state values
- */
-enum ehca_qp_state {
-       EHCA_QPS_RESET = 1,
-       EHCA_QPS_INIT = 2,
-       EHCA_QPS_RTR = 3,
-       EHCA_QPS_RTS = 5,
-       EHCA_QPS_SQD = 6,
-       EHCA_QPS_SQE = 8,
-       EHCA_QPS_ERR = 128
-};
-
-/*
- * qp state transitions as defined by IB Arch Rel 1.1 page 431
- */
-enum ib_qp_statetrans {
-       IB_QPST_ANY2RESET,
-       IB_QPST_ANY2ERR,
-       IB_QPST_RESET2INIT,
-       IB_QPST_INIT2RTR,
-       IB_QPST_INIT2INIT,
-       IB_QPST_RTR2RTS,
-       IB_QPST_RTS2SQD,
-       IB_QPST_RTS2RTS,
-       IB_QPST_SQD2RTS,
-       IB_QPST_SQE2RTS,
-       IB_QPST_SQD2SQD,
-       IB_QPST_MAX     /* nr of transitions, this must be last!!! */
-};
-
-/*
- * ib2ehca_qp_state maps IB to ehca qp_state
- * returns ehca qp state corresponding to given ib qp state
- */
-static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
-{
-       switch (ib_qp_state) {
-       case IB_QPS_RESET:
-               return EHCA_QPS_RESET;
-       case IB_QPS_INIT:
-               return EHCA_QPS_INIT;
-       case IB_QPS_RTR:
-               return EHCA_QPS_RTR;
-       case IB_QPS_RTS:
-               return EHCA_QPS_RTS;
-       case IB_QPS_SQD:
-               return EHCA_QPS_SQD;
-       case IB_QPS_SQE:
-               return EHCA_QPS_SQE;
-       case IB_QPS_ERR:
-               return EHCA_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca2ib_qp_state maps ehca to IB qp_state
- * returns ib qp state corresponding to given ehca qp state
- */
-static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
-                                               ehca_qp_state)
-{
-       switch (ehca_qp_state) {
-       case EHCA_QPS_RESET:
-               return IB_QPS_RESET;
-       case EHCA_QPS_INIT:
-               return IB_QPS_INIT;
-       case EHCA_QPS_RTR:
-               return IB_QPS_RTR;
-       case EHCA_QPS_RTS:
-               return IB_QPS_RTS;
-       case EHCA_QPS_SQD:
-               return IB_QPS_SQD;
-       case EHCA_QPS_SQE:
-               return IB_QPS_SQE;
-       case EHCA_QPS_ERR:
-               return IB_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca_qp_type used as index for req_attr and opt_attr of
- * struct ehca_modqp_statetrans
- */
-enum ehca_qp_type {
-       QPT_RC = 0,
-       QPT_UC = 1,
-       QPT_UD = 2,
-       QPT_SQP = 3,
-       QPT_MAX
-};
-
-/*
- * ib2ehcaqptype maps Ib to ehca qp_type
- * returns ehca qp type corresponding to ib qp type
- */
-static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return QPT_SQP;
-       case IB_QPT_RC:
-               return QPT_RC;
-       case IB_QPT_UC:
-               return QPT_UC;
-       case IB_QPT_UD:
-               return QPT_UD;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
-                                                        int ib_tostate)
-{
-       int index = -EINVAL;
-       switch (ib_tostate) {
-       case IB_QPS_RESET:
-               index = IB_QPST_ANY2RESET;
-               break;
-       case IB_QPS_INIT:
-               switch (ib_fromstate) {
-               case IB_QPS_RESET:
-                       index = IB_QPST_RESET2INIT;
-                       break;
-               case IB_QPS_INIT:
-                       index = IB_QPST_INIT2INIT;
-                       break;
-               }
-               break;
-       case IB_QPS_RTR:
-               if (ib_fromstate == IB_QPS_INIT)
-                       index = IB_QPST_INIT2RTR;
-               break;
-       case IB_QPS_RTS:
-               switch (ib_fromstate) {
-               case IB_QPS_RTR:
-                       index = IB_QPST_RTR2RTS;
-                       break;
-               case IB_QPS_RTS:
-                       index = IB_QPST_RTS2RTS;
-                       break;
-               case IB_QPS_SQD:
-                       index = IB_QPST_SQD2RTS;
-                       break;
-               case IB_QPS_SQE:
-                       index = IB_QPST_SQE2RTS;
-                       break;
-               }
-               break;
-       case IB_QPS_SQD:
-               if (ib_fromstate == IB_QPS_RTS)
-                       index = IB_QPST_RTS2SQD;
-               break;
-       case IB_QPS_SQE:
-               break;
-       case IB_QPS_ERR:
-               index = IB_QPST_ANY2ERR;
-               break;
-       default:
-               break;
-       }
-       return index;
-}
-
-/*
- * ibqptype2servicetype returns hcp service type corresponding to given
- * ib qp type used by create_qp()
- */
-static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return ST_UD;
-       case IB_QPT_RC:
-               return ST_RC;
-       case IB_QPT_UC:
-               return ST_UC;
-       case IB_QPT_UD:
-               return ST_UD;
-       case IB_QPT_RAW_IPV6:
-               return -EINVAL;
-       case IB_QPT_RAW_ETHERTYPE:
-               return -EINVAL;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-/*
- * init userspace queue info from ipz_queue data
- */
-static inline void queue2resp(struct ipzu_queue_resp *resp,
-                             struct ipz_queue *queue)
-{
-       resp->qe_size = queue->qe_size;
-       resp->act_nr_of_sg = queue->act_nr_of_sg;
-       resp->queue_length = queue->queue_length;
-       resp->pagesize = queue->pagesize;
-       resp->toggle_state = queue->toggle_state;
-       resp->offset = queue->offset;
-}
-
-/*
- * init_qp_queue initializes/constructs r/squeue and registers queue pages.
- */
-static inline int init_qp_queue(struct ehca_shca *shca,
-                               struct ehca_pd *pd,
-                               struct ehca_qp *my_qp,
-                               struct ipz_queue *queue,
-                               int q_type,
-                               u64 expected_hret,
-                               struct ehca_alloc_queue_parms *parms,
-                               int wqe_size)
-{
-       int ret, cnt, ipz_rc, nr_q_pages;
-       void *vpage;
-       u64 rpage, h_ret;
-       struct ib_device *ib_dev = &shca->ib_device;
-       struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
-
-       if (!parms->queue_size)
-               return 0;
-
-       if (parms->is_small) {
-               nr_q_pages = 1;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       128 << parms->page_size,
-                                       wqe_size, parms->act_nr_sges, 1);
-       } else {
-               nr_q_pages = parms->queue_size;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       EHCA_PAGESIZE, wqe_size,
-                                       parms->act_nr_sges, 0);
-       }
-
-       if (!ipz_rc) {
-               ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
-                        ipz_rc);
-               return -EBUSY;
-       }
-
-       /* register queue pages */
-       for (cnt = 0; cnt < nr_q_pages; cnt++) {
-               vpage = ipz_qpageit_get_inc(queue);
-               if (!vpage) {
-                       ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                "failed p_vpage= %p", vpage);
-                       ret = -EINVAL;
-                       goto init_qp_queue1;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
-                                                my_qp->ipz_qp_handle,
-                                                NULL, 0, q_type,
-                                                rpage, parms->is_small ? 0 : 1,
-                                                my_qp->galpas.kernel);
-               if (cnt == (nr_q_pages - 1)) {  /* last page! */
-                       if (h_ret != expected_hret) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-                       vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
-                       if (vpage) {
-                               ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                        "should not succeed vpage=%p", vpage);
-                               ret = -EINVAL;
-                               goto init_qp_queue1;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(queue);
-
-       return 0;
-
-init_qp_queue1:
-       ipz_queue_dtor(pd, queue);
-       return ret;
-}
-
-static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
-{
-       if (is_llqp)
-               return 128 << act_nr_sge;
-       else
-               return offsetof(struct ehca_wqe,
-                               u.nud.sg_list[act_nr_sge]);
-}
-
-static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
-                                      int req_nr_sge, int is_llqp)
-{
-       u32 wqe_size, q_size;
-       int act_nr_sge = req_nr_sge;
-
-       if (!is_llqp)
-               /* round up #SGEs so WQE size is a power of 2 */
-               for (act_nr_sge = 4; act_nr_sge <= 252;
-                    act_nr_sge = 4 + 2 * act_nr_sge)
-                       if (act_nr_sge >= req_nr_sge)
-                               break;
-
-       wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
-       q_size = wqe_size * (queue->max_wr + 1);
-
-       if (q_size <= 512)
-               queue->page_size = 2;
-       else if (q_size <= 1024)
-               queue->page_size = 3;
-       else
-               queue->page_size = 0;
-
-       queue->is_small = (queue->page_size != 0);
-}
-
-/* needs to be called with cq->spinlock held */
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
-{
-       struct list_head *list, *node;
-
-       /* TODO: support low latency QPs */
-       if (qp->ext_type == EQPT_LLQP)
-               return;
-
-       if (on_sq) {
-               list = &qp->send_cq->sqp_err_list;
-               node = &qp->sq_err_node;
-       } else {
-               list = &qp->recv_cq->rqp_err_list;
-               node = &qp->rq_err_node;
-       }
-
-       if (list_empty(node))
-               list_add_tail(node, list);
-
-       return;
-}
-
-static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-
-       if (!list_empty(node))
-               list_del_init(node);
-
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-}
-
-static void reset_queue_map(struct ehca_queue_map *qmap)
-{
-       int i;
-
-       qmap->tail = qmap->entries - 1;
-       qmap->left_to_poll = 0;
-       qmap->next_wqe_idx = 0;
-       for (i = 0; i < qmap->entries; i++) {
-               qmap->map[i].reported = 1;
-               qmap->map[i].cqe_req = 0;
-       }
-}
-
-/*
- * Create an ib_qp struct that is either a QP or an SRQ, depending on
- * the value of the is_srq parameter. If init_attr and srq_init_attr share
- * fields, the field out of init_attr is used.
- */
-static struct ehca_qp *internal_create_qp(
-       struct ib_pd *pd,
-       struct ib_qp_init_attr *init_attr,
-       struct ib_srq_init_attr *srq_init_attr,
-       struct ib_udata *udata, int is_srq)
-{
-       struct ehca_qp *my_qp, *my_srq = NULL;
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct ib_ucontext *context = NULL;
-       u64 h_ret;
-       int is_llqp = 0, has_srq = 0, is_user = 0;
-       int qp_type, max_send_sge, max_recv_sge, ret;
-
-       /* h_call's out parameters */
-       struct ehca_alloc_qp_parms parms;
-       u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
-       unsigned long flags;
-
-       if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
-               ehca_err(pd->device, "Unable to create QP, max number of %i "
-                        "QPs reached.", shca->max_num_qps);
-               ehca_err(pd->device, "To increase the maximum number of QPs "
-                        "use the number_of_qps module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       if (init_attr->create_flags) {
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       memset(&parms, 0, sizeof(parms));
-       qp_type = init_attr->qp_type;
-
-       if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
-               init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
-               ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
-                        init_attr->sq_sig_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* save LLQP info */
-       if (qp_type & 0x80) {
-               is_llqp = 1;
-               parms.ext_type = EQPT_LLQP;
-               parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
-       }
-       qp_type &= 0x1F;
-       init_attr->qp_type &= 0x1F;
-
-       /* handle SRQ base QPs */
-       if (init_attr->srq) {
-               my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
-
-               if (qp_type == IB_QPT_UC) {
-                       ehca_err(pd->device, "UC with SRQ not supported");
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-
-               has_srq = 1;
-               parms.ext_type = EQPT_SRQBASE;
-               parms.srq_qpn = my_srq->real_qp_num;
-       }
-
-       if (is_llqp && has_srq) {
-               ehca_err(pd->device, "LLQPs can't have an SRQ");
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* handle SRQs */
-       if (is_srq) {
-               parms.ext_type = EQPT_SRQ;
-               parms.srq_limit = srq_init_attr->attr.srq_limit;
-               if (init_attr->cap.max_recv_sge > 3) {
-                       ehca_err(pd->device, "no more than three SGEs "
-                                "supported for SRQ  pd=%p  max_sge=%x",
-                                pd, init_attr->cap.max_recv_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       /* check QP type */
-       if (qp_type != IB_QPT_UD &&
-           qp_type != IB_QPT_UC &&
-           qp_type != IB_QPT_RC &&
-           qp_type != IB_QPT_SMI &&
-           qp_type != IB_QPT_GSI) {
-               ehca_err(pd->device, "wrong QP Type=%x", qp_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       if (is_llqp) {
-               switch (qp_type) {
-               case IB_QPT_RC:
-                       if ((init_attr->cap.max_send_wr > 255) ||
-                           (init_attr->cap.max_recv_wr > 255)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_sq_wr=%x "
-                                        "or max_rq_wr=%x for RC LLQP",
-                                        init_attr->cap.max_send_wr,
-                                        init_attr->cap.max_recv_wr);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               case IB_QPT_UD:
-                       if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
-                               ehca_err(pd->device, "UD LLQP not supported "
-                                        "by this adapter");
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-ENOSYS);
-                       }
-                       if (!(init_attr->cap.max_send_sge <= 5
-                           && init_attr->cap.max_send_sge >= 1
-                           && init_attr->cap.max_recv_sge <= 5
-                           && init_attr->cap.max_recv_sge >= 1)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_send_sge=%x "
-                                        "or max_recv_sge=%x for UD LLQP",
-                                        init_attr->cap.max_send_sge,
-                                        init_attr->cap.max_recv_sge);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       } else if (init_attr->cap.max_send_wr > 255) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of "
-                                        "max_send_wr=%x for UD QP_TYPE=%x",
-                                        init_attr->cap.max_send_wr, qp_type);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               default:
-                       ehca_err(pd->device, "unsupported LL QP Type=%x",
-                                qp_type);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       } else {
-               int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
-                              || qp_type == IB_QPT_GSI) ? 250 : 252;
-
-               if (init_attr->cap.max_send_sge > max_sge
-                   || init_attr->cap.max_recv_sge > max_sge) {
-                       ehca_err(pd->device, "Invalid number of SGEs requested "
-                                "send_sge=%x recv_sge=%x max_sge=%x",
-                                init_attr->cap.max_send_sge,
-                                init_attr->cap.max_recv_sge, max_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
-       if (!my_qp) {
-               ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       if (pd->uobject && udata) {
-               is_user = 1;
-               context = pd->uobject->context;
-       }
-
-       atomic_set(&my_qp->nr_events, 0);
-       init_waitqueue_head(&my_qp->wait_completion);
-       spin_lock_init(&my_qp->spinlock_s);
-       spin_lock_init(&my_qp->spinlock_r);
-       my_qp->qp_type = qp_type;
-       my_qp->ext_type = parms.ext_type;
-       my_qp->state = IB_QPS_RESET;
-
-       if (init_attr->recv_cq)
-               my_qp->recv_cq =
-                       container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
-       if (init_attr->send_cq)
-               my_qp->send_cq =
-                       container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-
-       ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
-       if (ret >= 0)
-               my_qp->token = ret;
-
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-       idr_preload_end();
-       if (ret < 0) {
-               if (ret == -ENOSPC) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid number of qp");
-               } else {
-                       ret = -ENOMEM;
-                       ehca_err(pd->device, "Can't allocate new idr entry.");
-               }
-               goto create_qp_exit0;
-       }
-
-       if (has_srq)
-               parms.srq_token = my_qp->token;
-
-       parms.servicetype = ibqptype2servicetype(qp_type);
-       if (parms.servicetype < 0) {
-               ret = -EINVAL;
-               ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
-               goto create_qp_exit1;
-       }
-
-       /* Always signal by WQE so we can hide circ. WQEs */
-       parms.sigtype = HCALL_SIGT_BY_WQE;
-
-       /* UD_AV CIRCUMVENTION */
-       max_send_sge = init_attr->cap.max_send_sge;
-       max_recv_sge = init_attr->cap.max_recv_sge;
-       if (parms.servicetype == ST_UD && !is_llqp) {
-               max_send_sge += 2;
-               max_recv_sge += 2;
-       }
-
-       parms.token = my_qp->token;
-       parms.eq_handle = shca->eq.ipz_eq_handle;
-       parms.pd = my_pd->fw_pd;
-       if (my_qp->send_cq)
-               parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
-       if (my_qp->recv_cq)
-               parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
-
-       parms.squeue.max_wr = init_attr->cap.max_send_wr;
-       parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
-       parms.squeue.max_sge = max_send_sge;
-       parms.rqueue.max_sge = max_recv_sge;
-
-       /* RC QPs need one more SWQE for unsolicited ack circumvention */
-       if (qp_type == IB_QPT_RC)
-               parms.squeue.max_wr++;
-
-       if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
-               if (HAS_SQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.squeue, max_send_sge, is_llqp);
-               if (HAS_RQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.rqueue, max_recv_sge, is_llqp);
-               parms.qp_storage =
-                       (parms.squeue.is_small || parms.rqueue.is_small);
-       }
-
-       h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
-                        h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto create_qp_exit1;
-       }
-
-       ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
-       my_qp->ipz_qp_handle = parms.qp_handle;
-       my_qp->galpas = parms.galpas;
-
-       swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
-       rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
-
-       switch (qp_type) {
-       case IB_QPT_RC:
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               }
-               /* hide the extra WQE */
-               parms.squeue.act_nr_wqes--;
-               break;
-       case IB_QPT_UD:
-       case IB_QPT_GSI:
-       case IB_QPT_SMI:
-               /* UD circumvention */
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               } else {
-                       parms.squeue.act_nr_sges -= 2;
-                       parms.rqueue.act_nr_sges -= 2;
-               }
-
-               if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
-                       parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
-                       parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
-                       parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
-                       parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
-                       ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
-               }
-
-               break;
-
-       default:
-               break;
-       }
-
-       /* initialize r/squeue and register queue pages */
-       if (HAS_SQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
-                       HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
-                       &parms.squeue, swqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize squeue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit2;
-               }
-
-               if (!is_user) {
-                       my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
-                               my_qp->ipz_squeue.qe_size;
-                       my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->sq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit3;
-                       }
-                       INIT_LIST_HEAD(&my_qp->sq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->sq_map);
-               }
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
-                       H_SUCCESS, &parms.rqueue, rwqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize rqueue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit4;
-               }
-               if (!is_user) {
-                       my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
-                               my_qp->ipz_rqueue.qe_size;
-                       my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->rq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit5;
-                       }
-                       INIT_LIST_HEAD(&my_qp->rq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->rq_map);
-               }
-       } else if (init_attr->srq && !is_user) {
-               /* this is a base QP, use the queue map of the SRQ */
-               my_qp->rq_map = my_srq->rq_map;
-               INIT_LIST_HEAD(&my_qp->rq_err_node);
-
-               my_qp->ipz_rqueue = my_srq->ipz_rqueue;
-       }
-
-       if (is_srq) {
-               my_qp->ib_srq.pd = &my_pd->ib_pd;
-               my_qp->ib_srq.device = my_pd->ib_pd.device;
-
-               my_qp->ib_srq.srq_context = init_attr->qp_context;
-               my_qp->ib_srq.event_handler = init_attr->event_handler;
-       } else {
-               my_qp->ib_qp.qp_num = ib_qp_num;
-               my_qp->ib_qp.pd = &my_pd->ib_pd;
-               my_qp->ib_qp.device = my_pd->ib_pd.device;
-
-               my_qp->ib_qp.recv_cq = init_attr->recv_cq;
-               my_qp->ib_qp.send_cq = init_attr->send_cq;
-
-               my_qp->ib_qp.qp_type = qp_type;
-               my_qp->ib_qp.srq = init_attr->srq;
-
-               my_qp->ib_qp.qp_context = init_attr->qp_context;
-               my_qp->ib_qp.event_handler = init_attr->event_handler;
-       }
-
-       init_attr->cap.max_inline_data = 0; /* not supported yet */
-       init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
-       init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
-       init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
-       init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
-       my_qp->init_attr = *init_attr;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                       &my_qp->ib_qp;
-               if (ehca_nr_ports < 0) {
-                       /* alloc array to cache subsequent modify qp parms
-                        * for autodetect mode
-                        */
-                       my_qp->mod_qp_parm =
-                               kzalloc(EHCA_MOD_QP_PARM_MAX *
-                                       sizeof(*my_qp->mod_qp_parm),
-                                       GFP_KERNEL);
-                       if (!my_qp->mod_qp_parm) {
-                               ehca_err(pd->device,
-                                        "Could not alloc mod_qp_parm");
-                               goto create_qp_exit5;
-                       }
-               }
-       }
-
-       /* NOTE: define_apq0() not supported yet */
-       if (qp_type == IB_QPT_GSI) {
-               h_ret = ehca_define_sqp(shca, my_qp, init_attr);
-               if (h_ret != H_SUCCESS) {
-                       kfree(my_qp->mod_qp_parm);
-                       my_qp->mod_qp_parm = NULL;
-                       /* the QP pointer is no longer valid */
-                       shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                               NULL;
-                       ret = ehca2ib_return_code(h_ret);
-                       goto create_qp_exit6;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
-               if (ret) {
-                       ehca_err(pd->device,
-                                "Couldn't assign qp to send_cq ret=%i", ret);
-                       goto create_qp_exit7;
-               }
-       }
-
-       /* copy queues, galpa data to user space */
-       if (context && udata) {
-               struct ehca_create_qp_resp resp;
-               memset(&resp, 0, sizeof(resp));
-
-               resp.qp_num = my_qp->real_qp_num;
-               resp.token = my_qp->token;
-               resp.qp_type = my_qp->qp_type;
-               resp.ext_type = my_qp->ext_type;
-               resp.qkey = my_qp->qkey;
-               resp.real_qp_num = my_qp->real_qp_num;
-
-               if (HAS_SQ(my_qp))
-                       queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
-               if (HAS_RQ(my_qp))
-                       queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
-               resp.fw_handle_ofs = (u32)
-                       (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
-
-               if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
-                       ehca_err(pd->device, "Copy to udata failed");
-                       ret = -EINVAL;
-                       goto create_qp_exit8;
-               }
-       }
-
-       return my_qp;
-
-create_qp_exit8:
-       ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
-
-create_qp_exit7:
-       kfree(my_qp->mod_qp_parm);
-
-create_qp_exit6:
-       if (HAS_RQ(my_qp) && !is_user)
-               vfree(my_qp->rq_map.map);
-
-create_qp_exit5:
-       if (HAS_RQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-
-create_qp_exit4:
-       if (HAS_SQ(my_qp) && !is_user)
-               vfree(my_qp->sq_map.map);
-
-create_qp_exit3:
-       if (HAS_SQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-
-create_qp_exit2:
-       hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-
-create_qp_exit1:
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-create_qp_exit0:
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return ERR_PTR(ret);
-}
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *qp_init_attr,
-                            struct ib_udata *udata)
-{
-       struct ehca_qp *ret;
-
-       ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
-       return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *srq_init_attr,
-                              struct ib_udata *udata)
-{
-       struct ib_qp_init_attr qp_init_attr;
-       struct ehca_qp *my_qp;
-       struct ib_srq *ret;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 hret, update_mask;
-
-       if (srq_init_attr->srq_type != IB_SRQT_BASIC)
-               return ERR_PTR(-ENOSYS);
-
-       /* For common attributes, internal_create_qp() takes its info
-        * out of qp_init_attr, so copy all common attrs there.
-        */
-       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
-       qp_init_attr.event_handler = srq_init_attr->event_handler;
-       qp_init_attr.qp_context = srq_init_attr->srq_context;
-       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-       qp_init_attr.qp_type = IB_QPT_RC;
-       qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
-       qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
-
-       my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
-       if (IS_ERR(my_qp))
-               return (struct ib_srq *)my_qp;
-
-       /* copy back return values */
-       srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
-       srq_init_attr->attr.max_sge = 3;
-
-       /* drive SRQ into RTR state */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(pd->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               ret = ERR_PTR(-ENOMEM);
-               goto create_srq1;
-       }
-
-       mqpcb->qp_state = EHCA_QPS_INIT;
-       mqpcb->prim_phys_port = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to INIT "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_enable = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not enable SRQ "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_state  = EHCA_QPS_RTR;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to RTR "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return &my_qp->ib_srq;
-
-create_srq2:
-       ret = ERR_PTR(ehca2ib_return_code(hret));
-       ehca_free_fw_ctrlblock(mqpcb);
-
-create_srq1:
-       internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
-
-       return ret;
-}
-
-/*
- * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
- * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
- * returns total number of bad wqes in bad_wqe_cnt
- */
-static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
-                          int *bad_wqe_cnt)
-{
-       u64 h_ret;
-       struct ipz_queue *squeue;
-       void *bad_send_wqe_p, *bad_send_wqe_v;
-       u64 q_ofs;
-       struct ehca_wqe *wqe;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* get send wqe pointer */
-       h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                                          my_qp->ipz_qp_handle, &my_qp->pf,
-                                          &bad_send_wqe_p, NULL, 2);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
-                        " ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp_num, h_ret);
-               return ehca2ib_return_code(h_ret);
-       }
-       bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
-       ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
-                qp_num, bad_send_wqe_p);
-       /* convert wqe pointer to vadr */
-       bad_send_wqe_v = __va((u64)bad_send_wqe_p);
-       if (ehca_debug_level >= 2)
-               ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
-       squeue = &my_qp->ipz_squeue;
-       if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
-               ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
-                        " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
-               return -EFAULT;
-       }
-
-       /* loop sets wqe's purge bit */
-       wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-       *bad_wqe_cnt = 0;
-       while (wqe->optype != 0xff && wqe->wqef != 0xff) {
-               if (ehca_debug_level >= 2)
-                       ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
-               wqe->nr_of_data_seg = 0; /* suppress data access */
-               wqe->wqef = WQEF_PURGE; /* WQE to be purged */
-               q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-               *bad_wqe_cnt = (*bad_wqe_cnt)+1;
-       }
-       /*
-        * bad wqe will be reprocessed and ignored when pol_cq() is called,
-        *  i.e. nr of wqes with flush error status is one less
-        */
-       ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
-                qp_num, (*bad_wqe_cnt)-1);
-       wqe->wqef = 0;
-
-       return 0;
-}
-
-static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
-                         struct ehca_queue_map *qmap)
-{
-       void *wqe_v;
-       u64 q_ofs;
-       u32 wqe_idx;
-       unsigned int tail_idx;
-
-       /* convert real to abs address */
-       wqe_p = wqe_p & (~(1UL << 63));
-
-       wqe_v = __va(wqe_p);
-
-       if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
-               ehca_gen_err("Invalid offset for calculating left cqes "
-                               "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
-               return -EFAULT;
-       }
-
-       tail_idx = next_index(qmap->tail, qmap->entries);
-       wqe_idx = q_ofs / ipz_queue->qe_size;
-
-       /* check all processed wqes, whether a cqe is requested or not */
-       while (tail_idx != wqe_idx) {
-               if (qmap->map[tail_idx].cqe_req)
-                       qmap->left_to_poll++;
-               tail_idx = next_index(tail_idx, qmap->entries);
-       }
-       /* save index in queue, where we have to start flushing */
-       qmap->next_wqe_idx = wqe_idx;
-       return 0;
-}
-
-static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
-{
-       u64 h_ret;
-       void *send_wqe_p, *recv_wqe_p;
-       int ret;
-       unsigned long flags;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* this hcall is not supported on base QPs */
-       if (my_qp->ext_type != EQPT_SRQBASE) {
-               /* get send and receive wqe pointer */
-               h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle, &my_qp->pf,
-                               &send_wqe_p, &recv_wqe_p, 4);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "disable_and_get_wqe() "
-                                "failed ehca_qp=%p qp_num=%x h_ret=%lli",
-                                my_qp, qp_num, h_ret);
-                       return ehca2ib_return_code(h_ret);
-               }
-
-               /*
-                * acquire lock to ensure that nobody is polling the cq which
-                * could mean that the qmap->tail pointer is in an
-                * inconsistent state.
-                */
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
-                               &my_qp->sq_map);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
-                               &my_qp->rq_map);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-       } else {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               my_qp->sq_map.left_to_poll = 0;
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               my_qp->rq_map.left_to_poll = 0;
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-       }
-
-       /* this assures flush cqes being generated only for pending wqes */
-       if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ehca_add_to_err_list(my_qp, 1);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               if (HAS_RQ(my_qp)) {
-                       spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-                       ehca_add_to_err_list(my_qp, 0);
-                       spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
-                                       flags);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * internal_modify_qp with circumvention to handle aqp0 properly
- * smi_reset2init indicates if this is an internal reset-to-init-call for
- * smi. This flag must always be zero if called from ehca_modify_qp()!
- * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
- */
-static int internal_modify_qp(struct ib_qp *ibqp,
-                             struct ib_qp_attr *attr,
-                             int attr_mask, int smi_reset2init)
-{
-       enum ib_qp_state qp_cur_state, qp_new_state;
-       int cnt, qp_attr_idx, ret = 0;
-       enum ib_qp_statetrans statetrans;
-       struct hcp_modify_qp_control_block *mqpcb;
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca =
-               container_of(ibqp->pd->device, struct ehca_shca, ib_device);
-       u64 update_mask;
-       u64 h_ret;
-       int bad_wqe_cnt = 0;
-       int is_user = 0;
-       int squeue_locked = 0;
-       unsigned long flags = 0;
-
-       /* do query_qp to obtain current attr values */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!mqpcb) {
-               ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               mqpcb, my_qp->galpas.kernel);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ibqp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, ibqp->qp_num, h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto modify_qp_exit1;
-       }
-       if (ibqp->uobject)
-               is_user = 1;
-
-       qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
-
-       if (qp_cur_state == -EINVAL) {  /* invalid qp state */
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        mqpcb->qp_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-       /*
-        * circumvention to set aqp0 initial state to init
-        * as expected by IB spec
-        */
-       if (smi_reset2init == 0 &&
-           ibqp->qp_type == IB_QPT_SMI &&
-           qp_cur_state == IB_QPS_RESET &&
-           (attr_mask & IB_QP_STATE) &&
-           attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
-               struct ib_qp_attr smiqp_attr = {
-                       .qp_state = IB_QPS_INIT,
-                       .port_num = my_qp->init_attr.port_num,
-                       .pkey_index = 0,
-                       .qkey = 0
-               };
-               int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
-                       IB_QP_PKEY_INDEX | IB_QP_QKEY;
-               int smirc = internal_modify_qp(
-                       ibqp, &smiqp_attr, smiqp_attr_mask, 1);
-               if (smirc) {
-                       ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
-                                "ehca_modify_qp() rc=%i", smirc);
-                       ret = H_PARAMETER;
-                       goto modify_qp_exit1;
-               }
-               qp_cur_state = IB_QPS_INIT;
-               ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
-       }
-       /* is transmitted current state  equal to "real" current state */
-       if ((attr_mask & IB_QP_CUR_STATE) &&
-           qp_cur_state != attr->cur_qp_state) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
-                        " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
-                        attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
-                "new qp_state=%x attribute_mask=%x",
-                my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
-
-       qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
-       if (!smi_reset2init &&
-           !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
-                               attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid qp transition new_state=%x cur_state=%x "
-                        "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
-                        qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
-               goto modify_qp_exit1;
-       }
-
-       mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
-       if (mqpcb->qp_state)
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       else {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid new qp state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qp_new_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       /* retrieve state transition struct to get req and opt attrs */
-       statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
-       if (statetrans < 0) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
-                        "new_qp_state=%x State_xsition=%x ehca_qp=%p "
-                        "qp_num=%x", qp_cur_state, qp_new_state,
-                        statetrans, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
-
-       if (qp_attr_idx < 0) {
-               ret = qp_attr_idx;
-               ehca_err(ibqp->device,
-                        "Invalid QP type=%x ehca_qp=%p qp_num=%x",
-                        ibqp->qp_type, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device,
-                "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
-                my_qp, ibqp->qp_num, statetrans);
-
-       /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
-        * in non-LL UD QPs.
-        */
-       if ((my_qp->qp_type == IB_QPT_UD) &&
-           (my_qp->ext_type != EQPT_LLQP) &&
-           (statetrans == IB_QPST_INIT2RTR) &&
-           (shca->hw_level >= 0x22)) {
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-               mqpcb->send_grh_flag = 1;
-       }
-
-       /* sqe -> rts: set purge bit of bad wqe before actual trans */
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* mark next free wqe if kernel */
-               if (!ibqp->uobject) {
-                       struct ehca_wqe *wqe;
-                       /* lock send queue */
-                       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-                       squeue_locked = 1;
-                       /* mark next free wqe */
-                       wqe = (struct ehca_wqe *)
-                               ipz_qeit_get(&my_qp->ipz_squeue);
-                       wqe->optype = wqe->wqef = 0xff;
-                       ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
-                                ibqp->qp_num, wqe);
-               }
-               ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
-               if (ret) {
-                       ehca_err(ibqp->device, "prepare_sqe_rts() failed "
-                                "ehca_qp=%p qp_num=%x ret=%i",
-                                my_qp, ibqp->qp_num, ret);
-                       goto modify_qp_exit2;
-               }
-       }
-
-       /*
-        * enable RDMA_Atomic_Control if reset->init und reliable con
-        * this is necessary since gen2 does not provide that flag,
-        * but pHyp requires it
-        */
-       if (statetrans == IB_QPST_RESET2INIT &&
-           (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
-               mqpcb->rdma_atomic_ctrl = 3;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
-       }
-       /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
-       if (statetrans == IB_QPST_INIT2RTR &&
-           (ibqp->qp_type == IB_QPT_UC) &&
-           !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
-               mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX) {
-               if (attr->pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_p_key_idx = attr->pkey_index;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
-       }
-       if (attr_mask & IB_QP_PORT) {
-               struct ehca_sport *sport;
-               struct ehca_qp *aqp1;
-               if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               sport = &shca->sport[attr->port_num - 1];
-               if (!sport->ibqp_sqp[IB_QPT_GSI]) {
-                       /* should not occur */
-                       ret = -EFAULT;
-                       ehca_err(ibqp->device, "AQP1 was not created for "
-                                "port=%x", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
-                                   struct ehca_qp, ib_qp);
-               if (ibqp->qp_type != IB_QPT_GSI &&
-                   ibqp->qp_type != IB_QPT_SMI &&
-                   aqp1->mod_qp_parm) {
-                       /*
-                        * firmware will reject this modify_qp() because
-                        * port is not activated/initialized fully
-                        */
-                       ret = -EFAULT;
-                       ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
-                                 "either port is being activated (try again) "
-                                 "or cabling issue", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_phys_port = attr->port_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
-       }
-       if (attr_mask & IB_QP_QKEY) {
-               mqpcb->qkey = attr->qkey;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
-       }
-       if (attr_mask & IB_QP_AV) {
-               mqpcb->dlid = attr->ah_attr.dlid;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
-               mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
-               mqpcb->service_level = attr->ah_attr.sl;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
-
-               if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
-                                 attr->ah_attr.static_rate,
-                                 &mqpcb->max_static_rate)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag = 1;
-
-                       mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid.byte[cnt] =
-                                       attr->ah_attr.grh.dgid.raw[cnt];
-
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
-                       mqpcb->flow_label = attr->ah_attr.grh.flow_label;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
-                       mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
-                       mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_PATH_MTU) {
-               /* store ld(MTU) */
-               my_qp->mtu_shift = attr->path_mtu + 7;
-               mqpcb->path_mtu = attr->path_mtu;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
-       }
-       if (attr_mask & IB_QP_TIMEOUT) {
-               mqpcb->timeout = attr->timeout;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
-       }
-       if (attr_mask & IB_QP_RETRY_CNT) {
-               mqpcb->retry_count = attr->retry_cnt;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RNR_RETRY) {
-               mqpcb->rnr_retry_count = attr->rnr_retry;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RQ_PSN) {
-               mqpcb->receive_psn = attr->rq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
-       }
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-               mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
-                       attr->max_dest_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-               mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
-                       attr->max_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET
-                       (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
-       }
-       if (attr_mask & IB_QP_ALT_PATH) {
-               if (attr->alt_port_num < 1
-                   || attr->alt_port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->alt_port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_phys_port = attr->alt_port_num;
-
-               if (attr->alt_pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_p_key_idx = attr->alt_pkey_index;
-
-               mqpcb->timeout_al = attr->alt_timeout;
-               mqpcb->dlid_al = attr->alt_ah_attr.dlid;
-               mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
-               mqpcb->service_level_al = attr->alt_ah_attr.sl;
-
-               if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
-                                 attr->alt_ah_attr.static_rate,
-                                 &mqpcb->max_static_rate_al)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-
-               /* OpenIB doesn't support alternate retry counts - copy them */
-               mqpcb->retry_count_al = mqpcb->retry_count;
-               mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
-
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag_al = 1;
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid_al.byte[cnt] =
-                                       attr->alt_ah_attr.grh.dgid.raw[cnt];
-                       mqpcb->source_gid_idx_al =
-                               attr->alt_ah_attr.grh.sgid_index;
-                       mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
-                       mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
-                       mqpcb->traffic_class_al =
-                               attr->alt_ah_attr.grh.traffic_class;
-
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
-               mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
-       }
-
-       if (attr_mask & IB_QP_SQ_PSN) {
-               mqpcb->send_psn = attr->sq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
-       }
-
-       if (attr_mask & IB_QP_DEST_QPN) {
-               mqpcb->dest_qp_nr = attr->dest_qp_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
-       }
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE) {
-               if (attr->path_mig_state != IB_MIG_REARM
-                   && attr->path_mig_state != IB_MIG_MIGRATED) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid mig_state=%x",
-                                attr->path_mig_state);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->path_migration_state = attr->path_mig_state + 1;
-               if (attr->path_mig_state == IB_MIG_REARM)
-                       my_qp->mig_armed = 1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
-       }
-
-       if (attr_mask & IB_QP_CAP) {
-               mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
-               mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
-               /* no support for max_send/recv_sge yet */
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                my_qp->ipz_qp_handle,
-                                &my_qp->pf,
-                                update_mask,
-                                mqpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
-               goto modify_qp_exit2;
-       }
-
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* doorbell to reprocessing wqes */
-               iosync(); /* serialize GAL register access */
-               hipz_update_sqa(my_qp, bad_wqe_cnt-1);
-               ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
-       }
-
-       if (statetrans == IB_QPST_RESET2INIT ||
-           statetrans == IB_QPST_INIT2INIT) {
-               mqpcb->qp_enable = 1;
-               mqpcb->qp_state = EHCA_QPS_INIT;
-               update_mask = 0;
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-
-               h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                        my_qp->ipz_qp_handle,
-                                        &my_qp->pf,
-                                        update_mask,
-                                        mqpcb,
-                                        my_qp->galpas.kernel);
-
-               if (h_ret != H_SUCCESS) {
-                       ret = ehca2ib_return_code(h_ret);
-                       ehca_err(ibqp->device, "ENABLE in context of "
-                                "RESET_2_INIT failed! Maybe you didn't get "
-                                "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
-                                h_ret, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-       }
-       if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
-           && !is_user) {
-               ret = check_for_left_cqes(my_qp, shca);
-               if (ret)
-                       goto modify_qp_exit2;
-       }
-
-       if (statetrans == IB_QPST_ANY2RESET) {
-               ipz_qeit_reset(&my_qp->ipz_rqueue);
-               ipz_qeit_reset(&my_qp->ipz_squeue);
-
-               if (qp_cur_state == IB_QPS_ERR && !is_user) {
-                       del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-                       if (HAS_RQ(my_qp))
-                               del_from_err_list(my_qp->recv_cq,
-                                                 &my_qp->rq_err_node);
-               }
-               if (!is_user)
-                       reset_queue_map(&my_qp->sq_map);
-
-               if (HAS_RQ(my_qp) && !is_user)
-                       reset_queue_map(&my_qp->rq_map);
-       }
-
-       if (attr_mask & IB_QP_QKEY)
-               my_qp->qkey = attr->qkey;
-
-modify_qp_exit2:
-       if (squeue_locked) { /* this means: sqe -> rts */
-               spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-               my_qp->sqerr_purgeflag = 1;
-       }
-
-modify_qp_exit1:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata)
-{
-       int ret = 0;
-
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-
-       /* The if-block below caches qp_attr to be modified for GSI and SMI
-        * qps during the initialization by ib_mad. When the respective port
-        * is activated, ie we got an event PORT_ACTIVE, we'll replay the
-        * cached modify calls sequence, see ehca_recover_sqs() below.
-        * Why that is required:
-        * 1) If one port is connected, older code requires that port one
-        *    to be connected and module option nr_ports=1 to be given by
-        *    user, which is very inconvenient for end user.
-        * 2) Firmware accepts modify_qp() only if respective port has become
-        *    active. Older code had a wait loop of 30sec create_qp()/
-        *    define_aqp1(), which is not appropriate in practice. This
-        *    code now removes that wait loop, see define_aqp1(), and always
-        *    reports all ports to ib_mad resp. users. Only activated ports
-        *    will then usable for the users.
-        */
-       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
-               int port = my_qp->init_attr.port_num;
-               struct ehca_sport *sport = &shca->sport[port - 1];
-               unsigned long flags;
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               /* cache qp_attr only during init */
-               if (my_qp->mod_qp_parm) {
-                       struct ehca_mod_qp_parm *p;
-                       if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
-                               ehca_err(&shca->ib_device,
-                                        "mod_qp_parm overflow state=%x port=%x"
-                                        " type=%x", attr->qp_state,
-                                        my_qp->init_attr.port_num,
-                                        ibqp->qp_type);
-                               spin_unlock_irqrestore(&sport->mod_sqp_lock,
-                                                      flags);
-                               return -EINVAL;
-                       }
-                       p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
-                       p->mask = attr_mask;
-                       p->attr = *attr;
-                       my_qp->mod_qp_parm_idx++;
-                       ehca_dbg(&shca->ib_device,
-                                "Saved qp_attr for state=%x port=%x type=%x",
-                                attr->qp_state, my_qp->init_attr.port_num,
-                                ibqp->qp_type);
-                       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-                       goto out;
-               }
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
-
-out:
-       if ((ret == 0) && (attr_mask & IB_QP_STATE))
-               my_qp->state = attr->qp_state;
-
-       return ret;
-}
-
-void ehca_recover_sqp(struct ib_qp *sqp)
-{
-       struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
-       int port = my_sqp->init_attr.port_num;
-       struct ib_qp_attr attr;
-       struct ehca_mod_qp_parm *qp_parm;
-       int i, qp_parm_idx, ret;
-       unsigned long flags, wr_cnt;
-
-       if (!my_sqp->mod_qp_parm)
-               return;
-       ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
-
-       qp_parm = my_sqp->mod_qp_parm;
-       qp_parm_idx = my_sqp->mod_qp_parm_idx;
-       for (i = 0; i < qp_parm_idx; i++) {
-               attr = qp_parm[i].attr;
-               ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
-               if (ret) {
-                       ehca_err(sqp->device, "Could not modify SQP port=%x "
-                                "qp_num=%x ret=%x", port, sqp->qp_num, ret);
-                       goto free_qp_parm;
-               }
-               ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
-                        port, sqp->qp_num, attr.qp_state);
-       }
-
-       /* re-trigger posted recv wrs */
-       wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
-               my_sqp->ipz_rqueue.qe_size;
-       if (wr_cnt) {
-               spin_lock_irqsave(&my_sqp->spinlock_r, flags);
-               hipz_update_rqa(my_sqp, wr_cnt);
-               spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
-               ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
-                        port, sqp->qp_num, wr_cnt);
-       }
-
-free_qp_parm:
-       kfree(qp_parm);
-       /* this prevents subsequent calls to modify_qp() to cache qp_attr */
-       my_sqp->mod_qp_parm = NULL;
-}
-
-int ehca_query_qp(struct ib_qp *qp,
-                 struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int cnt, ret = 0;
-       u64 h_ret;
-
-       if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
-               ehca_err(qp->device, "Invalid attribute mask "
-                        "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
-                        my_qp, qp->qp_num, qp_attr_mask);
-               return -EINVAL;
-       }
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(qp->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(qp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp->qp_num, h_ret);
-               goto query_qp_exit1;
-       }
-
-       qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
-       qp_attr->qp_state = qp_attr->cur_qp_state;
-
-       if (qp_attr->cur_qp_state == -EINVAL) {
-               ret = -EINVAL;
-               ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qpcb->qp_state, my_qp, qp->qp_num);
-               goto query_qp_exit1;
-       }
-
-       if (qp_attr->qp_state == IB_QPS_SQD)
-               qp_attr->sq_draining = 1;
-
-       qp_attr->qkey = qpcb->qkey;
-       qp_attr->path_mtu = qpcb->path_mtu;
-       qp_attr->path_mig_state = qpcb->path_migration_state - 1;
-       qp_attr->rq_psn = qpcb->receive_psn;
-       qp_attr->sq_psn = qpcb->send_psn;
-       qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
-       qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
-       qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
-       /* UD_AV CIRCUMVENTION */
-       if (my_qp->qp_type == IB_QPT_UD) {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe - 2;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe - 2;
-       } else {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe;
-       }
-
-       qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
-       qp_attr->dest_qp_num = qpcb->dest_qp_nr;
-
-       qp_attr->pkey_index = qpcb->prim_p_key_idx;
-       qp_attr->port_num = qpcb->prim_phys_port;
-       qp_attr->timeout = qpcb->timeout;
-       qp_attr->retry_cnt = qpcb->retry_count;
-       qp_attr->rnr_retry = qpcb->rnr_retry_count;
-
-       qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
-       qp_attr->alt_port_num = qpcb->alt_phys_port;
-       qp_attr->alt_timeout = qpcb->timeout_al;
-
-       qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
-       qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
-
-       /* primary av */
-       qp_attr->ah_attr.sl = qpcb->service_level;
-
-       if (qpcb->send_grh_flag) {
-               qp_attr->ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
-       qp_attr->ah_attr.dlid = qpcb->dlid;
-       qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
-       qp_attr->ah_attr.port_num = qp_attr->port_num;
-
-       /* primary GRH */
-       qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
-       qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
-       qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
-       qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid.byte[cnt];
-
-       /* alternate AV */
-       qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
-       if (qpcb->send_grh_flag_al) {
-               qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
-       qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
-       qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
-
-       /* alternate GRH */
-       qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
-       qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
-       qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
-       qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid_al.byte[cnt];
-
-       /* return init attributes given in ehca_create_qp */
-       if (qp_init_attr)
-               *qp_init_attr = my_qp->init_attr;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
-
-query_qp_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
-{
-       struct ehca_qp *my_qp =
-               container_of(ibsrq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca =
-               container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 update_mask;
-       u64 h_ret;
-       int ret = 0;
-
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       update_mask = 0;
-       if (attr_mask & IB_SRQ_LIMIT) {
-               attr_mask &= ~IB_SRQ_LIMIT;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
-               mqpcb->curr_srq_limit = attr->srq_limit;
-               mqpcb->qp_aff_asyn_ev_log_reg =
-                       EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
-       }
-
-       /* by now, all bits in attr_mask should have been cleared */
-       if (attr_mask) {
-               ehca_err(ibsrq->device, "invalid attribute mask bits set  "
-                        "attr_mask=%x", attr_mask);
-               ret = -EINVAL;
-               goto modify_srq_exit0;
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
-                                NULL, update_mask, mqpcb,
-                                my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x",
-                        h_ret, my_qp, my_qp->real_qp_num);
-       }
-
-modify_srq_exit0:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
-{
-       struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int ret = 0;
-       u64 h_ret;
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(srq->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
-                               NULL, qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(srq->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, h_ret);
-               goto query_srq_exit1;
-       }
-
-       srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
-       srq_attr->max_sge = 3;
-       srq_attr->srq_limit = qpcb->curr_srq_limit;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-query_srq_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject)
-{
-       struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
-       struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
-                                            ib_pd);
-       struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
-       u32 qp_num = my_qp->real_qp_num;
-       int ret;
-       u64 h_ret;
-       u8 port_num;
-       int is_user = 0;
-       enum ib_qp_type qp_type;
-       unsigned long flags;
-
-       if (uobject) {
-               is_user = 1;
-               if (my_qp->mm_count_galpa ||
-                   my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
-                       ehca_err(dev, "Resources still referenced in "
-                                "user space qp_num=%x", qp_num);
-                       return -EINVAL;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
-               if (ret) {
-                       ehca_err(dev, "Couldn't unassign qp from "
-                                "send_cq ret=%i qp_num=%x cq_num=%x", ret,
-                                qp_num, my_qp->send_cq->cq_number);
-                       return ret;
-               }
-       }
-
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-       /*
-        * SRQs will never get into an error list and do not have a recv_cq,
-        * so we need to skip them here.
-        */
-       if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
-
-       if (HAS_SQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
-
-       h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
-               return ehca2ib_return_code(h_ret);
-       }
-
-       port_num = my_qp->init_attr.port_num;
-       qp_type  = my_qp->init_attr.qp_type;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               kfree(my_qp->mod_qp_parm);
-               my_qp->mod_qp_parm = NULL;
-               shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       /* no support for IB_QPT_SMI yet */
-       if (qp_type == IB_QPT_GSI) {
-               struct ib_event event;
-               ehca_info(dev, "device %s: port %x is inactive.",
-                               shca->ib_device.name, port_num);
-               event.device = &shca->ib_device;
-               event.event = IB_EVENT_PORT_ERR;
-               event.element.port_num = port_num;
-               shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
-               ib_dispatch_event(&event);
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-               if (!is_user)
-                       vfree(my_qp->rq_map.map);
-       }
-       if (HAS_SQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-               if (!is_user)
-                       vfree(my_qp->sq_map.map);
-       }
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return 0;
-}
-
-int ehca_destroy_qp(struct ib_qp *qp)
-{
-       return internal_destroy_qp(qp->device,
-                                  container_of(qp, struct ehca_qp, ib_qp),
-                                  qp->uobject);
-}
-
-int ehca_destroy_srq(struct ib_srq *srq)
-{
-       return internal_destroy_qp(srq->device,
-                                  container_of(srq, struct ehca_qp, ib_srq),
-                                  srq->uobject);
-}
-
-int ehca_init_qp_cache(void)
-{
-       qp_cache = kmem_cache_create("ehca_cache_qp",
-                                    sizeof(struct ehca_qp), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!qp_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_qp_cache(void)
-{
-       kmem_cache_destroy(qp_cache);
-}
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
deleted file mode 100644 (file)
index 11813b8..0000000
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  post_send/recv, poll_cq, req_notify
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-/* in RC traffic, insert an empty RDMA READ every this many packets */
-#define ACK_CIRC_THRESHOLD 2000000
-
-static u64 replace_wr_id(u64 wr_id, u16 idx)
-{
-       u64 ret;
-
-       ret = wr_id & ~QMAP_IDX_MASK;
-       ret |= idx & QMAP_IDX_MASK;
-
-       return ret;
-}
-
-static u16 get_app_wr_id(u64 wr_id)
-{
-       return wr_id & QMAP_IDX_MASK;
-}
-
-static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
-                                 struct ehca_wqe *wqe_p,
-                                 struct ib_recv_wr *recv_wr,
-                                 u32 rq_map_idx)
-{
-       u8 cnt_ds;
-       if (unlikely((recv_wr->num_sge < 0) ||
-                    (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
-       wqe_p->nr_of_data_seg = recv_wr->num_sge;
-
-       for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
-               wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
-                       recv_wr->sg_list[cnt_ds].addr;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
-                       recv_wr->sg_list[cnt_ds].lkey;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].length =
-                       recv_wr->sg_list[cnt_ds].length;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
-                            ipz_rqueue);
-               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
-       }
-
-       return 0;
-}
-
-#if defined(DEBUG_GSI_SEND_WR)
-
-/* need ib_mad struct */
-#include <rdma/ib_mad.h>
-
-static void trace_ud_wr(const struct ib_ud_wr *ud_wr)
-{
-       int idx;
-       int j;
-       while (ud_wr) {
-               struct ib_mad_hdr *mad_hdr = ud_wrmad_hdr;
-               struct ib_sge *sge = ud_wr->wr.sg_list;
-               ehca_gen_dbg("ud_wr#%x wr_id=%lx num_sge=%x "
-                            "send_flags=%x opcode=%x", idx, ud_wr->wr.wr_id,
-                            ud_wr->wr.num_sge, ud_wr->wr.send_flags,
-                            ud_wr->.wr.opcode);
-               if (mad_hdr) {
-                       ehca_gen_dbg("ud_wr#%x mad_hdr base_version=%x "
-                                    "mgmt_class=%x class_version=%x method=%x "
-                                    "status=%x class_specific=%x tid=%lx "
-                                    "attr_id=%x resv=%x attr_mod=%x",
-                                    idx, mad_hdr->base_version,
-                                    mad_hdr->mgmt_class,
-                                    mad_hdr->class_version, mad_hdr->method,
-                                    mad_hdr->status, mad_hdr->class_specific,
-                                    mad_hdr->tid, mad_hdr->attr_id,
-                                    mad_hdr->resv,
-                                    mad_hdr->attr_mod);
-               }
-               for (j = 0; j < ud_wr->wr.num_sge; j++) {
-                       u8 *data = __va(sge->addr);
-                       ehca_gen_dbg("ud_wr#%x sge#%x addr=%p length=%x "
-                                    "lkey=%x",
-                                    idx, j, data, sge->length, sge->lkey);
-                       /* assume length is n*16 */
-                       ehca_dmp(data, sge->length, "ud_wr#%x sge#%x",
-                                idx, j);
-                       sge++;
-               } /* eof for j */
-               idx++;
-               ud_wr = ud_wr(ud_wr->wr.next);
-       } /* eof while ud_wr */
-}
-
-#endif /* DEBUG_GSI_SEND_WR */
-
-static inline int ehca_write_swqe(struct ehca_qp *qp,
-                                 struct ehca_wqe *wqe_p,
-                                 struct ib_send_wr *send_wr,
-                                 u32 sq_map_idx,
-                                 int hidden)
-{
-       u32 idx;
-       u64 dma_length;
-       struct ehca_av *my_av;
-       u32 remote_qkey;
-       struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
-
-       if (unlikely((send_wr->num_sge < 0) ||
-                    (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
-
-       qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
-       qmap_entry->reported = 0;
-       qmap_entry->cqe_req = 0;
-
-       switch (send_wr->opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_SEND;
-               break;
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
-               break;
-       case IB_WR_RDMA_READ:
-               wqe_p->optype = WQE_OPTYPE_RDMAREAD;
-               break;
-       default:
-               ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
-               return -EINVAL; /* invalid opcode */
-       }
-
-       wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
-
-       wqe_p->wr_flag = 0;
-
-       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
-           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
-           && !hidden) {
-               wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
-               qmap_entry->cqe_req = 1;
-       }
-
-       if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
-           send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
-               /* this might not work as long as HW does not support it */
-               wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
-               wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
-       }
-
-       wqe_p->nr_of_data_seg = send_wr->num_sge;
-
-       switch (qp->qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               /* no break is intential here */
-       case IB_QPT_UD:
-               /* IB 1.2 spec C10-15 compliance */
-               remote_qkey = ud_wr(send_wr)->remote_qkey;
-               if (remote_qkey & 0x80000000)
-                       remote_qkey = qp->qkey;
-
-               wqe_p->destination_qp_number = ud_wr(send_wr)->remote_qpn << 8;
-               wqe_p->local_ee_context_qkey = remote_qkey;
-               if (unlikely(!ud_wr(send_wr)->ah)) {
-                       ehca_gen_err("ud_wr(send_wr) is NULL. qp=%p", qp);
-                       return -EINVAL;
-               }
-               if (unlikely(ud_wr(send_wr)->remote_qpn == 0)) {
-                       ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
-                       return -EINVAL;
-               }
-               my_av = container_of(ud_wr(send_wr)->ah, struct ehca_av, ib_ah);
-               wqe_p->u.ud_av.ud_av = my_av->av;
-
-               /*
-                * omitted check of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.ud_av.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.ud_av.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.ud_av.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-               } /* eof for idx */
-               if (qp->qp_type == IB_QPT_SMI ||
-                   qp->qp_type == IB_QPT_GSI)
-                       wqe_p->u.ud_av.ud_av.pmtu = 1;
-               if (qp->qp_type == IB_QPT_GSI) {
-                       wqe_p->pkeyi = ud_wr(send_wr)->pkey_index;
-#ifdef DEBUG_GSI_SEND_WR
-                       trace_ud_wr(ud_wr(send_wr));
-#endif /* DEBUG_GSI_SEND_WR */
-               }
-               break;
-
-       case IB_QPT_UC:
-               if (send_wr->send_flags & IB_SEND_FENCE)
-                       wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
-               /* no break is intentional here */
-       case IB_QPT_RC:
-               /* TODO: atomic not implemented */
-               wqe_p->u.nud.remote_virtual_address =
-                       rdma_wr(send_wr)->remote_addr;
-               wqe_p->u.nud.rkey = rdma_wr(send_wr)->rkey;
-
-               /*
-                * omitted checking of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               dma_length = 0;
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.nud.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.nud.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.nud.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-                       dma_length += send_wr->sg_list[idx].length;
-               } /* eof idx */
-               wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
-
-               /* unsolicited ack circumvention */
-               if (send_wr->opcode == IB_WR_RDMA_READ) {
-                       /* on RDMA read, switch on and reset counters */
-                       qp->message_count = qp->packet_count = 0;
-                       qp->unsol_ack_circ = 1;
-               } else
-                       /* else estimate #packets */
-                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
-
-               break;
-
-       default:
-               ehca_gen_err("Invalid qptype=%x", qp->qp_type);
-               return -EINVAL;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
-               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
-       }
-       return 0;
-}
-
-/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
-static inline void map_ib_wc_status(u32 cqe_status,
-                                   enum ib_wc_status *wc_status)
-{
-       if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
-               switch (cqe_status & 0x3F) {
-               case 0x01:
-               case 0x21:
-                       *wc_status = IB_WC_LOC_LEN_ERR;
-                       break;
-               case 0x02:
-               case 0x22:
-                       *wc_status = IB_WC_LOC_QP_OP_ERR;
-                       break;
-               case 0x03:
-               case 0x23:
-                       *wc_status = IB_WC_LOC_EEC_OP_ERR;
-                       break;
-               case 0x04:
-               case 0x24:
-                       *wc_status = IB_WC_LOC_PROT_ERR;
-                       break;
-               case 0x05:
-               case 0x25:
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               case 0x06:
-                       *wc_status = IB_WC_MW_BIND_ERR;
-                       break;
-               case 0x07: /* remote error - look into bits 20:24 */
-                       switch ((cqe_status
-                                & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
-                       case 0x0:
-                               /*
-                                * PSN Sequence Error!
-                                * couldn't find a matching status!
-                                */
-                               *wc_status = IB_WC_GENERAL_ERR;
-                               break;
-                       case 0x1:
-                               *wc_status = IB_WC_REM_INV_REQ_ERR;
-                               break;
-                       case 0x2:
-                               *wc_status = IB_WC_REM_ACCESS_ERR;
-                               break;
-                       case 0x3:
-                               *wc_status = IB_WC_REM_OP_ERR;
-                               break;
-                       case 0x4:
-                               *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
-                               break;
-                       }
-                       break;
-               case 0x08:
-                       *wc_status = IB_WC_RETRY_EXC_ERR;
-                       break;
-               case 0x09:
-                       *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
-                       break;
-               case 0x0A:
-               case 0x2D:
-                       *wc_status = IB_WC_REM_ABORT_ERR;
-                       break;
-               case 0x0B:
-               case 0x2E:
-                       *wc_status = IB_WC_INV_EECN_ERR;
-                       break;
-               case 0x0C:
-               case 0x2F:
-                       *wc_status = IB_WC_INV_EEC_STATE_ERR;
-                       break;
-               case 0x0D:
-                       *wc_status = IB_WC_BAD_RESP_ERR;
-                       break;
-               case 0x10:
-                       /* WQE purged */
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               default:
-                       *wc_status = IB_WC_FATAL_ERR;
-
-               }
-       } else
-               *wc_status = IB_WC_SUCCESS;
-}
-
-static inline int post_one_send(struct ehca_qp *my_qp,
-                        struct ib_send_wr *cur_send_wr,
-                        int hidden)
-{
-       struct ehca_wqe *wqe_p;
-       int ret;
-       u32 sq_map_idx;
-       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
-
-       /* get pointer next to free WQE */
-       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
-       if (unlikely(!wqe_p)) {
-               /* too many posted work requests: queue overflow */
-               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -ENOMEM;
-       }
-
-       /*
-        * Get the index of the WQE in the send queue. The same index is used
-        * for writing into the sq_map.
-        */
-       sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
-
-       /* write a SEND WQE into the QUEUE */
-       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
-       /*
-        * if something failed,
-        * reset the free entry pointer to the start value
-        */
-       if (unlikely(ret)) {
-               my_qp->ipz_squeue.current_q_offset = start_offset;
-               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_post_send(struct ib_qp *qp,
-                  struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       int wqe_cnt = 0;
-       int ret = 0;
-       unsigned long flags;
-
-       /* Reject WR if QP is in RESET, INIT or RTR state */
-       if (unlikely(my_qp->state < IB_QPS_RTS)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-
-       /* Send an empty extra RDMA read if:
-        *  1) there has been an RDMA read on this connection before
-        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
-        *  3) we can be sure that any previous extra RDMA read has been
-        *     processed so we don't overflow the SQ
-        */
-       if (unlikely(my_qp->unsol_ack_circ &&
-                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
-                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
-               /* insert an empty RDMA READ to fix up the remote QP state */
-               struct ib_send_wr circ_wr;
-               memset(&circ_wr, 0, sizeof(circ_wr));
-               circ_wr.opcode = IB_WR_RDMA_READ;
-               post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
-               wqe_cnt++;
-               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
-               my_qp->message_count = my_qp->packet_count = 0;
-       }
-
-       /* loop processes list of send reqs */
-       while (send_wr) {
-               ret = post_one_send(my_qp, send_wr, 0);
-               if (unlikely(ret)) {
-                       goto post_send_exit0;
-               }
-               wqe_cnt++;
-               send_wr = send_wr->next;
-       }
-
-post_send_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_sqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-               ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                        my_qp, qp->qp_num, wqe_cnt, ret);
-       my_qp->message_count += wqe_cnt;
-       spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-
-out:
-       if (ret)
-               *bad_send_wr = send_wr;
-       return ret;
-}
-
-static int internal_post_recv(struct ehca_qp *my_qp,
-                             struct ib_device *dev,
-                             struct ib_recv_wr *recv_wr,
-                             struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_wqe *wqe_p;
-       int wqe_cnt = 0;
-       int ret = 0;
-       u32 rq_map_idx;
-       unsigned long flags;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (unlikely(!HAS_RQ(my_qp))) {
-               ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
-                        my_qp, my_qp->real_qp_num, my_qp->ext_type);
-               ret = -ENODEV;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_r, flags);
-
-       /* loop processes list of recv reqs */
-       while (recv_wr) {
-               u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
-               /* get pointer next to free WQE */
-               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
-               if (unlikely(!wqe_p)) {
-                       /* too many posted work requests: queue overflow */
-                       ret = -ENOMEM;
-                       ehca_err(dev, "Too many posted WQEs "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-               /*
-                * Get the index of the WQE in the recv queue. The same index
-                * is used for writing into the rq_map.
-                */
-               rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
-
-               /* write a RECV WQE into the QUEUE */
-               ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
-                               rq_map_idx);
-               /*
-                * if something failed,
-                * reset the free entry pointer to the start value
-                */
-               if (unlikely(ret)) {
-                       my_qp->ipz_rqueue.current_q_offset = start_offset;
-                       ret = -EINVAL;
-                       ehca_err(dev, "Could not write WQE "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-
-               qmap_entry = &my_qp->rq_map.map[rq_map_idx];
-               qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
-               qmap_entry->reported = 0;
-               qmap_entry->cqe_req = 1;
-
-               wqe_cnt++;
-               recv_wr = recv_wr->next;
-       } /* eof for recv_wr */
-
-post_recv_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_rqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-           ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                    my_qp, my_qp->real_qp_num, wqe_cnt, ret);
-       spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
-
-out:
-       if (ret)
-               *bad_recv_wr = recv_wr;
-
-       return ret;
-}
-
-int ehca_post_recv(struct ib_qp *qp,
-                  struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-
-       /* Reject WR if QP is in RESET state */
-       if (unlikely(my_qp->state == IB_QPS_RESET)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               *bad_recv_wr = recv_wr;
-               return -EINVAL;
-       }
-
-       return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
-}
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr)
-{
-       return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
-                                 srq->device, recv_wr, bad_recv_wr);
-}
-
-/*
- * ib_wc_opcode table converts ehca wc opcode to ib
- * Since we use zero to indicate invalid opcode, the actual ib opcode must
- * be decremented!!!
- */
-static const u8 ib_wc_opcode[255] = {
-       [0x01] = IB_WC_RECV+1,
-       [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
-       [0x08] = IB_WC_FETCH_ADD+1,
-       [0x10] = IB_WC_COMP_SWAP+1,
-       [0x20] = IB_WC_RDMA_WRITE+1,
-       [0x40] = IB_WC_RDMA_READ+1,
-       [0x80] = IB_WC_SEND+1
-};
-
-/* internal function to poll one entry of cq */
-static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
-{
-       int ret = 0, qmap_tail_idx;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       struct ehca_cqe *cqe;
-       struct ehca_qp *my_qp;
-       struct ehca_qmap_entry *qmap_entry;
-       struct ehca_queue_map *qmap;
-       int cqe_count = 0, is_error;
-
-repoll:
-       cqe = (struct ehca_cqe *)
-               ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
-       if (!cqe) {
-               ret = -EAGAIN;
-               if (ehca_debug_level >= 3)
-                       ehca_dbg(cq->device, "Completion queue is empty  "
-                                "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
-               goto poll_cq_one_exit0;
-       }
-
-       /* prevents loads being reordered across this point */
-       rmb();
-
-       cqe_count++;
-       if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
-               struct ehca_qp *qp;
-               int purgeflag;
-               unsigned long flags;
-
-               qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
-               if (!qp) {
-                       ehca_err(cq->device, "cq_num=%x qp_num=%x "
-                                "could not find qp -> ignore cqe",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       /* ignore this purged cqe */
-                       goto repoll;
-               }
-               spin_lock_irqsave(&qp->spinlock_s, flags);
-               purgeflag = qp->sqerr_purgeflag;
-               spin_unlock_irqrestore(&qp->spinlock_s, flags);
-
-               if (purgeflag) {
-                       ehca_dbg(cq->device,
-                                "Got CQE with purged bit qp_num=%x src_qp=%x",
-                                cqe->local_qp_number, cqe->remote_qp_number);
-                       if (ehca_debug_level >= 2)
-                               ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
-                                        cqe->local_qp_number,
-                                        cqe->remote_qp_number);
-                       /*
-                        * ignore this to avoid double cqes of bad wqe
-                        * that caused sqe and turn off purge flag
-                        */
-                       qp->sqerr_purgeflag = 0;
-                       goto repoll;
-               }
-       }
-
-       is_error = cqe->status & WC_STATUS_ERROR_BIT;
-
-       /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
-       if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
-               ehca_dbg(cq->device,
-                        "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
-                        is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               ehca_dbg(cq->device,
-                        "ehca_cq=%p cq_num=%x -------------------------",
-                        my_cq, my_cq->cq_number);
-       }
-
-       read_lock(&ehca_qp_idr_lock);
-       my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
-       read_unlock(&ehca_qp_idr_lock);
-       if (!my_qp)
-               goto repoll;
-       wc->qp = &my_qp->ib_qp;
-
-       qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
-       if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
-               /* We got a send completion. */
-               qmap = &my_qp->sq_map;
-       else
-               /* We got a receive completion. */
-               qmap = &my_qp->rq_map;
-
-       /* advance the tail pointer */
-       qmap->tail = qmap_tail_idx;
-
-       if (is_error) {
-               /*
-                * set left_to_poll to 0 because in error state, we will not
-                * get any additional CQEs
-                */
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               my_qp->sq_map.left_to_poll = 0;
-               ehca_add_to_err_list(my_qp, 1);
-
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               my_qp->rq_map.left_to_poll = 0;
-               if (HAS_RQ(my_qp))
-                       ehca_add_to_err_list(my_qp, 0);
-       }
-
-       qmap_entry = &qmap->map[qmap_tail_idx];
-       if (qmap_entry->reported) {
-               ehca_warn(cq->device, "Double cqe on qp_num=%#x",
-                               my_qp->real_qp_num);
-               /* found a double cqe, discard it and read next one */
-               goto repoll;
-       }
-
-       wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
-       qmap_entry->reported = 1;
-
-       /* if left_to_poll is decremented to 0, add the QP to the error list */
-       if (qmap->left_to_poll > 0) {
-               qmap->left_to_poll--;
-               if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-                       ehca_add_to_err_list(my_qp, 1);
-                       if (HAS_RQ(my_qp))
-                               ehca_add_to_err_list(my_qp, 0);
-               }
-       }
-
-       /* eval ib_wc_opcode */
-       wc->opcode = ib_wc_opcode[cqe->optype]-1;
-       if (unlikely(wc->opcode == -1)) {
-               ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
-                        "ehca_cq=%p cq_num=%x",
-                        cqe->optype, cqe->status, my_cq, my_cq->cq_number);
-               /* dump cqe for other infos */
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               /* update also queue adder to throw away this entry!!! */
-               goto repoll;
-       }
-
-       /* eval ib_wc_status */
-       if (unlikely(is_error)) {
-               /* complete with errors */
-               map_ib_wc_status(cqe->status, &wc->status);
-               wc->vendor_err = wc->status;
-       } else
-               wc->status = IB_WC_SUCCESS;
-
-       wc->byte_len = cqe->nr_bytes_transferred;
-       wc->pkey_index = cqe->pkey_index;
-       wc->slid = cqe->rlid;
-       wc->dlid_path_bits = cqe->dlid;
-       wc->src_qp = cqe->remote_qp_number;
-       /*
-        * HW has "Immed data present" and "GRH present" in bits 6 and 5.
-        * SW defines those in bits 1 and 0, so we can just shift and mask.
-        */
-       wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
-       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
-       wc->sl = cqe->service_level;
-
-poll_cq_one_exit0:
-       if (cqe_count > 0)
-               hipz_update_feca(my_cq, cqe_count);
-
-       return ret;
-}
-
-static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
-                              struct ib_wc *wc, int num_entries,
-                              struct ipz_queue *ipz_queue, int on_sq)
-{
-       int nr = 0;
-       struct ehca_wqe *wqe;
-       u64 offset;
-       struct ehca_queue_map *qmap;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (on_sq)
-               qmap = &my_qp->sq_map;
-       else
-               qmap = &my_qp->rq_map;
-
-       qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-       while ((nr < num_entries) && (qmap_entry->reported == 0)) {
-               /* generate flush CQE */
-
-               memset(wc, 0, sizeof(*wc));
-
-               offset = qmap->next_wqe_idx * ipz_queue->qe_size;
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
-               if (!wqe) {
-                       ehca_err(cq->device, "Invalid wqe offset=%#llx on "
-                                "qp_num=%#x", offset, my_qp->real_qp_num);
-                       return nr;
-               }
-
-               wc->wr_id = replace_wr_id(wqe->work_request_id,
-                                         qmap_entry->app_wr_id);
-
-               if (on_sq) {
-                       switch (wqe->optype) {
-                       case WQE_OPTYPE_SEND:
-                               wc->opcode = IB_WC_SEND;
-                               break;
-                       case WQE_OPTYPE_RDMAWRITE:
-                               wc->opcode = IB_WC_RDMA_WRITE;
-                               break;
-                       case WQE_OPTYPE_RDMAREAD:
-                               wc->opcode = IB_WC_RDMA_READ;
-                               break;
-                       default:
-                               ehca_err(cq->device, "Invalid optype=%x",
-                                               wqe->optype);
-                               return nr;
-                       }
-               } else
-                       wc->opcode = IB_WC_RECV;
-
-               if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
-                       wc->ex.imm_data = wqe->immediate_data;
-                       wc->wc_flags |= IB_WC_WITH_IMM;
-               }
-
-               wc->status = IB_WC_WR_FLUSH_ERR;
-
-               wc->qp = &my_qp->ib_qp;
-
-               /* mark as reported and advance next_wqe pointer */
-               qmap_entry->reported = 1;
-               qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
-                                               qmap->entries);
-               qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-               wc++; nr++;
-       }
-
-       return nr;
-
-}
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int nr;
-       struct ehca_qp *err_qp;
-       struct ib_wc *current_wc = wc;
-       int ret = 0;
-       unsigned long flags;
-       int entries_left = num_entries;
-
-       if (num_entries < 1) {
-               ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
-                        "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
-               ret = -EINVAL;
-               goto poll_cq_exit0;
-       }
-
-       spin_lock_irqsave(&my_cq->spinlock, flags);
-
-       /* generate flush cqes for send queues */
-       list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_squeue, 1);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       /* generate flush cqes for receive queues */
-       list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_rqueue, 0);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       for (nr = 0; nr < entries_left; nr++) {
-               ret = ehca_poll_cq_one(cq, current_wc);
-               if (ret)
-                       break;
-               current_wc++;
-       } /* eof for nr */
-       entries_left -= nr;
-
-       spin_unlock_irqrestore(&my_cq->spinlock, flags);
-       if (ret == -EAGAIN  || !ret)
-               ret = num_entries - entries_left;
-
-poll_cq_exit0:
-       return ret;
-}
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int ret = 0;
-
-       switch (notify_flags & IB_CQ_SOLICITED_MASK) {
-       case IB_CQ_SOLICITED:
-               hipz_set_cqx_n0(my_cq, 1);
-               break;
-       case IB_CQ_NEXT_COMP:
-               hipz_set_cqx_n1(my_cq, 1);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-               unsigned long spl_flags;
-               spin_lock_irqsave(&my_cq->spinlock, spl_flags);
-               ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
-               spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
-       }
-
-       return ret;
-}
diff --git a/drivers/staging/rdma/ehca/ehca_sqp.c b/drivers/staging/rdma/ehca/ehca_sqp.c
deleted file mode 100644 (file)
index 376b031..0000000
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  SQP functions
- *
- *  Authors: Khadija Souissi <souissi@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <rdma/ib_mad.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define IB_MAD_STATUS_REDIRECT         cpu_to_be16(0x0002)
-#define IB_MAD_STATUS_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_MAD_STATUS_UNSUP_METHOD     cpu_to_be16(0x0008)
-
-#define IB_PMA_CLASS_PORT_INFO         cpu_to_be16(0x0001)
-
-/**
- * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
- * pair is created successfully, the corresponding port gets active.
- *
- * Define Special Queue pair 0 (SMI QP) is still not supported.
- *
- * @qp_init_attr: Queue pair init attributes with port and queue pair type
- */
-
-u64 ehca_define_sqp(struct ehca_shca *shca,
-                   struct ehca_qp *ehca_qp,
-                   struct ib_qp_init_attr *qp_init_attr)
-{
-       u32 pma_qp_nr, bma_qp_nr;
-       u64 ret;
-       u8 port = qp_init_attr->port_num;
-       int counter;
-
-       shca->sport[port - 1].port_state = IB_PORT_DOWN;
-
-       switch (qp_init_attr->qp_type) {
-       case IB_QPT_SMI:
-               /* function not supported yet */
-               break;
-       case IB_QPT_GSI:
-               ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
-                                        ehca_qp->ipz_qp_handle,
-                                        ehca_qp->galpas.kernel,
-                                        (u32) qp_init_attr->port_num,
-                                        &pma_qp_nr, &bma_qp_nr);
-
-               if (ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device,
-                                "Can't define AQP1 for port %x. h_ret=%lli",
-                                port, ret);
-                       return ret;
-               }
-               shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
-               ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
-                        port, pma_qp_nr);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "invalid qp_type=%x",
-                        qp_init_attr->qp_type);
-               return H_PARAMETER;
-       }
-
-       if (ehca_nr_ports < 0) /* autodetect mode */
-               return H_SUCCESS;
-
-       for (counter = 0;
-            shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
-                    counter < ehca_port_act_time;
-            counter++) {
-               ehca_dbg(&shca->ib_device, "... wait until port %x is active",
-                        port);
-               msleep_interruptible(1000);
-       }
-
-       if (counter == ehca_port_act_time) {
-               ehca_err(&shca->ib_device, "Port %x is not active.", port);
-               return H_HARDWARE;
-       }
-
-       return H_SUCCESS;
-}
-
-struct ib_perf {
-       struct ib_mad_hdr mad_hdr;
-       u8 reserved[40];
-       u8 data[192];
-} __attribute__ ((packed));
-
-/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
-struct tcslfl {
-       u32 tc:8;
-       u32 sl:4;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-/* IP Version/TC/FL packed into 32 bits, as in GRH */
-struct vertcfl {
-       u32 ver:4;
-       u32 tc:8;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
-                            const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                            const struct ib_mad *in_mad, struct ib_mad *out_mad)
-{
-       const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
-       struct ib_perf *out_perf = (struct ib_perf *)out_mad;
-       struct ib_class_port_info *poi =
-               (struct ib_class_port_info *)out_perf->data;
-       struct tcslfl *tcslfl =
-               (struct tcslfl *)&poi->redirect_tcslfl;
-       struct ehca_shca *shca =
-               container_of(ibdev, struct ehca_shca, ib_device);
-       struct ehca_sport *sport = &shca->sport[port_num - 1];
-
-       ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
-
-       *out_mad = *in_mad;
-
-       if (in_perf->mad_hdr.class_version != 1) {
-               ehca_warn(ibdev, "Unsupported class_version=%x",
-                         in_perf->mad_hdr.class_version);
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
-               goto perf_reply;
-       }
-
-       switch (in_perf->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-       case IB_MGMT_METHOD_SET:
-               /* set class port info for redirection */
-               out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
-               out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
-               memset(poi, 0, sizeof(*poi));
-               poi->base_version = 1;
-               poi->class_version = 1;
-               poi->resp_time_value = 18;
-
-               /* copy local routing information from WC where applicable */
-               tcslfl->sl         = in_wc->sl;
-               poi->redirect_lid  =
-                       sport->saved_attr.lid | in_wc->dlid_path_bits;
-               poi->redirect_qp   = sport->pma_qp_nr;
-               poi->redirect_qkey = IB_QP1_QKEY;
-
-               ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
-                               &poi->redirect_pkey);
-
-               /* if request was globally routed, copy route info */
-               if (in_grh) {
-                       const struct vertcfl *vertcfl =
-                               (const struct vertcfl *)&in_grh->version_tclass_flow;
-                       memcpy(poi->redirect_gid, in_grh->dgid.raw,
-                              sizeof(poi->redirect_gid));
-                       tcslfl->tc        = vertcfl->tc;
-                       tcslfl->fl        = vertcfl->fl;
-               } else
-                       /* else only fill in default GID */
-                       ehca_query_gid(ibdev, port_num, 0,
-                                      (union ib_gid *)&poi->redirect_gid);
-
-               ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
-                        sport->saved_attr.lid, sport->pma_qp_nr);
-               break;
-
-       case IB_MGMT_METHOD_GET_RESP:
-               return IB_MAD_RESULT_FAILURE;
-
-       default:
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
-               break;
-       }
-
-perf_reply:
-       out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
-
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index)
-{
-       int ret;
-       const struct ib_mad *in_mad = (const struct ib_mad *)in;
-       struct ib_mad *out_mad = (struct ib_mad *)out;
-
-       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-                        *out_mad_size != sizeof(*out_mad)))
-               return IB_MAD_RESULT_FAILURE;
-
-       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
-               return IB_MAD_RESULT_FAILURE;
-
-       /* accept only pma request */
-       if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
-               return IB_MAD_RESULT_SUCCESS;
-
-       ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
-       ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
-                               in_mad, out_mad);
-
-       return ret;
-}
diff --git a/drivers/staging/rdma/ehca/ehca_tools.h b/drivers/staging/rdma/ehca/ehca_tools.h
deleted file mode 100644 (file)
index d280b12..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  auxiliary functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef EHCA_TOOLS_H
-#define EHCA_TOOLS_H
-
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-#include <linux/idr.h>
-#include <linux/kthread.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/vmalloc.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/device.h>
-
-#include <linux/atomic.h>
-#include <asm/ibmebus.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/hvcall.h>
-
-extern int ehca_debug_level;
-
-#define ehca_dbg(ib_dev, format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
-                                  "PU%04x EHCA_DBG:%s " format "\n", \
-                                  raw_smp_processor_id(), __func__, \
-                                  ## arg); \
-       } while (0)
-
-#define ehca_info(ib_dev, format, arg...) \
-       dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_warn(ib_dev, format, arg...) \
-       dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_err(ib_dev, format, arg...) \
-       dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
-               raw_smp_processor_id(), __func__, ## arg)
-
-/* use this one only if no ib_dev available */
-#define ehca_gen_dbg(format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
-                              raw_smp_processor_id(), __func__, ## arg); \
-       } while (0)
-
-#define ehca_gen_warn(format, arg...) \
-       printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_gen_err(format, arg...) \
-       printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-/**
- * ehca_dmp - printk a memory block, whose length is n*8 bytes.
- * Each line has the following layout:
- * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
- */
-#define ehca_dmp(adr, len, format, args...) \
-       do { \
-               unsigned int x; \
-               unsigned int l = (unsigned int)(len); \
-               unsigned char *deb = (unsigned char *)(adr); \
-               for (x = 0; x < l; x += 16) { \
-                       printk(KERN_INFO "EHCA_DMP:%s " format \
-                              " adr=%p ofs=%04x %016llx %016llx\n", \
-                              __func__, ##args, deb, x, \
-                              *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
-                       deb += 16; \
-               } \
-       } while (0)
-
-/* define a bitmask, little endian version */
-#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
-
-/* define a bitmask, the ibm way... */
-#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
-
-/* internal function, don't use */
-#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
-
-/* internal function, don't use */
-#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
-
-/**
- * EHCA_BMASK_SET - return value shifted and masked by mask
- * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
- * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
- * in variable
- */
-#define EHCA_BMASK_SET(mask, value) \
-       ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
-
-/**
- * EHCA_BMASK_GET - extract a parameter from value by mask
- */
-#define EHCA_BMASK_GET(mask, value) \
-       (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
-
-/* Converts ehca to ib return code */
-int ehca2ib_return_code(u64 ehca_rc);
-
-#endif /* EHCA_TOOLS_H */
diff --git a/drivers/staging/rdma/ehca/ehca_uverbs.c b/drivers/staging/rdma/ehca/ehca_uverbs.c
deleted file mode 100644 (file)
index 1a1d5d9..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  userspace support verbs
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata)
-{
-       struct ehca_ucontext *my_context;
-
-       my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
-       if (!my_context) {
-               ehca_err(device, "Out of memory device=%p", device);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       return &my_context->ib_ucontext;
-}
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context)
-{
-       kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
-       return 0;
-}
-
-static void ehca_mm_open(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)++;
-       if (!(*count))
-               ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static void ehca_mm_close(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)--;
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static const struct vm_operations_struct vm_ops = {
-       .open = ehca_mm_open,
-       .close = ehca_mm_close,
-};
-
-static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
-                       u32 *mm_count)
-{
-       int ret;
-       u64 vsize, physical;
-
-       vsize = vma->vm_end - vma->vm_start;
-       if (vsize < EHCA_PAGESIZE) {
-               ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
-               return -EINVAL;
-       }
-
-       physical = galpas->user.fw_handle;
-       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-       ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
-       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
-       ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
-                          vma->vm_page_prot);
-       if (unlikely(ret)) {
-               ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
-               return -ENOMEM;
-       }
-
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
-                          u32 *mm_count)
-{
-       int ret;
-       u64 start, ofs;
-       struct page *page;
-
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-       start = vma->vm_start;
-       for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
-               u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
-               page = virt_to_page(virt_addr);
-               ret = vm_insert_page(vma, start, page);
-               if (unlikely(ret)) {
-                       ehca_gen_err("vm_insert_page() failed rc=%i", ret);
-                       return ret;
-               }
-               start += PAGE_SIZE;
-       }
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
-               ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_fw() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* cq queue_addr */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
-               ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_queue() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
-                        rsrc_type, cq->cq_number);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
-               ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "remap_pfn_range() failed ret=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return -ENOMEM;
-               }
-               break;
-
-       case 1: /* qp rqueue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
-                                     &qp->mm_count_rqueue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       case 2: /* qp squeue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
-                                     &qp->mm_count_squeue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
-                        rsrc_type, qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       u64 fileoffset = vma->vm_pgoff;
-       u32 idr_handle = fileoffset & 0x1FFFFFF;
-       u32 q_type = (fileoffset >> 27) & 0x1;    /* CQ, QP,...        */
-       u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
-       u32 ret;
-       struct ehca_cq *cq;
-       struct ehca_qp *qp;
-       struct ib_uobject *uobject;
-
-       switch (q_type) {
-       case  0: /* CQ */
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, idr_handle);
-               read_unlock(&ehca_cq_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!cq)
-                       return -EINVAL;
-
-               if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_cq(vma, cq, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_cq() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* QP */
-               read_lock(&ehca_qp_idr_lock);
-               qp = idr_find(&ehca_qp_idr, idr_handle);
-               read_unlock(&ehca_qp_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!qp)
-                       return -EINVAL;
-
-               uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
-               if (!uobject || uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_qp(vma, qp, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_qp() failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_gen_err("bad queue type %x", q_type);
-               return -EINVAL;
-       }
-
-       return 0;
-}
diff --git a/drivers/staging/rdma/ehca/hcp_if.c b/drivers/staging/rdma/ehca/hcp_if.c
deleted file mode 100644 (file)
index 89517ff..0000000
+++ /dev/null
@@ -1,949 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <asm/hvcall.h>
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hcp_phyp.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
-#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
-#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
-#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
-#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
-#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
-#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
-#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
-#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
-#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
-#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
-
-#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
-#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
-#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
-
-#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
-#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
-#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
-
-#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
-#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
-
-#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
-
-#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
-#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
-#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
-
-#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
-#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
-#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
-
-static DEFINE_SPINLOCK(hcall_lock);
-
-static long ehca_plpar_hcall_norets(unsigned long opcode,
-                                   unsigned long arg1,
-                                   unsigned long arg2,
-                                   unsigned long arg3,
-                                   unsigned long arg4,
-                                   unsigned long arg5,
-                                   unsigned long arg6,
-                                   unsigned long arg7)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
-                            opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
-                                        arg5, arg6, arg7);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS)
-                       ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
-                                    opcode, ret, arg1, arg2, arg3,
-                                    arg4, arg5, arg6, arg7);
-               else
-                       if (unlikely(ehca_debug_level >= 2))
-                               ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
-
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-static long ehca_plpar_hcall9(unsigned long opcode,
-                             unsigned long *outs, /* array of 9 outputs */
-                             unsigned long arg1,
-                             unsigned long arg2,
-                             unsigned long arg3,
-                             unsigned long arg4,
-                             unsigned long arg5,
-                             unsigned long arg6,
-                             unsigned long arg7,
-                             unsigned long arg8,
-                             unsigned long arg9)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
-                            arg1, arg2, arg3, arg4, arg5,
-                            arg6, arg7, arg8, arg9);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall9(opcode, outs,
-                                  arg1, arg2, arg3, arg4, arg5,
-                                  arg6, arg7, arg8, arg9);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS) {
-                       ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
-                                    opcode, arg1, arg2, arg3, arg4, arg5,
-                                    arg6, arg7, arg8, arg9);
-                       ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               } else if (unlikely(ehca_debug_level >= 2))
-                       ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 *act_nr_of_entries,
-                            u32 *act_pages,
-                            u32 *eq_ist)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       u64 allocate_controls;
-
-       /* resource type */
-       allocate_controls = 3ULL;
-
-       /* ISN is associated */
-       if (neq_control != 1)
-               allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
-       else /* notification event queue */
-               allocate_controls = (1ULL << 63) | allocate_controls;
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,  /* r4 */
-                               allocate_controls,      /* r5 */
-                               number_of_entries,      /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       eq_handle->handle = outs[0];
-       *act_nr_of_entries = (u32)outs[3];
-       *act_pages = (u32)outs[4];
-       *eq_ist = (u32)outs[5];
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resource - ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask)
-{
-       return ehca_plpar_hcall_norets(H_RESET_EVENTS,
-                                      adapter_handle.handle, /* r4 */
-                                      eq_handle.handle,      /* r5 */
-                                      event_mask,            /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param)
-{
-       int rc;
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,   /* r4  */
-                               2,                       /* r5  */
-                               param->eq_handle.handle, /* r6  */
-                               cq->token,               /* r7  */
-                               param->nr_cqe,           /* r8  */
-                               0, 0, 0, 0);
-       cq->ipz_cq_handle.handle = outs[0];
-       param->act_nr_of_entries = (u32)outs[3];
-       param->act_pages = (u32)outs[4];
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[5]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               cq->ipz_cq_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user)
-{
-       int rc;
-       u64 ret;
-       u64 allocate_controls, max_r10_reg, r11, r12;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       allocate_controls =
-               EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
-                                parms->squeue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
-                                parms->rqueue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_RECV_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_SEND_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
-                                parms->ud_av_l_key_ctl)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
-
-       max_r10_reg =
-               EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
-                              parms->squeue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
-                                parms->rqueue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
-                                parms->squeue.max_sge)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
-                                parms->rqueue.max_sge);
-
-       r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
-
-       if (parms->ext_type == EQPT_SRQ)
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
-       else
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,             /* r4  */
-                               allocate_controls,                 /* r5  */
-                               parms->send_cq_handle.handle,
-                               parms->recv_cq_handle.handle,
-                               parms->eq_handle.handle,
-                               ((u64)parms->token << 32) | parms->pd.value,
-                               max_r10_reg, r11, r12);
-
-       parms->qp_handle.handle = outs[0];
-       parms->real_qp_num = (u32)outs[1];
-       parms->squeue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
-       parms->rqueue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
-       parms->squeue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
-       parms->rqueue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
-       parms->squeue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
-       parms->rqueue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[6]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               parms->qp_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block)
-{
-       u64 ret;
-       u64 r_cb = __pa(query_port_response_block);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response block not page aligned");
-               return H_PARAMETER;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
-                                     adapter_handle.handle, /* r4 */
-                                     port_id,               /* r5 */
-                                     r_cb,                  /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(query_port_response_block, 64, "response_block");
-
-       return ret;
-}
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask)
-{
-       u64 port_attributes = port_cap;
-
-       if (modify_mask & IB_PORT_SHUTDOWN)
-               port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
-       if (modify_mask & IB_PORT_INIT_TYPE)
-               port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
-       if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
-               port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
-
-       return ehca_plpar_hcall_norets(H_MODIFY_PORT,
-                                      adapter_handle.handle, /* r4 */
-                                      port_id,               /* r5 */
-                                      port_attributes,       /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock)
-{
-       u64 r_cb = __pa(query_hca_rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response_block=%p not page aligned",
-                            query_hca_rblock);
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_QUERY_HCA,
-                                      adapter_handle.handle, /* r4 */
-                                      r_cb,                  /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count)
-{
-       return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
-                                      adapter_handle.handle,      /* r4  */
-                                      (u64)queue_type | ((u64)pagesize) << 8,
-                                      /* r5  */
-                                      resource_handle,            /* r6  */
-                                      logical_address_of_page,    /* r7  */
-                                      count,                      /* r8  */
-                                      0, 0);
-}
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       if (count != 1) {
-               ehca_gen_err("Ppage counter=%llx", count);
-               return H_PARAMETER;
-       }
-       return hipz_h_register_rpage(adapter_handle,
-                                    pagesize,
-                                    queue_type,
-                                    eq_handle.handle,
-                                    logical_address_of_page, count);
-}
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
-                          u32 ist)
-{
-       u64 ret;
-       ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
-                                     adapter_handle.handle, /* r4 */
-                                     ist,                   /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret != H_SUCCESS && ret != H_BUSY)
-               ehca_gen_err("Could not query interrupt state.");
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal)
-{
-       if (count != 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    cq_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa)
-{
-       if (count > 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    qp_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe2processed,
-                              void **log_addr_next_rq_wqe2processed,
-                              int dis_and_get_function_code)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               dis_and_get_function_code, /* r5 */
-                               qp_handle.handle,          /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (log_addr_next_sq_wqe2processed)
-               *log_addr_next_sq_wqe2processed = (void *)outs[0];
-       if (log_addr_next_rq_wqe2processed)
-               *log_addr_next_rq_wqe2processed = (void *)outs[1];
-
-       return ret;
-}
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               update_mask,           /* r6 */
-                               __pa(mqpcb),           /* r7 */
-                               0, 0, 0, 0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Insufficient resources ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal)
-{
-       return ehca_plpar_hcall_norets(H_QUERY_QP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      __pa(qqpcb),           /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = hcp_galpas_dtor(&qp->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct qp->galpas");
-               return H_RESOURCE;
-       }
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               /* function code */
-                               1,                         /* r5 */
-                               qp->ipz_qp_handle.handle,  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (ret == H_HARDWARE)
-               ehca_gen_err("HCA not operational. ret=%lli", ret);
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     qp->ipz_qp_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource still in use. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port)
-{
-       return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      port,                  /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               port,                  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       *pma_qp_nr = (u32)outs[0];
-       *bma_qp_nr = (u32)outs[1];
-
-       if (ret == H_ALIAS_EXIST)
-               ehca_gen_err("AQP1 already exists. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       u64 ret;
-
-       ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
-                                     adapter_handle.handle,  /* r4 */
-                                     qp_handle.handle,       /* r5 */
-                                     mcg_dlid,               /* r6 */
-                                     interface_id,           /* r7 */
-                                     subnet_prefix,          /* r8 */
-                                     0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       return ehca_plpar_hcall_norets(H_DETACH_MCQP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      mcg_dlid,              /* r6 */
-                                      interface_id,          /* r7 */
-                                      subnet_prefix,         /* r8 */
-                                      0, 0);
-}
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&cq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct cp->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     cq->ipz_cq_handle.handle,  /* r5 */
-                                     force_flag != 0 ? 1L : 0L, /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&eq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct eq->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     eq->ipz_eq_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource in use. ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,            /* r4 */
-                               5,                                /* r5 */
-                               vaddr,                            /* r6 */
-                               length,                           /* r7 */
-                               (((u64)access_ctrl) << 32ULL),    /* r8 */
-                               pd.value,                         /* r9 */
-                               0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       u64 ret;
-
-       if (unlikely(ehca_debug_level >= 3)) {
-               if (count > 1) {
-                       u64 *kpage;
-                       int i;
-                       kpage = __va(logical_address_of_page);
-                       for (i = 0; i < count; i++)
-                               ehca_gen_dbg("kpage[%d]=%p",
-                                            i, (void *)kpage[i]);
-               } else
-                       ehca_gen_dbg("kpage=%p",
-                                    (void *)logical_address_of_page);
-       }
-
-       if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
-               ehca_gen_err("logical_address_of_page not on a 4k boundary "
-                            "adapter_handle=%llx mr=%p mr_handle=%llx "
-                            "pagesize=%x queue_type=%x "
-                            "logical_address_of_page=%llx count=%llx",
-                            adapter_handle.handle, mr,
-                            mr->ipz_mr_handle.handle, pagesize, queue_type,
-                            logical_address_of_page, count);
-               ret = H_PARAMETER;
-       } else
-               ret = hipz_h_register_rpage(adapter_handle, pagesize,
-                                           queue_type,
-                                           mr->ipz_mr_handle.handle,
-                                           logical_address_of_page, count);
-       return ret;
-}
-
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
-                               adapter_handle.handle,     /* r4 */
-                               mr->ipz_mr_handle.handle,  /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->len = outs[0];
-       outparms->vaddr = outs[1];
-       outparms->acl  = outs[4] >> 32;
-       outparms->lkey = (u32)(outs[5] >> 32);
-       outparms->rkey = (u32)(outs[5] & (0xffffffff));
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mr->ipz_mr_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mr->ipz_mr_handle.handle, /* r5 */
-                               vaddr_in,                 /* r6 */
-                               length,                   /* r7 */
-                               /* r8 */
-                               ((((u64)access_ctrl) << 32ULL) | pd.value),
-                               mr_addr_cb,               /* r9 */
-                               0, 0, 0);
-       outparms->vaddr = outs[1];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
-                               adapter_handle.handle,            /* r4 */
-                               orig_mr->ipz_mr_handle.handle,    /* r5 */
-                               vaddr_in,                         /* r6 */
-                               (((u64)access_ctrl) << 32ULL),    /* r7 */
-                               pd.value,                         /* r8 */
-                               0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,      /* r4 */
-                               6,                          /* r5 */
-                               pd.value,                   /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mw->ipz_mw_handle.handle, /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mw->ipz_mw_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count)
-{
-       u64 r_cb = __pa(rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("rblock not page aligned.");
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_ERROR_DATA,
-                                      adapter_handle.handle,
-                                      ressource_handle,
-                                      r_cb,
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_eoi(int irq)
-{
-       unsigned long xirr;
-
-       iosync();
-       xirr = (0xffULL << 24) | irq;
-
-       return plpar_hcall_norets(H_EOI, xirr);
-}
diff --git a/drivers/staging/rdma/ehca/hcp_if.h b/drivers/staging/rdma/ehca/hcp_if.h
deleted file mode 100644 (file)
index a46e514..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_IF_H__
-#define __HCP_IF_H__
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "hipz_hw.h"
-
-/*
- * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
- * resources, create the empty EQPT (ring).
- */
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 * act_nr_of_entries,
-                            u32 * act_pages,
-                            u32 * eq_ist);
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask);
-/*
- * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
- * resources, create the empty CQPT (ring).
- */
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param);
-
-
-/*
- * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
- * initialize resources, create empty QPPTs (2 rings).
- */
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user);
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block);
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask);
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock);
-
-/*
- * hipz_h_register_rpage internal function in hcp_if.h for all
- * hcp_H_REGISTER_RPAGE calls.
- */
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count);
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle
-                          hcp_adapter_handle,
-                          u32 ist);
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal);
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa);
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe_tb_processed,
-                              void **log_addr_next_rq_wqe_tb_processed,
-                              int dis_and_get_function_code);
-enum hcall_sigt {
-       HCALL_SIGT_NO_CQE = 0,
-       HCALL_SIGT_BY_WQE = 1,
-       HCALL_SIGT_EVERY = 2
-};
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal);
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal);
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp);
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port);
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr);
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag);
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq);
-
-/*
- * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-/* hipz_h_query_mr queries MR in HW and FW */
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mr frees MR resources in HW and FW */
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr);
-
-/* hipz_h_reregister_pmr reregisters MR in HW and FW */
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_smr register shared MR in HW and FW */
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms);
-
-/*
- * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_query_mw queries MW in HW and FW */
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mw frees MW resources in HW and FW */
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw);
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count);
-u64 hipz_h_eoi(int irq);
-
-#endif /* __HCP_IF_H__ */
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.c b/drivers/staging/rdma/ehca/hcp_phyp.c
deleted file mode 100644 (file)
index 077376f..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *   load store abstraction for ehca register access with tracing
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-u64 hcall_map_page(u64 physaddr)
-{
-       return (u64)ioremap(physaddr, EHCA_PAGESIZE);
-}
-
-int hcall_unmap_page(u64 mapaddr)
-{
-       iounmap((volatile void __iomem *) mapaddr);
-       return 0;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user)
-{
-       if (!is_user) {
-               galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
-               if (!galpas->kernel.fw_handle)
-                       return -ENOMEM;
-       } else
-               galpas->kernel.fw_handle = 0;
-
-       galpas->user.fw_handle = paddr_user;
-
-       return 0;
-}
-
-int hcp_galpas_dtor(struct h_galpas *galpas)
-{
-       if (galpas->kernel.fw_handle) {
-               int ret = hcall_unmap_page(galpas->kernel.fw_handle);
-               if (ret)
-                       return ret;
-       }
-
-       galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
-
-       return 0;
-}
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.h b/drivers/staging/rdma/ehca/hcp_phyp.h
deleted file mode 100644 (file)
index d1b0299..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware calls
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_PHYP_H__
-#define __HCP_PHYP_H__
-
-
-/*
- * eHCA page (mapped into memory)
- * resource to access eHCA register pages in CPU address space
-*/
-struct h_galpa {
-       u64 fw_handle;
-       /* for pSeries this is a 64bit memory address where
-          I/O memory is mapped into CPU address space (kv) */
-};
-
-/*
- * resource to access eHCA address space registers, all types
- */
-struct h_galpas {
-       u32 pid;                /*PID of userspace galpa checking */
-       struct h_galpa user;    /* user space accessible resource,
-                                  set to 0 if unused */
-       struct h_galpa kernel;  /* kernel space accessible resource,
-                                  set to 0 if unused */
-};
-
-static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
-{
-       u64 addr = galpa.fw_handle + offset;
-       return *(volatile u64 __force *)addr;
-}
-
-static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
-{
-       u64 addr = galpa.fw_handle + offset;
-       *(volatile u64 __force *)addr = value;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user);
-
-int hcp_galpas_dtor(struct h_galpas *galpas);
-
-u64 hcall_map_page(u64 physaddr);
-
-int hcall_unmap_page(u64 mapaddr);
-
-#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns.h b/drivers/staging/rdma/ehca/hipz_fns.h
deleted file mode 100644 (file)
index 9dac93d..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_H__
-#define __HIPZ_FNS_H__
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-#include "hipz_fns_core.h"
-
-#define hipz_galpa_store_eq(gal, offset, value) \
-       hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_eq(gal, offset) \
-       hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qped(gal, offset, value) \
-       hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_qped(gal, offset) \
-       hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
-
-#define hipz_galpa_store_mrmw(gal, offset, value) \
-       hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_mrmw(gal, offset) \
-       hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
-
-#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns_core.h b/drivers/staging/rdma/ehca/hipz_fns_core.h
deleted file mode 100644 (file)
index 868735f..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_CORE_H__
-#define __HIPZ_FNS_CORE_H__
-
-#include "hcp_phyp.h"
-#include "hipz_hw.h"
-
-#define hipz_galpa_store_cq(gal, offset, value) \
-       hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_cq(gal, offset) \
-       hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qp(gal, offset, value) \
-       hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
-#define hipz_galpa_load_qp(gal, offset) \
-       hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
-
-static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
-                           EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
-}
-
-static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
-                           EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
-}
-
-static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
-{
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
-                           EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
-}
-
-static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n0_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
-                           EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
-                                          value));
-       cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
-}
-
-static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n1_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
-                           EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
-       cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
-}
-
-#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/staging/rdma/ehca/hipz_hw.h b/drivers/staging/rdma/ehca/hipz_hw.h
deleted file mode 100644 (file)
index bf996c7..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  eHCA register definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_HW_H__
-#define __HIPZ_HW_H__
-
-#include "ehca_tools.h"
-
-#define EHCA_MAX_MTU 4
-
-/* QP Table Entry Memory Map */
-struct hipz_qptemm {
-       u64 qpx_hcr;
-       u64 qpx_c;
-       u64 qpx_herr;
-       u64 qpx_aer;
-/* 0x20*/
-       u64 qpx_sqa;
-       u64 qpx_sqc;
-       u64 qpx_rqa;
-       u64 qpx_rqc;
-/* 0x40*/
-       u64 qpx_st;
-       u64 qpx_pmstate;
-       u64 qpx_pmfa;
-       u64 qpx_pkey;
-/* 0x60*/
-       u64 qpx_pkeya;
-       u64 qpx_pkeyb;
-       u64 qpx_pkeyc;
-       u64 qpx_pkeyd;
-/* 0x80*/
-       u64 qpx_qkey;
-       u64 qpx_dqp;
-       u64 qpx_dlidp;
-       u64 qpx_portp;
-/* 0xa0*/
-       u64 qpx_slidp;
-       u64 qpx_slidpp;
-       u64 qpx_dlida;
-       u64 qpx_porta;
-/* 0xc0*/
-       u64 qpx_slida;
-       u64 qpx_slidpa;
-       u64 qpx_slvl;
-       u64 qpx_ipd;
-/* 0xe0*/
-       u64 qpx_mtu;
-       u64 qpx_lato;
-       u64 qpx_rlimit;
-       u64 qpx_rnrlimit;
-/* 0x100*/
-       u64 qpx_t;
-       u64 qpx_sqhp;
-       u64 qpx_sqptp;
-       u64 qpx_nspsn;
-/* 0x120*/
-       u64 qpx_nspsnhwm;
-       u64 reserved1;
-       u64 qpx_sdsi;
-       u64 qpx_sdsbc;
-/* 0x140*/
-       u64 qpx_sqwsize;
-       u64 qpx_sqwts;
-       u64 qpx_lsn;
-       u64 qpx_nssn;
-/* 0x160 */
-       u64 qpx_mor;
-       u64 qpx_cor;
-       u64 qpx_sqsize;
-       u64 qpx_erc;
-/* 0x180*/
-       u64 qpx_rnrrc;
-       u64 qpx_ernrwt;
-       u64 qpx_rnrresp;
-       u64 qpx_lmsna;
-/* 0x1a0 */
-       u64 qpx_sqhpc;
-       u64 qpx_sqcptp;
-       u64 qpx_sigt;
-       u64 qpx_wqecnt;
-/* 0x1c0*/
-       u64 qpx_rqhp;
-       u64 qpx_rqptp;
-       u64 qpx_rqsize;
-       u64 qpx_nrr;
-/* 0x1e0*/
-       u64 qpx_rdmac;
-       u64 qpx_nrpsn;
-       u64 qpx_lapsn;
-       u64 qpx_lcr;
-/* 0x200*/
-       u64 qpx_rwc;
-       u64 qpx_rwva;
-       u64 qpx_rdsi;
-       u64 qpx_rdsbc;
-/* 0x220*/
-       u64 qpx_rqwsize;
-       u64 qpx_crmsn;
-       u64 qpx_rdd;
-       u64 qpx_larpsn;
-/* 0x240*/
-       u64 qpx_pd;
-       u64 qpx_scqn;
-       u64 qpx_rcqn;
-       u64 qpx_aeqn;
-/* 0x260*/
-       u64 qpx_aaelog;
-       u64 qpx_ram;
-       u64 qpx_rdmaqe0;
-       u64 qpx_rdmaqe1;
-/* 0x280*/
-       u64 qpx_rdmaqe2;
-       u64 qpx_rdmaqe3;
-       u64 qpx_nrpsnhwm;
-/* 0x298*/
-       u64 reserved[(0x400 - 0x298) / 8];
-/* 0x400 extended data */
-       u64 reserved_ext[(0x500 - 0x400) / 8];
-/* 0x500 */
-       u64 reserved2[(0x1000 - 0x500) / 8];
-/* 0x1000      */
-};
-
-#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
-
-#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
-
-/* MRMWPT Entry Memory Map */
-struct hipz_mrmwmm {
-       /* 0x00 */
-       u64 mrx_hcr;
-
-       u64 mrx_c;
-       u64 mrx_herr;
-       u64 mrx_aer;
-       /* 0x20 */
-       u64 mrx_pp;
-       u64 reserved1;
-       u64 reserved2;
-       u64 reserved3;
-       /* 0x40 */
-       u64 reserved4[(0x200 - 0x40) / 8];
-       /* 0x200 */
-       u64 mrx_ctl[64];
-
-};
-
-#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
-
-struct hipz_qpedmm {
-       /* 0x00 */
-       u64 reserved0[(0x400) / 8];
-       /* 0x400 */
-       u64 qpedx_phh;
-       u64 qpedx_ppsgp;
-       /* 0x410 */
-       u64 qpedx_ppsgu;
-       u64 qpedx_ppdgp;
-       /* 0x420 */
-       u64 qpedx_ppdgu;
-       u64 qpedx_aph;
-       /* 0x430 */
-       u64 qpedx_apsgp;
-       u64 qpedx_apsgu;
-       /* 0x440 */
-       u64 qpedx_apdgp;
-       u64 qpedx_apdgu;
-       /* 0x450 */
-       u64 qpedx_apav;
-       u64 qpedx_apsav;
-       /* 0x460  */
-       u64 qpedx_hcr;
-       u64 reserved1[4];
-       /* 0x488 */
-       u64 qpedx_rrl0;
-       /* 0x490 */
-       u64 qpedx_rrrkey0;
-       u64 qpedx_rrva0;
-       /* 0x4a0 */
-       u64 reserved2;
-       u64 qpedx_rrl1;
-       /* 0x4b0 */
-       u64 qpedx_rrrkey1;
-       u64 qpedx_rrva1;
-       /* 0x4c0 */
-       u64 reserved3;
-       u64 qpedx_rrl2;
-       /* 0x4d0 */
-       u64 qpedx_rrrkey2;
-       u64 qpedx_rrva2;
-       /* 0x4e0 */
-       u64 reserved4;
-       u64 qpedx_rrl3;
-       /* 0x4f0 */
-       u64 qpedx_rrrkey3;
-       u64 qpedx_rrva3;
-};
-
-#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
-
-/* CQ Table Entry Memory Map */
-struct hipz_cqtemm {
-       u64 cqx_hcr;
-       u64 cqx_c;
-       u64 cqx_herr;
-       u64 cqx_aer;
-/* 0x20  */
-       u64 cqx_ptp;
-       u64 cqx_tp;
-       u64 cqx_fec;
-       u64 cqx_feca;
-/* 0x40  */
-       u64 cqx_ep;
-       u64 cqx_eq;
-/* 0x50  */
-       u64 reserved1;
-       u64 cqx_n0;
-/* 0x60  */
-       u64 cqx_n1;
-       u64 reserved2[(0x1000 - 0x60) / 8];
-/* 0x1000 */
-};
-
-#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
-#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
-#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-
-#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
-
-/* EQ Table Entry Memory Map */
-struct hipz_eqtemm {
-       u64 eqx_hcr;
-       u64 eqx_c;
-
-       u64 eqx_herr;
-       u64 eqx_aer;
-/* 0x20 */
-       u64 eqx_ptp;
-       u64 eqx_tp;
-       u64 eqx_ssba;
-       u64 eqx_psba;
-
-/* 0x40 */
-       u64 eqx_cec;
-       u64 eqx_meql;
-       u64 eqx_xisbi;
-       u64 eqx_xisc;
-/* 0x60 */
-       u64 eqx_it;
-
-};
-
-#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
-
-/* access control defines for MR/MW */
-#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
-#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
-#define HIPZ_ACCESSCTRL_R_READ   0x00200000
-#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
-#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
-
-/* query hca response block */
-struct hipz_query_hca {
-       u32 cur_reliable_dg;
-       u32 cur_qp;
-       u32 cur_cq;
-       u32 cur_eq;
-       u32 cur_mr;
-       u32 cur_mw;
-       u32 cur_ee_context;
-       u32 cur_mcast_grp;
-       u32 cur_qp_attached_mcast_grp;
-       u32 reserved1;
-       u32 cur_ipv6_qp;
-       u32 cur_eth_qp;
-       u32 cur_hp_mr;
-       u32 reserved2[3];
-       u32 max_rd_domain;
-       u32 max_qp;
-       u32 max_cq;
-       u32 max_eq;
-       u32 max_mr;
-       u32 max_hp_mr;
-       u32 max_mw;
-       u32 max_mrwpte;
-       u32 max_special_mrwpte;
-       u32 max_rd_ee_context;
-       u32 max_mcast_grp;
-       u32 max_total_mcast_qp_attach;
-       u32 max_mcast_qp_attach;
-       u32 max_raw_ipv6_qp;
-       u32 max_raw_ethy_qp;
-       u32 internal_clock_frequency;
-       u32 max_pd;
-       u32 max_ah;
-       u32 max_cqe;
-       u32 max_wqes_wq;
-       u32 max_partitions;
-       u32 max_rr_ee_context;
-       u32 max_rr_qp;
-       u32 max_rr_hca;
-       u32 max_act_wqs_ee_context;
-       u32 max_act_wqs_qp;
-       u32 max_sge;
-       u32 max_sge_rd;
-       u32 memory_page_size_supported;
-       u64 max_mr_size;
-       u32 local_ca_ack_delay;
-       u32 num_ports;
-       u32 vendor_id;
-       u32 vendor_part_id;
-       u32 hw_ver;
-       u64 node_guid;
-       u64 hca_cap_indicators;
-       u32 data_counter_register_size;
-       u32 max_shared_rq;
-       u32 max_isns_eq;
-       u32 max_neq;
-} __attribute__ ((packed));
-
-#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
-#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
-#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
-#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
-#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
-#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
-#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
-#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
-#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
-#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
-#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
-#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
-#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
-#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
-#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
-#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
-#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
-#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
-
-/* query port response block */
-struct hipz_query_port {
-       u32 state;
-       u32 bad_pkey_cntr;
-       u32 lmc;
-       u32 lid;
-       u32 subnet_timeout;
-       u32 qkey_viol_cntr;
-       u32 sm_sl;
-       u32 sm_lid;
-       u32 capability_mask;
-       u32 init_type_reply;
-       u32 pkey_tbl_len;
-       u32 gid_tbl_len;
-       u64 gid_prefix;
-       u32 port_nr;
-       u16 pkey_entries[16];
-       u8  reserved1[32];
-       u32 trent_size;
-       u32 trbuf_size;
-       u64 max_msg_sz;
-       u32 max_mtu;
-       u32 vl_cap;
-       u32 phys_pstate;
-       u32 phys_state;
-       u32 phys_speed;
-       u32 phys_width;
-       u8  reserved2[1884];
-       u64 guid_entries[255];
-} __attribute__ ((packed));
-
-#endif
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.c b/drivers/staging/rdma/ehca/ipz_pt_fn.c
deleted file mode 100644 (file)
index 7ffc748..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ipz_pt_fn.h"
-#include "ehca_classes.h"
-
-#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
-
-struct kmem_cache *small_qp_cache;
-
-void *ipz_qpageit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->pagesize;
-       if (queue->current_q_offset > queue->queue_length) {
-               queue->current_q_offset -= queue->pagesize;
-               ret = NULL;
-       }
-       if (((u64)ret) % queue->pagesize) {
-               ehca_gen_err("ERROR!! not at PAGE-Boundary");
-               return NULL;
-       }
-       return ret;
-}
-
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u64 last_entry_in_q = queue->queue_length - queue->qe_size;
-
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset > last_entry_in_q) {
-               queue->current_q_offset = 0;
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
-{
-       int i;
-       for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
-               u64 page = __pa(queue->queue_pages[i]);
-               if (addr >= page && addr < page + queue->pagesize) {
-                       *q_offset = addr - page + i * queue->pagesize;
-                       return 0;
-               }
-       }
-       return -EINVAL;
-}
-
-#if PAGE_SHIFT < EHCA_PAGESHIFT
-#error Kernel pages must be at least as large than eHCA pages (4K) !
-#endif
-
-/*
- * allocate pages for queue:
- * outer loop allocates whole kernel pages (page aligned) and
- * inner loop divides a kernel page into smaller hca queue pages
- */
-static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
-{
-       int k, f = 0;
-       u8 *kpage;
-
-       while (f < nr_of_pages) {
-               kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
-               if (!kpage)
-                       goto out;
-
-               for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
-                       queue->queue_pages[f] = (struct ipz_page *)kpage;
-                       kpage += EHCA_PAGESIZE;
-                       f++;
-               }
-       }
-       return 1;
-
-out:
-       for (f = 0; f < nr_of_pages && queue->queue_pages[f];
-            f += PAGES_PER_KPAGE)
-               free_page((unsigned long)(queue->queue_pages)[f]);
-       return 0;
-}
-
-static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page;
-       unsigned long bit;
-
-       mutex_lock(&pd->lock);
-
-       if (!list_empty(&pd->free[order]))
-               page = list_entry(pd->free[order].next,
-                                 struct ipz_small_queue_page, list);
-       else {
-               page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
-               if (!page)
-                       goto out;
-
-               page->page = get_zeroed_page(GFP_KERNEL);
-               if (!page->page) {
-                       kmem_cache_free(small_qp_cache, page);
-                       goto out;
-               }
-
-               list_add(&page->list, &pd->free[order]);
-       }
-
-       bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
-       __set_bit(bit, page->bitmap);
-       page->fill++;
-
-       if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
-               list_move(&page->list, &pd->full[order]);
-
-       mutex_unlock(&pd->lock);
-
-       queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
-       queue->small_page = page;
-       queue->offset = bit << (order + 9);
-       return 1;
-
-out:
-       ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
-       mutex_unlock(&pd->lock);
-       return 0;
-}
-
-static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page = queue->small_page;
-       unsigned long bit;
-       int free_page = 0;
-
-       bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
-               >> (order + 9);
-
-       mutex_lock(&pd->lock);
-
-       __clear_bit(bit, page->bitmap);
-       page->fill--;
-
-       if (page->fill == 0) {
-               list_del(&page->list);
-               free_page = 1;
-       }
-
-       if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
-               /* the page was full until we freed the chunk */
-               list_move_tail(&page->list, &pd->free[order]);
-
-       mutex_unlock(&pd->lock);
-
-       if (free_page) {
-               free_page(page->page);
-               kmem_cache_free(small_qp_cache, page);
-       }
-}
-
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small)
-{
-       if (pagesize > PAGE_SIZE) {
-               ehca_gen_err("FATAL ERROR: pagesize=%x "
-                            "is greater than kernel page size", pagesize);
-               return 0;
-       }
-
-       /* init queue fields */
-       queue->queue_length = nr_of_pages * pagesize;
-       queue->pagesize = pagesize;
-       queue->qe_size = qe_size;
-       queue->act_nr_of_sg = nr_of_sg;
-       queue->current_q_offset = 0;
-       queue->toggle_state = 1;
-       queue->small_page = NULL;
-
-       /* allocate queue page pointers */
-       queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
-                                    GFP_KERNEL | __GFP_NOWARN);
-       if (!queue->queue_pages) {
-               queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
-               if (!queue->queue_pages) {
-                       ehca_gen_err("Couldn't allocate queue page list");
-                       return 0;
-               }
-       }
-
-       /* allocate actual queue pages */
-       if (is_small) {
-               if (!alloc_small_queue_page(queue, pd))
-                       goto ipz_queue_ctor_exit0;
-       } else
-               if (!alloc_queue_pages(queue, nr_of_pages))
-                       goto ipz_queue_ctor_exit0;
-
-       return 1;
-
-ipz_queue_ctor_exit0:
-       ehca_gen_err("Couldn't alloc pages queue=%p "
-                "nr_of_pages=%x",  queue, nr_of_pages);
-       kvfree(queue->queue_pages);
-
-       return 0;
-}
-
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
-{
-       int i, nr_pages;
-
-       if (!queue || !queue->queue_pages) {
-               ehca_gen_dbg("queue or queue_pages is NULL");
-               return 0;
-       }
-
-       if (queue->small_page)
-               free_small_queue_page(queue, pd);
-       else {
-               nr_pages = queue->queue_length / queue->pagesize;
-               for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
-                       free_page((unsigned long)queue->queue_pages[i]);
-       }
-
-       kvfree(queue->queue_pages);
-
-       return 1;
-}
-
-int ehca_init_small_qp_cache(void)
-{
-       small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
-                                          sizeof(struct ipz_small_queue_page),
-                                          0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!small_qp_cache)
-               return -ENOMEM;
-
-       return 0;
-}
-
-void ehca_cleanup_small_qp_cache(void)
-{
-       kmem_cache_destroy(small_qp_cache);
-}
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.h b/drivers/staging/rdma/ehca/ipz_pt_fn.h
deleted file mode 100644 (file)
index a801274..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __IPZ_PT_FN_H__
-#define __IPZ_PT_FN_H__
-
-#define EHCA_PAGESHIFT   12
-#define EHCA_PAGESIZE   4096UL
-#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
-#define EHCA_PT_ENTRIES 512UL
-
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-
-struct ehca_pd;
-struct ipz_small_queue_page;
-
-extern struct kmem_cache *small_qp_cache;
-
-/* struct generic ehca page */
-struct ipz_page {
-       u8 entries[EHCA_PAGESIZE];
-};
-
-#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
-
-struct ipz_small_queue_page {
-       unsigned long page;
-       unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
-       int fill;
-       void *mapped_addr;
-       u32 mmap_count;
-       struct list_head list;
-};
-
-/* struct generic queue in linux kernel virtual memory (kv) */
-struct ipz_queue {
-       u64 current_q_offset;   /* current queue entry */
-
-       struct ipz_page **queue_pages;  /* array of pages belonging to queue */
-       u32 qe_size;            /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length;       /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;       /* toggle flag - per page */
-       u32 offset; /* save offset within page for small_qp */
-       struct ipz_small_queue_page *small_page;
-};
-
-/*
- * return current Queue Entry for a certain q_offset
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
-{
-       struct ipz_page *current_page;
-       if (q_offset >= queue->queue_length)
-               return NULL;
-       current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
-       return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
-}
-
-/*
- * return current Queue Entry
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_get(struct ipz_queue *queue)
-{
-       return ipz_qeit_calc(queue, queue->current_q_offset);
-}
-
-/*
- * return current Queue Page , increment Queue Page iterator from
- * page to page in struct ipz_queue, last increment will return 0! and
- * NOT wrap
- * returns address (kv) of Queue Page
- * warning don't use in parallel with ipz_QE_get_inc()
- */
-void *ipz_qpageit_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset >= queue->queue_length) {
-               queue->current_q_offset = 0;
-               /* toggle the valid flag */
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-/*
- * return a bool indicating whether current Queue Entry is valid
- */
-static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
-{
-       struct ehca_cqe *cqe = ipz_qeit_get(queue);
-       return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
-}
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
-{
-       return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
-}
-
-/*
- * returns and resets Queue Entry iterator
- * returns address (kv) of first Queue Entry
- */
-static inline void *ipz_qeit_reset(struct ipz_queue *queue)
-{
-       queue->current_q_offset = 0;
-       return ipz_qeit_get(queue);
-}
-
-/*
- * return the q_offset corresponding to an absolute address
- */
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
-
-/*
- * return the next queue offset. don't modify the queue.
- */
-static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
-{
-       offset += queue->qe_size;
-       if (offset >= queue->queue_length) offset = 0;
-       return offset;
-}
-
-/* struct generic page table */
-struct ipz_pt {
-       u64 entries[EHCA_PT_ENTRIES];
-};
-
-/* struct page table for a queue, only to be used in pf */
-struct ipz_qpt {
-       /* queue page tables (kv), use u64 because we know the element length */
-       u64 *qpts;
-       u32 n_qpts;
-       u32 n_ptes;       /*  number of page table entries */
-       u64 *current_pte_addr;
-};
-
-/*
- * constructor for a ipz_queue_t, placement new for ipz_queue_t,
- * new for all dependent datastructors
- * all QP Tables are the same
- * flow:
- *    allocate+pin queue
- * see ipz_qpt_ctor()
- * returns true if ok, false if out of memory
- */
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small);
-
-/*
- * destructor for a ipz_queue_t
- *  -# free queue
- *  see ipz_queue_ctor()
- *  returns true if ok, false if queue was NULL-ptr of free failed
- */
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
-
-/*
- * constructor for a ipz_qpt_t,
- * placement new for struct ipz_queue, new for all dependent datastructors
- * all QP Tables are the same,
- * flow:
- * -# allocate+pin queue
- * -# initialise ptcb
- * -# allocate+pin PTs
- * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
- * -# the ring must have room for exactly nr_of_PTEs
- * see ipz_qpt_ctor()
- */
-void ipz_qpt_ctor(struct ipz_qpt *qpt,
-                 const u32 nr_of_qes,
-                 const u32 pagesize,
-                 const u32 qe_size,
-                 const u8 lowbyte, const u8 toggle,
-                 u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- * fix EQ page problems
- */
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Event Queue Entry, increment Queue Entry iterator
- * by one step in struct ipz_queue if valid, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_queue_QPageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- */
-static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       ipz_qeit_eq_get_inc(queue); /* this is a good one */
-       return ret;
-}
-
-static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       return ret;
-}
-
-/* returns address (GX) of first queue entry */
-static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
-{
-       return be64_to_cpu(qpt->qpts[0]);
-}
-
-/* returns address (kv) of first page of queue page table */
-static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
-{
-       return qpt->qpts;
-}
-
-#endif                         /* __IPZ_PT_FN_H__ */
diff --git a/drivers/staging/rdma/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig
deleted file mode 100644 (file)
index 041ce06..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-config INFINIBAND_IPATH
-       tristate "QLogic HTX HCA support"
-       depends on 64BIT && NET && HT_IRQ
-       ---help---
-       This is a driver for the deprecated QLogic Hyper-Transport
-       IB host channel adapter (model QHT7140),
-       including InfiniBand verbs support.  This driver allows these
-       devices to be used with both kernel upper level protocols such
-       as IP-over-InfiniBand as well as with userspace applications
-       (in conjunction with InfiniBand userspace access).
-       For QLogic PCIe QLE based cards, use the QIB driver instead.
-
-       If you have this hardware you will need to boot with PAT disabled
-       on your x86-64 systems, use the nopat kernel parameter.
-
-       Note that this driver will soon be removed entirely from the kernel.
diff --git a/drivers/staging/rdma/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile
deleted file mode 100644 (file)
index 4496f28..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
-       -DIPATH_KERN_TYPE=0
-
-obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
-
-ib_ipath-y := \
-       ipath_cq.o \
-       ipath_diag.o \
-       ipath_dma.o \
-       ipath_driver.o \
-       ipath_eeprom.o \
-       ipath_file_ops.o \
-       ipath_fs.o \
-       ipath_init_chip.o \
-       ipath_intr.o \
-       ipath_keys.o \
-       ipath_mad.o \
-       ipath_mmap.o \
-       ipath_mr.o \
-       ipath_qp.o \
-       ipath_rc.o \
-       ipath_ruc.o \
-       ipath_sdma.o \
-       ipath_srq.o \
-       ipath_stats.o \
-       ipath_sysfs.o \
-       ipath_uc.o \
-       ipath_ud.o \
-       ipath_user_pages.o \
-       ipath_user_sdma.o \
-       ipath_verbs_mcast.o \
-       ipath_verbs.o
-
-ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
-
-ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
-ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO
deleted file mode 100644 (file)
index cb00158..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-The ipath driver has been moved to staging in preparation for its removal in a
-few releases. The driver will be deleted during the 4.6 merge window.
-
-Contact Dennis Dalessandro <dennis.dalessandro@intel.com> and
-Cc: linux-rdma@vger.kernel.org
diff --git a/drivers/staging/rdma/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h
deleted file mode 100644 (file)
index 28cfe97..0000000
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_COMMON_H
-#define _IPATH_COMMON_H
-
-/*
- * This file contains defines, structures, etc. that are used
- * to communicate between kernel and user code.
- */
-
-
-/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
-#define IPATH_SRC_OUI_1 0x00
-#define IPATH_SRC_OUI_2 0x11
-#define IPATH_SRC_OUI_3 0x75
-
-/* version of protocol header (known to chip also). In the long run,
- * we should be able to generate and accept a range of version numbers;
- * for now we only accept one, and it's compiled in.
- */
-#define IPS_PROTO_VERSION 2
-
-/*
- * These are compile time constants that you may want to enable or disable
- * if you are trying to debug problems with code or performance.
- * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
- * fastpath code
- * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in faspath code
- * _IPATH_TRACING define as 0 if you want to remove all tracing in a
- * compilation unit
- * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
- */
-
-/*
- * The value in the BTH QP field that InfiniPath uses to differentiate
- * an infinipath protocol IB packet vs standard IB transport
- */
-#define IPATH_KD_QP 0x656b79
-
-/*
- * valid states passed to ipath_set_linkstate() user call
- */
-#define IPATH_IB_LINKDOWN              0
-#define IPATH_IB_LINKARM               1
-#define IPATH_IB_LINKACTIVE            2
-#define IPATH_IB_LINKDOWN_ONLY         3
-#define IPATH_IB_LINKDOWN_SLEEP                4
-#define IPATH_IB_LINKDOWN_DISABLE      5
-#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */
-#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */
-#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */
-#define IPATH_IB_LINK_HRTBT    9 /* enable heartbeat, normal, non-loopback */
-
-/*
- * These 3 values (SDR and DDR may be ORed for auto-speed
- * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
- * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
- * are also the the possible values for ipath_link_speed_enabled and active
- * The values were chosen to match values used within the IB spec.
- */
-#define IPATH_IB_SDR 1
-#define IPATH_IB_DDR 2
-
-/*
- * stats maintained by the driver.  For now, at least, this is global
- * to all minor devices.
- */
-struct infinipath_stats {
-       /* number of interrupts taken */
-       __u64 sps_ints;
-       /* number of interrupts for errors */
-       __u64 sps_errints;
-       /* number of errors from chip (not incl. packet errors or CRC) */
-       __u64 sps_errs;
-       /* number of packet errors from chip other than CRC */
-       __u64 sps_pkterrs;
-       /* number of packets with CRC errors (ICRC and VCRC) */
-       __u64 sps_crcerrs;
-       /* number of hardware errors reported (parity, etc.) */
-       __u64 sps_hwerrs;
-       /* number of times IB link changed state unexpectedly */
-       __u64 sps_iblink;
-       __u64 sps_unused; /* was fastrcvint, no longer implemented */
-       /* number of kernel (port0) packets received */
-       __u64 sps_port0pkts;
-       /* number of "ethernet" packets sent by driver */
-       __u64 sps_ether_spkts;
-       /* number of "ethernet" packets received by driver */
-       __u64 sps_ether_rpkts;
-       /* number of SMA packets sent by driver. Obsolete. */
-       __u64 sps_sma_spkts;
-       /* number of SMA packets received by driver. Obsolete. */
-       __u64 sps_sma_rpkts;
-       /* number of times all ports rcvhdrq was full and packet dropped */
-       __u64 sps_hdrqfull;
-       /* number of times all ports egrtid was full and packet dropped */
-       __u64 sps_etidfull;
-       /*
-        * number of times we tried to send from driver, but no pio buffers
-        * avail
-        */
-       __u64 sps_nopiobufs;
-       /* number of ports currently open */
-       __u64 sps_ports;
-       /* list of pkeys (other than default) accepted (0 means not set) */
-       __u16 sps_pkeys[4];
-       __u16 sps_unused16[4]; /* available; maintaining compatible layout */
-       /* number of user ports per chip (not IB ports) */
-       __u32 sps_nports;
-       /* not our interrupt, or already handled */
-       __u32 sps_nullintr;
-       /* max number of packets handled per receive call */
-       __u32 sps_maxpkts_call;
-       /* avg number of packets handled per receive call */
-       __u32 sps_avgpkts_call;
-       /* total number of pages locked */
-       __u64 sps_pagelocks;
-       /* total number of pages unlocked */
-       __u64 sps_pageunlocks;
-       /*
-        * Number of packets dropped in kernel other than errors (ether
-        * packets if ipath not configured, etc.)
-        */
-       __u64 sps_krdrops;
-       __u64 sps_txeparity; /* PIO buffer parity error, recovered */
-       /* pad for future growth */
-       __u64 __sps_pad[45];
-};
-
-/*
- * These are the status bits readable (in ascii form, 64bit value)
- * from the "status" sysfs file.
- */
-#define IPATH_STATUS_INITTED       0x1 /* basic initialization done */
-#define IPATH_STATUS_DISABLED      0x2 /* hardware disabled */
-/* Device has been disabled via admin request */
-#define IPATH_STATUS_ADMIN_DISABLED    0x4
-/* Chip has been found and initted */
-#define IPATH_STATUS_CHIP_PRESENT 0x20
-/* IB link is at ACTIVE, usable for data traffic */
-#define IPATH_STATUS_IB_READY     0x40
-/* link is configured, LID, MTU, etc. have been set */
-#define IPATH_STATUS_IB_CONF      0x80
-/* no link established, probably no cable */
-#define IPATH_STATUS_IB_NOCABLE  0x100
-/* A Fatal hardware error has occurred. */
-#define IPATH_STATUS_HWERROR     0x200
-
-/*
- * The list of usermode accessible registers.  Also see Reg_* later in file.
- */
-typedef enum _ipath_ureg {
-       /* (RO)  DMA RcvHdr to be used next. */
-       ur_rcvhdrtail = 0,
-       /* (RW)  RcvHdr entry to be processed next by host. */
-       ur_rcvhdrhead = 1,
-       /* (RO)  Index of next Eager index to use. */
-       ur_rcvegrindextail = 2,
-       /* (RW)  Eager TID to be processed next */
-       ur_rcvegrindexhead = 3,
-       /* For internal use only; max register number. */
-       _IPATH_UregMax
-} ipath_ureg;
-
-/* bit values for spi_runtime_flags */
-#define IPATH_RUNTIME_HT       0x1
-#define IPATH_RUNTIME_PCIE     0x2
-#define IPATH_RUNTIME_FORCE_WC_ORDER   0x4
-#define IPATH_RUNTIME_RCVHDR_COPY      0x8
-#define IPATH_RUNTIME_MASTER   0x10
-#define IPATH_RUNTIME_NODMA_RTAIL 0x80
-#define IPATH_RUNTIME_SDMA           0x200
-#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
-#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
-
-/*
- * This structure is returned by ipath_userinit() immediately after
- * open to get implementation-specific info, and info specific to this
- * instance.
- *
- * This struct must have explict pad fields where type sizes
- * may result in different alignments between 32 and 64 bit
- * programs, since the 64 bit * bit kernel requires the user code
- * to have matching offsets
- */
-struct ipath_base_info {
-       /* version of hardware, for feature checking. */
-       __u32 spi_hw_version;
-       /* version of software, for feature checking. */
-       __u32 spi_sw_version;
-       /* InfiniPath port assigned, goes into sent packets */
-       __u16 spi_port;
-       __u16 spi_subport;
-       /*
-        * IB MTU, packets IB data must be less than this.
-        * The MTU is in bytes, and will be a multiple of 4 bytes.
-        */
-       __u32 spi_mtu;
-       /*
-        * Size of a PIO buffer.  Any given packet's total size must be less
-        * than this (in words).  Included is the starting control word, so
-        * if 513 is returned, then total pkt size is 512 words or less.
-        */
-       __u32 spi_piosize;
-       /* size of the TID cache in infinipath, in entries */
-       __u32 spi_tidcnt;
-       /* size of the TID Eager list in infinipath, in entries */
-       __u32 spi_tidegrcnt;
-       /* size of a single receive header queue entry in words. */
-       __u32 spi_rcvhdrent_size;
-       /*
-        * Count of receive header queue entries allocated.
-        * This may be less than the spu_rcvhdrcnt passed in!.
-        */
-       __u32 spi_rcvhdr_cnt;
-
-       /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
-       __u32 spi_runtime_flags;
-
-       /* address where receive buffer queue is mapped into */
-       __u64 spi_rcvhdr_base;
-
-       /* user program. */
-
-       /* base address of eager TID receive buffers. */
-       __u64 spi_rcv_egrbufs;
-
-       /* Allocated by initialization code, not by protocol. */
-
-       /*
-        * Size of each TID buffer in host memory, starting at
-        * spi_rcv_egrbufs.  The buffers are virtually contiguous.
-        */
-       __u32 spi_rcv_egrbufsize;
-       /*
-        * The special QP (queue pair) value that identifies an infinipath
-        * protocol packet from standard IB packets.  More, probably much
-        * more, to be added.
-        */
-       __u32 spi_qpair;
-
-       /*
-        * User register base for init code, not to be used directly by
-        * protocol or applications.
-        */
-       __u64 __spi_uregbase;
-       /*
-        * Maximum buffer size in bytes that can be used in a single TID
-        * entry (assuming the buffer is aligned to this boundary).  This is
-        * the minimum of what the hardware and software support Guaranteed
-        * to be a power of 2.
-        */
-       __u32 spi_tid_maxsize;
-       /*
-        * alignment of each pio send buffer (byte count
-        * to add to spi_piobufbase to get to second buffer)
-        */
-       __u32 spi_pioalign;
-       /*
-        * The index of the first pio buffer available to this process;
-        * needed to do lookup in spi_pioavailaddr; not added to
-        * spi_piobufbase.
-        */
-       __u32 spi_pioindex;
-        /* number of buffers mapped for this process */
-       __u32 spi_piocnt;
-
-       /*
-        * Base address of writeonly pio buffers for this process.
-        * Each buffer has spi_piosize words, and is aligned on spi_pioalign
-        * boundaries.  spi_piocnt buffers are mapped from this address
-        */
-       __u64 spi_piobufbase;
-
-       /*
-        * Base address of readonly memory copy of the pioavail registers.
-        * There are 2 bits for each buffer.
-        */
-       __u64 spi_pioavailaddr;
-
-       /*
-        * Address where driver updates a copy of the interface and driver
-        * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
-        * string indicating hardware error, if there was one.
-        */
-       __u64 spi_status;
-
-       /* number of chip ports available to user processes */
-       __u32 spi_nports;
-       /* unit number of chip we are using */
-       __u32 spi_unit;
-       /* num bufs in each contiguous set */
-       __u32 spi_rcv_egrperchunk;
-       /* size in bytes of each contiguous set */
-       __u32 spi_rcv_egrchunksize;
-       /* total size of mmap to cover full rcvegrbuffers */
-       __u32 spi_rcv_egrbuftotlen;
-       __u32 spi_filler_for_align;
-       /* address of readonly memory copy of the rcvhdrq tail register. */
-       __u64 spi_rcvhdr_tailaddr;
-
-       /* shared memory pages for subports if port is shared */
-       __u64 spi_subport_uregbase;
-       __u64 spi_subport_rcvegrbuf;
-       __u64 spi_subport_rcvhdr_base;
-
-       /* shared memory page for hardware port if it is shared */
-       __u64 spi_port_uregbase;
-       __u64 spi_port_rcvegrbuf;
-       __u64 spi_port_rcvhdr_base;
-       __u64 spi_port_rcvhdr_tailaddr;
-
-} __attribute__ ((aligned(8)));
-
-
-/*
- * This version number is given to the driver by the user code during
- * initialization in the spu_userversion field of ipath_user_info, so
- * the driver can check for compatibility with user code.
- *
- * The major version changes when data structures
- * change in an incompatible way.  The driver must be the same or higher
- * for initialization to succeed.  In some cases, a higher version
- * driver will not interoperate with older software, and initialization
- * will return an error.
- */
-#define IPATH_USER_SWMAJOR 1
-
-/*
- * Minor version differences are always compatible
- * a within a major version, however if user software is larger
- * than driver software, some new features and/or structure fields
- * may not be implemented; the user code must deal with this if it
- * cares, or it must abort after initialization reports the difference.
- */
-#define IPATH_USER_SWMINOR 6
-
-#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
-
-#define IPATH_KERN_TYPE 0
-
-/*
- * Similarly, this is the kernel version going back to the user.  It's
- * slightly different, in that we want to tell if the driver was built as
- * part of a QLogic release, or from the driver from openfabrics.org,
- * kernel.org, or a standard distribution, for support reasons.
- * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
- *
- * It's returned by the driver to the user code during initialization in the
- * spi_sw_version field of ipath_base_info, so the user code can in turn
- * check for compatibility with the kernel.
-*/
-#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
-
-/*
- * This structure is passed to ipath_userinit() to tell the driver where
- * user code buffers are, sizes, etc.   The offsets and sizes of the
- * fields must remain unchanged, for binary compatibility.  It can
- * be extended, if userversion is changed so user code can tell, if needed
- */
-struct ipath_user_info {
-       /*
-        * version of user software, to detect compatibility issues.
-        * Should be set to IPATH_USER_SWVERSION.
-        */
-       __u32 spu_userversion;
-
-       /* desired number of receive header queue entries */
-       __u32 spu_rcvhdrcnt;
-
-       /* size of struct base_info to write to */
-       __u32 spu_base_info_size;
-
-       /*
-        * number of words in KD protocol header
-        * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
-        * kernel uses a default.  Once set, attempts to set any other value
-        * are an error (EAGAIN) until driver is reloaded.
-        */
-       __u32 spu_rcvhdrsize;
-
-       /*
-        * If two or more processes wish to share a port, each process
-        * must set the spu_subport_cnt and spu_subport_id to the same
-        * values.  The only restriction on the spu_subport_id is that
-        * it be unique for a given node.
-        */
-       __u16 spu_subport_cnt;
-       __u16 spu_subport_id;
-
-       __u32 spu_unused; /* kept for compatible layout */
-
-       /*
-        * address of struct base_info to write to
-        */
-       __u64 spu_base_info;
-
-} __attribute__ ((aligned(8)));
-
-/* User commands. */
-
-#define IPATH_CMD_MIN          16
-
-#define __IPATH_CMD_USER_INIT  16      /* old set up userspace (for old user code) */
-#define IPATH_CMD_PORT_INFO    17      /* find out what resources we got */
-#define IPATH_CMD_RECV_CTRL    18      /* control receipt of packets */
-#define IPATH_CMD_TID_UPDATE   19      /* update expected TID entries */
-#define IPATH_CMD_TID_FREE     20      /* free expected TID entries */
-#define IPATH_CMD_SET_PART_KEY 21      /* add partition key */
-#define __IPATH_CMD_SLAVE_INFO 22      /* return info on slave processes (for old user code) */
-#define IPATH_CMD_ASSIGN_PORT  23      /* allocate HCA and port */
-#define IPATH_CMD_USER_INIT    24      /* set up userspace */
-#define IPATH_CMD_UNUSED_1     25
-#define IPATH_CMD_UNUSED_2     26
-#define IPATH_CMD_PIOAVAILUPD  27      /* force an update of PIOAvail reg */
-#define IPATH_CMD_POLL_TYPE    28      /* set the kind of polling we want */
-#define IPATH_CMD_ARMLAUNCH_CTRL       29 /* armlaunch detection control */
-/* 30 is unused */
-#define IPATH_CMD_SDMA_INFLIGHT 31     /* sdma inflight counter request */
-#define IPATH_CMD_SDMA_COMPLETE 32     /* sdma completion counter request */
-
-/*
- * Poll types
- */
-#define IPATH_POLL_TYPE_URGENT  0x01
-#define IPATH_POLL_TYPE_OVERFLOW 0x02
-
-struct ipath_port_info {
-       __u32 num_active;       /* number of active units */
-       __u32 unit;             /* unit (chip) assigned to caller */
-       __u16 port;             /* port on unit assigned to caller */
-       __u16 subport;          /* subport on unit assigned to caller */
-       __u16 num_ports;        /* number of ports available on unit */
-       __u16 num_subports;     /* number of subports opened on port */
-};
-
-struct ipath_tid_info {
-       __u32 tidcnt;
-       /* make structure same size in 32 and 64 bit */
-       __u32 tid__unused;
-       /* virtual address of first page in transfer */
-       __u64 tidvaddr;
-       /* pointer (same size 32/64 bit) to __u16 tid array */
-       __u64 tidlist;
-
-       /*
-        * pointer (same size 32/64 bit) to bitmap of TIDs used
-        * for this call; checked for being large enough at open
-        */
-       __u64 tidmap;
-};
-
-struct ipath_cmd {
-       __u32 type;                     /* command type */
-       union {
-               struct ipath_tid_info tid_info;
-               struct ipath_user_info user_info;
-
-               /*
-                * address in userspace where we should put the sdma
-                * inflight counter
-                */
-               __u64 sdma_inflight;
-               /*
-                * address in userspace where we should put the sdma
-                * completion counter
-                */
-               __u64 sdma_complete;
-               /* address in userspace of struct ipath_port_info to
-                  write result to */
-               __u64 port_info;
-               /* enable/disable receipt of packets */
-               __u32 recv_ctrl;
-               /* enable/disable armlaunch errors (non-zero to enable) */
-               __u32 armlaunch_ctrl;
-               /* partition key to set */
-               __u16 part_key;
-               /* user address of __u32 bitmask of active slaves */
-               __u64 slave_mask_addr;
-               /* type of polling we want */
-               __u16 poll_type;
-       } cmd;
-};
-
-struct ipath_iovec {
-       /* Pointer to data, but same size 32 and 64 bit */
-       __u64 iov_base;
-
-       /*
-        * Length of data; don't need 64 bits, but want
-        * ipath_sendpkt to remain same size as before 32 bit changes, so...
-        */
-       __u64 iov_len;
-};
-
-/*
- * Describes a single packet for send.  Each packet can have one or more
- * buffers, but the total length (exclusive of IB headers) must be less
- * than the MTU, and if using the PIO method, entire packet length,
- * including IB headers, must be less than the ipath_piosize value (words).
- * Use of this necessitates including sys/uio.h
- */
-struct __ipath_sendpkt {
-       __u32 sps_flags;        /* flags for packet (TBD) */
-       __u32 sps_cnt;          /* number of entries to use in sps_iov */
-       /* array of iov's describing packet. TEMPORARY */
-       struct ipath_iovec sps_iov[4];
-};
-
-/*
- * diagnostics can send a packet by "writing" one of the following
- * two structs to diag data special file
- * The first is the legacy version for backward compatibility
- */
-struct ipath_diag_pkt {
-       __u32 unit;
-       __u64 data;
-       __u32 len;
-};
-
-/* The second diag_pkt struct is the expanded version that allows
- * more control over the packet, specifically, by allowing a custom
- * pbc (+ static rate) qword, so that special modes and deliberate
- * changes to CRCs can be used. The elements were also re-ordered
- * for better alignment and to avoid padding issues.
- */
-struct ipath_diag_xpkt {
-       __u64 data;
-       __u64 pbc_wd;
-       __u32 unit;
-       __u32 len;
-};
-
-/*
- * Data layout in I2C flash (for GUID, etc.)
- * All fields are little-endian binary unless otherwise stated
- */
-#define IPATH_FLASH_VERSION 2
-struct ipath_flash {
-       /* flash layout version (IPATH_FLASH_VERSION) */
-       __u8 if_fversion;
-       /* checksum protecting if_length bytes */
-       __u8 if_csum;
-       /*
-        * valid length (in use, protected by if_csum), including
-        * if_fversion and if_csum themselves)
-        */
-       __u8 if_length;
-       /* the GUID, in network order */
-       __u8 if_guid[8];
-       /* number of GUIDs to use, starting from if_guid */
-       __u8 if_numguid;
-       /* the (last 10 characters of) board serial number, in ASCII */
-       char if_serial[12];
-       /* board mfg date (YYYYMMDD ASCII) */
-       char if_mfgdate[8];
-       /* last board rework/test date (YYYYMMDD ASCII) */
-       char if_testdate[8];
-       /* logging of error counts, TBD */
-       __u8 if_errcntp[4];
-       /* powered on hours, updated at driver unload */
-       __u8 if_powerhour[2];
-       /* ASCII free-form comment field */
-       char if_comment[32];
-       /* Backwards compatible prefix for longer QLogic Serial Numbers */
-       char if_sprefix[4];
-       /* 82 bytes used, min flash size is 128 bytes */
-       __u8 if_future[46];
-};
-
-/*
- * These are the counters implemented in the chip, and are listed in order.
- * The InterCaps naming is taken straight from the chip spec.
- */
-struct infinipath_counters {
-       __u64 LBIntCnt;
-       __u64 LBFlowStallCnt;
-       __u64 TxSDmaDescCnt;    /* was Reserved1 */
-       __u64 TxUnsupVLErrCnt;
-       __u64 TxDataPktCnt;
-       __u64 TxFlowPktCnt;
-       __u64 TxDwordCnt;
-       __u64 TxLenErrCnt;
-       __u64 TxMaxMinLenErrCnt;
-       __u64 TxUnderrunCnt;
-       __u64 TxFlowStallCnt;
-       __u64 TxDroppedPktCnt;
-       __u64 RxDroppedPktCnt;
-       __u64 RxDataPktCnt;
-       __u64 RxFlowPktCnt;
-       __u64 RxDwordCnt;
-       __u64 RxLenErrCnt;
-       __u64 RxMaxMinLenErrCnt;
-       __u64 RxICRCErrCnt;
-       __u64 RxVCRCErrCnt;
-       __u64 RxFlowCtrlErrCnt;
-       __u64 RxBadFormatCnt;
-       __u64 RxLinkProblemCnt;
-       __u64 RxEBPCnt;
-       __u64 RxLPCRCErrCnt;
-       __u64 RxBufOvflCnt;
-       __u64 RxTIDFullErrCnt;
-       __u64 RxTIDValidErrCnt;
-       __u64 RxPKeyMismatchCnt;
-       __u64 RxP0HdrEgrOvflCnt;
-       __u64 RxP1HdrEgrOvflCnt;
-       __u64 RxP2HdrEgrOvflCnt;
-       __u64 RxP3HdrEgrOvflCnt;
-       __u64 RxP4HdrEgrOvflCnt;
-       __u64 RxP5HdrEgrOvflCnt;
-       __u64 RxP6HdrEgrOvflCnt;
-       __u64 RxP7HdrEgrOvflCnt;
-       __u64 RxP8HdrEgrOvflCnt;
-       __u64 RxP9HdrEgrOvflCnt;        /* was Reserved6 */
-       __u64 RxP10HdrEgrOvflCnt;       /* was Reserved7 */
-       __u64 RxP11HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP12HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP13HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP14HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP15HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP16HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 IBStatusChangeCnt;
-       __u64 IBLinkErrRecoveryCnt;
-       __u64 IBLinkDownedCnt;
-       __u64 IBSymbolErrCnt;
-       /* The following are new for IBA7220 */
-       __u64 RxVL15DroppedPktCnt;
-       __u64 RxOtherLocalPhyErrCnt;
-       __u64 PcieRetryBufDiagQwordCnt;
-       __u64 ExcessBufferOvflCnt;
-       __u64 LocalLinkIntegrityErrCnt;
-       __u64 RxVlErrCnt;
-       __u64 RxDlidFltrCnt;
-};
-
-/*
- * The next set of defines are for packet headers, and chip register
- * and memory bits that are visible to and/or used by user-mode software
- * The other bits that are used only by the driver or diags are in
- * ipath_registers.h
- */
-
-/* RcvHdrFlags bits */
-#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
-#define INFINIPATH_RHF_LENGTH_SHIFT 0
-#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
-#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
-#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
-#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
-#define INFINIPATH_RHF_SEQ_MASK 0xF
-#define INFINIPATH_RHF_SEQ_SHIFT 0
-#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
-#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
-#define INFINIPATH_RHF_H_ICRCERR   0x80000000
-#define INFINIPATH_RHF_H_VCRCERR   0x40000000
-#define INFINIPATH_RHF_H_PARITYERR 0x20000000
-#define INFINIPATH_RHF_H_LENERR    0x10000000
-#define INFINIPATH_RHF_H_MTUERR    0x08000000
-#define INFINIPATH_RHF_H_IHDRERR   0x04000000
-#define INFINIPATH_RHF_H_TIDERR    0x02000000
-#define INFINIPATH_RHF_H_MKERR     0x01000000
-#define INFINIPATH_RHF_H_IBERR     0x00800000
-#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
-#define INFINIPATH_RHF_L_USE_EGR   0x80000000
-#define INFINIPATH_RHF_L_SWA       0x00008000
-#define INFINIPATH_RHF_L_SWB       0x00004000
-
-/* infinipath header fields */
-#define INFINIPATH_I_VERS_MASK 0xF
-#define INFINIPATH_I_VERS_SHIFT 28
-#define INFINIPATH_I_PORT_MASK 0xF
-#define INFINIPATH_I_PORT_SHIFT 24
-#define INFINIPATH_I_TID_MASK 0x7FF
-#define INFINIPATH_I_TID_SHIFT 13
-#define INFINIPATH_I_OFFSET_MASK 0x1FFF
-#define INFINIPATH_I_OFFSET_SHIFT 0
-
-/* K_PktFlags bits */
-#define INFINIPATH_KPF_INTR 0x1
-#define INFINIPATH_KPF_SUBPORT_MASK 0x3
-#define INFINIPATH_KPF_SUBPORT_SHIFT 1
-
-#define INFINIPATH_MAX_SUBPORT 4
-
-/* SendPIO per-buffer control */
-#define INFINIPATH_SP_TEST    0x40
-#define INFINIPATH_SP_TESTEBP 0x20
-#define INFINIPATH_SP_TRIGGER_SHIFT  15
-
-/* SendPIOAvail bits */
-#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
-#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
-
-/* infinipath header format */
-struct ipath_header {
-       /*
-        * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
-        * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
-        * Port 4, TID 11, offset 13.
-        */
-       __le32 ver_port_tid_offset;
-       __le16 chksum;
-       __le16 pkt_flags;
-};
-
-/* infinipath user message header format.
- * This structure contains the first 4 fields common to all protocols
- * that employ infinipath.
- */
-struct ipath_message_header {
-       __be16 lrh[4];
-       __be32 bth[3];
-       /* fields below this point are in host byte order */
-       struct ipath_header iph;
-       __u8 sub_opcode;
-};
-
-/* infinipath ethernet header format */
-struct ether_header {
-       __be16 lrh[4];
-       __be32 bth[3];
-       struct ipath_header iph;
-       __u8 sub_opcode;
-       __u8 cmd;
-       __be16 lid;
-       __u16 mac[3];
-       __u8 frag_num;
-       __u8 seq_num;
-       __le32 len;
-       /* MUST be of word size due to PIO write requirements */
-       __le32 csum;
-       __le16 csum_offset;
-       __le16 flags;
-       __u16 first_2_bytes;
-       __u8 unused[2];         /* currently unused */
-};
-
-
-/* IB - LRH header consts */
-#define IPATH_LRH_GRH 0x0003   /* 1. word of IB LRH - next header: GRH */
-#define IPATH_LRH_BTH 0x0002   /* 1. word of IB LRH - next header: BTH */
-
-/* misc. */
-#define SIZE_OF_CRC 1
-
-#define IPATH_DEFAULT_P_KEY 0xFFFF
-#define IPATH_PERMISSIVE_LID 0xFFFF
-#define IPATH_AETH_CREDIT_SHIFT 24
-#define IPATH_AETH_CREDIT_MASK 0x1F
-#define IPATH_AETH_CREDIT_INVAL 0x1F
-#define IPATH_PSN_MASK 0xFFFFFF
-#define IPATH_MSN_MASK 0xFFFFFF
-#define IPATH_QPN_MASK 0xFFFFFF
-#define IPATH_MULTICAST_LID_BASE 0xC000
-#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
-#define IPATH_MULTICAST_QPN 0xFFFFFF
-
-/* Receive Header Queue: receive type (from infinipath) */
-#define RCVHQ_RCV_TYPE_EXPECTED  0
-#define RCVHQ_RCV_TYPE_EAGER     1
-#define RCVHQ_RCV_TYPE_NON_KD    2
-#define RCVHQ_RCV_TYPE_ERROR     3
-
-
-/* sub OpCodes - ith4x  */
-#define IPATH_ITH4X_OPCODE_ENCAP 0x81
-#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
-
-#define IPATH_HEADER_QUEUE_WORDS 9
-
-/* functions for extracting fields from rcvhdrq entries for the driver.
- */
-static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
-{
-       return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
-}
-
-static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
-{
-       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
-           & INFINIPATH_RHF_RCVTYPE_MASK;
-}
-
-static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
-{
-       return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
-               & INFINIPATH_RHF_LENGTH_MASK) << 2;
-}
-
-static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
-{
-       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
-           & INFINIPATH_RHF_EGRINDEX_MASK;
-}
-
-static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
-{
-       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
-               & INFINIPATH_RHF_SEQ_MASK;
-}
-
-static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
-{
-       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
-               & INFINIPATH_RHF_HDRQ_OFFSET_MASK;
-}
-
-static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
-{
-       return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
-}
-
-static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
-{
-       return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
-           & INFINIPATH_I_VERS_MASK;
-}
-
-#endif                         /* _IPATH_COMMON_H */
diff --git a/drivers/staging/rdma/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c
deleted file mode 100644 (file)
index e9dd911..0000000
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_cq_enter - add a new entry to the completion queue
- * @cq: completion queue
- * @entry: work completion entry to add
- * @sig: true if @entry is a solicitated entry
- *
- * This may be called with qp->s_lock held.
- */
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
-{
-       struct ipath_cq_wc *wc;
-       unsigned long flags;
-       u32 head;
-       u32 next;
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       /*
-        * Note that the head pointer might be writable by user processes.
-        * Take care to verify it is a sane value.
-        */
-       wc = cq->queue;
-       head = wc->head;
-       if (head >= (unsigned) cq->ibcq.cqe) {
-               head = cq->ibcq.cqe;
-               next = 0;
-       } else
-               next = head + 1;
-       if (unlikely(next == wc->tail)) {
-               spin_unlock_irqrestore(&cq->lock, flags);
-               if (cq->ibcq.event_handler) {
-                       struct ib_event ev;
-
-                       ev.device = cq->ibcq.device;
-                       ev.element.cq = &cq->ibcq;
-                       ev.event = IB_EVENT_CQ_ERR;
-                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
-               }
-               return;
-       }
-       if (cq->ip) {
-               wc->uqueue[head].wr_id = entry->wr_id;
-               wc->uqueue[head].status = entry->status;
-               wc->uqueue[head].opcode = entry->opcode;
-               wc->uqueue[head].vendor_err = entry->vendor_err;
-               wc->uqueue[head].byte_len = entry->byte_len;
-               wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
-               wc->uqueue[head].qp_num = entry->qp->qp_num;
-               wc->uqueue[head].src_qp = entry->src_qp;
-               wc->uqueue[head].wc_flags = entry->wc_flags;
-               wc->uqueue[head].pkey_index = entry->pkey_index;
-               wc->uqueue[head].slid = entry->slid;
-               wc->uqueue[head].sl = entry->sl;
-               wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-               wc->uqueue[head].port_num = entry->port_num;
-               /* Make sure entry is written before the head index. */
-               smp_wmb();
-       } else
-               wc->kqueue[head] = *entry;
-       wc->head = next;
-
-       if (cq->notify == IB_CQ_NEXT_COMP ||
-           (cq->notify == IB_CQ_SOLICITED && solicited)) {
-               cq->notify = IB_CQ_NONE;
-               cq->triggered++;
-               /*
-                * This will cause send_complete() to be called in
-                * another thread.
-                */
-               tasklet_hi_schedule(&cq->comptask);
-       }
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       if (entry->status != IB_WC_SUCCESS)
-               to_idev(cq->ibcq.device)->n_wqe_errs++;
-}
-
-/**
- * ipath_poll_cq - poll for work completion entries
- * @ibcq: the completion queue to poll
- * @num_entries: the maximum number of entries to return
- * @entry: pointer to array where work completions are placed
- *
- * Returns the number of completion entries polled.
- *
- * This may be called from interrupt context.  Also called by ib_poll_cq()
- * in the generic verbs code.
- */
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *wc;
-       unsigned long flags;
-       int npolled;
-       u32 tail;
-
-       /* The kernel can only poll a kernel completion queue */
-       if (cq->ip) {
-               npolled = -EINVAL;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       wc = cq->queue;
-       tail = wc->tail;
-       if (tail > (u32) cq->ibcq.cqe)
-               tail = (u32) cq->ibcq.cqe;
-       for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
-               if (tail == wc->head)
-                       break;
-               /* The kernel doesn't need a RMB since it has the lock. */
-               *entry = wc->kqueue[tail];
-               if (tail >= cq->ibcq.cqe)
-                       tail = 0;
-               else
-                       tail++;
-       }
-       wc->tail = tail;
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-bail:
-       return npolled;
-}
-
-static void send_complete(unsigned long data)
-{
-       struct ipath_cq *cq = (struct ipath_cq *)data;
-
-       /*
-        * The completion handler will most likely rearm the notification
-        * and poll for all pending entries.  If a new completion entry
-        * is added while we are in this routine, tasklet_hi_schedule()
-        * won't call us again until we return so we check triggered to
-        * see if we need to call the handler again.
-        */
-       for (;;) {
-               u8 triggered = cq->triggered;
-
-               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
-
-               if (cq->triggered == triggered)
-                       return;
-       }
-}
-
-/**
- * ipath_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
- * @attr: creation attributes
- * @context: unused by the InfiniPath driver
- * @udata: unused by the InfiniPath driver
- *
- * Returns a pointer to the completion queue or negative errno values
- * for failure.
- *
- * Called by ib_create_cq() in the generic verbs code.
- */
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-                             const struct ib_cq_init_attr *attr,
-                             struct ib_ucontext *context,
-                             struct ib_udata *udata)
-{
-       int entries = attr->cqe;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cq *cq;
-       struct ipath_cq_wc *wc;
-       struct ib_cq *ret;
-       u32 sz;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (entries < 1 || entries > ib_ipath_max_cqes) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       /* Allocate the completion queue structure. */
-       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
-       /*
-        * Allocate the completion queue entries and head/tail pointers.
-        * This is allocated separately so that it can be resized and
-        * also mapped into user space.
-        * We need to use vmalloc() in order to support mmap and large
-        * numbers of entries.
-        */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-       else
-               sz += sizeof(struct ib_wc) * (entries + 1);
-       wc = vmalloc_user(sz);
-       if (!wc) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_cq;
-       }
-
-       /*
-        * Return the address of the WC as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
-
-               cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
-               if (!cq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_wc;
-               }
-
-               err = ib_copy_to_udata(udata, &cq->ip->offset,
-                                      sizeof(cq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
-                       goto bail_ip;
-               }
-       } else
-               cq->ip = NULL;
-
-       spin_lock(&dev->n_cqs_lock);
-       if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
-               spin_unlock(&dev->n_cqs_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_cqs_allocated++;
-       spin_unlock(&dev->n_cqs_lock);
-
-       if (cq->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       /*
-        * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
-        * The number of entries should be >= the number requested or return
-        * an error.
-        */
-       cq->ibcq.cqe = entries;
-       cq->notify = IB_CQ_NONE;
-       cq->triggered = 0;
-       spin_lock_init(&cq->lock);
-       tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
-       wc->head = 0;
-       wc->tail = 0;
-       cq->queue = wc;
-
-       ret = &cq->ibcq;
-
-       goto done;
-
-bail_ip:
-       kfree(cq->ip);
-bail_wc:
-       vfree(wc);
-bail_cq:
-       kfree(cq);
-done:
-       return ret;
-}
-
-/**
- * ipath_destroy_cq - destroy a completion queue
- * @ibcq: the completion queue to destroy.
- *
- * Returns 0 for success.
- *
- * Called by ib_destroy_cq() in the generic verbs code.
- */
-int ipath_destroy_cq(struct ib_cq *ibcq)
-{
-       struct ipath_ibdev *dev = to_idev(ibcq->device);
-       struct ipath_cq *cq = to_icq(ibcq);
-
-       tasklet_kill(&cq->comptask);
-       spin_lock(&dev->n_cqs_lock);
-       dev->n_cqs_allocated--;
-       spin_unlock(&dev->n_cqs_lock);
-       if (cq->ip)
-               kref_put(&cq->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(cq->queue);
-       kfree(cq);
-
-       return 0;
-}
-
-/**
- * ipath_req_notify_cq - change the notification type for a completion queue
- * @ibcq: the completion queue
- * @notify_flags: the type of notification to request
- *
- * Returns 0 for success.
- *
- * This may be called from interrupt context.  Also called by
- * ib_req_notify_cq() in the generic verbs code.
- */
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       unsigned long flags;
-       int ret = 0;
-
-       spin_lock_irqsave(&cq->lock, flags);
-       /*
-        * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
-        * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
-        */
-       if (cq->notify != IB_CQ_NEXT_COMP)
-               cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
-
-       if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-           cq->queue->head != cq->queue->tail)
-               ret = 1;
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       return ret;
-}
-
-/**
- * ipath_resize_cq - change the size of the CQ
- * @ibcq: the completion queue
- *
- * Returns 0 for success.
- */
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *old_wc;
-       struct ipath_cq_wc *wc;
-       u32 head, tail, n;
-       int ret;
-       u32 sz;
-
-       if (cqe < 1 || cqe > ib_ipath_max_cqes) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * Need to use vmalloc() if we want to support large #s of entries.
-        */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-       else
-               sz += sizeof(struct ib_wc) * (cqe + 1);
-       wc = vmalloc_user(sz);
-       if (!wc) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       /* Check that we can write the offset to mmap. */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               __u64 offset = 0;
-
-               ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
-               if (ret)
-                       goto bail_free;
-       }
-
-       spin_lock_irq(&cq->lock);
-       /*
-        * Make sure head and tail are sane since they
-        * might be user writable.
-        */
-       old_wc = cq->queue;
-       head = old_wc->head;
-       if (head > (u32) cq->ibcq.cqe)
-               head = (u32) cq->ibcq.cqe;
-       tail = old_wc->tail;
-       if (tail > (u32) cq->ibcq.cqe)
-               tail = (u32) cq->ibcq.cqe;
-       if (head < tail)
-               n = cq->ibcq.cqe + 1 + head - tail;
-       else
-               n = head - tail;
-       if (unlikely((u32)cqe < n)) {
-               ret = -EINVAL;
-               goto bail_unlock;
-       }
-       for (n = 0; tail != head; n++) {
-               if (cq->ip)
-                       wc->uqueue[n] = old_wc->uqueue[tail];
-               else
-                       wc->kqueue[n] = old_wc->kqueue[tail];
-               if (tail == (u32) cq->ibcq.cqe)
-                       tail = 0;
-               else
-                       tail++;
-       }
-       cq->ibcq.cqe = cqe;
-       wc->head = n;
-       wc->tail = 0;
-       cq->queue = wc;
-       spin_unlock_irq(&cq->lock);
-
-       vfree(old_wc);
-
-       if (cq->ip) {
-               struct ipath_ibdev *dev = to_idev(ibcq->device);
-               struct ipath_mmap_info *ip = cq->ip;
-
-               ipath_update_mmap_info(dev, ip, sz, wc);
-
-               /*
-                * Return the offset to mmap.
-                * See ipath_mmap() for details.
-                */
-               if (udata && udata->outlen >= sizeof(__u64)) {
-                       ret = ib_copy_to_udata(udata, &ip->offset,
-                                              sizeof(ip->offset));
-                       if (ret)
-                               goto bail;
-               }
-
-               spin_lock_irq(&dev->pending_lock);
-               if (list_empty(&ip->pending_mmaps))
-                       list_add(&ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = 0;
-       goto bail;
-
-bail_unlock:
-       spin_unlock_irq(&cq->lock);
-bail_free:
-       vfree(wc);
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h
deleted file mode 100644 (file)
index 65926cd..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_DEBUG_H
-#define _IPATH_DEBUG_H
-
-#ifndef _IPATH_DEBUGGING       /* debugging enabled or not */
-#define _IPATH_DEBUGGING 1
-#endif
-
-#if _IPATH_DEBUGGING
-
-/*
- * Mask values for debugging.  The scheme allows us to compile out any
- * of the debug tracing stuff, and if compiled in, to enable or disable
- * dynamically.  This can be set at modprobe time also:
- *      modprobe infinipath.ko infinipath_debug=7
- */
-
-#define __IPATH_INFO        0x1        /* generic low verbosity stuff */
-#define __IPATH_DBG         0x2        /* generic debug */
-#define __IPATH_TRSAMPLE    0x8        /* generate trace buffer sample entries */
-/* leave some low verbosity spots open */
-#define __IPATH_VERBDBG     0x40       /* very verbose debug */
-#define __IPATH_PKTDBG      0x80       /* print packet data */
-/* print process startup (init)/exit messages */
-#define __IPATH_PROCDBG     0x100
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG       0x200
-#define __IPATH_ERRPKTDBG   0x400
-#define __IPATH_USER_SEND   0x1000     /* use user mode send */
-#define __IPATH_KERNEL_SEND 0x2000     /* use kernel mode send */
-#define __IPATH_EPKTDBG     0x4000     /* print ethernet packet data */
-#define __IPATH_IPATHDBG    0x10000    /* Ethernet (IPATH) gen debug */
-#define __IPATH_IPATHWARN   0x20000    /* Ethernet (IPATH) warnings */
-#define __IPATH_IPATHERR    0x40000    /* Ethernet (IPATH) errors */
-#define __IPATH_IPATHPD     0x80000    /* Ethernet (IPATH) packet dump */
-#define __IPATH_IPATHTABLE  0x100000   /* Ethernet (IPATH) table dump */
-#define __IPATH_LINKVERBDBG 0x200000   /* very verbose linkchange debug */
-
-#else                          /* _IPATH_DEBUGGING */
-
-/*
- * define all of these even with debugging off, for the few places that do
- * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
- * compiler eliminate the code
- */
-
-#define __IPATH_INFO      0x0  /* generic low verbosity stuff */
-#define __IPATH_DBG       0x0  /* generic debug */
-#define __IPATH_TRSAMPLE  0x0  /* generate trace buffer sample entries */
-#define __IPATH_VERBDBG   0x0  /* very verbose debug */
-#define __IPATH_PKTDBG    0x0  /* print packet data */
-#define __IPATH_PROCDBG   0x0  /* process startup (init)/exit messages */
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG     0x0
-#define __IPATH_EPKTDBG   0x0  /* print ethernet packet data */
-#define __IPATH_IPATHDBG  0x0  /* Ethernet (IPATH) table dump on */
-#define __IPATH_IPATHWARN 0x0  /* Ethernet (IPATH) warnings on   */
-#define __IPATH_IPATHERR  0x0  /* Ethernet (IPATH) errors on   */
-#define __IPATH_IPATHPD   0x0  /* Ethernet (IPATH) packet dump on   */
-#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on   */
-#define __IPATH_LINKVERBDBG 0x0        /* very verbose linkchange debug */
-
-#endif                         /* _IPATH_DEBUGGING */
-
-#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
-
-#endif                         /* _IPATH_DEBUG_H */
diff --git a/drivers/staging/rdma/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c
deleted file mode 100644 (file)
index 45802e9..0000000
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the ipath_diag device, normally minor number 129.  Diagnostic use
- * of the InfiniPath chip may render the chip or board unusable until the
- * driver is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/export.h>
-#include <asm/uaccess.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-int ipath_diag_inuse;
-static int diag_set_link;
-
-static int ipath_diag_open(struct inode *in, struct file *fp);
-static int ipath_diag_release(struct inode *in, struct file *fp);
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-                              size_t count, loff_t *off);
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off);
-
-static const struct file_operations diag_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_diag_write,
-       .read = ipath_diag_read,
-       .open = ipath_diag_open,
-       .release = ipath_diag_release,
-       .llseek = default_llseek,
-};
-
-static ssize_t ipath_diagpkt_write(struct file *fp,
-                                  const char __user *data,
-                                  size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_diagpkt_write,
-       .llseek = noop_llseek,
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev *diagpkt_cdev;
-static struct device *diagpkt_dev;
-
-int ipath_diag_add(struct ipath_devdata *dd)
-{
-       char name[16];
-       int ret = 0;
-
-       if (atomic_inc_return(&diagpkt_count) == 1) {
-               ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
-                                     "ipath_diagpkt", &diagpkt_file_ops,
-                                     &diagpkt_cdev, &diagpkt_dev);
-
-               if (ret) {
-                       ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
-                                     "device: %d", ret);
-                       goto done;
-               }
-       }
-
-       snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
-
-       ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
-                             &diag_file_ops, &dd->diag_cdev,
-                             &dd->diag_dev);
-       if (ret)
-               ipath_dev_err(dd, "Couldn't create %s device: %d",
-                             name, ret);
-
-done:
-       return ret;
-}
-
-void ipath_diag_remove(struct ipath_devdata *dd)
-{
-       if (atomic_dec_and_test(&diagpkt_count))
-               ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
-
-       ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
-}
-
-/**
- * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy (multiple of 32 bits)
- *
- * This function also localizes all chip memory accesses.
- * The copy should be written such that we read full cacheline packets
- * from the chip.  This is usually used for a single qword
- *
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
-                            const void __iomem *caddr, size_t count)
-{
-       const u64 __iomem *reg_addr = caddr;
-       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-       int ret;
-
-       /* not very efficient, but it works for now */
-       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u64 data = readq(reg_addr);
-               if (copy_to_user(uaddr, &data, sizeof(u64))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               reg_addr++;
-               uaddr += sizeof(u64);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: the number of bytes to copy (multiple of 32 bits)
- *
- * This is usually used for a single qword
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-
-static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
-                             const void __user *uaddr, size_t count)
-{
-       u64 __iomem *reg_addr = caddr;
-       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-       int ret;
-
-       /* not very efficient, but it works for now */
-       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u64 data;
-               if (copy_from_user(&data, uaddr, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               writeq(data, reg_addr);
-
-               reg_addr++;
-               uaddr += sizeof(u64);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy
- *
- * read 32 bit values, not 64 bit; for memories that only
- * support 32 bit reads; usually a single dword.
- */
-static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
-                            const void __iomem *caddr, size_t count)
-{
-       const u32 __iomem *reg_addr = caddr;
-       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-       int ret;
-
-       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-           reg_end > (u32 __iomem *) dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* not very efficient, but it works for now */
-       while (reg_addr < reg_end) {
-               u32 data = readl(reg_addr);
-               if (copy_to_user(uaddr, &data, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-
-               reg_addr++;
-               uaddr += sizeof(u32);
-
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: number of bytes to copy
- *
- * write 32 bit values, not 64 bit; for memories that only
- * support 32 bit write; usually a single dword.
- */
-
-static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
-                             const void __user *uaddr, size_t count)
-{
-       u32 __iomem *reg_addr = caddr;
-       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-       int ret;
-
-       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-           reg_end > (u32 __iomem *) dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u32 data;
-               if (copy_from_user(&data, uaddr, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               writel(data, reg_addr);
-
-               reg_addr++;
-               uaddr += sizeof(u32);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-static int ipath_diag_open(struct inode *in, struct file *fp)
-{
-       int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
-       struct ipath_devdata *dd;
-       int ret;
-
-       mutex_lock(&ipath_mutex);
-
-       if (ipath_diag_inuse) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       dd = ipath_lookup(unit);
-
-       if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
-           !dd->ipath_kregbase) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       fp->private_data = dd;
-       ipath_diag_inuse = -2;
-       diag_set_link = 0;
-       ret = 0;
-
-       /* Only expose a way to reset the device if we
-          make it into diag mode. */
-       ipath_expose_reset(&dd->pcidev->dev);
-
-bail:
-       mutex_unlock(&ipath_mutex);
-
-       return ret;
-}
-
-/**
- * ipath_diagpkt_write - write an IB packet
- * @fp: the diag data device file pointer
- * @data: ipath_diag_pkt structure saying where to get the packet
- * @count: size of data to write
- * @off: unused by this code
- */
-static ssize_t ipath_diagpkt_write(struct file *fp,
-                                  const char __user *data,
-                                  size_t count, loff_t *off)
-{
-       u32 __iomem *piobuf;
-       u32 plen, pbufn, maxlen_reserve;
-       struct ipath_diag_pkt odp;
-       struct ipath_diag_xpkt dp;
-       u32 *tmpbuf = NULL;
-       struct ipath_devdata *dd;
-       ssize_t ret = 0;
-       u64 val;
-       u32 l_state, lt_state; /* LinkState, LinkTrainingState */
-
-
-       if (count == sizeof(dp)) {
-               if (copy_from_user(&dp, data, sizeof(dp))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       } else if (count == sizeof(odp)) {
-               if (copy_from_user(&odp, data, sizeof(odp))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               dp.len = odp.len;
-               dp.unit = odp.unit;
-               dp.data = odp.data;
-               dp.pbc_wd = 0;
-       } else {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* send count must be an exact number of dwords */
-       if (dp.len & 3) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       plen = dp.len >> 2;
-
-       dd = ipath_lookup(dp.unit);
-       if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
-           !dd->ipath_kregbase) {
-               ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
-                          dp.unit);
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (ipath_diag_inuse && !diag_set_link &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               diag_set_link = 1;
-               ipath_cdbg(VERBOSE, "Trying to set to set link active for "
-                          "diag pkt\n");
-               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-       }
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
-               ret = -ENODEV;
-               goto bail;
-       }
-       /*
-        * Want to skip check for l_state if using custom PBC,
-        * because we might be trying to force an SM packet out.
-        * first-cut, skip _all_ state checking in that case.
-        */
-       val = ipath_ib_state(dd, dd->ipath_lastibcstat);
-       lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-       l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-       if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
-           (val != dd->ib_init && val != dd->ib_arm &&
-           val != dd->ib_active))) {
-               ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
-                          dd->ipath_unit, (unsigned long long) val);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * need total length before first word written, plus 2 Dwords. One Dword
-        * is for padding so we get the full user data when not aligned on
-        * a word boundary. The other Dword is to make sure we have room for the
-        * ICRC which gets tacked on later.
-        */
-       maxlen_reserve = 2 * sizeof(u32);
-       if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
-               ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
-                         dp.len, dd->ipath_ibmaxlen);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       plen = sizeof(u32) + dp.len;
-
-       tmpbuf = vmalloc(plen);
-       if (!tmpbuf) {
-               dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
-                        "failing\n");
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       if (copy_from_user(tmpbuf,
-                          (const void __user *) (unsigned long) dp.data,
-                          dp.len)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       plen >>= 2;             /* in dwords */
-
-       piobuf = ipath_getpiobuf(dd, plen, &pbufn);
-       if (!piobuf) {
-               ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
-                          dd->ipath_unit);
-               ret = -EBUSY;
-               goto bail;
-       }
-       /* disarm it just to be extra sure */
-       ipath_disarm_piobufs(dd, pbufn, 1);
-
-       if (ipath_debug & __IPATH_PKTDBG)
-               ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
-                          dd->ipath_unit, plen - 1, pbufn);
-
-       if (dp.pbc_wd == 0)
-               dp.pbc_wd = plen;
-       writeq(dp.pbc_wd, piobuf);
-       /*
-        * Copy all by the trigger word, then flush, so it's written
-        * to chip before trigger word, then write trigger word, then
-        * flush again, so packet is sent.
-        */
-       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-               ipath_flush_wc();
-               __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
-               ipath_flush_wc();
-               __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
-       } else
-               __iowrite32_copy(piobuf + 2, tmpbuf, plen);
-
-       ipath_flush_wc();
-
-       ret = sizeof(dp);
-
-bail:
-       vfree(tmpbuf);
-       return ret;
-}
-
-static int ipath_diag_release(struct inode *in, struct file *fp)
-{
-       mutex_lock(&ipath_mutex);
-       ipath_diag_inuse = 0;
-       fp->private_data = NULL;
-       mutex_unlock(&ipath_mutex);
-       return 0;
-}
-
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-                              size_t count, loff_t *off)
-{
-       struct ipath_devdata *dd = fp->private_data;
-       void __iomem *kreg_base;
-       ssize_t ret;
-
-       kreg_base = dd->ipath_kregbase;
-
-       if (count == 0)
-               ret = 0;
-       else if ((count % 4) || (*off % 4))
-               /* address or length is not 32-bit aligned, hence invalid */
-               ret = -EINVAL;
-       else if (ipath_diag_inuse < 1 && (*off || count != 8))
-               ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
-       else if ((count % 8) || (*off % 8))
-               /* address or length not 64-bit aligned; do 32-bit reads */
-               ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
-       else
-               ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
-
-       if (ret >= 0) {
-               *off += count;
-               ret = count;
-               if (ipath_diag_inuse == -2)
-                       ipath_diag_inuse++;
-       }
-
-       return ret;
-}
-
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off)
-{
-       struct ipath_devdata *dd = fp->private_data;
-       void __iomem *kreg_base;
-       ssize_t ret;
-
-       kreg_base = dd->ipath_kregbase;
-
-       if (count == 0)
-               ret = 0;
-       else if ((count % 4) || (*off % 4))
-               /* address or length is not 32-bit aligned, hence invalid */
-               ret = -EINVAL;
-       else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
-                ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
-               ret = -EINVAL;  /* before any other write allowed */
-       else if ((count % 8) || (*off % 8))
-               /* address or length not 64-bit aligned; do 32-bit writes */
-               ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
-       else
-               ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
-
-       if (ret >= 0) {
-               *off += count;
-               ret = count;
-               if (ipath_diag_inuse == -1)
-                       ipath_diag_inuse = 1; /* all read/write OK now */
-       }
-
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c
deleted file mode 100644 (file)
index 123a8c0..0000000
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/scatterlist.h>
-#include <linux/gfp.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_verbs.h"
-
-#define BAD_DMA_ADDRESS ((u64) 0)
-
-/*
- * The following functions implement driver specific replacements
- * for the ib_dma_*() functions.
- *
- * These functions return kernel virtual addresses instead of
- * device bus addresses since the driver uses the CPU to copy
- * data instead of using hardware DMA.
- */
-
-static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == BAD_DMA_ADDRESS;
-}
-
-static u64 ipath_dma_map_single(struct ib_device *dev,
-                               void *cpu_addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-       return (u64) cpu_addr;
-}
-
-static void ipath_dma_unmap_single(struct ib_device *dev,
-                                  u64 addr, size_t size,
-                                  enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static u64 ipath_dma_map_page(struct ib_device *dev,
-                             struct page *page,
-                             unsigned long offset,
-                             size_t size,
-                             enum dma_data_direction direction)
-{
-       u64 addr;
-
-       BUG_ON(!valid_dma_direction(direction));
-
-       if (offset + size > PAGE_SIZE) {
-               addr = BAD_DMA_ADDRESS;
-               goto done;
-       }
-
-       addr = (u64) page_address(page);
-       if (addr)
-               addr += offset;
-       /* TODO: handle highmem pages */
-
-done:
-       return addr;
-}
-
-static void ipath_dma_unmap_page(struct ib_device *dev,
-                                u64 addr, size_t size,
-                                enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                       int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       u64 addr;
-       int i;
-       int ret = nents;
-
-       BUG_ON(!valid_dma_direction(direction));
-
-       for_each_sg(sgl, sg, nents, i) {
-               addr = (u64) page_address(sg_page(sg));
-               /* TODO: handle highmem pages */
-               if (!addr) {
-                       ret = 0;
-                       break;
-               }
-               sg->dma_address = addr + sg->offset;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-               sg->dma_length = sg->length;
-#endif
-       }
-       return ret;
-}
-
-static void ipath_unmap_sg(struct ib_device *dev,
-                          struct scatterlist *sg, int nents,
-                          enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static void ipath_sync_single_for_cpu(struct ib_device *dev,
-                                     u64 addr,
-                                     size_t size,
-                                     enum dma_data_direction dir)
-{
-}
-
-static void ipath_sync_single_for_device(struct ib_device *dev,
-                                        u64 addr,
-                                        size_t size,
-                                        enum dma_data_direction dir)
-{
-}
-
-static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                     u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p)
-               addr = page_address(p);
-       if (dma_handle)
-               *dma_handle = (u64) addr;
-       return addr;
-}
-
-static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
-                                   void *cpu_addr, u64 dma_handle)
-{
-       free_pages((unsigned long) cpu_addr, get_order(size));
-}
-
-struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
-       .mapping_error = ipath_mapping_error,
-       .map_single = ipath_dma_map_single,
-       .unmap_single = ipath_dma_unmap_single,
-       .map_page = ipath_dma_map_page,
-       .unmap_page = ipath_dma_unmap_page,
-       .map_sg = ipath_map_sg,
-       .unmap_sg = ipath_unmap_sg,
-       .sync_single_for_cpu = ipath_sync_single_for_cpu,
-       .sync_single_for_device = ipath_sync_single_for_device,
-       .alloc_coherent = ipath_dma_alloc_coherent,
-       .free_coherent = ipath_dma_free_coherent
-};
diff --git a/drivers/staging/rdma/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c
deleted file mode 100644 (file)
index 2ab22f9..0000000
+++ /dev/null
@@ -1,2784 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#ifdef CONFIG_X86_64
-#include <asm/pat.h>
-#endif
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-
-static void ipath_update_pio_bufs(struct ipath_devdata *);
-
-const char *ipath_get_unit_name(int unit)
-{
-       static char iname[16];
-       snprintf(iname, sizeof iname, "infinipath%u", unit);
-       return iname;
-}
-
-#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
-#define PFX IPATH_DRV_NAME ": "
-
-/*
- * The size has to be longer than this string, so we can append
- * board/chip information to it in the init code.
- */
-const char ib_ipath_version[] = IPATH_IDSTR "\n";
-
-static struct idr unit_table;
-DEFINE_SPINLOCK(ipath_devs_lock);
-LIST_HEAD(ipath_dev_list);
-
-wait_queue_head_t ipath_state_wait;
-
-unsigned ipath_debug = __IPATH_INFO;
-
-module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(debug, "mask for debug prints");
-EXPORT_SYMBOL_GPL(ipath_debug);
-
-unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
-module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
-MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
-
-static unsigned ipath_hol_timeout_ms = 13000;
-module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
-MODULE_PARM_DESC(hol_timeout_ms,
-       "duration of user app suspension after link failure");
-
-unsigned ipath_linkrecovery = 1;
-module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("QLogic <support@qlogic.com>");
-MODULE_DESCRIPTION("QLogic InfiniPath driver");
-
-/*
- * Table to translate the LINKTRAININGSTATE portion of
- * IBCStatus to a human-readable form.
- */
-const char *ipath_ibcstatus_str[] = {
-       "Disabled",
-       "LinkUp",
-       "PollActive",
-       "PollQuiet",
-       "SleepDelay",
-       "SleepQuiet",
-       "LState6",              /* unused */
-       "LState7",              /* unused */
-       "CfgDebounce",
-       "CfgRcvfCfg",
-       "CfgWaitRmt",
-       "CfgIdle",
-       "RecovRetrain",
-       "CfgTxRevLane",         /* unused before IBA7220 */
-       "RecovWaitRmt",
-       "RecovIdle",
-       /* below were added for IBA7220 */
-       "CfgEnhanced",
-       "CfgTest",
-       "CfgWaitRmtTest",
-       "CfgWaitCfgEnhanced",
-       "SendTS_T",
-       "SendTstIdles",
-       "RcvTS_T",
-       "SendTst_TS1s",
-       "LTState18", "LTState19", "LTState1A", "LTState1B",
-       "LTState1C", "LTState1D", "LTState1E", "LTState1F"
-};
-
-static void ipath_remove_one(struct pci_dev *);
-static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
-
-/* Only needed for registration, nothing else needs this info */
-#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
-#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
-
-/* Number of seconds before our card status check...  */
-#define STATUS_TIMEOUT 60
-
-static const struct pci_device_id ipath_pci_tbl[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
-       { 0, }
-};
-
-MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
-
-static struct pci_driver ipath_driver = {
-       .name = IPATH_DRV_NAME,
-       .probe = ipath_init_one,
-       .remove = ipath_remove_one,
-       .id_table = ipath_pci_tbl,
-       .driver = {
-               .groups = ipath_driver_attr_groups,
-       },
-};
-
-static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
-                            u32 *bar0, u32 *bar1)
-{
-       int ret;
-
-       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
-       if (ret)
-               ipath_dev_err(dd, "failed to read bar0 before enable: "
-                             "error %d\n", -ret);
-
-       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
-       if (ret)
-               ipath_dev_err(dd, "failed to read bar1 before enable: "
-                             "error %d\n", -ret);
-
-       ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
-}
-
-static void ipath_free_devdata(struct pci_dev *pdev,
-                              struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       pci_set_drvdata(pdev, NULL);
-
-       if (dd->ipath_unit != -1) {
-               spin_lock_irqsave(&ipath_devs_lock, flags);
-               idr_remove(&unit_table, dd->ipath_unit);
-               list_del(&dd->ipath_list);
-               spin_unlock_irqrestore(&ipath_devs_lock, flags);
-       }
-       vfree(dd);
-}
-
-static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
-{
-       unsigned long flags;
-       struct ipath_devdata *dd;
-       int ret;
-
-       dd = vzalloc(sizeof(*dd));
-       if (!dd) {
-               dd = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-       dd->ipath_unit = -1;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate unit ID: error %d\n", -ret);
-               ipath_free_devdata(pdev, dd);
-               dd = ERR_PTR(ret);
-               goto bail_unlock;
-       }
-       dd->ipath_unit = ret;
-
-       dd->pcidev = pdev;
-       pci_set_drvdata(pdev, dd);
-
-       list_add(&dd->ipath_list, &ipath_dev_list);
-
-bail_unlock:
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-       idr_preload_end();
-bail:
-       return dd;
-}
-
-static inline struct ipath_devdata *__ipath_lookup(int unit)
-{
-       return idr_find(&unit_table, unit);
-}
-
-struct ipath_devdata *ipath_lookup(int unit)
-{
-       struct ipath_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-       dd = __ipath_lookup(unit);
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       return dd;
-}
-
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
-{
-       int nunits, npresent, nup;
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       int maxports;
-
-       nunits = npresent = nup = maxports = 0;
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-               nunits++;
-               if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
-                       npresent++;
-               if (dd->ipath_lid &&
-                   !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
-                                        | IPATH_LINKUNK)))
-                       nup++;
-               if (dd->ipath_cfgports > maxports)
-                       maxports = dd->ipath_cfgports;
-       }
-
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       if (npresentp)
-               *npresentp = npresent;
-       if (nupp)
-               *nupp = nup;
-       if (maxportsp)
-               *maxportsp = maxports;
-
-       return nunits;
-}
-
-/*
- * These next two routines are placeholders in case we don't have per-arch
- * code for controlling write combining.  If explicit control of write
- * combining is not available, performance will probably be awful.
- */
-
-int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
-{
-       return -EOPNOTSUPP;
-}
-
-void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
-{
-}
-
-/*
- * Perform a PIO buffer bandwidth write test, to verify proper system
- * configuration.  Even when all the setup calls work, occasionally
- * BIOS or other issues can prevent write combining from working, or
- * can cause other bandwidth problems to the chip.
- *
- * This test simply writes the same buffer over and over again, and
- * measures close to the peak bandwidth to the chip (not testing
- * data bandwidth to the wire).   On chips that use an address-based
- * trigger to send packets to the wire, this is easy.  On chips that
- * use a count to trigger, we want to make sure that the packet doesn't
- * go out on the wire, or trigger flow control checks.
- */
-static void ipath_verify_pioperf(struct ipath_devdata *dd)
-{
-       u32 pbnum, cnt, lcnt;
-       u32 __iomem *piobuf;
-       u32 *addr;
-       u64 msecs, emsecs;
-
-       piobuf = ipath_getpiobuf(dd, 0, &pbnum);
-       if (!piobuf) {
-               dev_info(&dd->pcidev->dev,
-                       "No PIObufs for checking perf, skipping\n");
-               return;
-       }
-
-       /*
-        * Enough to give us a reasonable test, less than piobuf size, and
-        * likely multiple of store buffer length.
-        */
-       cnt = 1024;
-
-       addr = vmalloc(cnt);
-       if (!addr) {
-               dev_info(&dd->pcidev->dev,
-                       "Couldn't get memory for checking PIO perf,"
-                       " skipping\n");
-               goto done;
-       }
-
-       preempt_disable();  /* we want reasonably accurate elapsed time */
-       msecs = 1 + jiffies_to_msecs(jiffies);
-       for (lcnt = 0; lcnt < 10000U; lcnt++) {
-               /* wait until we cross msec boundary */
-               if (jiffies_to_msecs(jiffies) >= msecs)
-                       break;
-               udelay(1);
-       }
-
-       ipath_disable_armlaunch(dd);
-
-       /*
-        * length 0, no dwords actually sent, and mark as VL15
-        * on chips where that may matter (due to IB flowcontrol)
-        */
-       if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
-               writeq(1UL << 63, piobuf);
-       else
-               writeq(0, piobuf);
-       ipath_flush_wc();
-
-       /*
-        * this is only roughly accurate, since even with preempt we
-        * still take interrupts that could take a while.   Running for
-        * >= 5 msec seems to get us "close enough" to accurate values
-        */
-       msecs = jiffies_to_msecs(jiffies);
-       for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
-               __iowrite32_copy(piobuf + 64, addr, cnt >> 2);
-               emsecs = jiffies_to_msecs(jiffies) - msecs;
-       }
-
-       /* 1 GiB/sec, slightly over IB SDR line rate */
-       if (lcnt < (emsecs * 1024U))
-               ipath_dev_err(dd,
-                       "Performance problem: bandwidth to PIO buffers is "
-                       "only %u MiB/sec\n",
-                       lcnt / (u32) emsecs);
-       else
-               ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
-                       lcnt / (u32) emsecs);
-
-       preempt_enable();
-
-       vfree(addr);
-
-done:
-       /* disarm piobuf, so it's available again */
-       ipath_disarm_piobufs(dd, pbnum, 1);
-       ipath_enable_armlaunch(dd);
-}
-
-static void cleanup_device(struct ipath_devdata *dd);
-
-static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret, len, j;
-       struct ipath_devdata *dd;
-       unsigned long long addr;
-       u32 bar0 = 0, bar1 = 0;
-
-#ifdef CONFIG_X86_64
-       if (pat_enabled()) {
-               pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
-               ret = -ENODEV;
-               goto bail;
-       }
-#endif
-
-       dd = ipath_alloc_devdata(pdev);
-       if (IS_ERR(dd)) {
-               ret = PTR_ERR(dd);
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate devdata: error %d\n", -ret);
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
-
-       ret = pci_enable_device(pdev);
-       if (ret) {
-               /* This can happen iff:
-                *
-                * We did a chip reset, and then failed to reprogram the
-                * BAR, or the chip reset due to an internal error.  We then
-                * unloaded the driver and reloaded it.
-                *
-                * Both reset cases set the BAR back to initial state.  For
-                * the latter case, the AER sticky error bit at offset 0x718
-                * should be set, but the Linux kernel doesn't yet know
-                * about that, it appears.  If the original BAR was retained
-                * in the kernel data structures, this may be OK.
-                */
-               ipath_dev_err(dd, "enable unit %d failed: error %d\n",
-                             dd->ipath_unit, -ret);
-               goto bail_devdata;
-       }
-       addr = pci_resource_start(pdev, 0);
-       len = pci_resource_len(pdev, 0);
-       ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
-                  "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
-                  ent->device, ent->driver_data);
-
-       read_bars(dd, pdev, &bar0, &bar1);
-
-       if (!bar1 && !(bar0 & ~0xf)) {
-               if (addr) {
-                       dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
-                                "rewriting as %llx\n", addr);
-                       ret = pci_write_config_dword(
-                               pdev, PCI_BASE_ADDRESS_0, addr);
-                       if (ret) {
-                               ipath_dev_err(dd, "rewrite of BAR0 "
-                                             "failed: err %d\n", -ret);
-                               goto bail_disable;
-                       }
-                       ret = pci_write_config_dword(
-                               pdev, PCI_BASE_ADDRESS_1, addr >> 32);
-                       if (ret) {
-                               ipath_dev_err(dd, "rewrite of BAR1 "
-                                             "failed: err %d\n", -ret);
-                               goto bail_disable;
-                       }
-               } else {
-                       ipath_dev_err(dd, "BAR is 0 (probable RESET), "
-                                     "not usable until reboot\n");
-                       ret = -ENODEV;
-                       goto bail_disable;
-               }
-       }
-
-       ret = pci_request_regions(pdev, IPATH_DRV_NAME);
-       if (ret) {
-               dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
-                        "err %d\n", dd->ipath_unit, -ret);
-               goto bail_disable;
-       }
-
-       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (ret) {
-               /*
-                * if the 64 bit setup fails, try 32 bit.  Some systems
-                * do not setup 64 bit maps on systems with 2GB or less
-                * memory installed.
-                */
-               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-               if (ret) {
-                       dev_info(&pdev->dev,
-                               "Unable to set DMA mask for unit %u: %d\n",
-                               dd->ipath_unit, ret);
-                       goto bail_regions;
-               } else {
-                       ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
-                       ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-                       if (ret)
-                               dev_info(&pdev->dev,
-                                       "Unable to set DMA consistent mask "
-                                       "for unit %u: %d\n",
-                                       dd->ipath_unit, ret);
-
-               }
-       } else {
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-               if (ret)
-                       dev_info(&pdev->dev,
-                               "Unable to set DMA consistent mask "
-                               "for unit %u: %d\n",
-                               dd->ipath_unit, ret);
-       }
-
-       pci_set_master(pdev);
-
-       /*
-        * Save BARs to rewrite after device reset.  Save all 64 bits of
-        * BAR, just in case.
-        */
-       dd->ipath_pcibar0 = addr;
-       dd->ipath_pcibar1 = addr >> 32;
-       dd->ipath_deviceid = ent->device;       /* save for later use */
-       dd->ipath_vendorid = ent->vendor;
-
-       /* setup the chip-specific functions, as early as possible. */
-       switch (ent->device) {
-       case PCI_DEVICE_ID_INFINIPATH_HT:
-               ipath_init_iba6110_funcs(dd);
-               break;
-
-       default:
-               ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
-                             "failing\n", ent->device);
-               return -ENODEV;
-       }
-
-       for (j = 0; j < 6; j++) {
-               if (!pdev->resource[j].start)
-                       continue;
-               ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
-                          j, &pdev->resource[j],
-                          (unsigned long long)pci_resource_len(pdev, j));
-       }
-
-       if (!addr) {
-               ipath_dev_err(dd, "No valid address in BAR 0!\n");
-               ret = -ENODEV;
-               goto bail_regions;
-       }
-
-       dd->ipath_pcirev = pdev->revision;
-
-#if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       dd->ipath_kregbase = __ioremap(addr, len,
-               (_PAGE_NO_CACHE|_PAGE_WRITETHRU));
-#else
-       /* XXX: split this properly to enable on PAT */
-       dd->ipath_kregbase = ioremap_nocache(addr, len);
-#endif
-
-       if (!dd->ipath_kregbase) {
-               ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
-                         addr);
-               ret = -ENOMEM;
-               goto bail_iounmap;
-       }
-       dd->ipath_kregend = (u64 __iomem *)
-               ((void __iomem *)dd->ipath_kregbase + len);
-       dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
-       /* for user mmap */
-       ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
-                  addr, dd->ipath_kregbase);
-
-       if (dd->ipath_f_bus(dd, pdev))
-               ipath_dev_err(dd, "Failed to setup config space; "
-                             "continuing anyway\n");
-
-       /*
-        * set up our interrupt handler; IRQF_SHARED probably not needed,
-        * since MSI interrupts shouldn't be shared but won't  hurt for now.
-        * check 0 irq after we return from chip-specific bus setup, since
-        * that can affect this due to setup
-        */
-       if (!dd->ipath_irq)
-               ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
-                             "work\n");
-       else {
-               ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
-                                 IPATH_DRV_NAME, dd);
-               if (ret) {
-                       ipath_dev_err(dd, "Couldn't setup irq handler, "
-                                     "irq=%d: %d\n", dd->ipath_irq, ret);
-                       goto bail_iounmap;
-               }
-       }
-
-       ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
-       if (ret)
-               goto bail_irqsetup;
-
-       ret = ipath_enable_wc(dd);
-
-       if (ret)
-               ret = 0;
-
-       ipath_verify_pioperf(dd);
-
-       ipath_device_create_group(&pdev->dev, dd);
-       ipathfs_add_device(dd);
-       ipath_user_add(dd);
-       ipath_diag_add(dd);
-       ipath_register_ib_device(dd);
-
-       goto bail;
-
-bail_irqsetup:
-       cleanup_device(dd);
-
-       if (dd->ipath_irq)
-               dd->ipath_f_free_irq(dd);
-
-       if (dd->ipath_f_cleanup)
-               dd->ipath_f_cleanup(dd);
-
-bail_iounmap:
-       iounmap((volatile void __iomem *) dd->ipath_kregbase);
-
-bail_regions:
-       pci_release_regions(pdev);
-
-bail_disable:
-       pci_disable_device(pdev);
-
-bail_devdata:
-       ipath_free_devdata(pdev, dd);
-
-bail:
-       return ret;
-}
-
-static void cleanup_device(struct ipath_devdata *dd)
-{
-       int port;
-       struct ipath_portdata **tmp;
-       unsigned long flags;
-
-       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
-               /* can't do anything more with chip; needs re-init */
-               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
-               if (dd->ipath_kregbase) {
-                       /*
-                        * if we haven't already cleaned up before these are
-                        * to ensure any register reads/writes "fail" until
-                        * re-init
-                        */
-                       dd->ipath_kregbase = NULL;
-                       dd->ipath_uregbase = 0;
-                       dd->ipath_sregbase = 0;
-                       dd->ipath_cregbase = 0;
-                       dd->ipath_kregsize = 0;
-               }
-               ipath_disable_wc(dd);
-       }
-
-       if (dd->ipath_spectriggerhit)
-               dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
-                        dd->ipath_spectriggerhit);
-
-       if (dd->ipath_pioavailregs_dma) {
-               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                 (void *) dd->ipath_pioavailregs_dma,
-                                 dd->ipath_pioavailregs_phys);
-               dd->ipath_pioavailregs_dma = NULL;
-       }
-       if (dd->ipath_dummy_hdrq) {
-               dma_free_coherent(&dd->pcidev->dev,
-                       dd->ipath_pd[0]->port_rcvhdrq_size,
-                       dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
-               dd->ipath_dummy_hdrq = NULL;
-       }
-
-       if (dd->ipath_pageshadow) {
-               struct page **tmpp = dd->ipath_pageshadow;
-               dma_addr_t *tmpd = dd->ipath_physshadow;
-               int i, cnt = 0;
-
-               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
-                          "locked\n");
-               for (port = 0; port < dd->ipath_cfgports; port++) {
-                       int port_tidbase = port * dd->ipath_rcvtidcnt;
-                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-                       for (i = port_tidbase; i < maxtid; i++) {
-                               if (!tmpp[i])
-                                       continue;
-                               pci_unmap_page(dd->pcidev, tmpd[i],
-                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                               ipath_release_user_pages(&tmpp[i], 1);
-                               tmpp[i] = NULL;
-                               cnt++;
-                       }
-               }
-               if (cnt) {
-                       ipath_stats.sps_pageunlocks += cnt;
-                       ipath_cdbg(VERBOSE, "There were still %u expTID "
-                                  "entries locked\n", cnt);
-               }
-               if (ipath_stats.sps_pagelocks ||
-                   ipath_stats.sps_pageunlocks)
-                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
-                                  "unlocked via ipath_m{un}lock\n",
-                                  (unsigned long long)
-                                  ipath_stats.sps_pagelocks,
-                                  (unsigned long long)
-                                  ipath_stats.sps_pageunlocks);
-
-               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
-                          dd->ipath_pageshadow);
-               tmpp = dd->ipath_pageshadow;
-               dd->ipath_pageshadow = NULL;
-               vfree(tmpp);
-
-               dd->ipath_egrtidbase = NULL;
-       }
-
-       /*
-        * free any resources still in use (usually just kernel ports)
-        * at unload; we do for portcnt, because that's what we allocate.
-        * We acquire lock to be really paranoid that ipath_pd isn't being
-        * accessed from some interrupt-related code (that should not happen,
-        * but best to be sure).
-        */
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       tmp = dd->ipath_pd;
-       dd->ipath_pd = NULL;
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-       for (port = 0; port < dd->ipath_portcnt; port++) {
-               struct ipath_portdata *pd = tmp[port];
-               tmp[port] = NULL; /* debugging paranoia */
-               ipath_free_pddata(dd, pd);
-       }
-       kfree(tmp);
-}
-
-static void ipath_remove_one(struct pci_dev *pdev)
-{
-       struct ipath_devdata *dd = pci_get_drvdata(pdev);
-
-       ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
-
-       /*
-        * disable the IB link early, to be sure no new packets arrive, which
-        * complicates the shutdown process
-        */
-       ipath_shutdown_device(dd);
-
-       flush_workqueue(ib_wq);
-
-       if (dd->verbs_dev)
-               ipath_unregister_ib_device(dd->verbs_dev);
-
-       ipath_diag_remove(dd);
-       ipath_user_remove(dd);
-       ipathfs_remove_device(dd);
-       ipath_device_remove_group(&pdev->dev, dd);
-
-       ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
-                  "unit %u\n", dd, (u32) dd->ipath_unit);
-
-       cleanup_device(dd);
-
-       /*
-        * turn off rcv, send, and interrupts for all ports, all drivers
-        * should also hard reset the chip here?
-        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
-        * for all versions of the driver, if they were allocated
-        */
-       if (dd->ipath_irq) {
-               ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
-                          dd->ipath_unit, dd->ipath_irq);
-               dd->ipath_f_free_irq(dd);
-       } else
-               ipath_dbg("irq is 0, not doing free_irq "
-                         "for unit %u\n", dd->ipath_unit);
-       /*
-        * we check for NULL here, because it's outside
-        * the kregbase check, and we need to call it
-        * after the free_irq.  Thus it's possible that
-        * the function pointers were never initialized.
-        */
-       if (dd->ipath_f_cleanup)
-               /* clean up chip-specific stuff */
-               dd->ipath_f_cleanup(dd);
-
-       ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
-       iounmap((volatile void __iomem *) dd->ipath_kregbase);
-       pci_release_regions(pdev);
-       ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
-       pci_disable_device(pdev);
-
-       ipath_free_devdata(pdev, dd);
-}
-
-/* general driver use */
-DEFINE_MUTEX(ipath_mutex);
-
-static DEFINE_SPINLOCK(ipath_pioavail_lock);
-
-/**
- * ipath_disarm_piobufs - cancel a range of PIO buffers
- * @dd: the infinipath device
- * @first: the first PIO buffer to cancel
- * @cnt: the number of PIO buffers to cancel
- *
- * cancel a range of PIO buffers, used when they might be armed, but
- * not triggered.  Used at init to ensure buffer state, and also user
- * process close, in case it died while writing to a PIO buffer
- * Also after errors.
- */
-void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
-                         unsigned cnt)
-{
-       unsigned i, last = first + cnt;
-       unsigned long flags;
-
-       ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
-       for (i = first; i < last; i++) {
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               /*
-                * The disarm-related bits are write-only, so it
-                * is ok to OR them in with our copy of sendctrl
-                * while we hold the lock.
-                */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl | INFINIPATH_S_DISARM |
-                       (i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
-               /* can't disarm bufs back-to-back per iba7220 spec */
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-       /* on some older chips, update may not happen after cancel */
-       ipath_force_pio_avail_update(dd);
-}
-
-/**
- * ipath_wait_linkstate - wait for an IB link state change to occur
- * @dd: the infinipath device
- * @state: the state to wait for
- * @msecs: the number of milliseconds to wait
- *
- * wait up to msecs milliseconds for IB link state change to occur for
- * now, take the easy polling route.  Currently used only by
- * ipath_set_linkstate.  Returns 0 if state reached, otherwise
- * -ETIMEDOUT state can have multiple states set, for any of several
- * transitions.
- */
-int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
-{
-       dd->ipath_state_wanted = state;
-       wait_event_interruptible_timeout(ipath_state_wait,
-                                        (dd->ipath_flags & state),
-                                        msecs_to_jiffies(msecs));
-       dd->ipath_state_wanted = 0;
-
-       if (!(dd->ipath_flags & state)) {
-               u64 val;
-               ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
-                          " ms\n",
-                          /* test INIT ahead of DOWN, both can be set */
-                          (state & IPATH_LINKINIT) ? "INIT" :
-                          ((state & IPATH_LINKDOWN) ? "DOWN" :
-                           ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
-                          msecs);
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-               ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
-                          (unsigned long long) ipath_read_kreg64(
-                                  dd, dd->ipath_kregs->kr_ibcctrl),
-                          (unsigned long long) val,
-                          ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
-       }
-       return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
-}
-
-static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
-       char *buf, size_t blen)
-{
-       static const struct {
-               ipath_err_t err;
-               const char *msg;
-       } errs[] = {
-               { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
-               { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
-               { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
-               { INFINIPATH_E_SDMABASE, "SDmaBase" },
-               { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
-               { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
-               { INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
-               { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
-               { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
-               { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
-               { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
-               { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
-       };
-       int i;
-       int expected;
-       size_t bidx = 0;
-
-       for (i = 0; i < ARRAY_SIZE(errs); i++) {
-               expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
-                       test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-               if ((err & errs[i].err) && !expected)
-                       bidx += snprintf(buf + bidx, blen - bidx,
-                                        "%s ", errs[i].msg);
-       }
-}
-
-/*
- * Decode the error status into strings, deciding whether to always
- * print * it or not depending on "normal packet errors" vs everything
- * else.   Return 1 if "real" errors, otherwise 0 if only packet
- * errors, so caller can decide what to print with the string.
- */
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-       ipath_err_t err)
-{
-       int iserr = 1;
-       *buf = '\0';
-       if (err & INFINIPATH_E_PKTERRS) {
-               if (!(err & ~INFINIPATH_E_PKTERRS))
-                       iserr = 0; // if only packet errors.
-               if (ipath_debug & __IPATH_ERRPKTDBG) {
-                       if (err & INFINIPATH_E_REBP)
-                               strlcat(buf, "EBP ", blen);
-                       if (err & INFINIPATH_E_RVCRC)
-                               strlcat(buf, "VCRC ", blen);
-                       if (err & INFINIPATH_E_RICRC) {
-                               strlcat(buf, "CRC ", blen);
-                               // clear for check below, so only once
-                               err &= INFINIPATH_E_RICRC;
-                       }
-                       if (err & INFINIPATH_E_RSHORTPKTLEN)
-                               strlcat(buf, "rshortpktlen ", blen);
-                       if (err & INFINIPATH_E_SDROPPEDDATAPKT)
-                               strlcat(buf, "sdroppeddatapkt ", blen);
-                       if (err & INFINIPATH_E_SPKTLEN)
-                               strlcat(buf, "spktlen ", blen);
-               }
-               if ((err & INFINIPATH_E_RICRC) &&
-                       !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
-                       strlcat(buf, "CRC ", blen);
-               if (!iserr)
-                       goto done;
-       }
-       if (err & INFINIPATH_E_RHDRLEN)
-               strlcat(buf, "rhdrlen ", blen);
-       if (err & INFINIPATH_E_RBADTID)
-               strlcat(buf, "rbadtid ", blen);
-       if (err & INFINIPATH_E_RBADVERSION)
-               strlcat(buf, "rbadversion ", blen);
-       if (err & INFINIPATH_E_RHDR)
-               strlcat(buf, "rhdr ", blen);
-       if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
-               strlcat(buf, "sendspecialtrigger ", blen);
-       if (err & INFINIPATH_E_RLONGPKTLEN)
-               strlcat(buf, "rlongpktlen ", blen);
-       if (err & INFINIPATH_E_RMAXPKTLEN)
-               strlcat(buf, "rmaxpktlen ", blen);
-       if (err & INFINIPATH_E_RMINPKTLEN)
-               strlcat(buf, "rminpktlen ", blen);
-       if (err & INFINIPATH_E_SMINPKTLEN)
-               strlcat(buf, "sminpktlen ", blen);
-       if (err & INFINIPATH_E_RFORMATERR)
-               strlcat(buf, "rformaterr ", blen);
-       if (err & INFINIPATH_E_RUNSUPVL)
-               strlcat(buf, "runsupvl ", blen);
-       if (err & INFINIPATH_E_RUNEXPCHAR)
-               strlcat(buf, "runexpchar ", blen);
-       if (err & INFINIPATH_E_RIBFLOW)
-               strlcat(buf, "ribflow ", blen);
-       if (err & INFINIPATH_E_SUNDERRUN)
-               strlcat(buf, "sunderrun ", blen);
-       if (err & INFINIPATH_E_SPIOARMLAUNCH)
-               strlcat(buf, "spioarmlaunch ", blen);
-       if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
-               strlcat(buf, "sunexperrpktnum ", blen);
-       if (err & INFINIPATH_E_SDROPPEDSMPPKT)
-               strlcat(buf, "sdroppedsmppkt ", blen);
-       if (err & INFINIPATH_E_SMAXPKTLEN)
-               strlcat(buf, "smaxpktlen ", blen);
-       if (err & INFINIPATH_E_SUNSUPVL)
-               strlcat(buf, "sunsupVL ", blen);
-       if (err & INFINIPATH_E_INVALIDADDR)
-               strlcat(buf, "invalidaddr ", blen);
-       if (err & INFINIPATH_E_RRCVEGRFULL)
-               strlcat(buf, "rcvegrfull ", blen);
-       if (err & INFINIPATH_E_RRCVHDRFULL)
-               strlcat(buf, "rcvhdrfull ", blen);
-       if (err & INFINIPATH_E_IBSTATUSCHANGED)
-               strlcat(buf, "ibcstatuschg ", blen);
-       if (err & INFINIPATH_E_RIBLOSTLINK)
-               strlcat(buf, "riblostlink ", blen);
-       if (err & INFINIPATH_E_HARDWARE)
-               strlcat(buf, "hardware ", blen);
-       if (err & INFINIPATH_E_RESET)
-               strlcat(buf, "reset ", blen);
-       if (err & INFINIPATH_E_SDMAERRS)
-               decode_sdma_errs(dd, err, buf, blen);
-       if (err & INFINIPATH_E_INVALIDEEPCMD)
-               strlcat(buf, "invalideepromcmd ", blen);
-done:
-       return iserr;
-}
-
-/**
- * get_rhf_errstring - decode RHF errors
- * @err: the err number
- * @msg: the output buffer
- * @len: the length of the output buffer
- *
- * only used one place now, may want more later
- */
-static void get_rhf_errstring(u32 err, char *msg, size_t len)
-{
-       /* if no errors, and so don't need to check what's first */
-       *msg = '\0';
-
-       if (err & INFINIPATH_RHF_H_ICRCERR)
-               strlcat(msg, "icrcerr ", len);
-       if (err & INFINIPATH_RHF_H_VCRCERR)
-               strlcat(msg, "vcrcerr ", len);
-       if (err & INFINIPATH_RHF_H_PARITYERR)
-               strlcat(msg, "parityerr ", len);
-       if (err & INFINIPATH_RHF_H_LENERR)
-               strlcat(msg, "lenerr ", len);
-       if (err & INFINIPATH_RHF_H_MTUERR)
-               strlcat(msg, "mtuerr ", len);
-       if (err & INFINIPATH_RHF_H_IHDRERR)
-               /* infinipath hdr checksum error */
-               strlcat(msg, "ipathhdrerr ", len);
-       if (err & INFINIPATH_RHF_H_TIDERR)
-               strlcat(msg, "tiderr ", len);
-       if (err & INFINIPATH_RHF_H_MKERR)
-               /* bad port, offset, etc. */
-               strlcat(msg, "invalid ipathhdr ", len);
-       if (err & INFINIPATH_RHF_H_IBERR)
-               strlcat(msg, "iberr ", len);
-       if (err & INFINIPATH_RHF_L_SWA)
-               strlcat(msg, "swA ", len);
-       if (err & INFINIPATH_RHF_L_SWB)
-               strlcat(msg, "swB ", len);
-}
-
-/**
- * ipath_get_egrbuf - get an eager buffer
- * @dd: the infinipath device
- * @bufnum: the eager buffer to get
- *
- * must only be called if ipath_pd[port] is known to be allocated
- */
-static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
-{
-       return dd->ipath_port0_skbinfo ?
-               (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
-}
-
-/**
- * ipath_alloc_skb - allocate an skb and buffer with possible constraints
- * @dd: the infinipath device
- * @gfp_mask: the sk_buff SFP mask
- */
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
-                               gfp_t gfp_mask)
-{
-       struct sk_buff *skb;
-       u32 len;
-
-       /*
-        * Only fully supported way to handle this is to allocate lots
-        * extra, align as needed, and then do skb_reserve().  That wastes
-        * a lot of memory...  I'll have to hack this into infinipath_copy
-        * also.
-        */
-
-       /*
-        * We need 2 extra bytes for ipath_ether data sent in the
-        * key header.  In order to keep everything dword aligned,
-        * we'll reserve 4 bytes.
-        */
-       len = dd->ipath_ibmaxlen + 4;
-
-       if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               /* We need a 2KB multiple alignment, and there is no way
-                * to do it except to allocate extra and then skb_reserve
-                * enough to bring it up to the right alignment.
-                */
-               len += 2047;
-       }
-
-       skb = __dev_alloc_skb(len, gfp_mask);
-       if (!skb) {
-               ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
-                             len);
-               goto bail;
-       }
-
-       skb_reserve(skb, 4);
-
-       if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               u32 una = (unsigned long)skb->data & 2047;
-               if (una)
-                       skb_reserve(skb, 2048 - una);
-       }
-
-bail:
-       return skb;
-}
-
-static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
-                            u32 eflags,
-                            u32 l,
-                            u32 etail,
-                            __le32 *rhf_addr,
-                            struct ipath_message_header *hdr)
-{
-       char emsg[128];
-
-       get_rhf_errstring(eflags, emsg, sizeof emsg);
-       ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
-                  "tlen=%x opcode=%x egridx=%x: %s\n",
-                  eflags, l,
-                  ipath_hdrget_rcv_type(rhf_addr),
-                  ipath_hdrget_length_in_bytes(rhf_addr),
-                  be32_to_cpu(hdr->bth[0]) >> 24,
-                  etail, emsg);
-
-       /* Count local link integrity errors. */
-       if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
-               u8 n = (dd->ipath_ibcctrl >>
-                       INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-                       INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-
-               if (++dd->ipath_lli_counter > n) {
-                       dd->ipath_lli_counter = 0;
-                       dd->ipath_lli_errors++;
-               }
-       }
-}
-
-/*
- * ipath_kreceive - receive a packet
- * @pd: the infinipath port
- *
- * called from interrupt handler for errors or receive interrupt
- */
-void ipath_kreceive(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       __le32 *rhf_addr;
-       void *ebuf;
-       const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
-       const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
-       u32 etail = -1, l, hdrqtail;
-       struct ipath_message_header *hdr;
-       u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
-       static u64 totcalls;    /* stats, may eventually remove */
-       int last;
-
-       l = pd->port_head;
-       rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
-       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-               u32 seq = ipath_hdrget_seq(rhf_addr);
-
-               if (seq != pd->port_seq_cnt)
-                       goto bail;
-               hdrqtail = 0;
-       } else {
-               hdrqtail = ipath_get_rcvhdrtail(pd);
-               if (l == hdrqtail)
-                       goto bail;
-               smp_rmb();
-       }
-
-reloop:
-       for (last = 0, i = 1; !last; i += !last) {
-               hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
-               eflags = ipath_hdrget_err_flags(rhf_addr);
-               etype = ipath_hdrget_rcv_type(rhf_addr);
-               /* total length */
-               tlen = ipath_hdrget_length_in_bytes(rhf_addr);
-               ebuf = NULL;
-               if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
-                   ipath_hdrget_use_egr_buf(rhf_addr) :
-                   (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
-                       /*
-                        * It turns out that the chip uses an eager buffer
-                        * for all non-expected packets, whether it "needs"
-                        * one or not.  So always get the index, but don't
-                        * set ebuf (so we try to copy data) unless the
-                        * length requires it.
-                        */
-                       etail = ipath_hdrget_index(rhf_addr);
-                       updegr = 1;
-                       if (tlen > sizeof(*hdr) ||
-                           etype == RCVHQ_RCV_TYPE_NON_KD)
-                               ebuf = ipath_get_egrbuf(dd, etail);
-               }
-
-               /*
-                * both tiderr and ipathhdrerr are set for all plain IB
-                * packets; only ipathhdrerr should be set.
-                */
-
-               if (etype != RCVHQ_RCV_TYPE_NON_KD &&
-                   etype != RCVHQ_RCV_TYPE_ERROR &&
-                   ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
-                   IPS_PROTO_VERSION)
-                       ipath_cdbg(PKT, "Bad InfiniPath protocol version "
-                                  "%x\n", etype);
-
-               if (unlikely(eflags))
-                       ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
-               else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
-                       ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
-                       if (dd->ipath_lli_counter)
-                               dd->ipath_lli_counter--;
-               } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
-                       u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
-                       u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
-                       ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
-                                  "qp=%x), len %x; ignored\n",
-                                  etype, opcode, qp, tlen);
-               } else if (etype == RCVHQ_RCV_TYPE_EXPECTED) {
-                       ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
-                                 be32_to_cpu(hdr->bth[0]) >> 24);
-               } else {
-                       /*
-                        * error packet, type of error unknown.
-                        * Probably type 3, but we don't know, so don't
-                        * even try to print the opcode, etc.
-                        * Usually caused by a "bad packet", that has no
-                        * BTH, when the LRH says it should.
-                        */
-                       ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
-                                 " %x, len %x hdrq+%x rhf: %Lx\n",
-                                 etail, tlen, l, (unsigned long long)
-                                 le64_to_cpu(*(__le64 *) rhf_addr));
-                       if (ipath_debug & __IPATH_ERRPKTDBG) {
-                               u32 j, *d, dw = rsize-2;
-                               if (rsize > (tlen>>2))
-                                       dw = tlen>>2;
-                               d = (u32 *)hdr;
-                               printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
-                                       dw);
-                               for (j = 0; j < dw; j++)
-                                       printk(KERN_DEBUG "%8x%s", d[j],
-                                               (j%8) == 7 ? "\n" : " ");
-                               printk(KERN_DEBUG ".\n");
-                       }
-               }
-               l += rsize;
-               if (l >= maxcnt)
-                       l = 0;
-               rhf_addr = (__le32 *) pd->port_rcvhdrq +
-                       l + dd->ipath_rhf_offset;
-               if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-                       u32 seq = ipath_hdrget_seq(rhf_addr);
-
-                       if (++pd->port_seq_cnt > 13)
-                               pd->port_seq_cnt = 1;
-                       if (seq != pd->port_seq_cnt)
-                               last = 1;
-               } else if (l == hdrqtail) {
-                       last = 1;
-               }
-               /*
-                * update head regs on last packet, and every 16 packets.
-                * Reduce bus traffic, while still trying to prevent
-                * rcvhdrq overflows, for when the queue is nearly full
-                */
-               if (last || !(i & 0xf)) {
-                       u64 lval = l;
-
-                       /* request IBA6120 and 7220 interrupt only on last */
-                       if (last)
-                               lval |= dd->ipath_rhdrhead_intr_off;
-                       ipath_write_ureg(dd, ur_rcvhdrhead, lval,
-                               pd->port_port);
-                       if (updegr) {
-                               ipath_write_ureg(dd, ur_rcvegrindexhead,
-                                                etail, pd->port_port);
-                               updegr = 0;
-                       }
-               }
-       }
-
-       if (!dd->ipath_rhdrhead_intr_off && !reloop &&
-           !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-               /* IBA6110 workaround; we can have a race clearing chip
-                * interrupt with another interrupt about to be delivered,
-                * and can clear it before it is delivered on the GPIO
-                * workaround.  By doing the extra check here for the
-                * in-memory tail register updating while we were doing
-                * earlier packets, we "almost" guarantee we have covered
-                * that case.
-                */
-               u32 hqtail = ipath_get_rcvhdrtail(pd);
-               if (hqtail != hdrqtail) {
-                       hdrqtail = hqtail;
-                       reloop = 1; /* loop 1 extra time at most */
-                       goto reloop;
-               }
-       }
-
-       pkttot += i;
-
-       pd->port_head = l;
-
-       if (pkttot > ipath_stats.sps_maxpkts_call)
-               ipath_stats.sps_maxpkts_call = pkttot;
-       ipath_stats.sps_port0pkts += pkttot;
-       ipath_stats.sps_avgpkts_call =
-               ipath_stats.sps_port0pkts / ++totcalls;
-
-bail:;
-}
-
-/**
- * ipath_update_pio_bufs - update shadow copy of the PIO availability map
- * @dd: the infinipath device
- *
- * called whenever our local copy indicates we have run out of send buffers
- * NOTE: This can be called from interrupt context by some code
- * and from non-interrupt context by ipath_getpiobuf().
- */
-
-static void ipath_update_pio_bufs(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int i;
-       const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
-
-       /* If the generation (check) bits have changed, then we update the
-        * busy bit for the corresponding PIO buffer.  This algorithm will
-        * modify positions to the value they already have in some cases
-        * (i.e., no change), but it's faster than changing only the bits
-        * that have changed.
-        *
-        * We would like to do this atomicly, to avoid spinlocks in the
-        * critical send path, but that's not really possible, given the
-        * type of changes, and that this routine could be called on
-        * multiple cpu's simultaneously, so we lock in this routine only,
-        * to avoid conflicting updates; all we change is the shadow, and
-        * it's a single 64 bit memory location, so by definition the update
-        * is atomic in terms of what other cpu's can see in testing the
-        * bits.  The spin_lock overhead isn't too bad, since it only
-        * happens when all buffers are in use, so only cpu overhead, not
-        * latency or bandwidth is affected.
-        */
-       if (!dd->ipath_pioavailregs_dma) {
-               ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
-               return;
-       }
-       if (ipath_debug & __IPATH_VERBDBG) {
-               /* only if packet debug and verbose */
-               volatile __le64 *dma = dd->ipath_pioavailregs_dma;
-               unsigned long *shadow = dd->ipath_pioavailshadow;
-
-               ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
-                          "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
-                          "s3=%lx\n",
-                          (unsigned long long) le64_to_cpu(dma[0]),
-                          shadow[0],
-                          (unsigned long long) le64_to_cpu(dma[1]),
-                          shadow[1],
-                          (unsigned long long) le64_to_cpu(dma[2]),
-                          shadow[2],
-                          (unsigned long long) le64_to_cpu(dma[3]),
-                          shadow[3]);
-               if (piobregs > 4)
-                       ipath_cdbg(
-                               PKT, "2nd group, dma4=%llx shad4=%lx, "
-                               "d5=%llx s5=%lx, d6=%llx s6=%lx, "
-                               "d7=%llx s7=%lx\n",
-                               (unsigned long long) le64_to_cpu(dma[4]),
-                               shadow[4],
-                               (unsigned long long) le64_to_cpu(dma[5]),
-                               shadow[5],
-                               (unsigned long long) le64_to_cpu(dma[6]),
-                               shadow[6],
-                               (unsigned long long) le64_to_cpu(dma[7]),
-                               shadow[7]);
-       }
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (i = 0; i < piobregs; i++) {
-               u64 pchbusy, pchg, piov, pnew;
-               /*
-                * Chip Errata: bug 6641; even and odd qwords>3 are swapped
-                */
-               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
-               else
-                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
-               pchg = dd->ipath_pioavailkernel[i] &
-                       ~(dd->ipath_pioavailshadow[i] ^ piov);
-               pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
-               if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
-                       pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
-                       pnew |= piov & pchbusy;
-                       dd->ipath_pioavailshadow[i] = pnew;
-               }
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/*
- * used to force update of pioavailshadow if we can't get a pio buffer.
- * Needed primarily due to exitting freeze mode after recovering
- * from errors.  Done lazily, because it's safer (known to not
- * be writing pio buffers).
- */
-static void ipath_reset_availshadow(struct ipath_devdata *dd)
-{
-       int i, im;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (i = 0; i < dd->ipath_pioavregs; i++) {
-               u64 val, oldval;
-               /* deal with 6110 chip bug on high register #s */
-               im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-                       i ^ 1 : i;
-               val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
-               /*
-                * busy out the buffers not in the kernel avail list,
-                * without changing the generation bits.
-                */
-               oldval = dd->ipath_pioavailshadow[i];
-               dd->ipath_pioavailshadow[i] = val |
-                       ((~dd->ipath_pioavailkernel[i] <<
-                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
-                       0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
-               if (oldval != dd->ipath_pioavailshadow[i])
-                       ipath_dbg("shadow[%d] was %Lx, now %lx\n",
-                               i, (unsigned long long) oldval,
-                               dd->ipath_pioavailshadow[i]);
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/**
- * ipath_setrcvhdrsize - set the receive header size
- * @dd: the infinipath device
- * @rhdrsize: the receive header size
- *
- * called from user init code, and also layered driver init
- */
-int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
-{
-       int ret = 0;
-
-       if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
-               if (dd->ipath_rcvhdrsize != rhdrsize) {
-                       dev_info(&dd->pcidev->dev,
-                                "Error: can't set protocol header "
-                                "size %u, already %u\n",
-                                rhdrsize, dd->ipath_rcvhdrsize);
-                       ret = -EAGAIN;
-               } else
-                       ipath_cdbg(VERBOSE, "Reuse same protocol header "
-                                  "size %u\n", dd->ipath_rcvhdrsize);
-       } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
-                              (sizeof(u64) / sizeof(u32)))) {
-               ipath_dbg("Error: can't set protocol header size %u "
-                         "(> max %u)\n", rhdrsize,
-                         dd->ipath_rcvhdrentsize -
-                         (u32) (sizeof(u64) / sizeof(u32)));
-               ret = -EOVERFLOW;
-       } else {
-               dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
-               dd->ipath_rcvhdrsize = rhdrsize;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-                                dd->ipath_rcvhdrsize);
-               ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
-                          dd->ipath_rcvhdrsize);
-       }
-       return ret;
-}
-
-/*
- * debugging code and stats updates if no pio buffers available.
- */
-static noinline void no_pio_bufs(struct ipath_devdata *dd)
-{
-       unsigned long *shadow = dd->ipath_pioavailshadow;
-       __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
-
-       dd->ipath_upd_pio_shadow = 1;
-
-       /*
-        * not atomic, but if we lose a stat count in a while, that's OK
-        */
-       ipath_stats.sps_nopiobufs++;
-       if (!(++dd->ipath_consec_nopiobuf % 100000)) {
-               ipath_force_pio_avail_update(dd); /* at start */
-               ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
-                       "%llx %llx %llx %llx\n"
-                       "ipath  shadow:  %lx %lx %lx %lx\n",
-                       dd->ipath_consec_nopiobuf,
-                       (unsigned long)get_cycles(),
-                       (unsigned long long) le64_to_cpu(dma[0]),
-                       (unsigned long long) le64_to_cpu(dma[1]),
-                       (unsigned long long) le64_to_cpu(dma[2]),
-                       (unsigned long long) le64_to_cpu(dma[3]),
-                       shadow[0], shadow[1], shadow[2], shadow[3]);
-               /*
-                * 4 buffers per byte, 4 registers above, cover rest
-                * below
-                */
-               if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
-                   (sizeof(shadow[0]) * 4 * 4))
-                       ipath_dbg("2nd group: dmacopy: "
-                                 "%llx %llx %llx %llx\n"
-                                 "ipath  shadow:  %lx %lx %lx %lx\n",
-                                 (unsigned long long)le64_to_cpu(dma[4]),
-                                 (unsigned long long)le64_to_cpu(dma[5]),
-                                 (unsigned long long)le64_to_cpu(dma[6]),
-                                 (unsigned long long)le64_to_cpu(dma[7]),
-                                 shadow[4], shadow[5], shadow[6], shadow[7]);
-
-               /* at end, so update likely happened */
-               ipath_reset_availshadow(dd);
-       }
-}
-
-/*
- * common code for normal driver pio buffer allocation, and reserved
- * allocation.
- *
- * do appropriate marking as busy, etc.
- * returns buffer number if one found (>=0), negative number is error.
- */
-static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
-       u32 *pbufnum, u32 first, u32 last, u32 firsti)
-{
-       int i, j, updated = 0;
-       unsigned piobcnt;
-       unsigned long flags;
-       unsigned long *shadow = dd->ipath_pioavailshadow;
-       u32 __iomem *buf;
-
-       piobcnt = last - first;
-       if (dd->ipath_upd_pio_shadow) {
-               /*
-                * Minor optimization.  If we had no buffers on last call,
-                * start out by doing the update; continue and do scan even
-                * if no buffers were updated, to be paranoid
-                */
-               ipath_update_pio_bufs(dd);
-               updated++;
-               i = first;
-       } else
-               i = firsti;
-rescan:
-       /*
-        * while test_and_set_bit() is atomic, we do that and then the
-        * change_bit(), and the pair is not.  See if this is the cause
-        * of the remaining armlaunch errors.
-        */
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (j = 0; j < piobcnt; j++, i++) {
-               if (i >= last)
-                       i = first;
-               if (__test_and_set_bit((2 * i) + 1, shadow))
-                       continue;
-               /* flip generation bit */
-               __change_bit(2 * i, shadow);
-               break;
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-       if (j == piobcnt) {
-               if (!updated) {
-                       /*
-                        * first time through; shadow exhausted, but may be
-                        * buffers available, try an update and then rescan.
-                        */
-                       ipath_update_pio_bufs(dd);
-                       updated++;
-                       i = first;
-                       goto rescan;
-               } else if (updated == 1 && piobcnt <=
-                       ((dd->ipath_sendctrl
-                       >> INFINIPATH_S_UPDTHRESH_SHIFT) &
-                       INFINIPATH_S_UPDTHRESH_MASK)) {
-                       /*
-                        * for chips supporting and using the update
-                        * threshold we need to force an update of the
-                        * in-memory copy if the count is less than the
-                        * thershold, then check one more time.
-                        */
-                       ipath_force_pio_avail_update(dd);
-                       ipath_update_pio_bufs(dd);
-                       updated++;
-                       i = first;
-                       goto rescan;
-               }
-
-               no_pio_bufs(dd);
-               buf = NULL;
-       } else {
-               if (i < dd->ipath_piobcnt2k)
-                       buf = (u32 __iomem *) (dd->ipath_pio2kbase +
-                                              i * dd->ipath_palign);
-               else
-                       buf = (u32 __iomem *)
-                               (dd->ipath_pio4kbase +
-                                (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
-               if (pbufnum)
-                       *pbufnum = i;
-       }
-
-       return buf;
-}
-
-/**
- * ipath_getpiobuf - find an available pio buffer
- * @dd: the infinipath device
- * @plen: the size of the PIO buffer needed in 32-bit words
- * @pbufnum: the buffer number is placed here
- */
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
-{
-       u32 __iomem *buf;
-       u32 pnum, nbufs;
-       u32 first, lasti;
-
-       if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
-               first = dd->ipath_piobcnt2k;
-               lasti = dd->ipath_lastpioindexl;
-       } else {
-               first = 0;
-               lasti = dd->ipath_lastpioindex;
-       }
-       nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
-
-       if (buf) {
-               /*
-                * Set next starting place.  It's just an optimization,
-                * it doesn't matter who wins on this, so no locking
-                */
-               if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-                       dd->ipath_lastpioindexl = pnum + 1;
-               else
-                       dd->ipath_lastpioindex = pnum + 1;
-               if (dd->ipath_upd_pio_shadow)
-                       dd->ipath_upd_pio_shadow = 0;
-               if (dd->ipath_consec_nopiobuf)
-                       dd->ipath_consec_nopiobuf = 0;
-               ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
-                          pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
-               if (pbufnum)
-                       *pbufnum = pnum;
-
-       }
-       return buf;
-}
-
-/**
- * ipath_chg_pioavailkernel - change which send buffers are available for kernel
- * @dd: the infinipath device
- * @start: the starting send buffer number
- * @len: the number of send buffers
- * @avail: true if the buffers are available for kernel use, false otherwise
- */
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-                             unsigned len, int avail)
-{
-       unsigned long flags;
-       unsigned end, cnt = 0;
-
-       /* There are two bits per send buffer (busy and generation) */
-       start *= 2;
-       end = start + len * 2;
-
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       /* Set or clear the busy bit in the shadow. */
-       while (start < end) {
-               if (avail) {
-                       unsigned long dma;
-                       int i, im;
-                       /*
-                        * the BUSY bit will never be set, because we disarm
-                        * the user buffers before we hand them back to the
-                        * kernel.  We do have to make sure the generation
-                        * bit is set correctly in shadow, since it could
-                        * have changed many times while allocated to user.
-                        * We can't use the bitmap functions on the full
-                        * dma array because it is always little-endian, so
-                        * we have to flip to host-order first.
-                        * BITS_PER_LONG is slightly wrong, since it's
-                        * always 64 bits per register in chip...
-                        * We only work on 64 bit kernels, so that's OK.
-                        */
-                       /* deal with 6110 chip bug on high register #s */
-                       i = start / BITS_PER_LONG;
-                       im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-                               i ^ 1 : i;
-                       __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
-                               + start, dd->ipath_pioavailshadow);
-                       dma = (unsigned long) le64_to_cpu(
-                               dd->ipath_pioavailregs_dma[im]);
-                       if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                               + start) % BITS_PER_LONG, &dma))
-                               __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                                       + start, dd->ipath_pioavailshadow);
-                       else
-                               __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                                       + start, dd->ipath_pioavailshadow);
-                       __set_bit(start, dd->ipath_pioavailkernel);
-               } else {
-                       __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
-                               dd->ipath_pioavailshadow);
-                       __clear_bit(start, dd->ipath_pioavailkernel);
-               }
-               start += 2;
-       }
-
-       if (dd->ipath_pioupd_thresh) {
-               end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-               cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-       /*
-        * When moving buffers from kernel to user, if number assigned to
-        * the user is less than the pio update threshold, and threshold
-        * is supported (cnt was computed > 0), drop the update threshold
-        * so we update at least once per allocated number of buffers.
-        * In any case, if the kernel buffers are less than the threshold,
-        * drop the threshold.  We don't bother increasing it, having once
-        * decreased it, since it would typically just cycle back and forth.
-        * If we don't decrease below buffers in use, we can wait a long
-        * time for an update, until some other context uses PIO buffers.
-        */
-       if (!avail && len < cnt)
-               cnt = len;
-       if (cnt < dd->ipath_pioupd_thresh) {
-               dd->ipath_pioupd_thresh = cnt;
-               ipath_dbg("Decreased pio update threshold to %u\n",
-                       dd->ipath_pioupd_thresh);
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
-                       << INFINIPATH_S_UPDTHRESH_SHIFT);
-               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-                       << INFINIPATH_S_UPDTHRESH_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-}
-
-/**
- * ipath_create_rcvhdrq - create a receive header queue
- * @dd: the infinipath device
- * @pd: the port data
- *
- * this must be contiguous memory (from an i/o perspective), and must be
- * DMA'able (which means for some systems, it will go through an IOMMU,
- * or be forced into a low address range).
- */
-int ipath_create_rcvhdrq(struct ipath_devdata *dd,
-                        struct ipath_portdata *pd)
-{
-       int ret = 0;
-
-       if (!pd->port_rcvhdrq) {
-               dma_addr_t phys_hdrqtail;
-               gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-               int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-                               sizeof(u32), PAGE_SIZE);
-
-               pd->port_rcvhdrq = dma_alloc_coherent(
-                       &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
-                       gfp_flags);
-
-               if (!pd->port_rcvhdrq) {
-                       ipath_dev_err(dd, "attempt to allocate %d bytes "
-                                     "for port %u rcvhdrq failed\n",
-                                     amt, pd->port_port);
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-                       pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
-                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
-                               GFP_KERNEL);
-                       if (!pd->port_rcvhdrtail_kvaddr) {
-                               ipath_dev_err(dd, "attempt to allocate 1 page "
-                                       "for port %u rcvhdrqtailaddr "
-                                       "failed\n", pd->port_port);
-                               ret = -ENOMEM;
-                               dma_free_coherent(&dd->pcidev->dev, amt,
-                                       pd->port_rcvhdrq,
-                                       pd->port_rcvhdrq_phys);
-                               pd->port_rcvhdrq = NULL;
-                               goto bail;
-                       }
-                       pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
-                       ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
-                                  "physical\n", pd->port_port,
-                                  (unsigned long long) phys_hdrqtail);
-               }
-
-               pd->port_rcvhdrq_size = amt;
-
-               ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
-                          "for port %u rcvhdr Q\n",
-                          amt >> PAGE_SHIFT, pd->port_rcvhdrq,
-                          (unsigned long) pd->port_rcvhdrq_phys,
-                          (unsigned long) pd->port_rcvhdrq_size,
-                          pd->port_port);
-       } else {
-               ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
-                          "hdrtailaddr@%p %llx physical\n",
-                          pd->port_port, pd->port_rcvhdrq,
-                          (unsigned long long) pd->port_rcvhdrq_phys,
-                          pd->port_rcvhdrtail_kvaddr, (unsigned long long)
-                          pd->port_rcvhdrqtailaddr_phys);
-       }
-       /* clear for security and sanity on each use */
-       memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
-       if (pd->port_rcvhdrtail_kvaddr)
-               memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
-
-       /*
-        * tell chip each time we init it, even if we are re-using previous
-        * memory (we zero the register at process close)
-        */
-       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
-                             pd->port_port, pd->port_rcvhdrqtailaddr_phys);
-       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-                             pd->port_port, pd->port_rcvhdrq_phys);
-
-bail:
-       return ret;
-}
-
-
-/*
- * Flush all sends that might be in the ready to send state, as well as any
- * that are in the process of being sent.   Used whenever we need to be
- * sure the send side is idle.  Cleans up all buffer state by canceling
- * all pio buffers, and issuing an abort, which cleans up anything in the
- * launch fifo.  The cancel is superfluous on some chip versions, but
- * it's safer to always do it.
- * PIOAvail bits are updated by the chip as if normal send had happened.
- */
-void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
-{
-       unsigned long flags;
-
-       if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
-               ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
-               goto bail;
-       }
-       /*
-        * If we have SDMA, and it's not disabled, we have to kick off the
-        * abort state machine, provided we aren't already aborting.
-        * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
-        * we skip the rest of this routine. It is already "in progress"
-        */
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
-               int skip_cancel;
-               unsigned long *statp = &dd->ipath_sdma_status;
-
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               skip_cancel =
-                       test_and_set_bit(IPATH_SDMA_ABORTING, statp)
-                       && !test_bit(IPATH_SDMA_DISABLED, statp);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-               if (skip_cancel)
-                       goto bail;
-       }
-
-       ipath_dbg("Cancelling all in-progress send buffers\n");
-
-       /* skip armlaunch errs for a while */
-       dd->ipath_lastcancel = jiffies + HZ / 2;
-
-       /*
-        * The abort bit is auto-clearing.  We also don't want pioavail
-        * update happening during this, and we don't want any other
-        * sends going out, so turn those off for the duration.  We read
-        * the scratch register to be sure that cancels and the abort
-        * have taken effect in the chip.  Otherwise two parts are same
-        * as ipath_force_pio_avail_update()
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
-               | INFINIPATH_S_PIOENABLE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-               dd->ipath_sendctrl | INFINIPATH_S_ABORT);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /* disarm all send buffers */
-       ipath_disarm_piobufs(dd, 0,
-               dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-
-       if (restore_sendctrl) {
-               /* else done by caller later if needed */
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
-                       INFINIPATH_S_PIOENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               /* and again, be sure all have hit the chip */
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-
-       if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
-           !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
-           test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               /* only wait so long for intr */
-               dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
-               dd->ipath_sdma_reset_wait = 200;
-               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       }
-bail:;
-}
-
-/*
- * Force an update of in-memory copy of the pioavail registers, when
- * needed for any of a variety of reasons.  We read the scratch register
- * to make it highly likely that the update will have happened by the
- * time we return.  If already off (as in cancel_sends above), this
- * routine is a nop, on the assumption that the caller will "do the
- * right thing".
- */
-void ipath_force_pio_avail_update(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
-                               int linitcmd)
-{
-       u64 mod_wd;
-       static const char *what[4] = {
-               [0] = "NOP",
-               [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
-               [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
-               [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
-       };
-
-       if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
-               /*
-                * If we are told to disable, note that so link-recovery
-                * code does not attempt to bring us back up.
-                */
-               preempt_disable();
-               dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-               preempt_enable();
-       } else if (linitcmd) {
-               /*
-                * Any other linkinitcmd will lead to LINKDOWN and then
-                * to INIT (if all is well), so clear flag to let
-                * link-recovery code attempt to bring us back up.
-                */
-               preempt_disable();
-               dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-               preempt_enable();
-       }
-
-       mod_wd = (linkcmd << dd->ibcc_lc_shift) |
-               (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
-       ipath_cdbg(VERBOSE,
-               "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
-               dd->ipath_unit, what[linkcmd], linitcmd,
-               ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
-                       ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                        dd->ipath_ibcctrl | mod_wd);
-       /* read from chip so write is flushed */
-       (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-}
-
-int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
-{
-       u32 lstate;
-       int ret;
-
-       switch (newstate) {
-       case IPATH_IB_LINKDOWN_ONLY:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_POLL);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN_SLEEP:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_SLEEP);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN_DISABLE:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKARM:
-               if (dd->ipath_flags & IPATH_LINKARMED) {
-                       ret = 0;
-                       goto bail;
-               }
-               if (!(dd->ipath_flags &
-                     (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
-
-               /*
-                * Since the port can transition to ACTIVE by receiving
-                * a non VL 15 packet, wait for either state.
-                */
-               lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
-               break;
-
-       case IPATH_IB_LINKACTIVE:
-               if (dd->ipath_flags & IPATH_LINKACTIVE) {
-                       ret = 0;
-                       goto bail;
-               }
-               if (!(dd->ipath_flags & IPATH_LINKARMED)) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
-               lstate = IPATH_LINKACTIVE;
-               break;
-
-       case IPATH_IB_LINK_LOOPBACK:
-               dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
-               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-
-               /* turn heartbeat off, as it causes loopback to fail */
-               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                                      IPATH_IB_HRTBT_OFF);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINK_EXTERNAL:
-               dev_info(&dd->pcidev->dev,
-                       "Disabling IB local loopback (normal)\n");
-               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                                      IPATH_IB_HRTBT_ON);
-               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       /*
-        * Heartbeat can be explicitly enabled by the user via
-        * "hrtbt_enable" "file", and if disabled, trying to enable here
-        * will have no effect.  Implicit changes (heartbeat off when
-        * loopback on, and vice versa) are included to ease testing.
-        */
-       case IPATH_IB_LINK_HRTBT:
-               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                       IPATH_IB_HRTBT_ON);
-               goto bail;
-
-       case IPATH_IB_LINK_NO_HRTBT:
-               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                       IPATH_IB_HRTBT_OFF);
-               goto bail;
-
-       default:
-               ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
-               ret = -EINVAL;
-               goto bail;
-       }
-       ret = ipath_wait_linkstate(dd, lstate, 2000);
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_set_mtu - set the MTU
- * @dd: the infinipath device
- * @arg: the new MTU
- *
- * we can handle "any" incoming size, the issue here is whether we
- * need to restrict our outgoing size.   For now, we don't do any
- * sanity checking on this, and we don't deal with what happens to
- * programs that are already running when the size changes.
- * NOTE: changing the MTU will usually cause the IBC to go back to
- * link INIT state...
- */
-int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
-{
-       u32 piosize;
-       int changed = 0;
-       int ret;
-
-       /*
-        * mtu is IB data payload max.  It's the largest power of 2 less
-        * than piosize (or even larger, since it only really controls the
-        * largest we can receive; we can send the max of the mtu and
-        * piosize).  We check that it's one of the valid IB sizes.
-        */
-       if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
-           (arg != 4096 || !ipath_mtu4096)) {
-               ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
-               ret = -EINVAL;
-               goto bail;
-       }
-       if (dd->ipath_ibmtu == arg) {
-               ret = 0;        /* same as current */
-               goto bail;
-       }
-
-       piosize = dd->ipath_ibmaxlen;
-       dd->ipath_ibmtu = arg;
-
-       if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
-               /* Only if it's not the initial value (or reset to it) */
-               if (piosize != dd->ipath_init_ibmaxlen) {
-                       if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
-                               piosize = dd->ipath_init_ibmaxlen;
-                       dd->ipath_ibmaxlen = piosize;
-                       changed = 1;
-               }
-       } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
-               piosize = arg + IPATH_PIO_MAXIBHDR;
-               ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
-                          "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
-                          arg);
-               dd->ipath_ibmaxlen = piosize;
-               changed = 1;
-       }
-
-       if (changed) {
-               u64 ibc = dd->ipath_ibcctrl, ibdw;
-               /*
-                * update our housekeeping variables, and set IBC max
-                * size, same as init code; max IBC is max we allow in
-                * buffer, less the qword pbc, plus 1 for ICRC, in dwords
-                */
-               dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
-               ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
-               ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
-                        dd->ibcc_mpl_shift);
-               ibc |= ibdw << dd->ibcc_mpl_shift;
-               dd->ipath_ibcctrl = ibc;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-               dd->ipath_f_tidtemplate(dd);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
-{
-       dd->ipath_lid = lid;
-       dd->ipath_lmc = lmc;
-
-       dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
-               (~((1U << lmc) - 1)) << 16);
-
-       dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
-
-       return 0;
-}
-
-
-/**
- * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
- * @dd: the infinipath device
- * @regno: the register number to write
- * @port: the port containing the register
- * @value: the value to write
- *
- * Registers that vary with the chip implementation constants (port)
- * use this routine.
- */
-void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
-                         unsigned port, u64 value)
-{
-       u16 where;
-
-       if (port < dd->ipath_portcnt &&
-           (regno == dd->ipath_kregs->kr_rcvhdraddr ||
-            regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
-               where = regno + port;
-       else
-               where = -1;
-
-       ipath_write_kreg(dd, where, value);
-}
-
-/*
- * Following deal with the "obviously simple" task of overriding the state
- * of the LEDS, which normally indicate link physical and logical status.
- * The complications arise in dealing with different hardware mappings
- * and the board-dependent routine being called from interrupts.
- * and then there's the requirement to _flash_ them.
- */
-#define LED_OVER_FREQ_SHIFT 8
-#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
-/* Below is "non-zero" to force override, but both actual LEDs are off */
-#define LED_OVER_BOTH_OFF (8)
-
-static void ipath_run_led_override(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-       int timeoff;
-       int pidx;
-       u64 lstate, ltstate, val;
-
-       if (!(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       pidx = dd->ipath_led_override_phase++ & 1;
-       dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
-       timeoff = dd->ipath_led_override_timeoff;
-
-       /*
-        * below potentially restores the LED values per current status,
-        * should also possibly setup the traffic-blink register,
-        * but leave that to per-chip functions.
-        */
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-       ltstate = ipath_ib_linktrstate(dd, val);
-       lstate = ipath_ib_linkstate(dd, val);
-
-       dd->ipath_f_setextled(dd, lstate, ltstate);
-       mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
-}
-
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
-{
-       int timeoff, freq;
-
-       if (!(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       /* First check if we are blinking. If not, use 1HZ polling */
-       timeoff = HZ;
-       freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
-
-       if (freq) {
-               /* For blink, set each phase from one nybble of val */
-               dd->ipath_led_override_vals[0] = val & 0xF;
-               dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
-               timeoff = (HZ << 4)/freq;
-       } else {
-               /* Non-blink set both phases the same. */
-               dd->ipath_led_override_vals[0] = val & 0xF;
-               dd->ipath_led_override_vals[1] = val & 0xF;
-       }
-       dd->ipath_led_override_timeoff = timeoff;
-
-       /*
-        * If the timer has not already been started, do so. Use a "quick"
-        * timeout so the function will be called soon, to look at our request.
-        */
-       if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
-               /* Need to start timer */
-               setup_timer(&dd->ipath_led_override_timer,
-                               ipath_run_led_override, (unsigned long)dd);
-
-               dd->ipath_led_override_timer.expires = jiffies + 1;
-               add_timer(&dd->ipath_led_override_timer);
-       } else
-               atomic_dec(&dd->ipath_led_override_timer_active);
-}
-
-/**
- * ipath_shutdown_device - shut down a device
- * @dd: the infinipath device
- *
- * This is called to make the device quiet when we are about to
- * unload the driver, and also when the device is administratively
- * disabled.   It does not free any data structures.
- * Everything it does has to be setup again by ipath_init_chip(dd,1)
- */
-void ipath_shutdown_device(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       ipath_dbg("Shutting down the device\n");
-
-       ipath_hol_up(dd); /* make sure user processes aren't suspended */
-
-       dd->ipath_flags |= IPATH_LINKUNK;
-       dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
-                            IPATH_LINKINIT | IPATH_LINKARMED |
-                            IPATH_LINKACTIVE);
-       *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
-                               IPATH_STATUS_IB_READY);
-
-       /* mask interrupts, but not errors */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-       dd->ipath_rcvctrl = 0;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               teardown_sdma(dd);
-
-       /*
-        * gracefully stop all sends allowing any in progress to trickle out
-        * first.
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl = 0;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       /* flush it */
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /*
-        * enough for anything that's going to trickle out to have actually
-        * done so.
-        */
-       udelay(5);
-
-       dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
-
-       ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-       ipath_cancel_sends(dd, 0);
-
-       /*
-        * we are shutting down, so tell components that care.  We don't do
-        * this on just a link state change, much like ethernet, a cable
-        * unplug, etc. doesn't change driver state
-        */
-       signal_ib_event(dd, IB_EVENT_PORT_ERR);
-
-       /* disable IBC */
-       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control | INFINIPATH_C_FREEZEMODE);
-
-       /*
-        * clear SerdesEnable and turn the leds off; do this here because
-        * we are unloading, so don't count on interrupts to move along
-        * Turn the LEDs off explicitly for the same reason.
-        */
-       dd->ipath_f_quiet_serdes(dd);
-
-       /* stop all the timers that might still be running */
-       del_timer_sync(&dd->ipath_hol_timer);
-       if (dd->ipath_stats_timer_active) {
-               del_timer_sync(&dd->ipath_stats_timer);
-               dd->ipath_stats_timer_active = 0;
-       }
-       if (dd->ipath_intrchk_timer.data) {
-               del_timer_sync(&dd->ipath_intrchk_timer);
-               dd->ipath_intrchk_timer.data = 0;
-       }
-       if (atomic_read(&dd->ipath_led_override_timer_active)) {
-               del_timer_sync(&dd->ipath_led_override_timer);
-               atomic_set(&dd->ipath_led_override_timer_active, 0);
-       }
-
-       /*
-        * clear all interrupts and errors, so that the next time the driver
-        * is loaded or device is enabled, we know that whatever is set
-        * happened while we were unloaded
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-       ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
-       ipath_update_eeprom_log(dd);
-}
-
-/**
- * ipath_free_pddata - free a port's allocated data
- * @dd: the infinipath device
- * @pd: the portdata structure
- *
- * free up any allocated data for a port
- * This should not touch anything that would affect a simultaneous
- * re-allocation of port data, because it is called after ipath_mutex
- * is released (and can be called from reinit as well).
- * It should never change any chip state, or global driver state.
- * (The only exception to global state is freeing the port0 port0_skbs.)
- */
-void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
-{
-       if (!pd)
-               return;
-
-       if (pd->port_rcvhdrq) {
-               ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
-                          "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
-                          (unsigned long) pd->port_rcvhdrq_size);
-               dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
-                                 pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
-               pd->port_rcvhdrq = NULL;
-               if (pd->port_rcvhdrtail_kvaddr) {
-                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                        pd->port_rcvhdrtail_kvaddr,
-                                        pd->port_rcvhdrqtailaddr_phys);
-                       pd->port_rcvhdrtail_kvaddr = NULL;
-               }
-       }
-       if (pd->port_port && pd->port_rcvegrbuf) {
-               unsigned e;
-
-               for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-                       void *base = pd->port_rcvegrbuf[e];
-                       size_t size = pd->port_rcvegrbuf_size;
-
-                       ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
-                                  "chunk %u/%u\n", base,
-                                  (unsigned long) size,
-                                  e, pd->port_rcvegrbuf_chunks);
-                       dma_free_coherent(&dd->pcidev->dev, size,
-                               base, pd->port_rcvegrbuf_phys[e]);
-               }
-               kfree(pd->port_rcvegrbuf);
-               pd->port_rcvegrbuf = NULL;
-               kfree(pd->port_rcvegrbuf_phys);
-               pd->port_rcvegrbuf_phys = NULL;
-               pd->port_rcvegrbuf_chunks = 0;
-       } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
-               unsigned e;
-               struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
-
-               dd->ipath_port0_skbinfo = NULL;
-               ipath_cdbg(VERBOSE, "free closed port %d "
-                          "ipath_port0_skbinfo @ %p\n", pd->port_port,
-                          skbinfo);
-               for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
-                       if (skbinfo[e].skb) {
-                               pci_unmap_single(dd->pcidev, skbinfo[e].phys,
-                                                dd->ipath_ibmaxlen,
-                                                PCI_DMA_FROMDEVICE);
-                               dev_kfree_skb(skbinfo[e].skb);
-                       }
-               vfree(skbinfo);
-       }
-       kfree(pd->port_tid_pg_list);
-       vfree(pd->subport_uregbase);
-       vfree(pd->subport_rcvegrbuf);
-       vfree(pd->subport_rcvhdr_base);
-       kfree(pd);
-}
-
-static int __init infinipath_init(void)
-{
-       int ret;
-
-       if (ipath_debug & __IPATH_DBG)
-               printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
-
-       /*
-        * These must be called before the driver is registered with
-        * the PCI subsystem.
-        */
-       idr_init(&unit_table);
-
-       ret = pci_register_driver(&ipath_driver);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Unable to register driver: error %d\n", -ret);
-               goto bail_unit;
-       }
-
-       ret = ipath_init_ipathfs();
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
-                      "ipathfs: error %d\n", -ret);
-               goto bail_pci;
-       }
-
-       goto bail;
-
-bail_pci:
-       pci_unregister_driver(&ipath_driver);
-
-bail_unit:
-       idr_destroy(&unit_table);
-
-bail:
-       return ret;
-}
-
-static void __exit infinipath_cleanup(void)
-{
-       ipath_exit_ipathfs();
-
-       ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
-       pci_unregister_driver(&ipath_driver);
-
-       idr_destroy(&unit_table);
-}
-
-/**
- * ipath_reset_device - reset the chip if possible
- * @unit: the device to reset
- *
- * Whether or not reset is successful, we attempt to re-initialize the chip
- * (that is, much like a driver unload/reload).  We clear the INITTED flag
- * so that the various entry points will fail until we reinitialize.  For
- * now, we only allow this if no user ports are open that use chip resources
- */
-int ipath_reset_device(int unit)
-{
-       int ret, i;
-       struct ipath_devdata *dd = ipath_lookup(unit);
-       unsigned long flags;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (atomic_read(&dd->ipath_led_override_timer_active)) {
-               /* Need to stop LED timer, _then_ shut off LEDs */
-               del_timer_sync(&dd->ipath_led_override_timer);
-               atomic_set(&dd->ipath_led_override_timer_active, 0);
-       }
-
-       /* Shut off LEDs after we are sure timer is not running */
-       dd->ipath_led_override = LED_OVER_BOTH_OFF;
-       dd->ipath_f_setextled(dd, 0, 0);
-
-       dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
-
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
-               dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
-                        "not initialized or not present\n", unit);
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       if (dd->ipath_pd)
-               for (i = 1; i < dd->ipath_cfgports; i++) {
-                       if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-                               continue;
-                       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-                       ipath_dbg("unit %u port %d is in use "
-                                 "(PID %u cmd %s), can't reset\n",
-                                 unit, i,
-                                 pid_nr(dd->ipath_pd[i]->port_pid),
-                                 dd->ipath_pd[i]->port_comm);
-                       ret = -EBUSY;
-                       goto bail;
-               }
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               teardown_sdma(dd);
-
-       dd->ipath_flags &= ~IPATH_INITTED;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-       ret = dd->ipath_f_reset(dd);
-       if (ret == 1) {
-               ipath_dbg("Reinitializing unit %u after reset attempt\n",
-                         unit);
-               ret = ipath_init_chip(dd, 1);
-       } else
-               ret = -EAGAIN;
-       if (ret)
-               ipath_dev_err(dd, "Reinitialize unit %u after "
-                             "reset failed with %d\n", unit, ret);
-       else
-               dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
-                        "resetting\n", unit);
-
-bail:
-       return ret;
-}
-
-/*
- * send a signal to all the processes that have the driver open
- * through the normal interfaces (i.e., everything other than diags
- * interface).  Returns number of signalled processes.
- */
-static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
-{
-       int i, sub, any = 0;
-       struct pid *pid;
-       unsigned long flags;
-
-       if (!dd->ipath_pd)
-               return 0;
-
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-                       continue;
-               pid = dd->ipath_pd[i]->port_pid;
-               if (!pid)
-                       continue;
-
-               dev_info(&dd->pcidev->dev, "context %d in use "
-                         "(PID %u), sending signal %d\n",
-                         i, pid_nr(pid), sig);
-               kill_pid(pid, sig, 1);
-               any++;
-               for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
-                       pid = dd->ipath_pd[i]->port_subpid[sub];
-                       if (!pid)
-                               continue;
-                       dev_info(&dd->pcidev->dev, "sub-context "
-                               "%d:%d in use (PID %u), sending "
-                               "signal %d\n", i, sub, pid_nr(pid), sig);
-                       kill_pid(pid, sig, 1);
-                       any++;
-               }
-       }
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-       return any;
-}
-
-static void ipath_hol_signal_down(struct ipath_devdata *dd)
-{
-       if (ipath_signal_procs(dd, SIGSTOP))
-               ipath_dbg("Stopped some processes\n");
-       ipath_cancel_sends(dd, 1);
-}
-
-
-static void ipath_hol_signal_up(struct ipath_devdata *dd)
-{
-       if (ipath_signal_procs(dd, SIGCONT))
-               ipath_dbg("Continued some processes\n");
-}
-
-/*
- * link is down, stop any users processes, and flush pending sends
- * to prevent HoL blocking, then start the HoL timer that
- * periodically continues, then stop procs, so they can detect
- * link down if they want, and do something about it.
- * Timer may already be running, so use mod_timer, not add_timer.
- */
-void ipath_hol_down(struct ipath_devdata *dd)
-{
-       dd->ipath_hol_state = IPATH_HOL_DOWN;
-       ipath_hol_signal_down(dd);
-       dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-       dd->ipath_hol_timer.expires = jiffies +
-               msecs_to_jiffies(ipath_hol_timeout_ms);
-       mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
-}
-
-/*
- * link is up, continue any user processes, and ensure timer
- * is a nop, if running.  Let timer keep running, if set; it
- * will nop when it sees the link is up
- */
-void ipath_hol_up(struct ipath_devdata *dd)
-{
-       ipath_hol_signal_up(dd);
-       dd->ipath_hol_state = IPATH_HOL_UP;
-}
-
-/*
- * toggle the running/not running state of user proceses
- * to prevent HoL blocking on chip resources, but still allow
- * user processes to do link down special case handling.
- * Should only be called via the timer
- */
-void ipath_hol_event(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
-               && dd->ipath_hol_state != IPATH_HOL_UP) {
-               dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-               ipath_dbg("Stopping processes\n");
-               ipath_hol_signal_down(dd);
-       } else { /* may do "extra" if also in ipath_hol_up() */
-               dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
-               ipath_dbg("Continuing processes\n");
-               ipath_hol_signal_up(dd);
-       }
-       if (dd->ipath_hol_state == IPATH_HOL_UP)
-               ipath_dbg("link's up, don't resched timer\n");
-       else {
-               dd->ipath_hol_timer.expires = jiffies +
-                       msecs_to_jiffies(ipath_hol_timeout_ms);
-               mod_timer(&dd->ipath_hol_timer,
-                       dd->ipath_hol_timer.expires);
-       }
-}
-
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
-{
-       u64 val;
-
-       if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
-               return -1;
-       if (dd->ipath_rx_pol_inv != new_pol_inv) {
-               dd->ipath_rx_pol_inv = new_pol_inv;
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-                        INFINIPATH_XGXS_RX_POL_SHIFT);
-               val |= ((u64)dd->ipath_rx_pol_inv) <<
-                       INFINIPATH_XGXS_RX_POL_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-       }
-       return 0;
-}
-
-/*
- * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
- * the 7220, which is count-based, rather than trigger-based.  Safe for the
- * driver check, since it's at init.   Not completely safe when used for
- * user-mode checking, since some error checking can be lost, but not
- * particularly risky, and only has problematic side-effects in the face of
- * very buggy user code.  There is no reference counting, but that's also
- * fine, given the intended use.
- */
-void ipath_enable_armlaunch(struct ipath_devdata *dd)
-{
-       dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-               INFINIPATH_E_SPIOARMLAUNCH);
-       dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-}
-
-void ipath_disable_armlaunch(struct ipath_devdata *dd)
-{
-       /* so don't re-enable if already set */
-       dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-}
-
-module_init(infinipath_init);
-module_exit(infinipath_cleanup);
diff --git a/drivers/staging/rdma/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c
deleted file mode 100644 (file)
index ef84107..0000000
+++ /dev/null
@@ -1,1183 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-
-/*
- * InfiniPath I2C driver for a serial eeprom.  This is not a generic
- * I2C interface.  For a start, the device we're using (Atmel AT24C11)
- * doesn't work like a regular I2C device.  It looks like one
- * electrically, but not logically.  Normal I2C devices have a single
- * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
- * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
- * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
- * call" address.)  The Atmel device, on the other hand, responds to ALL
- * 7-bit addresses.  It's designed to be the only device on a given I2C
- * bus.  A 7-bit address corresponds to the memory address within the
- * Atmel device itself.
- *
- * Also, the timing requirements mean more than simple software
- * bitbanging, with readbacks from chip to ensure timing (simple udelay
- * is not enough).
- *
- * This all means that accessing the device is specialized enough
- * that using the standard kernel I2C bitbanging interface would be
- * impossible.  For example, the core I2C eeprom driver expects to find
- * a device at one or more of a limited set of addresses only.  It doesn't
- * allow writing to an eeprom.  It also doesn't provide any means of
- * accessing eeprom contents from within the kernel, only via sysfs.
- */
-
-/* Added functionality for IBA7220-based cards */
-#define IPATH_EEPROM_DEV_V1 0xA0
-#define IPATH_EEPROM_DEV_V2 0xA2
-#define IPATH_TEMP_DEV 0x98
-#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
-#define IPATH_NO_DEV (0xFF)
-
-/*
- * The number of I2C chains is proliferating. Table below brings
- * some order to the madness. The basic principle is that the
- * table is scanned from the top, and a "probe" is made to the
- * device probe_dev. If that succeeds, the chain is considered
- * to be of that type, and dd->i2c_chain_type is set to the index+1
- * of the entry.
- * The +1 is so static initialization can mean "unknown, do probe."
- */
-static struct i2c_chain_desc {
-       u8 probe_dev;   /* If seen at probe, chain is this type */
-       u8 eeprom_dev;  /* Dev addr (if any) for EEPROM */
-       u8 temp_dev;    /* Dev Addr (if any) for Temp-sense */
-} i2c_chains[] = {
-       { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
-       { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
-       { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
-       { IPATH_NO_DEV }
-};
-
-enum i2c_type {
-       i2c_line_scl = 0,
-       i2c_line_sda
-};
-
-enum i2c_state {
-       i2c_line_low = 0,
-       i2c_line_high
-};
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_gpio_set - set a GPIO line
- * @dd: the infinipath device
- * @line: the line to set
- * @new_line_state: the state to set
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.
- */
-static int i2c_gpio_set(struct ipath_devdata *dd,
-                       enum i2c_type line,
-                       enum i2c_state new_line_state)
-{
-       u64 out_mask, dir_mask, *gpioval;
-       unsigned long flags = 0;
-
-       gpioval = &dd->ipath_gpio_out;
-
-       if (line == i2c_line_scl) {
-               dir_mask = dd->ipath_gpio_scl;
-               out_mask = (1UL << dd->ipath_gpio_scl_num);
-       } else {
-               dir_mask = dd->ipath_gpio_sda;
-               out_mask = (1UL << dd->ipath_gpio_sda_num);
-       }
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       if (new_line_state == i2c_line_high) {
-               /* tri-state the output rather than force high */
-               dd->ipath_extctrl &= ~dir_mask;
-       } else {
-               /* config line to be an output */
-               dd->ipath_extctrl |= dir_mask;
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-
-       /* set output as well (no real verify) */
-       if (new_line_state == i2c_line_high)
-               *gpioval |= out_mask;
-       else
-               *gpioval &= ~out_mask;
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       return 0;
-}
-
-/**
- * i2c_gpio_get - get a GPIO line state
- * @dd: the infinipath device
- * @line: the line to get
- * @curr_statep: where to put the line state
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.  curr_state is not set on error.
- */
-static int i2c_gpio_get(struct ipath_devdata *dd,
-                       enum i2c_type line,
-                       enum i2c_state *curr_statep)
-{
-       u64 read_val, mask;
-       int ret;
-       unsigned long flags = 0;
-
-       /* check args */
-       if (curr_statep == NULL) {
-               ret = 1;
-               goto bail;
-       }
-
-       /* config line to be an input */
-       if (line == i2c_line_scl)
-               mask = dd->ipath_gpio_scl;
-       else
-               mask = dd->ipath_gpio_sda;
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       dd->ipath_extctrl &= ~mask;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-       /*
-        * Below is very unlikely to reflect true input state if Output
-        * Enable actually changed.
-        */
-       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       if (read_val & mask)
-               *curr_statep = i2c_line_high;
-       else
-               *curr_statep = i2c_line_low;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the infinipath device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct ipath_devdata *dd)
-{
-       (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-       rmb();
-}
-
-static void scl_out(struct ipath_devdata *dd, u8 bit)
-{
-       udelay(1);
-       i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
-
-       i2c_wait_for_writes(dd);
-}
-
-static void sda_out(struct ipath_devdata *dd, u8 bit)
-{
-       i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
-
-       i2c_wait_for_writes(dd);
-}
-
-static u8 sda_in(struct ipath_devdata *dd, int wait)
-{
-       enum i2c_state bit;
-
-       if (i2c_gpio_get(dd, i2c_line_sda, &bit))
-               ipath_dbg("get bit failed!\n");
-
-       if (wait)
-               i2c_wait_for_writes(dd);
-
-       return bit == i2c_line_high ? 1U : 0;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the infinipath device
- */
-static int i2c_ackrcv(struct ipath_devdata *dd)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, 1);
-       scl_out(dd, i2c_line_high);
-       ack_received = sda_in(dd, 1) == 0;
-       scl_out(dd, i2c_line_low);
-       return ack_received;
-}
-
-/**
- * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
- * @dd: the infinipath device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct ipath_devdata *dd)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, i2c_line_high);
-               data |= sda_in(dd, 0);
-               scl_out(dd, i2c_line_low);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the infinipath device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct ipath_devdata *dd, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, bit);
-               scl_out(dd, i2c_line_high);
-               scl_out(dd, i2c_line_low);
-       }
-       return (!i2c_ackrcv(dd)) ? 1 : 0;
-}
-
-static void send_ack(struct ipath_devdata *dd)
-{
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_high);
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_high);
-}
-
-/**
- * i2c_startcmd - transmit the start condition, followed by address/cmd
- * @dd: the infinipath device
- * @offset_dir: direction byte
- *
- *      (both clock/data high, clock high, data low while clock is high)
- */
-static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
-{
-       int res;
-
-       /* issue start sequence */
-       sda_out(dd, i2c_line_high);
-       scl_out(dd, i2c_line_high);
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_low);
-
-       /* issue length and direction byte */
-       res = wr_byte(dd, offset_dir);
-
-       if (res)
-               ipath_cdbg(VERBOSE, "No ack to complete start\n");
-
-       return res;
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the infinipath device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct ipath_devdata *dd)
-{
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_high);
-       sda_out(dd, i2c_line_high);
-       udelay(2);
-}
-
-/**
- * eeprom_reset - reset I2C communication
- * @dd: the infinipath device
- */
-
-static int eeprom_reset(struct ipath_devdata *dd)
-{
-       int clock_cycles_left = 9;
-       u64 *gpioval = &dd->ipath_gpio_out;
-       int ret;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       /* Make sure shadows are consistent */
-       dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
-       *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
-                  "is %llx\n", (unsigned long long) *gpioval);
-
-       /*
-        * This is to get the i2c into a known state, by first going low,
-        * then tristate sda (and then tristate scl as first thing
-        * in loop)
-        */
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_high);
-
-       /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
-       while (clock_cycles_left--) {
-               scl_out(dd, i2c_line_high);
-
-               /* SDA seen high, issue START by dropping it while SCL high */
-               if (sda_in(dd, 0)) {
-                       sda_out(dd, i2c_line_low);
-                       scl_out(dd, i2c_line_low);
-                       /* ATMEL spec says must be followed by STOP. */
-                       scl_out(dd, i2c_line_high);
-                       sda_out(dd, i2c_line_high);
-                       ret = 0;
-                       goto bail;
-               }
-
-               scl_out(dd, i2c_line_low);
-       }
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/*
- * Probe for I2C device at specified address. Returns 0 for "success"
- * to match rest of this file.
- * Leave bus in "reasonable" state for further commands.
- */
-static int i2c_probe(struct ipath_devdata *dd, int devaddr)
-{
-       int ret;
-
-       ret = eeprom_reset(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
-                             devaddr);
-               return ret;
-       }
-       /*
-        * Reset no longer leaves bus in start condition, so normal
-        * i2c_startcmd() will do.
-        */
-       ret = i2c_startcmd(dd, devaddr | READ_CMD);
-       if (ret)
-               ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
-                          devaddr);
-       else {
-               /*
-                * Device did respond. Complete a single-byte read, because some
-                * devices apparently cannot handle STOP immediately after they
-                * ACK the start-cmd.
-                */
-               int data;
-               data = rd_byte(dd);
-               stop_cmd(dd);
-               ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
-       }
-       return ret;
-}
-
-/*
- * Returns the "i2c type". This is a pointer to a struct that describes
- * the I2C chain on this board. To minimize impact on struct ipath_devdata,
- * the (small integer) index into the table is actually memoized, rather
- * then the pointer.
- * Memoization is because the type is determined on the first call per chip.
- * An alternative would be to move type determination to early
- * init code.
- */
-static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
-{
-       int idx;
-
-       /* Get memoized index, from previous successful probes */
-       idx = dd->ipath_i2c_chain_type - 1;
-       if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
-               goto done;
-
-       idx = 0;
-       while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
-               /* if probe succeeds, this is type */
-               if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
-                       break;
-               ++idx;
-       }
-
-       /*
-        * Old EEPROM (first entry) may require a reset after probe,
-        * rather than being able to "start" after "stop"
-        */
-       if (idx == 0)
-               eeprom_reset(dd);
-
-       if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
-               idx = -1;
-       else
-               dd->ipath_i2c_chain_type = idx + 1;
-done:
-       return (idx >= 0) ? i2c_chains + idx : NULL;
-}
-
-static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
-                                       u8 eeprom_offset, void *buffer, int len)
-{
-       int ret;
-       struct i2c_chain_desc *icd;
-       u8 *bp = buffer;
-
-       ret = 1;
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->eeprom_dev == IPATH_NO_DEV) {
-               /* legacy not-really-I2C */
-               ipath_cdbg(VERBOSE, "Start command only address\n");
-               eeprom_offset = (eeprom_offset << 1) | READ_CMD;
-               ret = i2c_startcmd(dd, eeprom_offset);
-       } else {
-               /* Actual I2C */
-               ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
-               if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-                       ipath_dbg("Failed EEPROM startcmd\n");
-                       stop_cmd(dd);
-                       ret = 1;
-                       goto bail;
-               }
-               ret = wr_byte(dd, eeprom_offset);
-               stop_cmd(dd);
-               if (ret) {
-                       ipath_dev_err(dd, "Failed to write EEPROM address\n");
-                       ret = 1;
-                       goto bail;
-               }
-               ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
-       }
-       if (ret) {
-               ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
-               stop_cmd(dd);
-               ret = 1;
-               goto bail;
-       }
-
-       /*
-        * eeprom keeps clocking data out as long as we ack, automatically
-        * incrementing the address.
-        */
-       while (len-- > 0) {
-               /* get and store data */
-               *bp++ = rd_byte(dd);
-               /* send ack if not the last byte */
-               if (len)
-                       send_ack(dd);
-       }
-
-       stop_cmd(dd);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
-                                      const void *buffer, int len)
-{
-       int sub_len;
-       const u8 *bp = buffer;
-       int max_wait_time, i;
-       int ret;
-       struct i2c_chain_desc *icd;
-
-       ret = 1;
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       while (len > 0) {
-               if (icd->eeprom_dev == IPATH_NO_DEV) {
-                       if (i2c_startcmd(dd,
-                                        (eeprom_offset << 1) | WRITE_CMD)) {
-                               ipath_dbg("Failed to start cmd offset %u\n",
-                                       eeprom_offset);
-                               goto failed_write;
-                       }
-               } else {
-                       /* Real I2C */
-                       if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-                               ipath_dbg("Failed EEPROM startcmd\n");
-                               goto failed_write;
-                       }
-                       ret = wr_byte(dd, eeprom_offset);
-                       if (ret) {
-                               ipath_dev_err(dd, "Failed to write EEPROM "
-                                             "address\n");
-                               goto failed_write;
-                       }
-               }
-
-               sub_len = min(len, 4);
-               eeprom_offset += sub_len;
-               len -= sub_len;
-
-               for (i = 0; i < sub_len; i++) {
-                       if (wr_byte(dd, *bp++)) {
-                               ipath_dbg("no ack after byte %u/%u (%u "
-                                         "total remain)\n", i, sub_len,
-                                         len + sub_len - i);
-                               goto failed_write;
-                       }
-               }
-
-               stop_cmd(dd);
-
-               /*
-                * wait for write complete by waiting for a successful
-                * read (the chip replies with a zero after the write
-                * cmd completes, and before it writes to the eeprom.
-                * The startcmd for the read will fail the ack until
-                * the writes have completed.   We do this inline to avoid
-                * the debug prints that are in the real read routine
-                * if the startcmd fails.
-                * We also use the proper device address, so it doesn't matter
-                * whether we have real eeprom_dev. legacy likes any address.
-                */
-               max_wait_time = 100;
-               while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
-                       stop_cmd(dd);
-                       if (!--max_wait_time) {
-                               ipath_dbg("Did not get successful read to "
-                                         "complete write\n");
-                               goto failed_write;
-                       }
-               }
-               /* now read (and ignore) the resulting byte */
-               rd_byte(dd);
-               stop_cmd(dd);
-       }
-
-       ret = 0;
-       goto bail;
-
-failed_write:
-       stop_cmd(dd);
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_eeprom_read - receives bytes from the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: address to read from
- * @buffer: where to store result
- * @len: number of bytes to receive
- */
-int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
-                       void *buff, int len)
-{
-       int ret;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       return ret;
-}
-
-/**
- * ipath_eeprom_write - writes data to the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: where to place data
- * @buffer: data to write
- * @len: number of bytes to write
- */
-int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
-                       const void *buff, int len)
-{
-       int ret;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       return ret;
-}
-
-static u8 flash_csum(struct ipath_flash *ifp, int adjust)
-{
-       u8 *ip = (u8 *) ifp;
-       u8 csum = 0, len;
-
-       /*
-        * Limit length checksummed to max length of actual data.
-        * Checksum of erased eeprom will still be bad, but we avoid
-        * reading past the end of the buffer we were passed.
-        */
-       len = ifp->if_length;
-       if (len > sizeof(struct ipath_flash))
-               len = sizeof(struct ipath_flash);
-       while (len--)
-               csum += *ip++;
-       csum -= ifp->if_csum;
-       csum = ~csum;
-       if (adjust)
-               ifp->if_csum = csum;
-
-       return csum;
-}
-
-/**
- * ipath_get_guid - get the GUID from the i2c device
- * @dd: the infinipath device
- *
- * We have the capability to use the ipath_nguid field, and get
- * the guid from the first chip's flash, to use for all of them.
- */
-void ipath_get_eeprom_info(struct ipath_devdata *dd)
-{
-       void *buf;
-       struct ipath_flash *ifp;
-       __be64 guid;
-       int len, eep_stat;
-       u8 csum, *bguid;
-       int t = dd->ipath_unit;
-       struct ipath_devdata *dd0 = ipath_lookup(0);
-
-       if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
-               u8 oguid;
-               dd->ipath_guid = dd0->ipath_guid;
-               bguid = (u8 *) & dd->ipath_guid;
-
-               oguid = bguid[7];
-               bguid[7] += t;
-               if (oguid > bguid[7]) {
-                       if (bguid[6] == 0xff) {
-                               if (bguid[5] == 0xff) {
-                                       ipath_dev_err(
-                                               dd,
-                                               "Can't set %s GUID from "
-                                               "base, wraps to OUI!\n",
-                                               ipath_get_unit_name(t));
-                                       dd->ipath_guid = 0;
-                                       goto bail;
-                               }
-                               bguid[5]++;
-                       }
-                       bguid[6]++;
-               }
-               dd->ipath_nguid = 1;
-
-               ipath_dbg("nguid %u, so adding %u to device 0 guid, "
-                         "for %llx\n",
-                         dd0->ipath_nguid, t,
-                         (unsigned long long) be64_to_cpu(dd->ipath_guid));
-               goto bail;
-       }
-
-       /*
-        * read full flash, not just currently used part, since it may have
-        * been written with a newer definition
-        * */
-       len = sizeof(struct ipath_flash);
-       buf = vmalloc(len);
-       if (!buf) {
-               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-                             "bytes from eeprom for GUID\n", len);
-               goto bail;
-       }
-
-       mutex_lock(&dd->ipath_eep_lock);
-       eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
-       mutex_unlock(&dd->ipath_eep_lock);
-
-       if (eep_stat) {
-               ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
-               goto done;
-       }
-       ifp = (struct ipath_flash *)buf;
-
-       csum = flash_csum(ifp, 0);
-       if (csum != ifp->if_csum) {
-               dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
-                        "0x%x, not 0x%x\n", csum, ifp->if_csum);
-               goto done;
-       }
-       if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
-           *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
-               ipath_dev_err(dd, "Invalid GUID %llx from flash; "
-                             "ignoring\n",
-                             *(unsigned long long *) ifp->if_guid);
-               /* don't allow GUID if all 0 or all 1's */
-               goto done;
-       }
-
-       /* complain, but allow it */
-       if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
-               dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
-                        "default, probably not correct!\n",
-                        *(unsigned long long *) ifp->if_guid);
-
-       bguid = ifp->if_guid;
-       if (!bguid[0] && !bguid[1] && !bguid[2]) {
-               /* original incorrect GUID format in flash; fix in
-                * core copy, by shifting up 2 octets; don't need to
-                * change top octet, since both it and shifted are
-                * 0.. */
-               bguid[1] = bguid[3];
-               bguid[2] = bguid[4];
-               bguid[3] = bguid[4] = 0;
-               guid = *(__be64 *) ifp->if_guid;
-               ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
-                          "shifting 2 octets\n");
-       } else
-               guid = *(__be64 *) ifp->if_guid;
-       dd->ipath_guid = guid;
-       dd->ipath_nguid = ifp->if_numguid;
-       /*
-        * Things are slightly complicated by the desire to transparently
-        * support both the Pathscale 10-digit serial number and the QLogic
-        * 13-character version.
-        */
-       if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
-               && ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
-               /* This board has a Serial-prefix, which is stored
-                * elsewhere for backward-compatibility.
-                */
-               char *snp = dd->ipath_serial;
-               memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
-               snp[sizeof ifp->if_sprefix] = '\0';
-               len = strlen(snp);
-               snp += len;
-               len = (sizeof dd->ipath_serial) - len;
-               if (len > sizeof ifp->if_serial) {
-                       len = sizeof ifp->if_serial;
-               }
-               memcpy(snp, ifp->if_serial, len);
-       } else
-               memcpy(dd->ipath_serial, ifp->if_serial,
-                      sizeof ifp->if_serial);
-       if (!strstr(ifp->if_comment, "Tested successfully"))
-               ipath_dev_err(dd, "Board SN %s did not pass functional "
-                       "test: %s\n", dd->ipath_serial,
-                       ifp->if_comment);
-
-       ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
-                  (unsigned long long) be64_to_cpu(dd->ipath_guid));
-
-       memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
-       /*
-        * Power-on (actually "active") hours are kept as little-endian value
-        * in EEPROM, but as seconds in a (possibly as small as 24-bit)
-        * atomic_t while running.
-        */
-       atomic_set(&dd->ipath_active_time, 0);
-       dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
-
-done:
-       vfree(buf);
-
-bail:;
-}
-
-/**
- * ipath_update_eeprom_log - copy active-time and error counters to eeprom
- * @dd: the infinipath device
- *
- * Although the time is kept as seconds in the ipath_devdata struct, it is
- * rounded to hours for re-write, as we have only 16 bits in EEPROM.
- * First-cut code reads whole (expected) struct ipath_flash, modifies,
- * re-writes. Future direction: read/write only what we need, assuming
- * that the EEPROM had to have been "good enough" for driver init, and
- * if not, we aren't making it worse.
- *
- */
-
-int ipath_update_eeprom_log(struct ipath_devdata *dd)
-{
-       void *buf;
-       struct ipath_flash *ifp;
-       int len, hi_water;
-       uint32_t new_time, new_hrs;
-       u8 csum;
-       int ret, idx;
-       unsigned long flags;
-
-       /* first, check if we actually need to do anything. */
-       ret = 0;
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               if (dd->ipath_eep_st_new_errs[idx]) {
-                       ret = 1;
-                       break;
-               }
-       }
-       new_time = atomic_read(&dd->ipath_active_time);
-
-       if (ret == 0 && new_time < 3600)
-               return 0;
-
-       /*
-        * The quick-check above determined that there is something worthy
-        * of logging, so get current contents and do a more detailed idea.
-        * read full flash, not just currently used part, since it may have
-        * been written with a newer definition
-        */
-       len = sizeof(struct ipath_flash);
-       buf = vmalloc(len);
-       ret = 1;
-       if (!buf) {
-               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-                               "bytes from eeprom for logging\n", len);
-               goto bail;
-       }
-
-       /* Grab semaphore and read current EEPROM. If we get an
-        * error, let go, but if not, keep it until we finish write.
-        */
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (ret) {
-               ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
-               goto free_bail;
-       }
-       ret = ipath_eeprom_internal_read(dd, 0, buf, len);
-       if (ret) {
-               mutex_unlock(&dd->ipath_eep_lock);
-               ipath_dev_err(dd, "Unable read EEPROM for logging\n");
-               goto free_bail;
-       }
-       ifp = (struct ipath_flash *)buf;
-
-       csum = flash_csum(ifp, 0);
-       if (csum != ifp->if_csum) {
-               mutex_unlock(&dd->ipath_eep_lock);
-               ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
-                               csum, ifp->if_csum);
-               ret = 1;
-               goto free_bail;
-       }
-       hi_water = 0;
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               int new_val = dd->ipath_eep_st_new_errs[idx];
-               if (new_val) {
-                       /*
-                        * If we have seen any errors, add to EEPROM values
-                        * We need to saturate at 0xFF (255) and we also
-                        * would need to adjust the checksum if we were
-                        * trying to minimize EEPROM traffic
-                        * Note that we add to actual current count in EEPROM,
-                        * in case it was altered while we were running.
-                        */
-                       new_val += ifp->if_errcntp[idx];
-                       if (new_val > 0xFF)
-                               new_val = 0xFF;
-                       if (ifp->if_errcntp[idx] != new_val) {
-                               ifp->if_errcntp[idx] = new_val;
-                               hi_water = offsetof(struct ipath_flash,
-                                               if_errcntp) + idx;
-                       }
-                       /*
-                        * update our shadow (used to minimize EEPROM
-                        * traffic), to match what we are about to write.
-                        */
-                       dd->ipath_eep_st_errs[idx] = new_val;
-                       dd->ipath_eep_st_new_errs[idx] = 0;
-               }
-       }
-       /*
-        * now update active-time. We would like to round to the nearest hour
-        * but unless atomic_t are sure to be proper signed ints we cannot,
-        * because we need to account for what we "transfer" to EEPROM and
-        * if we log an hour at 31 minutes, then we would need to set
-        * active_time to -29 to accurately count the _next_ hour.
-        */
-       if (new_time >= 3600) {
-               new_hrs = new_time / 3600;
-               atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
-               new_hrs += dd->ipath_eep_hrs;
-               if (new_hrs > 0xFFFF)
-                       new_hrs = 0xFFFF;
-               dd->ipath_eep_hrs = new_hrs;
-               if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
-                       ifp->if_powerhour[0] = new_hrs & 0xFF;
-                       hi_water = offsetof(struct ipath_flash, if_powerhour);
-               }
-               if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
-                       ifp->if_powerhour[1] = new_hrs >> 8;
-                       hi_water = offsetof(struct ipath_flash, if_powerhour)
-                                       + 1;
-               }
-       }
-       /*
-        * There is a tiny possibility that we could somehow fail to write
-        * the EEPROM after updating our shadows, but problems from holding
-        * the spinlock too long are a much bigger issue.
-        */
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-       if (hi_water) {
-               /* we made some change to the data, uopdate cksum and write */
-               csum = flash_csum(ifp, 1);
-               ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
-       }
-       mutex_unlock(&dd->ipath_eep_lock);
-       if (ret)
-               ipath_dev_err(dd, "Failed updating EEPROM\n");
-
-free_bail:
-       vfree(buf);
-bail:
-       return ret;
-
-}
-
-/**
- * ipath_inc_eeprom_err - increment one of the four error counters
- * that are logged to EEPROM.
- * @dd: the infinipath device
- * @eidx: 0..3, the counter to increment
- * @incr: how much to add
- *
- * Each counter is 8-bits, and saturates at 255 (0xFF). They
- * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
- * is called, but it can only be called in a context that allows sleep.
- * This function can be called even at interrupt level.
- */
-
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
-{
-       uint new_val;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
-       if (new_val > 255)
-               new_val = 255;
-       dd->ipath_eep_st_new_errs[eidx] = new_val;
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-       return;
-}
-
-static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
-{
-       int ret;
-       struct i2c_chain_desc *icd;
-
-       ret = -ENOENT;
-
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->temp_dev == IPATH_NO_DEV) {
-               /* tempsense only exists on new, real-I2C boards */
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-               ipath_dbg("Failed tempsense startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, regnum);
-       stop_cmd(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
-                             regnum);
-               ret = -ENXIO;
-               goto bail;
-       }
-       if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
-               ipath_dbg("Failed tempsense RD startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       /*
-        * We can only clock out one byte per command, sensibly
-        */
-       ret = rd_byte(dd);
-       stop_cmd(dd);
-
-bail:
-       return ret;
-}
-
-#define VALID_TS_RD_REG_MASK 0xBF
-
-/**
- * ipath_tempsense_read - read register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to read from
- *
- * returns reg contents (0..255) or < 0 for error
- */
-int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
-{
-       int ret;
-
-       if (regnum > 7)
-               return -EINVAL;
-
-       /* return a bogus value for (the one) register we do not have */
-       if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
-               return 0;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_tempsense_internal_read(dd, regnum);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       /*
-        * There are three possibilities here:
-        * ret is actual value (0..255)
-        * ret is -ENXIO or -EINVAL from code in this file
-        * ret is -EINTR from mutex_lock_interruptible.
-        */
-       return ret;
-}
-
-static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
-                                         u8 regnum, u8 data)
-{
-       int ret = -ENOENT;
-       struct i2c_chain_desc *icd;
-
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->temp_dev == IPATH_NO_DEV) {
-               /* tempsense only exists on new, real-I2C boards */
-               ret = -ENXIO;
-               goto bail;
-       }
-       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-               ipath_dbg("Failed tempsense startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, regnum);
-       if (ret) {
-               stop_cmd(dd);
-               ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
-                             regnum);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, data);
-       stop_cmd(dd);
-       ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
-       if (ret) {
-               ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
-                             regnum);
-               ret = -ENXIO;
-       }
-
-bail:
-       return ret;
-}
-
-#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
-
-/**
- * ipath_tempsense_write - write register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to write
- * @data: data to write
- *
- * returns 0 for success or < 0 for error
- */
-int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
-{
-       int ret;
-
-       if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
-               return -EINVAL;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_tempsense_internal_write(dd, regnum, data);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       /*
-        * There are three possibilities here:
-        * ret is 0 for success
-        * ret is -ENXIO or -EINVAL from code in this file
-        * ret is -EINTR from mutex_lock_interruptible.
-        */
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c
deleted file mode 100644 (file)
index 6187b84..0000000
+++ /dev/null
@@ -1,2619 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/cdev.h>
-#include <linux/swap.h>
-#include <linux/export.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/jiffies.h>
-#include <linux/cpu.h>
-#include <linux/uio.h>
-#include <asm/pgtable.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-#include "ipath_user_sdma.h"
-
-static int ipath_open(struct inode *, struct file *);
-static int ipath_close(struct inode *, struct file *);
-static ssize_t ipath_write(struct file *, const char __user *, size_t,
-                          loff_t *);
-static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
-static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
-static int ipath_mmap(struct file *, struct vm_area_struct *);
-
-/*
- * This is really, really weird shit - write() and writev() here
- * have completely unrelated semantics.  Sucky userland ABI,
- * film at 11.
- */
-static const struct file_operations ipath_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_write,
-       .write_iter = ipath_write_iter,
-       .open = ipath_open,
-       .release = ipath_close,
-       .poll = ipath_poll,
-       .mmap = ipath_mmap,
-       .llseek = noop_llseek,
-};
-
-/*
- * Convert kernel virtual addresses to physical addresses so they don't
- * potentially conflict with the chip addresses used as mmap offsets.
- * It doesn't really matter what mmap offset we use as long as we can
- * interpret it correctly.
- */
-static u64 cvt_kvaddr(void *p)
-{
-       struct page *page;
-       u64 paddr = 0;
-
-       page = vmalloc_to_page(p);
-       if (page)
-               paddr = page_to_pfn(page) << PAGE_SHIFT;
-
-       return paddr;
-}
-
-static int ipath_get_base_info(struct file *fp,
-                              void __user *ubase, size_t ubase_size)
-{
-       struct ipath_portdata *pd = port_fp(fp);
-       int ret = 0;
-       struct ipath_base_info *kinfo = NULL;
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned subport_cnt;
-       int shared, master;
-       size_t sz;
-
-       subport_cnt = pd->port_subport_cnt;
-       if (!subport_cnt) {
-               shared = 0;
-               master = 0;
-               subport_cnt = 1;
-       } else {
-               shared = 1;
-               master = !subport_fp(fp);
-       }
-
-       sz = sizeof(*kinfo);
-       /* If port sharing is not requested, allow the old size structure */
-       if (!shared)
-               sz -= 7 * sizeof(u64);
-       if (ubase_size < sz) {
-               ipath_cdbg(PROC,
-                          "Base size %zu, need %zu (version mismatch?)\n",
-                          ubase_size, sz);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
-       if (kinfo == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       ret = dd->ipath_f_get_base_info(pd, kinfo);
-       if (ret < 0)
-               goto bail;
-
-       kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
-       kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
-       kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
-       kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
-       /*
-        * have to mmap whole thing
-        */
-       kinfo->spi_rcv_egrbuftotlen =
-               pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-       kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
-       kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
-               pd->port_rcvegrbuf_chunks;
-       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
-       if (master)
-               kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
-       /*
-        * for this use, may be ipath_cfgports summed over all chips that
-        * are are configured and present
-        */
-       kinfo->spi_nports = dd->ipath_cfgports;
-       /* unit (chip/board) our port is on */
-       kinfo->spi_unit = dd->ipath_unit;
-       /* for now, only a single page */
-       kinfo->spi_tid_maxsize = PAGE_SIZE;
-
-       /*
-        * Doing this per port, and based on the skip value, etc.  This has
-        * to be the actual buffer size, since the protocol code treats it
-        * as an array.
-        *
-        * These have to be set to user addresses in the user code via mmap.
-        * These values are used on return to user code for the mmap target
-        * addresses only.  For 32 bit, same 44 bit address problem, so use
-        * the physical address, not virtual.  Before 2.6.11, using the
-        * page_address() macro worked, but in 2.6.11, even that returns the
-        * full 64 bit address (upper bits all 1's).  So far, using the
-        * physical addresses (or chip offsets, for chip mapping) works, but
-        * no doubt some future kernel release will change that, and we'll be
-        * on to yet another method of dealing with this.
-        */
-       kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
-       kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
-       kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
-       kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
-       kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
-               (void *) dd->ipath_statusp -
-               (void *) dd->ipath_pioavailregs_dma;
-       if (!shared) {
-               kinfo->spi_piocnt = pd->port_piocnt;
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs;
-               kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
-                       dd->ipath_ureg_align * pd->port_port;
-       } else if (master) {
-               kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
-                                   (pd->port_piocnt % subport_cnt);
-               /* Master's PIO buffers are after all the slave's */
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-                       dd->ipath_palign *
-                       (pd->port_piocnt - kinfo->spi_piocnt);
-       } else {
-               unsigned slave = subport_fp(fp) - 1;
-
-               kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-                       dd->ipath_palign * kinfo->spi_piocnt * slave;
-       }
-
-       if (shared) {
-               kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
-                       dd->ipath_ureg_align * pd->port_port;
-               kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
-               kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
-               kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
-
-               kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
-                       PAGE_SIZE * subport_fp(fp));
-
-               kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
-                       pd->port_rcvhdrq_size * subport_fp(fp));
-               kinfo->spi_rcvhdr_tailaddr = 0;
-               kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
-                       pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
-                       subport_fp(fp));
-
-               kinfo->spi_subport_uregbase =
-                       cvt_kvaddr(pd->subport_uregbase);
-               kinfo->spi_subport_rcvegrbuf =
-                       cvt_kvaddr(pd->subport_rcvegrbuf);
-               kinfo->spi_subport_rcvhdr_base =
-                       cvt_kvaddr(pd->subport_rcvhdr_base);
-               ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
-                       kinfo->spi_port, kinfo->spi_runtime_flags,
-                       (unsigned long long) kinfo->spi_subport_uregbase,
-                       (unsigned long long) kinfo->spi_subport_rcvegrbuf,
-                       (unsigned long long) kinfo->spi_subport_rcvhdr_base);
-       }
-
-       /*
-        * All user buffers are 2KB buffers.  If we ever support
-        * giving 4KB buffers to user processes, this will need some
-        * work.
-        */
-       kinfo->spi_pioindex = (kinfo->spi_piobufbase -
-               (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
-       kinfo->spi_pioalign = dd->ipath_palign;
-
-       kinfo->spi_qpair = IPATH_KD_QP;
-       /*
-        * user mode PIO buffers are always 2KB, even when 4KB can
-        * be received, and sent via the kernel; this is ibmaxlen
-        * for 2K MTU.
-        */
-       kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
-       kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
-       kinfo->spi_port = pd->port_port;
-       kinfo->spi_subport = subport_fp(fp);
-       kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
-       kinfo->spi_hw_version = dd->ipath_revision;
-
-       if (master) {
-               kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
-       }
-
-       sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
-       if (copy_to_user(ubase, kinfo, sz))
-               ret = -EFAULT;
-
-bail:
-       kfree(kinfo);
-       return ret;
-}
-
-/**
- * ipath_tid_update - update a port TID
- * @pd: the port
- * @fp: the ipath device file
- * @ti: the TID information
- *
- * The new implementation as of Oct 2004 is that the driver assigns
- * the tid and returns it to the caller.   To make it easier to
- * catch bugs, and to reduce search time, we keep a cursor for
- * each port, walking the shadow tid array to find one that's not
- * in use.
- *
- * For now, if we can't allocate the full list, we fail, although
- * in the long run, we'll allocate as many as we can, and the
- * caller will deal with that by trying the remaining pages later.
- * That means that when we fail, we have to mark the tids as not in
- * use again, in our shadow copy.
- *
- * It's up to the caller to free the tids when they are done.
- * We'll unlock the pages as they free them.
- *
- * Also, right now we are locking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.
- */
-static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
-                           const struct ipath_tid_info *ti)
-{
-       int ret = 0, ntids;
-       u32 tid, porttid, cnt, i, tidcnt, tidoff;
-       u16 *tidlist;
-       struct ipath_devdata *dd = pd->port_dd;
-       u64 physaddr;
-       unsigned long vaddr;
-       u64 __iomem *tidbase;
-       unsigned long tidmap[8];
-       struct page **pagep = NULL;
-       unsigned subport = subport_fp(fp);
-
-       if (!dd->ipath_pageshadow) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       cnt = ti->tidcnt;
-       if (!cnt) {
-               ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
-                         (unsigned long long) ti->tidlist);
-               /*
-                * Should we treat as success?  likely a bug
-                */
-               ret = -EFAULT;
-               goto done;
-       }
-       porttid = pd->port_port * dd->ipath_rcvtidcnt;
-       if (!pd->port_subport_cnt) {
-               tidcnt = dd->ipath_rcvtidcnt;
-               tid = pd->port_tidcursor;
-               tidoff = 0;
-       } else if (!subport) {
-               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-               tidoff = dd->ipath_rcvtidcnt - tidcnt;
-               porttid += tidoff;
-               tid = tidcursor_fp(fp);
-       } else {
-               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-               tidoff = tidcnt * (subport - 1);
-               porttid += tidoff;
-               tid = tidcursor_fp(fp);
-       }
-       if (cnt > tidcnt) {
-               /* make sure it all fits in port_tid_pg_list */
-               dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
-                        "TIDs, only trying max (%u)\n", cnt, tidcnt);
-               cnt = tidcnt;
-       }
-       pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
-       tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
-
-       memset(tidmap, 0, sizeof(tidmap));
-       /* before decrement; chip actual # */
-       ntids = tidcnt;
-       tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  porttid * sizeof(*tidbase));
-
-       ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
-                  pd->port_port, cnt, tid, tidbase);
-
-       /* virtual address of first page in transfer */
-       vaddr = ti->tidvaddr;
-       if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
-                      cnt * PAGE_SIZE)) {
-               ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
-                         (void *)vaddr, cnt);
-               ret = -EFAULT;
-               goto done;
-       }
-       ret = ipath_get_user_pages(vaddr, cnt, pagep);
-       if (ret) {
-               if (ret == -EBUSY) {
-                       ipath_dbg("Failed to lock addr %p, %u pages "
-                                 "(already locked)\n",
-                                 (void *) vaddr, cnt);
-                       /*
-                        * for now, continue, and see what happens but with
-                        * the new implementation, this should never happen,
-                        * unless perhaps the user has mpin'ed the pages
-                        * themselves (something we need to test)
-                        */
-                       ret = 0;
-               } else {
-                       dev_info(&dd->pcidev->dev,
-                                "Failed to lock addr %p, %u pages: "
-                                "errno %d\n", (void *) vaddr, cnt, -ret);
-                       goto done;
-               }
-       }
-       for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
-               for (; ntids--; tid++) {
-                       if (tid == tidcnt)
-                               tid = 0;
-                       if (!dd->ipath_pageshadow[porttid + tid])
-                               break;
-               }
-               if (ntids < 0) {
-                       /*
-                        * oops, wrapped all the way through their TIDs,
-                        * and didn't have enough free; see comments at
-                        * start of routine
-                        */
-                       ipath_dbg("Not enough free TIDs for %u pages "
-                                 "(index %d), failing\n", cnt, i);
-                       i--;    /* last tidlist[i] not filled in */
-                       ret = -ENOMEM;
-                       break;
-               }
-               tidlist[i] = tid + tidoff;
-               ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
-                          "vaddr %lx\n", i, tid + tidoff, vaddr);
-               /* we "know" system pages and TID pages are same size */
-               dd->ipath_pageshadow[porttid + tid] = pagep[i];
-               dd->ipath_physshadow[porttid + tid] = ipath_map_page(
-                       dd->pcidev, pagep[i], 0, PAGE_SIZE,
-                       PCI_DMA_FROMDEVICE);
-               /*
-                * don't need atomic or it's overhead
-                */
-               __set_bit(tid, tidmap);
-               physaddr = dd->ipath_physshadow[porttid + tid];
-               ipath_stats.sps_pagelocks++;
-               ipath_cdbg(VERBOSE,
-                          "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
-                          tid, vaddr, (unsigned long long) physaddr,
-                          pagep[i]);
-               dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
-                                   physaddr);
-               /*
-                * don't check this tid in ipath_portshadow, since we
-                * just filled it in; start with the next one.
-                */
-               tid++;
-       }
-
-       if (ret) {
-               u32 limit;
-       cleanup:
-               /* jump here if copy out of updated info failed... */
-               ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
-                         -ret, i, cnt);
-               /* same code that's in ipath_free_tid() */
-               limit = sizeof(tidmap) * BITS_PER_BYTE;
-               if (limit > tidcnt)
-                       /* just in case size changes in future */
-                       limit = tidcnt;
-               tid = find_first_bit((const unsigned long *)tidmap, limit);
-               for (; tid < limit; tid++) {
-                       if (!test_bit(tid, tidmap))
-                               continue;
-                       if (dd->ipath_pageshadow[porttid + tid]) {
-                               ipath_cdbg(VERBOSE, "Freeing TID %u\n",
-                                          tid);
-                               dd->ipath_f_put_tid(dd, &tidbase[tid],
-                                                   RCVHQ_RCV_TYPE_EXPECTED,
-                                                   dd->ipath_tidinvalid);
-                               pci_unmap_page(dd->pcidev,
-                                       dd->ipath_physshadow[porttid + tid],
-                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                               dd->ipath_pageshadow[porttid + tid] = NULL;
-                               ipath_stats.sps_pageunlocks++;
-                       }
-               }
-               ipath_release_user_pages(pagep, cnt);
-       } else {
-               /*
-                * Copy the updated array, with ipath_tid's filled in, back
-                * to user.  Since we did the copy in already, this "should
-                * never fail" If it does, we have to clean up...
-                */
-               if (copy_to_user((void __user *)
-                                (unsigned long) ti->tidlist,
-                                tidlist, cnt * sizeof(*tidlist))) {
-                       ret = -EFAULT;
-                       goto cleanup;
-               }
-               if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
-                                tidmap, sizeof tidmap)) {
-                       ret = -EFAULT;
-                       goto cleanup;
-               }
-               if (tid == tidcnt)
-                       tid = 0;
-               if (!pd->port_subport_cnt)
-                       pd->port_tidcursor = tid;
-               else
-                       tidcursor_fp(fp) = tid;
-       }
-
-done:
-       if (ret)
-               ipath_dbg("Failed to map %u TID pages, failing with %d\n",
-                         ti->tidcnt, -ret);
-       return ret;
-}
-
-/**
- * ipath_tid_free - free a port TID
- * @pd: the port
- * @subport: the subport
- * @ti: the TID info
- *
- * right now we are unlocking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.  We check that the TID is in range for this port
- * but otherwise don't check validity; if user has an error and
- * frees the wrong tid, it's only their own data that can thereby
- * be corrupted.  We do check that the TID was in use, for sanity
- * We always use our idea of the saved address, not the address that
- * they pass in to us.
- */
-
-static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
-                         const struct ipath_tid_info *ti)
-{
-       int ret = 0;
-       u32 tid, porttid, cnt, limit, tidcnt;
-       struct ipath_devdata *dd = pd->port_dd;
-       u64 __iomem *tidbase;
-       unsigned long tidmap[8];
-
-       if (!dd->ipath_pageshadow) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
-                          sizeof tidmap)) {
-               ret = -EFAULT;
-               goto done;
-       }
-
-       porttid = pd->port_port * dd->ipath_rcvtidcnt;
-       if (!pd->port_subport_cnt)
-               tidcnt = dd->ipath_rcvtidcnt;
-       else if (!subport) {
-               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-               porttid += dd->ipath_rcvtidcnt - tidcnt;
-       } else {
-               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-               porttid += tidcnt * (subport - 1);
-       }
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  porttid * sizeof(*tidbase));
-
-       limit = sizeof(tidmap) * BITS_PER_BYTE;
-       if (limit > tidcnt)
-               /* just in case size changes in future */
-               limit = tidcnt;
-       tid = find_first_bit(tidmap, limit);
-       ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
-                  "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
-                  limit, tid, porttid);
-       for (cnt = 0; tid < limit; tid++) {
-               /*
-                * small optimization; if we detect a run of 3 or so without
-                * any set, use find_first_bit again.  That's mainly to
-                * accelerate the case where we wrapped, so we have some at
-                * the beginning, and some at the end, and a big gap
-                * in the middle.
-                */
-               if (!test_bit(tid, tidmap))
-                       continue;
-               cnt++;
-               if (dd->ipath_pageshadow[porttid + tid]) {
-                       struct page *p;
-                       p = dd->ipath_pageshadow[porttid + tid];
-                       dd->ipath_pageshadow[porttid + tid] = NULL;
-                       ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
-                                  pid_nr(pd->port_pid), tid);
-                       dd->ipath_f_put_tid(dd, &tidbase[tid],
-                                           RCVHQ_RCV_TYPE_EXPECTED,
-                                           dd->ipath_tidinvalid);
-                       pci_unmap_page(dd->pcidev,
-                               dd->ipath_physshadow[porttid + tid],
-                               PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                       ipath_release_user_pages(&p, 1);
-                       ipath_stats.sps_pageunlocks++;
-               } else
-                       ipath_dbg("Unused tid %u, ignoring\n", tid);
-       }
-       if (cnt != ti->tidcnt)
-               ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
-                         ti->tidcnt, cnt);
-done:
-       if (ret)
-               ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
-                         ti->tidcnt, -ret);
-       return ret;
-}
-
-/**
- * ipath_set_part_key - set a partition key
- * @pd: the port
- * @key: the key
- *
- * We can have up to 4 active at a time (other than the default, which is
- * always allowed).  This is somewhat tricky, since multiple ports may set
- * the same key, so we reference count them, and clean up at exit.  All 4
- * partition keys are packed into a single infinipath register.  It's an
- * error for a process to set the same pkey multiple times.  We provide no
- * mechanism to de-allocate a pkey at this time, we may eventually need to
- * do that.  I've used the atomic operations, and no locking, and only make
- * a single pass through what's available.  This should be more than
- * adequate for some time. I'll think about spinlocks or the like if and as
- * it's necessary.
- */
-static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       int i, any = 0, pidx = -1;
-       u16 lkey = key & 0x7FFF;
-       int ret;
-
-       if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
-               /* nothing to do; this key always valid */
-               ret = 0;
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
-                  "%hx:%x %hx:%x %hx:%x %hx:%x\n",
-                  pd->port_port, key, dd->ipath_pkeys[0],
-                  atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
-                  atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
-                  atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
-                  atomic_read(&dd->ipath_pkeyrefs[3]));
-
-       if (!lkey) {
-               ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
-                          pd->port_port);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * Set the full membership bit, because it has to be
-        * set in the register or the packet, and it seems
-        * cleaner to set in the register than to force all
-        * callers to set it. (see bug 4331)
-        */
-       key |= 0x8000;
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               if (!pd->port_pkeys[i] && pidx == -1)
-                       pidx = i;
-               if (pd->port_pkeys[i] == key) {
-                       ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
-                                  "(%x) more than once\n",
-                                  pd->port_port, key);
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (pidx == -1) {
-               ipath_dbg("All pkeys for port %u already in use, "
-                         "can't set %x\n", pd->port_port, key);
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i]) {
-                       any++;
-                       continue;
-               }
-               if (dd->ipath_pkeys[i] == key) {
-                       atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
-
-                       if (atomic_inc_return(pkrefs) > 1) {
-                               pd->port_pkeys[pidx] = key;
-                               ipath_cdbg(VERBOSE, "p%u set key %x "
-                                          "matches #%d, count now %d\n",
-                                          pd->port_port, key, i,
-                                          atomic_read(pkrefs));
-                               ret = 0;
-                               goto bail;
-                       } else {
-                               /*
-                                * lost race, decrement count, catch below
-                                */
-                               atomic_dec(pkrefs);
-                               ipath_cdbg(VERBOSE, "Lost race, count was "
-                                          "0, after dec, it's %d\n",
-                                          atomic_read(pkrefs));
-                               any++;
-                       }
-               }
-               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-                       /*
-                        * It makes no sense to have both the limited and
-                        * full membership PKEY set at the same time since
-                        * the unlimited one will disable the limited one.
-                        */
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (!any) {
-               ipath_dbg("port %u, all pkeys already in use, "
-                         "can't set %x\n", pd->port_port, key);
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i] &&
-                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-                       u64 pkey;
-
-                       /* for ipathstats, etc. */
-                       ipath_stats.sps_pkeys[i] = lkey;
-                       pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
-                       pkey =
-                               (u64) dd->ipath_pkeys[0] |
-                               ((u64) dd->ipath_pkeys[1] << 16) |
-                               ((u64) dd->ipath_pkeys[2] << 32) |
-                               ((u64) dd->ipath_pkeys[3] << 48);
-                       ipath_cdbg(PROC, "p%u set key %x in #%d, "
-                                  "portidx %d, new pkey reg %llx\n",
-                                  pd->port_port, key, i, pidx,
-                                  (unsigned long long) pkey);
-                       ipath_write_kreg(
-                               dd, dd->ipath_kregs->kr_partitionkey, pkey);
-
-                       ret = 0;
-                       goto bail;
-               }
-       }
-       ipath_dbg("port %u, all pkeys already in use 2nd pass, "
-                 "can't set %x\n", pd->port_port, key);
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_manage_rcvq - manage a port's receive queue
- * @pd: the port
- * @subport: the subport
- * @start_stop: action to carry out
- *
- * start_stop == 0 disables receive on the port, for use in queue
- * overflow conditions.  start_stop==1 re-enables, to be used to
- * re-init the software copy of the head register
- */
-static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
-                            int start_stop)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-
-       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
-                  start_stop ? "en" : "dis", dd->ipath_unit,
-                  pd->port_port, subport);
-       if (subport)
-               goto bail;
-       /* atomically clear receive enable port. */
-       if (start_stop) {
-               /*
-                * On enable, force in-memory copy of the tail register to
-                * 0, so that protocol code doesn't have to worry about
-                * whether or not the chip has yet updated the in-memory
-                * copy or not on return from the system call. The chip
-                * always resets it's tail register back to 0 on a
-                * transition from disabled to enabled.  This could cause a
-                * problem if software was broken, and did the enable w/o
-                * the disable, but eventually the in-memory copy will be
-                * updated and correct itself, even in the face of software
-                * bugs.
-                */
-               if (pd->port_rcvhdrtail_kvaddr)
-                       ipath_clear_rcvhdrtail(pd);
-               set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-                       &dd->ipath_rcvctrl);
-       } else
-               clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
-                         &dd->ipath_rcvctrl);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-       /* now be sure chip saw it before we return */
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       if (start_stop) {
-               /*
-                * And try to be sure that tail reg update has happened too.
-                * This should in theory interlock with the RXE changes to
-                * the tail register.  Don't assign it to the tail register
-                * in memory copy, since we could overwrite an update by the
-                * chip if we did.
-                */
-               ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-       }
-       /* always; new head should be equal to new tail; see above */
-bail:
-       return 0;
-}
-
-static void ipath_clean_part_key(struct ipath_portdata *pd,
-                                struct ipath_devdata *dd)
-{
-       int i, j, pchanged = 0;
-       u64 oldpkey;
-
-       /* for debugging only */
-       oldpkey = (u64) dd->ipath_pkeys[0] |
-               ((u64) dd->ipath_pkeys[1] << 16) |
-               ((u64) dd->ipath_pkeys[2] << 32) |
-               ((u64) dd->ipath_pkeys[3] << 48);
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               if (!pd->port_pkeys[i])
-                       continue;
-               ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
-                          pd->port_pkeys[i]);
-               for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
-                       /* check for match independent of the global bit */
-                       if ((dd->ipath_pkeys[j] & 0x7fff) !=
-                           (pd->port_pkeys[i] & 0x7fff))
-                               continue;
-                       if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
-                               ipath_cdbg(VERBOSE, "p%u clear key "
-                                          "%x matches #%d\n",
-                                          pd->port_port,
-                                          pd->port_pkeys[i], j);
-                               ipath_stats.sps_pkeys[j] =
-                                       dd->ipath_pkeys[j] = 0;
-                               pchanged++;
-                       } else {
-                               ipath_cdbg(VERBOSE, "p%u key %x matches #%d, "
-                                          "but ref still %d\n", pd->port_port,
-                                          pd->port_pkeys[i], j,
-                                          atomic_read(&dd->ipath_pkeyrefs[j]));
-                               break;
-                       }
-               }
-               pd->port_pkeys[i] = 0;
-       }
-       if (pchanged) {
-               u64 pkey = (u64) dd->ipath_pkeys[0] |
-                       ((u64) dd->ipath_pkeys[1] << 16) |
-                       ((u64) dd->ipath_pkeys[2] << 32) |
-                       ((u64) dd->ipath_pkeys[3] << 48);
-               ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
-                          "new pkey reg %llx\n", pd->port_port,
-                          (unsigned long long) oldpkey,
-                          (unsigned long long) pkey);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-                                pkey);
-       }
-}
-
-/*
- * Initialize the port data with the receive buffer sizes
- * so this can be done while the master port is locked.
- * Otherwise, there is a race with a slave opening the port
- * and seeing these fields uninitialized.
- */
-static void init_user_egr_sizes(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned egrperchunk, egrcnt, size;
-
-       /*
-        * to avoid wasting a lot of memory, we allocate 32KB chunks of
-        * physically contiguous memory, advance through it until used up
-        * and then allocate more.  Of course, we need memory to store those
-        * extra pointers, now.  Started out with 256KB, but under heavy
-        * memory pressure (creating large files and then copying them over
-        * NFS while doing lots of MPI jobs), we hit some allocation
-        * failures, even though we can sleep...  (2.6.10) Still get
-        * failures at 64K.  32K is the lowest we can go without wasting
-        * additional memory.
-        */
-       size = 0x8000;
-       egrperchunk = size / dd->ipath_rcvegrbufsize;
-       egrcnt = dd->ipath_rcvegrcnt;
-       pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
-       pd->port_rcvegrbufs_perchunk = egrperchunk;
-       pd->port_rcvegrbuf_size = size;
-}
-
-/**
- * ipath_create_user_egr - allocate eager TID buffers
- * @pd: the port to allocate TID buffers for
- *
- * This routine is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance
- * This is the user port version
- *
- * Allocate the eager TID buffers and program them into infinipath
- * They are no longer completely contiguous, we do multiple allocation
- * calls.
- */
-static int ipath_create_user_egr(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
-       size_t size;
-       int ret;
-       gfp_t gfp_flags;
-
-       /*
-        * GFP_USER, but without GFP_FS, so buffer cache can be
-        * coalesced (we hope); otherwise, even at order 4,
-        * heavy filesystem activity makes these fail, and we can
-        * use compound pages.
-        */
-       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
-
-       egrcnt = dd->ipath_rcvegrcnt;
-       /* TID number offset for this port */
-       egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
-       egrsize = dd->ipath_rcvegrbufsize;
-       ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
-                  "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
-
-       chunk = pd->port_rcvegrbuf_chunks;
-       egrperchunk = pd->port_rcvegrbufs_perchunk;
-       size = pd->port_rcvegrbuf_size;
-       pd->port_rcvegrbuf = kmalloc_array(chunk, sizeof(pd->port_rcvegrbuf[0]),
-                                          GFP_KERNEL);
-       if (!pd->port_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       pd->port_rcvegrbuf_phys =
-               kmalloc_array(chunk, sizeof(pd->port_rcvegrbuf_phys[0]),
-                             GFP_KERNEL);
-       if (!pd->port_rcvegrbuf_phys) {
-               ret = -ENOMEM;
-               goto bail_rcvegrbuf;
-       }
-       for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-
-               pd->port_rcvegrbuf[e] = dma_alloc_coherent(
-                       &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
-                       gfp_flags);
-
-               if (!pd->port_rcvegrbuf[e]) {
-                       ret = -ENOMEM;
-                       goto bail_rcvegrbuf_phys;
-               }
-       }
-
-       pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
-
-       for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
-               dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
-               unsigned i;
-
-               for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
-                       dd->ipath_f_put_tid(dd, e + egroff +
-                                           (u64 __iomem *)
-                                           ((char __iomem *)
-                                            dd->ipath_kregbase +
-                                            dd->ipath_rcvegrbase),
-                                           RCVHQ_RCV_TYPE_EAGER, pa);
-                       pa += egrsize;
-               }
-               cond_resched(); /* don't hog the cpu */
-       }
-
-       ret = 0;
-       goto bail;
-
-bail_rcvegrbuf_phys:
-       for (e = 0; e < pd->port_rcvegrbuf_chunks &&
-               pd->port_rcvegrbuf[e]; e++) {
-               dma_free_coherent(&dd->pcidev->dev, size,
-                                 pd->port_rcvegrbuf[e],
-                                 pd->port_rcvegrbuf_phys[e]);
-
-       }
-       kfree(pd->port_rcvegrbuf_phys);
-       pd->port_rcvegrbuf_phys = NULL;
-bail_rcvegrbuf:
-       kfree(pd->port_rcvegrbuf);
-       pd->port_rcvegrbuf = NULL;
-bail:
-       return ret;
-}
-
-
-/* common code for the mappings on dma_alloc_coherent mem */
-static int ipath_mmap_mem(struct vm_area_struct *vma,
-       struct ipath_portdata *pd, unsigned len, int write_ok,
-       void *kvaddr, char *what)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned long pfn;
-       int ret;
-
-       if ((vma->vm_end - vma->vm_start) > len) {
-               dev_info(&dd->pcidev->dev,
-                        "FAIL on %s: len %lx > %x\n", what,
-                        vma->vm_end - vma->vm_start, len);
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       if (!write_ok) {
-               if (vma->vm_flags & VM_WRITE) {
-                       dev_info(&dd->pcidev->dev,
-                                "%s must be mapped readonly\n", what);
-                       ret = -EPERM;
-                       goto bail;
-               }
-
-               /* don't allow them to later change with mprotect */
-               vma->vm_flags &= ~VM_MAYWRITE;
-       }
-
-       pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
-       ret = remap_pfn_range(vma, vma->vm_start, pfn,
-                             len, vma->vm_page_prot);
-       if (ret)
-               dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
-                        "bytes r%c failed: %d\n", what, pd->port_port,
-                        pfn, len, write_ok?'w':'o', ret);
-       else
-               ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
-                          "r%c\n", what, pd->port_port, pfn, len,
-                          write_ok?'w':'o');
-bail:
-       return ret;
-}
-
-static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
-                    u64 ureg)
-{
-       unsigned long phys;
-       int ret;
-
-       /*
-        * This is real hardware, so use io_remap.  This is the mechanism
-        * for the user process to update the head registers for their port
-        * in the chip.
-        */
-       if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
-               dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
-                        "%lx > PAGE\n", vma->vm_end - vma->vm_start);
-               ret = -EFAULT;
-       } else {
-               phys = dd->ipath_physaddr + ureg;
-               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-               vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               ret = io_remap_pfn_range(vma, vma->vm_start,
-                                        phys >> PAGE_SHIFT,
-                                        vma->vm_end - vma->vm_start,
-                                        vma->vm_page_prot);
-       }
-       return ret;
-}
-
-static int mmap_piobufs(struct vm_area_struct *vma,
-                       struct ipath_devdata *dd,
-                       struct ipath_portdata *pd,
-                       unsigned piobufs, unsigned piocnt)
-{
-       unsigned long phys;
-       int ret;
-
-       /*
-        * When we map the PIO buffers in the chip, we want to map them as
-        * writeonly, no read possible.   This prevents access to previous
-        * process data, and catches users who might try to read the i/o
-        * space due to a bug.
-        */
-       if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
-               dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
-                        "reqlen %lx > PAGE\n",
-                        vma->vm_end - vma->vm_start);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       phys = dd->ipath_physaddr + piobufs;
-
-#if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
-       pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
-       pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
-#endif
-
-       /*
-        * don't allow them to later change to readable with mprotect (for when
-        * not initially mapped readable, as is normally the case)
-        */
-       vma->vm_flags &= ~VM_MAYREAD;
-       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-
-       ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
-                                vma->vm_end - vma->vm_start,
-                                vma->vm_page_prot);
-bail:
-       return ret;
-}
-
-static int mmap_rcvegrbufs(struct vm_area_struct *vma,
-                          struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned long start, size;
-       size_t total_size, i;
-       unsigned long pfn;
-       int ret;
-
-       size = pd->port_rcvegrbuf_size;
-       total_size = pd->port_rcvegrbuf_chunks * size;
-       if ((vma->vm_end - vma->vm_start) > total_size) {
-               dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
-                        "reqlen %lx > actual %lx\n",
-                        vma->vm_end - vma->vm_start,
-                        (unsigned long) total_size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (vma->vm_flags & VM_WRITE) {
-               dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
-                        "writable (flags=%lx)\n", vma->vm_flags);
-               ret = -EPERM;
-               goto bail;
-       }
-       /* don't allow them to later change to writeable with mprotect */
-       vma->vm_flags &= ~VM_MAYWRITE;
-
-       start = vma->vm_start;
-
-       for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
-               pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
-               ret = remap_pfn_range(vma, start, pfn, size,
-                                     vma->vm_page_prot);
-               if (ret < 0)
-                       goto bail;
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * ipath_file_vma_fault - handle a VMA page fault.
- */
-static int ipath_file_vma_fault(struct vm_area_struct *vma,
-                                       struct vm_fault *vmf)
-{
-       struct page *page;
-
-       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
-       if (!page)
-               return VM_FAULT_SIGBUS;
-       get_page(page);
-       vmf->page = page;
-
-       return 0;
-}
-
-static const struct vm_operations_struct ipath_file_vm_ops = {
-       .fault = ipath_file_vma_fault,
-};
-
-static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
-                      struct ipath_portdata *pd, unsigned subport)
-{
-       unsigned long len;
-       struct ipath_devdata *dd;
-       void *addr;
-       size_t size;
-       int ret = 0;
-
-       /* If the port is not shared, all addresses should be physical */
-       if (!pd->port_subport_cnt)
-               goto bail;
-
-       dd = pd->port_dd;
-       size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-
-       /*
-        * Each process has all the subport uregbase, rcvhdrq, and
-        * rcvegrbufs mmapped - as an array for all the processes,
-        * and also separately for this process.
-        */
-       if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
-               addr = pd->subport_uregbase;
-               size = PAGE_SIZE * pd->port_subport_cnt;
-       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
-               addr = pd->subport_rcvhdr_base;
-               size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
-       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
-               addr = pd->subport_rcvegrbuf;
-               size *= pd->port_subport_cnt;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
-                                        PAGE_SIZE * subport)) {
-                addr = pd->subport_uregbase + PAGE_SIZE * subport;
-                size = PAGE_SIZE;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
-                                pd->port_rcvhdrq_size * subport)) {
-                addr = pd->subport_rcvhdr_base +
-                        pd->port_rcvhdrq_size * subport;
-                size = pd->port_rcvhdrq_size;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
-                               size * subport)) {
-                addr = pd->subport_rcvegrbuf + size * subport;
-                /* rcvegrbufs are read-only on the slave */
-                if (vma->vm_flags & VM_WRITE) {
-                        dev_info(&dd->pcidev->dev,
-                                 "Can't map eager buffers as "
-                                 "writable (flags=%lx)\n", vma->vm_flags);
-                        ret = -EPERM;
-                        goto bail;
-                }
-                /*
-                 * Don't allow permission to later change to writeable
-                 * with mprotect.
-                 */
-                vma->vm_flags &= ~VM_MAYWRITE;
-       } else {
-               goto bail;
-       }
-       len = vma->vm_end - vma->vm_start;
-       if (len > size) {
-               ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
-       vma->vm_ops = &ipath_file_vm_ops;
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_mmap - mmap various structures into user space
- * @fp: the file pointer
- * @vma: the VM area
- *
- * We use this to have a shared buffer between the kernel and the user code
- * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
- * buffers in the chip.  We have the open and close entries so we can bump
- * the ref count and keep the driver from being unloaded while still mapped.
- */
-static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-       struct ipath_portdata *pd;
-       struct ipath_devdata *dd;
-       u64 pgaddr, ureg;
-       unsigned piobufs, piocnt;
-       int ret;
-
-       pd = port_fp(fp);
-       if (!pd) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       dd = pd->port_dd;
-
-       /*
-        * This is the ipath_do_user_init() code, mapping the shared buffers
-        * into the user process. The address referred to by vm_pgoff is the
-        * file offset passed via mmap().  For shared ports, this is the
-        * kernel vmalloc() address of the pages to share with the master.
-        * For non-shared or master ports, this is a physical address.
-        * We only do one mmap for each space mapped.
-        */
-       pgaddr = vma->vm_pgoff << PAGE_SHIFT;
-
-       /*
-        * Check for 0 in case one of the allocations failed, but user
-        * called mmap anyway.
-        */
-       if (!pgaddr)  {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
-                  (unsigned long long) pgaddr, vma->vm_start,
-                  vma->vm_end - vma->vm_start, dd->ipath_unit,
-                  pd->port_port, subport_fp(fp));
-
-       /*
-        * Physical addresses must fit in 40 bits for our hardware.
-        * Check for kernel virtual addresses first, anything else must
-        * match a HW or memory address.
-        */
-       ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
-       if (ret) {
-               if (ret > 0)
-                       ret = 0;
-               goto bail;
-       }
-
-       ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
-       if (!pd->port_subport_cnt) {
-               /* port is not shared */
-               piocnt = pd->port_piocnt;
-               piobufs = pd->port_piobufs;
-       } else if (!subport_fp(fp)) {
-               /* caller is the master */
-               piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
-                        (pd->port_piocnt % pd->port_subport_cnt);
-               piobufs = pd->port_piobufs +
-                       dd->ipath_palign * (pd->port_piocnt - piocnt);
-       } else {
-               unsigned slave = subport_fp(fp) - 1;
-
-               /* caller is a slave */
-               piocnt = pd->port_piocnt / pd->port_subport_cnt;
-               piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
-       }
-
-       if (pgaddr == ureg)
-               ret = mmap_ureg(vma, dd, ureg);
-       else if (pgaddr == piobufs)
-               ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
-       else if (pgaddr == dd->ipath_pioavailregs_phys)
-               /* in-memory copy of pioavail registers */
-               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    (void *) dd->ipath_pioavailregs_dma,
-                                    "pioavail registers");
-       else if (pgaddr == pd->port_rcvegr_phys)
-               ret = mmap_rcvegrbufs(vma, pd);
-       else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
-               /*
-                * The rcvhdrq itself; readonly except on HT (so have
-                * to allow writable mapping), multiple pages, contiguous
-                * from an i/o perspective.
-                */
-               ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
-                                    pd->port_rcvhdrq,
-                                    "rcvhdrq");
-       else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
-               /* in-memory copy of rcvhdrq tail register */
-               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    pd->port_rcvhdrtail_kvaddr,
-                                    "rcvhdrq tail");
-       else
-               ret = -EINVAL;
-
-       vma->vm_private_data = NULL;
-
-       if (ret < 0)
-               dev_info(&dd->pcidev->dev,
-                        "Failure %d on off %llx len %lx\n",
-                        -ret, (unsigned long long)pgaddr,
-                        vma->vm_end - vma->vm_start);
-bail:
-       return ret;
-}
-
-static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
-{
-       unsigned pollflag = 0;
-
-       if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
-           pd->port_hdrqfull != pd->port_hdrqfull_poll) {
-               pollflag |= POLLIN | POLLRDNORM;
-               pd->port_hdrqfull_poll = pd->port_hdrqfull;
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
-                                     struct file *fp,
-                                     struct poll_table_struct *pt)
-{
-       unsigned pollflag = 0;
-       struct ipath_devdata *dd;
-
-       dd = pd->port_dd;
-
-       /* variable access in ipath_poll_hdrqfull() needs this */
-       rmb();
-       pollflag = ipath_poll_hdrqfull(pd);
-
-       if (pd->port_urgent != pd->port_urgent_poll) {
-               pollflag |= POLLIN | POLLRDNORM;
-               pd->port_urgent_poll = pd->port_urgent;
-       }
-
-       if (!pollflag) {
-               /* this saves a spin_lock/unlock in interrupt handler... */
-               set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
-               /* flush waiting flag so don't miss an event... */
-               wmb();
-               poll_wait(fp, &pd->port_wait, pt);
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll_next(struct ipath_portdata *pd,
-                                   struct file *fp,
-                                   struct poll_table_struct *pt)
-{
-       u32 head;
-       u32 tail;
-       unsigned pollflag = 0;
-       struct ipath_devdata *dd;
-
-       dd = pd->port_dd;
-
-       /* variable access in ipath_poll_hdrqfull() needs this */
-       rmb();
-       pollflag = ipath_poll_hdrqfull(pd);
-
-       head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
-       if (pd->port_rcvhdrtail_kvaddr)
-               tail = ipath_get_rcvhdrtail(pd);
-       else
-               tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-
-       if (head != tail)
-               pollflag |= POLLIN | POLLRDNORM;
-       else {
-               /* this saves a spin_lock/unlock in interrupt handler */
-               set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
-               /* flush waiting flag so we don't miss an event */
-               wmb();
-
-               set_bit(pd->port_port + dd->ipath_r_intravail_shift,
-                       &dd->ipath_rcvctrl);
-
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                                dd->ipath_rcvctrl);
-
-               if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
-                       ipath_write_ureg(dd, ur_rcvhdrhead,
-                                        dd->ipath_rhdrhead_intr_off | head,
-                                        pd->port_port);
-
-               poll_wait(fp, &pd->port_wait, pt);
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll(struct file *fp,
-                              struct poll_table_struct *pt)
-{
-       struct ipath_portdata *pd;
-       unsigned pollflag;
-
-       pd = port_fp(fp);
-       if (!pd)
-               pollflag = 0;
-       else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
-               pollflag = ipath_poll_urgent(pd, fp, pt);
-       else
-               pollflag = ipath_poll_next(pd, fp, pt);
-
-       return pollflag;
-}
-
-static int ipath_supports_subports(int user_swmajor, int user_swminor)
-{
-       /* no subport implementation prior to software version 1.3 */
-       return (user_swmajor > 1) || (user_swminor >= 3);
-}
-
-static int ipath_compatible_subports(int user_swmajor, int user_swminor)
-{
-       /* this code is written long-hand for clarity */
-       if (IPATH_USER_SWMAJOR != user_swmajor) {
-               /* no promise of compatibility if major mismatch */
-               return 0;
-       }
-       if (IPATH_USER_SWMAJOR == 1) {
-               switch (IPATH_USER_SWMINOR) {
-               case 0:
-               case 1:
-               case 2:
-                       /* no subport implementation so cannot be compatible */
-                       return 0;
-               case 3:
-                       /* 3 is only compatible with itself */
-                       return user_swminor == 3;
-               default:
-                       /* >= 4 are compatible (or are expected to be) */
-                       return user_swminor >= 4;
-               }
-       }
-       /* make no promises yet for future major versions */
-       return 0;
-}
-
-static int init_subports(struct ipath_devdata *dd,
-                        struct ipath_portdata *pd,
-                        const struct ipath_user_info *uinfo)
-{
-       int ret = 0;
-       unsigned num_subports;
-       size_t size;
-
-       /*
-        * If the user is requesting zero subports,
-        * skip the subport allocation.
-        */
-       if (uinfo->spu_subport_cnt <= 0)
-               goto bail;
-
-       /* Self-consistency check for ipath_compatible_subports() */
-       if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
-           !ipath_compatible_subports(IPATH_USER_SWMAJOR,
-                                      IPATH_USER_SWMINOR)) {
-               dev_info(&dd->pcidev->dev,
-                        "Inconsistent ipath_compatible_subports()\n");
-               goto bail;
-       }
-
-       /* Check for subport compatibility */
-       if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
-                                      uinfo->spu_userversion & 0xffff)) {
-               dev_info(&dd->pcidev->dev,
-                        "Mismatched user version (%d.%d) and driver "
-                        "version (%d.%d) while port sharing. Ensure "
-                         "that driver and library are from the same "
-                         "release.\n",
-                        (int) (uinfo->spu_userversion >> 16),
-                         (int) (uinfo->spu_userversion & 0xffff),
-                        IPATH_USER_SWMAJOR,
-                        IPATH_USER_SWMINOR);
-               goto bail;
-       }
-       if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       num_subports = uinfo->spu_subport_cnt;
-       pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
-       if (!pd->subport_uregbase) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
-       size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-                    sizeof(u32), PAGE_SIZE) * num_subports;
-       pd->subport_rcvhdr_base = vzalloc(size);
-       if (!pd->subport_rcvhdr_base) {
-               ret = -ENOMEM;
-               goto bail_ureg;
-       }
-
-       pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
-                                       pd->port_rcvegrbuf_size *
-                                       num_subports);
-       if (!pd->subport_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail_rhdr;
-       }
-
-       pd->port_subport_cnt = uinfo->spu_subport_cnt;
-       pd->port_subport_id = uinfo->spu_subport_id;
-       pd->active_slaves = 1;
-       set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-       goto bail;
-
-bail_rhdr:
-       vfree(pd->subport_rcvhdr_base);
-bail_ureg:
-       vfree(pd->subport_uregbase);
-       pd->subport_uregbase = NULL;
-bail:
-       return ret;
-}
-
-static int try_alloc_port(struct ipath_devdata *dd, int port,
-                         struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       struct ipath_portdata *pd;
-       int ret;
-
-       if (!(pd = dd->ipath_pd[port])) {
-               void *ptmp;
-
-               pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
-
-               /*
-                * Allocate memory for use in ipath_tid_update() just once
-                * at open, not per call.  Reduces cost of expected send
-                * setup.
-                */
-               ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
-                              dd->ipath_rcvtidcnt * sizeof(struct page **),
-                              GFP_KERNEL);
-               if (!pd || !ptmp) {
-                       ipath_dev_err(dd, "Unable to allocate portdata "
-                                     "memory, failing open\n");
-                       ret = -ENOMEM;
-                       kfree(pd);
-                       kfree(ptmp);
-                       goto bail;
-               }
-               dd->ipath_pd[port] = pd;
-               dd->ipath_pd[port]->port_port = port;
-               dd->ipath_pd[port]->port_dd = dd;
-               dd->ipath_pd[port]->port_tid_pg_list = ptmp;
-               init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
-       }
-       if (!pd->port_cnt) {
-               pd->userversion = uinfo->spu_userversion;
-               init_user_egr_sizes(pd);
-               if ((ret = init_subports(dd, pd, uinfo)) != 0)
-                       goto bail;
-               ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
-                          current->comm, current->pid, dd->ipath_unit,
-                          port);
-               pd->port_cnt = 1;
-               port_fp(fp) = pd;
-               pd->port_pid = get_pid(task_pid(current));
-               strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
-               ipath_stats.sps_ports++;
-               ret = 0;
-       } else
-               ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-static inline int usable(struct ipath_devdata *dd)
-{
-       return dd &&
-               (dd->ipath_flags & IPATH_PRESENT) &&
-               dd->ipath_kregbase &&
-               dd->ipath_lid &&
-               !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
-                                    | IPATH_LINKUNK));
-}
-
-static int find_free_port(int unit, struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       struct ipath_devdata *dd = ipath_lookup(unit);
-       int ret, i;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (!usable(dd)) {
-               ret = -ENETDOWN;
-               goto bail;
-       }
-
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               ret = try_alloc_port(dd, i, fp, uinfo);
-               if (ret != -EBUSY)
-                       goto bail;
-       }
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-static int find_best_unit(struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       int ret = 0, i, prefunit = -1, devmax;
-       int maxofallports, npresent, nup;
-       int ndev;
-
-       devmax = ipath_count_units(&npresent, &nup, &maxofallports);
-
-       /*
-        * This code is present to allow a knowledgeable person to
-        * specify the layout of processes to processors before opening
-        * this driver, and then we'll assign the process to the "closest"
-        * InfiniPath chip to that processor (we assume reasonable connectivity,
-        * for now).  This code assumes that if affinity has been set
-        * before this point, that at most one cpu is set; for now this
-        * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
-        * in case some kernel variant sets none of the bits when no
-        * affinity is set.  2.6.11 and 12 kernels have all present
-        * cpus set.  Some day we'll have to fix it up further to handle
-        * a cpu subset.  This algorithm fails for two HT chips connected
-        * in tunnel fashion.  Eventually this needs real topology
-        * information.  There may be some issues with dual core numbering
-        * as well.  This needs more work prior to release.
-        */
-       if (!cpumask_empty(tsk_cpus_allowed(current)) &&
-           !cpumask_full(tsk_cpus_allowed(current))) {
-               int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
-               get_online_cpus();
-               for_each_online_cpu(i)
-                       if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
-                               ipath_cdbg(PROC, "%s[%u] affinity set for "
-                                          "cpu %d/%d\n", current->comm,
-                                          current->pid, i, ncpus);
-                               curcpu = i;
-                               nset++;
-                       }
-               put_online_cpus();
-               if (curcpu != -1 && nset != ncpus) {
-                       if (npresent) {
-                               prefunit = curcpu / (ncpus / npresent);
-                               ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
-                                         "%d cpus/chip, select unit %d\n",
-                                         current->comm, current->pid,
-                                         npresent, ncpus, ncpus / npresent,
-                                         prefunit);
-                       }
-               }
-       }
-
-       /*
-        * user ports start at 1, kernel port is 0
-        * For now, we do round-robin access across all chips
-        */
-
-       if (prefunit != -1)
-               devmax = prefunit + 1;
-recheck:
-       for (i = 1; i < maxofallports; i++) {
-               for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
-                    ndev++) {
-                       struct ipath_devdata *dd = ipath_lookup(ndev);
-
-                       if (!usable(dd))
-                               continue; /* can't use this unit */
-                       if (i >= dd->ipath_cfgports)
-                               /*
-                                * Maxed out on users of this unit. Try
-                                * next.
-                                */
-                               continue;
-                       ret = try_alloc_port(dd, i, fp, uinfo);
-                       if (!ret)
-                               goto done;
-               }
-       }
-
-       if (npresent) {
-               if (nup == 0) {
-                       ret = -ENETDOWN;
-                       ipath_dbg("No ports available (none initialized "
-                                 "and ready)\n");
-               } else {
-                       if (prefunit > 0) {
-                               /* if started above 0, retry from 0 */
-                               ipath_cdbg(PROC,
-                                          "%s[%u] no ports on prefunit "
-                                          "%d, clear and re-check\n",
-                                          current->comm, current->pid,
-                                          prefunit);
-                               devmax = ipath_count_units(NULL, NULL,
-                                                          NULL);
-                               prefunit = -1;
-                               goto recheck;
-                       }
-                       ret = -EBUSY;
-                       ipath_dbg("No ports available\n");
-               }
-       } else {
-               ret = -ENXIO;
-               ipath_dbg("No boards found\n");
-       }
-
-done:
-       return ret;
-}
-
-static int find_shared_port(struct file *fp,
-                           const struct ipath_user_info *uinfo)
-{
-       int devmax, ndev, i;
-       int ret = 0;
-
-       devmax = ipath_count_units(NULL, NULL, NULL);
-
-       for (ndev = 0; ndev < devmax; ndev++) {
-               struct ipath_devdata *dd = ipath_lookup(ndev);
-
-               if (!usable(dd))
-                       continue;
-               for (i = 1; i < dd->ipath_cfgports; i++) {
-                       struct ipath_portdata *pd = dd->ipath_pd[i];
-
-                       /* Skip ports which are not yet open */
-                       if (!pd || !pd->port_cnt)
-                               continue;
-                       /* Skip port if it doesn't match the requested one */
-                       if (pd->port_subport_id != uinfo->spu_subport_id)
-                               continue;
-                       /* Verify the sharing process matches the master */
-                       if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
-                           pd->userversion != uinfo->spu_userversion ||
-                           pd->port_cnt >= pd->port_subport_cnt) {
-                               ret = -EINVAL;
-                               goto done;
-                       }
-                       port_fp(fp) = pd;
-                       subport_fp(fp) = pd->port_cnt++;
-                       pd->port_subpid[subport_fp(fp)] =
-                               get_pid(task_pid(current));
-                       tidcursor_fp(fp) = 0;
-                       pd->active_slaves |= 1 << subport_fp(fp);
-                       ipath_cdbg(PROC,
-                                  "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
-                                  current->comm, current->pid,
-                                  subport_fp(fp),
-                                  pd->port_comm, pid_nr(pd->port_pid),
-                                  dd->ipath_unit, pd->port_port);
-                       ret = 1;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-static int ipath_open(struct inode *in, struct file *fp)
-{
-       /* The real work is performed later in ipath_assign_port() */
-       fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
-       return fp->private_data ? 0 : -ENOMEM;
-}
-
-/* Get port early, so can set affinity prior to memory allocation */
-static int ipath_assign_port(struct file *fp,
-                             const struct ipath_user_info *uinfo)
-{
-       int ret;
-       int i_minor;
-       unsigned swmajor, swminor;
-
-       /* Check to be sure we haven't already initialized this file */
-       if (port_fp(fp)) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       /* for now, if major version is different, bail */
-       swmajor = uinfo->spu_userversion >> 16;
-       if (swmajor != IPATH_USER_SWMAJOR) {
-               ipath_dbg("User major version %d not same as driver "
-                         "major %d\n", uinfo->spu_userversion >> 16,
-                         IPATH_USER_SWMAJOR);
-               ret = -ENODEV;
-               goto done;
-       }
-
-       swminor = uinfo->spu_userversion & 0xffff;
-       if (swminor != IPATH_USER_SWMINOR)
-               ipath_dbg("User minor version %d not same as driver "
-                         "minor %d\n", swminor, IPATH_USER_SWMINOR);
-
-       mutex_lock(&ipath_mutex);
-
-       if (ipath_compatible_subports(swmajor, swminor) &&
-           uinfo->spu_subport_cnt &&
-           (ret = find_shared_port(fp, uinfo))) {
-               if (ret > 0)
-                       ret = 0;
-               goto done_chk_sdma;
-       }
-
-       i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
-       ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
-                  (long)file_inode(fp)->i_rdev, i_minor);
-
-       if (i_minor)
-               ret = find_free_port(i_minor - 1, fp, uinfo);
-       else
-               ret = find_best_unit(fp, uinfo);
-
-done_chk_sdma:
-       if (!ret) {
-               struct ipath_filedata *fd = fp->private_data;
-               const struct ipath_portdata *pd = fd->pd;
-               const struct ipath_devdata *dd = pd->port_dd;
-
-               fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
-                                                     dd->ipath_unit,
-                                                     pd->port_port,
-                                                     fd->subport);
-
-               if (!fd->pq)
-                       ret = -ENOMEM;
-       }
-
-       mutex_unlock(&ipath_mutex);
-
-done:
-       return ret;
-}
-
-
-static int ipath_do_user_init(struct file *fp,
-                             const struct ipath_user_info *uinfo)
-{
-       int ret;
-       struct ipath_portdata *pd = port_fp(fp);
-       struct ipath_devdata *dd;
-       u32 head32;
-
-       /* Subports don't need to initialize anything since master did it. */
-       if (subport_fp(fp)) {
-               ret = wait_event_interruptible(pd->port_wait,
-                       !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
-               goto done;
-       }
-
-       dd = pd->port_dd;
-
-       if (uinfo->spu_rcvhdrsize) {
-               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
-               if (ret)
-                       goto done;
-       }
-
-       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
-
-       /* some ports may get extra buffers, calculate that here */
-       if (pd->port_port <= dd->ipath_ports_extrabuf)
-               pd->port_piocnt = dd->ipath_pbufsport + 1;
-       else
-               pd->port_piocnt = dd->ipath_pbufsport;
-
-       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
-       if (pd->port_port <= dd->ipath_ports_extrabuf)
-               pd->port_pio_base = (dd->ipath_pbufsport + 1)
-                       * (pd->port_port - 1);
-       else
-               pd->port_pio_base = dd->ipath_ports_extrabuf +
-                       dd->ipath_pbufsport * (pd->port_port - 1);
-       pd->port_piobufs = dd->ipath_piobufbase +
-               pd->port_pio_base * dd->ipath_palign;
-       ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
-               " first pio %u\n", pd->port_port, pd->port_piobufs,
-               pd->port_piocnt, pd->port_pio_base);
-       ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
-
-       /*
-        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
-        * array for time being.  If pd->port_port > chip-supported,
-        * we need to do extra stuff here to handle by handling overflow
-        * through port 0, someday
-        */
-       ret = ipath_create_rcvhdrq(dd, pd);
-       if (!ret)
-               ret = ipath_create_user_egr(pd);
-       if (ret)
-               goto done;
-
-       /*
-        * set the eager head register for this port to the current values
-        * of the tail pointers, since we don't know if they were
-        * updated on last use of the port.
-        */
-       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
-       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
-       pd->port_lastrcvhdrqtail = -1;
-       ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
-               pd->port_port, head32);
-       pd->port_tidcursor = 0; /* start at beginning after open */
-
-       /* initialize poll variables... */
-       pd->port_urgent = 0;
-       pd->port_urgent_poll = 0;
-       pd->port_hdrqfull_poll = pd->port_hdrqfull;
-
-       /*
-        * Now enable the port for receive.
-        * For chips that are set to DMA the tail register to memory
-        * when they change (and when the update bit transitions from
-        * 0 to 1.  So for those chips, we turn it off and then back on.
-        * This will (very briefly) affect any other open ports, but the
-        * duration is very short, and therefore isn't an issue.  We
-        * explicitly set the in-memory tail copy to 0 beforehand, so we
-        * don't have to wait to be sure the DMA update has happened
-        * (chip resets head/tail to 0 on transition to enable).
-        */
-       set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-               &dd->ipath_rcvctrl);
-       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-               if (pd->port_rcvhdrtail_kvaddr)
-                       ipath_clear_rcvhdrtail(pd);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                       dd->ipath_rcvctrl &
-                       ~(1ULL << dd->ipath_r_tailupd_shift));
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-       /* Notify any waiting slaves */
-       if (pd->port_subport_cnt) {
-               clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-               wake_up(&pd->port_wait);
-       }
-done:
-       return ret;
-}
-
-/**
- * unlock_exptid - unlock any expected TID entries port still had in use
- * @pd: port
- *
- * We don't actually update the chip here, because we do a bulk update
- * below, using ipath_f_clear_tids.
- */
-static void unlock_expected_tids(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
-       int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-
-       ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
-                  pd->port_port);
-       for (i = port_tidbase; i < maxtid; i++) {
-               struct page *ps = dd->ipath_pageshadow[i];
-
-               if (!ps)
-                       continue;
-
-               dd->ipath_pageshadow[i] = NULL;
-               pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
-                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-               ipath_release_user_pages_on_close(&ps, 1);
-               cnt++;
-               ipath_stats.sps_pageunlocks++;
-       }
-       if (cnt)
-               ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
-                          pd->port_port, cnt);
-
-       if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
-               ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
-                          (unsigned long long) ipath_stats.sps_pagelocks,
-                          (unsigned long long)
-                          ipath_stats.sps_pageunlocks);
-}
-
-static int ipath_close(struct inode *in, struct file *fp)
-{
-       struct ipath_filedata *fd;
-       struct ipath_portdata *pd;
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       unsigned port;
-       struct pid *pid;
-
-       ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
-                  (long)in->i_rdev, fp->private_data);
-
-       mutex_lock(&ipath_mutex);
-
-       fd = fp->private_data;
-       fp->private_data = NULL;
-       pd = fd->pd;
-       if (!pd) {
-               mutex_unlock(&ipath_mutex);
-               goto bail;
-       }
-
-       dd = pd->port_dd;
-
-       /* drain user sdma queue */
-       ipath_user_sdma_queue_drain(dd, fd->pq);
-       ipath_user_sdma_queue_destroy(fd->pq);
-
-       if (--pd->port_cnt) {
-               /*
-                * XXX If the master closes the port before the slave(s),
-                * revoke the mmap for the eager receive queue so
-                * the slave(s) don't wait for receive data forever.
-                */
-               pd->active_slaves &= ~(1 << fd->subport);
-               put_pid(pd->port_subpid[fd->subport]);
-               pd->port_subpid[fd->subport] = NULL;
-               mutex_unlock(&ipath_mutex);
-               goto bail;
-       }
-       /* early; no interrupt users after this */
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       port = pd->port_port;
-       dd->ipath_pd[port] = NULL;
-       pid = pd->port_pid;
-       pd->port_pid = NULL;
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-       if (pd->port_rcvwait_to || pd->port_piowait_to
-           || pd->port_rcvnowait || pd->port_pionowait) {
-               ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
-                          "%u rcv %u, pio already\n",
-                          pd->port_port, pd->port_rcvwait_to,
-                          pd->port_piowait_to, pd->port_rcvnowait,
-                          pd->port_pionowait);
-               pd->port_rcvwait_to = pd->port_piowait_to =
-                       pd->port_rcvnowait = pd->port_pionowait = 0;
-       }
-       if (pd->port_flag) {
-               ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
-                         pd->port_port, pd->port_flag);
-               pd->port_flag = 0;
-       }
-
-       if (dd->ipath_kregbase) {
-               /* atomically clear receive enable port and intr avail. */
-               clear_bit(dd->ipath_r_portenable_shift + port,
-                         &dd->ipath_rcvctrl);
-               clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
-                         &dd->ipath_rcvctrl);
-               ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
-                       dd->ipath_rcvctrl);
-               /* and read back from chip to be sure that nothing
-                * else is in flight when we do the rest */
-               (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-               /* clean up the pkeys for this port user */
-               ipath_clean_part_key(pd, dd);
-               /*
-                * be paranoid, and never write 0's to these, just use an
-                * unused part of the port 0 tail page.  Of course,
-                * rcvhdraddr points to a large chunk of memory, so this
-                * could still trash things, but at least it won't trash
-                * page 0, and by disabling the port, it should stop "soon",
-                * even if a packet or two is in already in flight after we
-                * disabled the port.
-                */
-               ipath_write_kreg_port(dd,
-                       dd->ipath_kregs->kr_rcvhdrtailaddr, port,
-                       dd->ipath_dummy_hdrq_phys);
-               ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-                       pd->port_port, dd->ipath_dummy_hdrq_phys);
-
-               ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
-               ipath_chg_pioavailkernel(dd, pd->port_pio_base,
-                       pd->port_piocnt, 1);
-
-               dd->ipath_f_clear_tids(dd, pd->port_port);
-
-               if (dd->ipath_pageshadow)
-                       unlock_expected_tids(pd);
-               ipath_stats.sps_ports--;
-               ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
-                          pd->port_comm, pid_nr(pid),
-                          dd->ipath_unit, port);
-       }
-
-       put_pid(pid);
-       mutex_unlock(&ipath_mutex);
-       ipath_free_pddata(dd, pd); /* after releasing the mutex */
-
-bail:
-       kfree(fd);
-       return 0;
-}
-
-static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
-                          struct ipath_port_info __user *uinfo)
-{
-       struct ipath_port_info info;
-       int nup;
-       int ret;
-       size_t sz;
-
-       (void) ipath_count_units(NULL, &nup, NULL);
-       info.num_active = nup;
-       info.unit = pd->port_dd->ipath_unit;
-       info.port = pd->port_port;
-       info.subport = subport;
-       /* Don't return new fields if old library opened the port. */
-       if (ipath_supports_subports(pd->userversion >> 16,
-                                   pd->userversion & 0xffff)) {
-               /* Number of user ports available for this device. */
-               info.num_ports = pd->port_dd->ipath_cfgports - 1;
-               info.num_subports = pd->port_subport_cnt;
-               sz = sizeof(info);
-       } else
-               sz = sizeof(info) - 2 * sizeof(u16);
-
-       if (copy_to_user(uinfo, &info, sz)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_get_slave_info(struct ipath_portdata *pd,
-                               void __user *slave_mask_addr)
-{
-       int ret = 0;
-
-       if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
-               ret = -EFAULT;
-       return ret;
-}
-
-static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
-                                  u32 __user *inflightp)
-{
-       const u32 val = ipath_user_sdma_inflight_counter(pq);
-
-       if (put_user(val, inflightp))
-               return -EFAULT;
-
-       return 0;
-}
-
-static int ipath_sdma_get_complete(struct ipath_devdata *dd,
-                                  struct ipath_user_sdma_queue *pq,
-                                  u32 __user *completep)
-{
-       u32 val;
-       int err;
-
-       err = ipath_user_sdma_make_progress(dd, pq);
-       if (err < 0)
-               return err;
-
-       val = ipath_user_sdma_complete_counter(pq);
-       if (put_user(val, completep))
-               return -EFAULT;
-
-       return 0;
-}
-
-static ssize_t ipath_write(struct file *fp, const char __user *data,
-                          size_t count, loff_t *off)
-{
-       const struct ipath_cmd __user *ucmd;
-       struct ipath_portdata *pd;
-       const void __user *src;
-       size_t consumed, copy;
-       struct ipath_cmd cmd;
-       ssize_t ret = 0;
-       void *dest;
-
-       if (count < sizeof(cmd.type)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ucmd = (const struct ipath_cmd __user *) data;
-
-       if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       consumed = sizeof(cmd.type);
-
-       switch (cmd.type) {
-       case IPATH_CMD_ASSIGN_PORT:
-       case __IPATH_CMD_USER_INIT:
-       case IPATH_CMD_USER_INIT:
-               copy = sizeof(cmd.cmd.user_info);
-               dest = &cmd.cmd.user_info;
-               src = &ucmd->cmd.user_info;
-               break;
-       case IPATH_CMD_RECV_CTRL:
-               copy = sizeof(cmd.cmd.recv_ctrl);
-               dest = &cmd.cmd.recv_ctrl;
-               src = &ucmd->cmd.recv_ctrl;
-               break;
-       case IPATH_CMD_PORT_INFO:
-               copy = sizeof(cmd.cmd.port_info);
-               dest = &cmd.cmd.port_info;
-               src = &ucmd->cmd.port_info;
-               break;
-       case IPATH_CMD_TID_UPDATE:
-       case IPATH_CMD_TID_FREE:
-               copy = sizeof(cmd.cmd.tid_info);
-               dest = &cmd.cmd.tid_info;
-               src = &ucmd->cmd.tid_info;
-               break;
-       case IPATH_CMD_SET_PART_KEY:
-               copy = sizeof(cmd.cmd.part_key);
-               dest = &cmd.cmd.part_key;
-               src = &ucmd->cmd.part_key;
-               break;
-       case __IPATH_CMD_SLAVE_INFO:
-               copy = sizeof(cmd.cmd.slave_mask_addr);
-               dest = &cmd.cmd.slave_mask_addr;
-               src = &ucmd->cmd.slave_mask_addr;
-               break;
-       case IPATH_CMD_PIOAVAILUPD:     // force an update of PIOAvail reg
-               copy = 0;
-               src = NULL;
-               dest = NULL;
-               break;
-       case IPATH_CMD_POLL_TYPE:
-               copy = sizeof(cmd.cmd.poll_type);
-               dest = &cmd.cmd.poll_type;
-               src = &ucmd->cmd.poll_type;
-               break;
-       case IPATH_CMD_ARMLAUNCH_CTRL:
-               copy = sizeof(cmd.cmd.armlaunch_ctrl);
-               dest = &cmd.cmd.armlaunch_ctrl;
-               src = &ucmd->cmd.armlaunch_ctrl;
-               break;
-       case IPATH_CMD_SDMA_INFLIGHT:
-               copy = sizeof(cmd.cmd.sdma_inflight);
-               dest = &cmd.cmd.sdma_inflight;
-               src = &ucmd->cmd.sdma_inflight;
-               break;
-       case IPATH_CMD_SDMA_COMPLETE:
-               copy = sizeof(cmd.cmd.sdma_complete);
-               dest = &cmd.cmd.sdma_complete;
-               src = &ucmd->cmd.sdma_complete;
-               break;
-       default:
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (copy) {
-               if ((count - consumed) < copy) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               if (copy_from_user(dest, src, copy)) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-
-               consumed += copy;
-       }
-
-       pd = port_fp(fp);
-       if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
-               cmd.type != IPATH_CMD_ASSIGN_PORT) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       switch (cmd.type) {
-       case IPATH_CMD_ASSIGN_PORT:
-               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               break;
-       case __IPATH_CMD_USER_INIT:
-               /* backwards compatibility, get port first */
-               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               /* and fall through to current version. */
-       case IPATH_CMD_USER_INIT:
-               ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               ret = ipath_get_base_info(
-                       fp, (void __user *) (unsigned long)
-                       cmd.cmd.user_info.spu_base_info,
-                       cmd.cmd.user_info.spu_base_info_size);
-               break;
-       case IPATH_CMD_RECV_CTRL:
-               ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
-               break;
-       case IPATH_CMD_PORT_INFO:
-               ret = ipath_port_info(pd, subport_fp(fp),
-                                     (struct ipath_port_info __user *)
-                                     (unsigned long) cmd.cmd.port_info);
-               break;
-       case IPATH_CMD_TID_UPDATE:
-               ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
-               break;
-       case IPATH_CMD_TID_FREE:
-               ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
-               break;
-       case IPATH_CMD_SET_PART_KEY:
-               ret = ipath_set_part_key(pd, cmd.cmd.part_key);
-               break;
-       case __IPATH_CMD_SLAVE_INFO:
-               ret = ipath_get_slave_info(pd,
-                                          (void __user *) (unsigned long)
-                                          cmd.cmd.slave_mask_addr);
-               break;
-       case IPATH_CMD_PIOAVAILUPD:
-               ipath_force_pio_avail_update(pd->port_dd);
-               break;
-       case IPATH_CMD_POLL_TYPE:
-               pd->poll_type = cmd.cmd.poll_type;
-               break;
-       case IPATH_CMD_ARMLAUNCH_CTRL:
-               if (cmd.cmd.armlaunch_ctrl)
-                       ipath_enable_armlaunch(pd->port_dd);
-               else
-                       ipath_disable_armlaunch(pd->port_dd);
-               break;
-       case IPATH_CMD_SDMA_INFLIGHT:
-               ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
-                                             (u32 __user *) (unsigned long)
-                                             cmd.cmd.sdma_inflight);
-               break;
-       case IPATH_CMD_SDMA_COMPLETE:
-               ret = ipath_sdma_get_complete(pd->port_dd,
-                                             user_sdma_queue_fp(fp),
-                                             (u32 __user *) (unsigned long)
-                                             cmd.cmd.sdma_complete);
-               break;
-       }
-
-       if (ret >= 0)
-               ret = consumed;
-
-bail:
-       return ret;
-}
-
-static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *filp = iocb->ki_filp;
-       struct ipath_filedata *fp = filp->private_data;
-       struct ipath_portdata *pd = port_fp(filp);
-       struct ipath_user_sdma_queue *pq = fp->pq;
-
-       if (!iter_is_iovec(from) || !from->nr_segs)
-               return -EINVAL;
-
-       return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
-}
-
-static struct class *ipath_class;
-
-static int init_cdev(int minor, char *name, const struct file_operations *fops,
-                    struct cdev **cdevp, struct device **devp)
-{
-       const dev_t dev = MKDEV(IPATH_MAJOR, minor);
-       struct cdev *cdev = NULL;
-       struct device *device = NULL;
-       int ret;
-
-       cdev = cdev_alloc();
-       if (!cdev) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate cdev for minor %d, %s\n",
-                      minor, name);
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       cdev->owner = THIS_MODULE;
-       cdev->ops = fops;
-       kobject_set_name(&cdev->kobj, name);
-
-       ret = cdev_add(cdev, dev, 1);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not add cdev for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto err_cdev;
-       }
-
-       device = device_create(ipath_class, NULL, dev, NULL, name);
-
-       if (IS_ERR(device)) {
-               ret = PTR_ERR(device);
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-                      "device for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto err_cdev;
-       }
-
-       goto done;
-
-err_cdev:
-       cdev_del(cdev);
-       cdev = NULL;
-
-done:
-       if (ret >= 0) {
-               *cdevp = cdev;
-               *devp = device;
-       } else {
-               *cdevp = NULL;
-               *devp = NULL;
-       }
-
-       return ret;
-}
-
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-                   struct cdev **cdevp, struct device **devp)
-{
-       return init_cdev(minor, name, fops, cdevp, devp);
-}
-
-static void cleanup_cdev(struct cdev **cdevp,
-                        struct device **devp)
-{
-       struct device *dev = *devp;
-
-       if (dev) {
-               device_unregister(dev);
-               *devp = NULL;
-       }
-
-       if (*cdevp) {
-               cdev_del(*cdevp);
-               *cdevp = NULL;
-       }
-}
-
-void ipath_cdev_cleanup(struct cdev **cdevp,
-                       struct device **devp)
-{
-       cleanup_cdev(cdevp, devp);
-}
-
-static struct cdev *wildcard_cdev;
-static struct device *wildcard_dev;
-
-static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
-
-static int user_init(void)
-{
-       int ret;
-
-       ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
-                      "chrdev region (err %d)\n", -ret);
-               goto done;
-       }
-
-       ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
-
-       if (IS_ERR(ipath_class)) {
-               ret = PTR_ERR(ipath_class);
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-                      "device class (err %d)\n", -ret);
-               goto bail;
-       }
-
-       goto done;
-bail:
-       unregister_chrdev_region(dev, IPATH_NMINORS);
-done:
-       return ret;
-}
-
-static void user_cleanup(void)
-{
-       if (ipath_class) {
-               class_destroy(ipath_class);
-               ipath_class = NULL;
-       }
-
-       unregister_chrdev_region(dev, IPATH_NMINORS);
-}
-
-static atomic_t user_count = ATOMIC_INIT(0);
-static atomic_t user_setup = ATOMIC_INIT(0);
-
-int ipath_user_add(struct ipath_devdata *dd)
-{
-       char name[10];
-       int ret;
-
-       if (atomic_inc_return(&user_count) == 1) {
-               ret = user_init();
-               if (ret < 0) {
-                       ipath_dev_err(dd, "Unable to set up user support: "
-                                     "error %d\n", -ret);
-                       goto bail;
-               }
-               ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
-                               &wildcard_dev);
-               if (ret < 0) {
-                       ipath_dev_err(dd, "Could not create wildcard "
-                                     "minor: error %d\n", -ret);
-                       goto bail_user;
-               }
-
-               atomic_set(&user_setup, 1);
-       }
-
-       snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
-
-       ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
-                       &dd->user_cdev, &dd->user_dev);
-       if (ret < 0)
-               ipath_dev_err(dd, "Could not create user minor %d, %s\n",
-                             dd->ipath_unit + 1, name);
-
-       goto bail;
-
-bail_user:
-       user_cleanup();
-bail:
-       return ret;
-}
-
-void ipath_user_remove(struct ipath_devdata *dd)
-{
-       cleanup_cdev(&dd->user_cdev, &dd->user_dev);
-
-       if (atomic_dec_return(&user_count) == 0) {
-               if (atomic_read(&user_setup) == 0)
-                       goto bail;
-
-               cleanup_cdev(&wildcard_cdev, &wildcard_dev);
-               user_cleanup();
-
-               atomic_set(&user_setup, 0);
-       }
-bail:
-       return;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
deleted file mode 100644 (file)
index 476fcdf..0000000
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/slab.h>
-
-#include "ipath_kernel.h"
-
-#define IPATHFS_MAGIC 0x726a77
-
-static struct super_block *ipath_super;
-
-static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
-                        umode_t mode, const struct file_operations *fops,
-                        void *data)
-{
-       int error;
-       struct inode *inode = new_inode(dir->i_sb);
-
-       if (!inode) {
-               error = -EPERM;
-               goto bail;
-       }
-
-       inode->i_ino = get_next_ino();
-       inode->i_mode = mode;
-       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       inode->i_private = data;
-       if (S_ISDIR(mode)) {
-               inode->i_op = &simple_dir_inode_operations;
-               inc_nlink(inode);
-               inc_nlink(dir);
-       }
-
-       inode->i_fop = fops;
-
-       d_instantiate(dentry, inode);
-       error = 0;
-
-bail:
-       return error;
-}
-
-static int create_file(const char *name, umode_t mode,
-                      struct dentry *parent, struct dentry **dentry,
-                      const struct file_operations *fops, void *data)
-{
-       int error;
-
-       inode_lock(d_inode(parent));
-       *dentry = lookup_one_len(name, parent, strlen(name));
-       if (!IS_ERR(*dentry))
-               error = ipathfs_mknod(d_inode(parent), *dentry,
-                                     mode, fops, data);
-       else
-               error = PTR_ERR(*dentry);
-       inode_unlock(d_inode(parent));
-
-       return error;
-}
-
-static ssize_t atomic_stats_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
-                                      sizeof ipath_stats);
-}
-
-static const struct file_operations atomic_stats_ops = {
-       .read = atomic_stats_read,
-       .llseek = default_llseek,
-};
-
-static ssize_t atomic_counters_read(struct file *file, char __user *buf,
-                                   size_t count, loff_t *ppos)
-{
-       struct infinipath_counters counters;
-       struct ipath_devdata *dd;
-
-       dd = file_inode(file)->i_private;
-       dd->ipath_f_read_counters(dd, &counters);
-
-       return simple_read_from_buffer(buf, count, ppos, &counters,
-                                      sizeof counters);
-}
-
-static const struct file_operations atomic_counters_ops = {
-       .read = atomic_counters_read,
-       .llseek = default_llseek,
-};
-
-static ssize_t flash_read(struct file *file, char __user *buf,
-                         size_t count, loff_t *ppos)
-{
-       struct ipath_devdata *dd;
-       ssize_t ret;
-       loff_t pos;
-       char *tmp;
-
-       pos = *ppos;
-
-       if ( pos < 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (pos >= sizeof(struct ipath_flash)) {
-               ret = 0;
-               goto bail;
-       }
-
-       if (count > sizeof(struct ipath_flash) - pos)
-               count = sizeof(struct ipath_flash) - pos;
-
-       tmp = kmalloc(count, GFP_KERNEL);
-       if (!tmp) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       dd = file_inode(file)->i_private;
-       if (ipath_eeprom_read(dd, pos, tmp, count)) {
-               ipath_dev_err(dd, "failed to read from flash\n");
-               ret = -ENXIO;
-               goto bail_tmp;
-       }
-
-       if (copy_to_user(buf, tmp, count)) {
-               ret = -EFAULT;
-               goto bail_tmp;
-       }
-
-       *ppos = pos + count;
-       ret = count;
-
-bail_tmp:
-       kfree(tmp);
-
-bail:
-       return ret;
-}
-
-static ssize_t flash_write(struct file *file, const char __user *buf,
-                          size_t count, loff_t *ppos)
-{
-       struct ipath_devdata *dd;
-       ssize_t ret;
-       loff_t pos;
-       char *tmp;
-
-       pos = *ppos;
-
-       if (pos != 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (count != sizeof(struct ipath_flash)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       tmp = memdup_user(buf, count);
-       if (IS_ERR(tmp))
-               return PTR_ERR(tmp);
-
-       dd = file_inode(file)->i_private;
-       if (ipath_eeprom_write(dd, pos, tmp, count)) {
-               ret = -ENXIO;
-               ipath_dev_err(dd, "failed to write to flash\n");
-               goto bail_tmp;
-       }
-
-       *ppos = pos + count;
-       ret = count;
-
-bail_tmp:
-       kfree(tmp);
-
-bail:
-       return ret;
-}
-
-static const struct file_operations flash_ops = {
-       .read = flash_read,
-       .write = flash_write,
-       .llseek = default_llseek,
-};
-
-static int create_device_files(struct super_block *sb,
-                              struct ipath_devdata *dd)
-{
-       struct dentry *dir, *tmp;
-       char unit[10];
-       int ret;
-
-       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-       ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
-                         &simple_dir_operations, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-       ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
-                         &atomic_counters_ops, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s/atomic_counters) "
-                      "failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-       ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
-                         &flash_ops, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s/flash) "
-                      "failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-static int remove_file(struct dentry *parent, char *name)
-{
-       struct dentry *tmp;
-       int ret;
-
-       tmp = lookup_one_len(name, parent, strlen(name));
-
-       if (IS_ERR(tmp)) {
-               ret = PTR_ERR(tmp);
-               goto bail;
-       }
-
-       spin_lock(&tmp->d_lock);
-       if (simple_positive(tmp)) {
-               dget_dlock(tmp);
-               __d_drop(tmp);
-               spin_unlock(&tmp->d_lock);
-               simple_unlink(d_inode(parent), tmp);
-       } else
-               spin_unlock(&tmp->d_lock);
-
-       ret = 0;
-bail:
-       /*
-        * We don't expect clients to care about the return value, but
-        * it's there if they need it.
-        */
-       return ret;
-}
-
-static int remove_device_files(struct super_block *sb,
-                              struct ipath_devdata *dd)
-{
-       struct dentry *dir, *root;
-       char unit[10];
-       int ret;
-
-       root = dget(sb->s_root);
-       inode_lock(d_inode(root));
-       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-       dir = lookup_one_len(unit, root, strlen(unit));
-
-       if (IS_ERR(dir)) {
-               ret = PTR_ERR(dir);
-               printk(KERN_ERR "Lookup of %s failed\n", unit);
-               goto bail;
-       }
-
-       remove_file(dir, "flash");
-       remove_file(dir, "atomic_counters");
-       d_delete(dir);
-       ret = simple_rmdir(d_inode(root), dir);
-
-bail:
-       inode_unlock(d_inode(root));
-       dput(root);
-       return ret;
-}
-
-static int ipathfs_fill_super(struct super_block *sb, void *data,
-                             int silent)
-{
-       struct ipath_devdata *dd, *tmp;
-       unsigned long flags;
-       int ret;
-
-       static struct tree_descr files[] = {
-               [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
-               {""},
-       };
-
-       ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
-       if (ret) {
-               printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
-               spin_unlock_irqrestore(&ipath_devs_lock, flags);
-               ret = create_device_files(sb, dd);
-               if (ret)
-                       goto bail;
-               spin_lock_irqsave(&ipath_devs_lock, flags);
-       }
-
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-bail:
-       return ret;
-}
-
-static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data)
-{
-       struct dentry *ret;
-       ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
-       if (!IS_ERR(ret))
-               ipath_super = ret->d_sb;
-       return ret;
-}
-
-static void ipathfs_kill_super(struct super_block *s)
-{
-       kill_litter_super(s);
-       ipath_super = NULL;
-}
-
-int ipathfs_add_device(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (ipath_super == NULL) {
-               ret = 0;
-               goto bail;
-       }
-
-       ret = create_device_files(ipath_super, dd);
-
-bail:
-       return ret;
-}
-
-int ipathfs_remove_device(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (ipath_super == NULL) {
-               ret = 0;
-               goto bail;
-       }
-
-       ret = remove_device_files(ipath_super, dd);
-
-bail:
-       return ret;
-}
-
-static struct file_system_type ipathfs_fs_type = {
-       .owner =        THIS_MODULE,
-       .name =         "ipathfs",
-       .mount =        ipathfs_mount,
-       .kill_sb =      ipathfs_kill_super,
-};
-MODULE_ALIAS_FS("ipathfs");
-
-int __init ipath_init_ipathfs(void)
-{
-       return register_filesystem(&ipathfs_fs_type);
-}
-
-void __exit ipath_exit_ipathfs(void)
-{
-       unregister_filesystem(&ipathfs_fs_type);
-}
diff --git a/drivers/staging/rdma/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c
deleted file mode 100644 (file)
index 5f13572..0000000
+++ /dev/null
@@ -1,1939 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains all of the code that is specific to the InfiniPath
- * HT chip.
- */
-
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/htirq.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_kernel.h"
-#include "ipath_registers.h"
-
-static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
-
-
-/*
- * This lists the InfiniPath registers, in the actual chip layout.
- * This structure should never be directly accessed.
- *
- * The names are in InterCap form because they're taken straight from
- * the chip specification.  Since they're only used in this file, they
- * don't pollute the rest of the source.
-*/
-
-struct _infinipath_do_not_use_kernel_regs {
-       unsigned long long Revision;
-       unsigned long long Control;
-       unsigned long long PageAlign;
-       unsigned long long PortCnt;
-       unsigned long long DebugPortSelect;
-       unsigned long long DebugPort;
-       unsigned long long SendRegBase;
-       unsigned long long UserRegBase;
-       unsigned long long CounterRegBase;
-       unsigned long long Scratch;
-       unsigned long long ReservedMisc1;
-       unsigned long long InterruptConfig;
-       unsigned long long IntBlocked;
-       unsigned long long IntMask;
-       unsigned long long IntStatus;
-       unsigned long long IntClear;
-       unsigned long long ErrorMask;
-       unsigned long long ErrorStatus;
-       unsigned long long ErrorClear;
-       unsigned long long HwErrMask;
-       unsigned long long HwErrStatus;
-       unsigned long long HwErrClear;
-       unsigned long long HwDiagCtrl;
-       unsigned long long MDIO;
-       unsigned long long IBCStatus;
-       unsigned long long IBCCtrl;
-       unsigned long long ExtStatus;
-       unsigned long long ExtCtrl;
-       unsigned long long GPIOOut;
-       unsigned long long GPIOMask;
-       unsigned long long GPIOStatus;
-       unsigned long long GPIOClear;
-       unsigned long long RcvCtrl;
-       unsigned long long RcvBTHQP;
-       unsigned long long RcvHdrSize;
-       unsigned long long RcvHdrCnt;
-       unsigned long long RcvHdrEntSize;
-       unsigned long long RcvTIDBase;
-       unsigned long long RcvTIDCnt;
-       unsigned long long RcvEgrBase;
-       unsigned long long RcvEgrCnt;
-       unsigned long long RcvBufBase;
-       unsigned long long RcvBufSize;
-       unsigned long long RxIntMemBase;
-       unsigned long long RxIntMemSize;
-       unsigned long long RcvPartitionKey;
-       unsigned long long ReservedRcv[10];
-       unsigned long long SendCtrl;
-       unsigned long long SendPIOBufBase;
-       unsigned long long SendPIOSize;
-       unsigned long long SendPIOBufCnt;
-       unsigned long long SendPIOAvailAddr;
-       unsigned long long TxIntMemBase;
-       unsigned long long TxIntMemSize;
-       unsigned long long ReservedSend[9];
-       unsigned long long SendBufferError;
-       unsigned long long SendBufferErrorCONT1;
-       unsigned long long SendBufferErrorCONT2;
-       unsigned long long SendBufferErrorCONT3;
-       unsigned long long ReservedSBE[4];
-       unsigned long long RcvHdrAddr0;
-       unsigned long long RcvHdrAddr1;
-       unsigned long long RcvHdrAddr2;
-       unsigned long long RcvHdrAddr3;
-       unsigned long long RcvHdrAddr4;
-       unsigned long long RcvHdrAddr5;
-       unsigned long long RcvHdrAddr6;
-       unsigned long long RcvHdrAddr7;
-       unsigned long long RcvHdrAddr8;
-       unsigned long long ReservedRHA[7];
-       unsigned long long RcvHdrTailAddr0;
-       unsigned long long RcvHdrTailAddr1;
-       unsigned long long RcvHdrTailAddr2;
-       unsigned long long RcvHdrTailAddr3;
-       unsigned long long RcvHdrTailAddr4;
-       unsigned long long RcvHdrTailAddr5;
-       unsigned long long RcvHdrTailAddr6;
-       unsigned long long RcvHdrTailAddr7;
-       unsigned long long RcvHdrTailAddr8;
-       unsigned long long ReservedRHTA[7];
-       unsigned long long Sync;        /* Software only */
-       unsigned long long Dump;        /* Software only */
-       unsigned long long SimVer;      /* Software only */
-       unsigned long long ReservedSW[5];
-       unsigned long long SerdesConfig0;
-       unsigned long long SerdesConfig1;
-       unsigned long long SerdesStatus;
-       unsigned long long XGXSConfig;
-       unsigned long long ReservedSW2[4];
-};
-
-struct _infinipath_do_not_use_counters {
-       __u64 LBIntCnt;
-       __u64 LBFlowStallCnt;
-       __u64 Reserved1;
-       __u64 TxUnsupVLErrCnt;
-       __u64 TxDataPktCnt;
-       __u64 TxFlowPktCnt;
-       __u64 TxDwordCnt;
-       __u64 TxLenErrCnt;
-       __u64 TxMaxMinLenErrCnt;
-       __u64 TxUnderrunCnt;
-       __u64 TxFlowStallCnt;
-       __u64 TxDroppedPktCnt;
-       __u64 RxDroppedPktCnt;
-       __u64 RxDataPktCnt;
-       __u64 RxFlowPktCnt;
-       __u64 RxDwordCnt;
-       __u64 RxLenErrCnt;
-       __u64 RxMaxMinLenErrCnt;
-       __u64 RxICRCErrCnt;
-       __u64 RxVCRCErrCnt;
-       __u64 RxFlowCtrlErrCnt;
-       __u64 RxBadFormatCnt;
-       __u64 RxLinkProblemCnt;
-       __u64 RxEBPCnt;
-       __u64 RxLPCRCErrCnt;
-       __u64 RxBufOvflCnt;
-       __u64 RxTIDFullErrCnt;
-       __u64 RxTIDValidErrCnt;
-       __u64 RxPKeyMismatchCnt;
-       __u64 RxP0HdrEgrOvflCnt;
-       __u64 RxP1HdrEgrOvflCnt;
-       __u64 RxP2HdrEgrOvflCnt;
-       __u64 RxP3HdrEgrOvflCnt;
-       __u64 RxP4HdrEgrOvflCnt;
-       __u64 RxP5HdrEgrOvflCnt;
-       __u64 RxP6HdrEgrOvflCnt;
-       __u64 RxP7HdrEgrOvflCnt;
-       __u64 RxP8HdrEgrOvflCnt;
-       __u64 Reserved6;
-       __u64 Reserved7;
-       __u64 IBStatusChangeCnt;
-       __u64 IBLinkErrRecoveryCnt;
-       __u64 IBLinkDownedCnt;
-       __u64 IBSymbolErrCnt;
-};
-
-#define IPATH_KREG_OFFSET(field) (offsetof( \
-       struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
-#define IPATH_CREG_OFFSET(field) (offsetof( \
-       struct _infinipath_do_not_use_counters, field) / sizeof(u64))
-
-static const struct ipath_kregs ipath_ht_kregs = {
-       .kr_control = IPATH_KREG_OFFSET(Control),
-       .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
-       .kr_debugport = IPATH_KREG_OFFSET(DebugPort),
-       .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
-       .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
-       .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
-       .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
-       .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
-       .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
-       .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
-       .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
-       .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
-       .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
-       .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
-       .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
-       .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
-       .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
-       .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
-       .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
-       .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
-       .kr_intclear = IPATH_KREG_OFFSET(IntClear),
-       .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
-       .kr_intmask = IPATH_KREG_OFFSET(IntMask),
-       .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
-       .kr_mdio = IPATH_KREG_OFFSET(MDIO),
-       .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
-       .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
-       .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
-       .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
-       .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
-       .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
-       .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
-       .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
-       .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
-       .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
-       .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
-       .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
-       .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
-       .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
-       .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
-       .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
-       .kr_revision = IPATH_KREG_OFFSET(Revision),
-       .kr_scratch = IPATH_KREG_OFFSET(Scratch),
-       .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
-       .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
-       .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
-       .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
-       .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
-       .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
-       .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
-       .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
-       .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
-       .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
-       .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
-       .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
-       .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
-       .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
-       /*
-        * These should not be used directly via ipath_write_kreg64(),
-        * use them with ipath_write_kreg64_port(),
-        */
-       .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
-       .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
-};
-
-static const struct ipath_cregs ipath_ht_cregs = {
-       .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
-       .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
-       .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
-       .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
-       .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
-       .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
-       .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
-       .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
-       .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
-       .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
-       .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
-       .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
-       /* calc from Reg_CounterRegBase + offset */
-       .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
-       .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
-       .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
-       .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
-       .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
-       .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
-       .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
-       .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
-       .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
-       .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
-       .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
-       .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
-       .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
-       .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
-       .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
-       .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
-       .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
-       .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
-       .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
-       .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
-       .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
-};
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVURG_SHIFT 0
-#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVAVAIL_SHIFT 12
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
-#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
-#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
-#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
-#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
-#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
-#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
-#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
-#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
-#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
-#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
-#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
-#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
-#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
-
-#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
-#define IBA6110_IBCS_LINKSTATE_SHIFT 4
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_FREQSEL 0x2
-#define INFINIPATH_EXTS_SERDESSEL 0x4
-#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
-#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
-
-
-/* TID entries (memory), HT-only */
-#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
-#define INFINIPATH_RT_VALID 0x8000000000000000ULL
-#define INFINIPATH_RT_ADDR_SHIFT 0
-#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
-#define INFINIPATH_RT_BUFSIZE_SHIFT 48
-
-#define INFINIPATH_R_INTRAVAIL_SHIFT 16
-#define INFINIPATH_R_TAILUPD_SHIFT 31
-
-/* kr_xgxsconfig bits */
-#define INFINIPATH_XGXS_RESET          0x7ULL
-
-/*
- * masks and bits that are different in different chips, or present only
- * in one
- */
-static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
-    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
-static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
-    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
-
-static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
-
-#define _IPATH_GPIO_SDA_NUM 1
-#define _IPATH_GPIO_SCL_NUM 0
-
-#define IPATH_GPIO_SDA \
-       (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-#define IPATH_GPIO_SCL \
-       (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-
-/* keep the code below somewhat more readable; not used elsewhere */
-#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
-                               infinipath_hwe_htclnkabyte1crcerr)
-#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |    \
-                               infinipath_hwe_htclnkbbyte1crcerr)
-#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
-                               infinipath_hwe_htclnkbbyte0crcerr)
-#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |    \
-                               infinipath_hwe_htclnkbbyte1crcerr)
-
-static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
-                         char *msg, size_t msgl)
-{
-       char bitsmsg[64];
-       ipath_err_t crcbits = hwerrs &
-               (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
-       /* don't check if 8bit HT */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-               crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
-       /* don't check if 8bit HT */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-               crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
-       /*
-        * we'll want to ignore link errors on link that is
-        * not in use, if any.  For now, complain about both
-        */
-       if (crcbits) {
-               u16 ctrl0, ctrl1;
-               snprintf(bitsmsg, sizeof bitsmsg,
-                        "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
-                        !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
-                        "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
-                                   ? "1 (B)" : "0+1 (A+B)"),
-                        !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
-                        : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
-                           "0+1"), (unsigned long long) crcbits);
-               strlcat(msg, bitsmsg, msgl);
-
-               /*
-                * print extra info for debugging.  slave/primary
-                * config word 4, 8 (link control 0, 1)
-                */
-
-               if (pci_read_config_word(dd->pcidev,
-                                        dd->ipath_ht_slave_off + 0x4,
-                                        &ctrl0))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkctrl0 of slave/primary "
-                                "config block\n");
-               else if (!(ctrl0 & 1 << 6))
-                       /* not if EOC bit set */
-                       ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
-                                 ((ctrl0 >> 8) & 7) ? " CRC" : "",
-                                 ((ctrl0 >> 4) & 1) ? "linkfail" :
-                                 "");
-               if (pci_read_config_word(dd->pcidev,
-                                        dd->ipath_ht_slave_off + 0x8,
-                                        &ctrl1))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkctrl1 of slave/primary "
-                                "config block\n");
-               else if (!(ctrl1 & 1 << 6))
-                       /* not if EOC bit set */
-                       ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
-                                 ((ctrl1 >> 8) & 7) ? " CRC" : "",
-                                 ((ctrl1 >> 4) & 1) ? "linkfail" :
-                                 "");
-
-               /* disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~crcbits;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-               ipath_dbg("HT crc errs: %s\n", msg);
-       } else
-               ipath_dbg("ignoring HT crc errors 0x%llx, "
-                         "not in use\n", (unsigned long long)
-                         (hwerrs & (_IPATH_HTLINK0_CRCBITS |
-                                    _IPATH_HTLINK1_CRCBITS)));
-}
-
-/* 6110 specific hardware errors... */
-static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
-       INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
-       INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
-       INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
-       INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
-       INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
-       INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
-       INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
-       INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
-};
-
-#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
-                       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
-                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
-#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
-                         << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
-
-static void ipath_ht_txe_recover(struct ipath_devdata *dd)
-{
-       ++ipath_stats.sps_txeparity;
-       dev_info(&dd->pcidev->dev,
-               "Recovering from TXE PIO parity error\n");
-}
-
-
-/**
- * ipath_ht_handle_hwerrors - display hardware errors.
- * @dd: the infinipath device
- * @msg: the output buffer
- * @msgl: the size of the output buffer
- *
- * Use same msg buffer as regular errors to avoid excessive stack
- * use.  Most hardware errors are catastrophic, but for right now,
- * we'll print them and continue.  We reuse the same message buffer as
- * ipath_handle_errors() to avoid excessive stack usage.
- */
-static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
-                                    size_t msgl)
-{
-       ipath_err_t hwerrs;
-       u32 bits, ctrl;
-       int isfatal = 0;
-       char bitsmsg[64];
-       int log_idx;
-
-       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-
-       if (!hwerrs) {
-               ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
-               /*
-                * better than printing cofusing messages
-                * This seems to be related to clearing the crc error, or
-                * the pll error during init.
-                */
-               goto bail;
-       } else if (hwerrs == -1LL) {
-               ipath_dev_err(dd, "Read of hardware error status failed "
-                             "(all bits set); ignoring\n");
-               goto bail;
-       }
-       ipath_stats.sps_hwerrs++;
-
-       /* Always clear the error status register, except MEMBISTFAIL,
-        * regardless of whether we continue or stop using the chip.
-        * We want that set so we know it failed, even across driver reload.
-        * We'll still ignore it in the hwerrmask.  We do this partly for
-        * diagnostics, but also for support */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
-
-       hwerrs &= dd->ipath_hwerrmask;
-
-       /* We log some errors to EEPROM, check if we have any of those. */
-       for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
-               if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
-                       ipath_inc_eeprom_err(dd, log_idx, 1);
-
-       /*
-        * make sure we get this much out, unless told to be quiet,
-        * it's a parity error we may recover from,
-        * or it's occurred within the last 5 seconds
-        */
-       if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
-               RXE_EAGER_PARITY)) ||
-               (ipath_debug & __IPATH_VERBDBG))
-               dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
-                        "(cleared)\n", (unsigned long long) hwerrs);
-       dd->ipath_lasthwerror |= hwerrs;
-
-       if (hwerrs & ~dd->ipath_hwe_bitsextant)
-               ipath_dev_err(dd, "hwerror interrupt with unknown errors "
-                             "%llx set\n", (unsigned long long)
-                             (hwerrs & ~dd->ipath_hwe_bitsextant));
-
-       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-       if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
-               /*
-                * parity errors in send memory are recoverable,
-                * just cancel the send (if indicated in * sendbuffererror),
-                * count the occurrence, unfreeze (if no other handled
-                * hardware error bits are set), and continue. They can
-                * occur if a processor speculative read is done to the PIO
-                * buffer while we are sending a packet, for example.
-                */
-               if (hwerrs & TXE_PIO_PARITY) {
-                       ipath_ht_txe_recover(dd);
-                       hwerrs &= ~TXE_PIO_PARITY;
-               }
-
-               if (!hwerrs) {
-                       ipath_dbg("Clearing freezemode on ignored or "
-                                 "recovered hardware error\n");
-                       ipath_clear_freeze(dd);
-               }
-       }
-
-       *msg = '\0';
-
-       /*
-        * may someday want to decode into which bits are which
-        * functional area for parity errors, etc.
-        */
-       if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
-                     << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
-
-       ipath_format_hwerrors(hwerrs,
-                             ipath_6110_hwerror_msgs,
-                             ARRAY_SIZE(ipath_6110_hwerror_msgs),
-                             msg, msgl);
-
-       if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
-               hwerr_crcbits(dd, hwerrs, msg, msgl);
-
-       if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
-               strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
-                       msgl);
-               /* ignore from now on, so disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
-                        INFINIPATH_HWE_COREPLL_RFSLIP |        \
-                        INFINIPATH_HWE_HTBPLL_FBSLIP |         \
-                        INFINIPATH_HWE_HTBPLL_RFSLIP |         \
-                        INFINIPATH_HWE_HTAPLL_FBSLIP |         \
-                        INFINIPATH_HWE_HTAPLL_RFSLIP)
-
-       if (hwerrs & _IPATH_PLL_FAIL) {
-               snprintf(bitsmsg, sizeof bitsmsg,
-                        "[PLL failed (%llx), InfiniPath hardware unusable]",
-                        (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
-               strlcat(msg, bitsmsg, msgl);
-               /* ignore from now on, so disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-
-       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
-               /*
-                * If it occurs, it is left masked since the eternal
-                * interface is unused
-                */
-               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-
-       if (hwerrs) {
-               /*
-                * if any set that we aren't ignoring; only
-                * make the complaint once, in case it's stuck
-                * or recurring, and we get here multiple
-                * times.
-                * force link down, so switch knows, and
-                * LEDs are turned off
-                */
-               if (dd->ipath_flags & IPATH_INITTED) {
-                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-                       ipath_setup_ht_setextled(dd,
-                               INFINIPATH_IBCS_L_STATE_DOWN,
-                               INFINIPATH_IBCS_LT_STATE_DISABLED);
-                       ipath_dev_err(dd, "Fatal Hardware Error (freeze "
-                                         "mode), no longer usable, SN %.16s\n",
-                                         dd->ipath_serial);
-                       isfatal = 1;
-               }
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-               /* mark as having had error */
-               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-               /*
-                * mark as not usable, at a minimum until driver
-                * is reloaded, probably until reboot, since no
-                * other reset is possible.
-                */
-               dd->ipath_flags &= ~IPATH_INITTED;
-       } else {
-               *msg = 0; /* recovered from all of them */
-       }
-       if (*msg)
-               ipath_dev_err(dd, "%s hardware error\n", msg);
-       if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
-               /*
-                * for status file; if no trailing brace is copied,
-                * we'll know it was truncated.
-                */
-               snprintf(dd->ipath_freezemsg,
-                        dd->ipath_freezelen, "{%s}", msg);
-
-bail:;
-}
-
-/**
- * ipath_ht_boardname - fill in the board name
- * @dd: the infinipath device
- * @name: the output buffer
- * @namelen: the size of the output buffer
- *
- * fill in the board name, based on the board revision register
- */
-static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
-                             size_t namelen)
-{
-       char *n = NULL;
-       u8 boardrev = dd->ipath_boardrev;
-       int ret = 0;
-
-       switch (boardrev) {
-       case 5:
-               /*
-                * original production board; two production levels, with
-                * different serial number ranges.   See ipath_ht_early_init() for
-                * case where we enable IPATH_GPIO_INTR for later serial # range.
-                * Original 112* serial number is no longer supported.
-                */
-               n = "InfiniPath_QHT7040";
-               break;
-       case 7:
-               /* small form factor production board */
-               n = "InfiniPath_QHT7140";
-               break;
-       default:                /* don't know, just print the number */
-               ipath_dev_err(dd, "Don't yet know about board "
-                             "with ID %u\n", boardrev);
-               snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
-                        boardrev);
-               break;
-       }
-       if (n)
-               snprintf(name, namelen, "%s", n);
-
-       if (ret) {
-               ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
-               goto bail;
-       }
-       if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
-               dd->ipath_minrev > 4)) {
-               /*
-                * This version of the driver only supports Rev 3.2 - 3.4
-                */
-               ipath_dev_err(dd,
-                             "Unsupported InfiniPath hardware revision %u.%u!\n",
-                             dd->ipath_majrev, dd->ipath_minrev);
-               ret = 1;
-               goto bail;
-       }
-       /*
-        * pkt/word counters are 32 bit, and therefore wrap fast enough
-        * that we snapshot them from a timer, and maintain 64 bit shadow
-        * copies
-        */
-       dd->ipath_flags |= IPATH_32BITCOUNTERS;
-       dd->ipath_flags |= IPATH_GPIO_INTR;
-       if (dd->ipath_lbus_speed != 800)
-               ipath_dev_err(dd,
-                             "Incorrectly configured for HT @ %uMHz\n",
-                             dd->ipath_lbus_speed);
-
-       /*
-        * set here, not in ipath_init_*_funcs because we have to do
-        * it after we can read chip registers.
-        */
-       dd->ipath_ureg_align =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-
-bail:
-       return ret;
-}
-
-static void ipath_check_htlink(struct ipath_devdata *dd)
-{
-       u8 linkerr, link_off, i;
-
-       for (i = 0; i < 2; i++) {
-               link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
-               if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkerror%d of HT slave/primary block\n",
-                                i);
-               else if (linkerr & 0xf0) {
-                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-                                  "clearing\n", linkerr >> 4, i);
-                       /*
-                        * writing the linkerr bits that are set should
-                        * clear them
-                        */
-                       if (pci_write_config_byte(dd->pcidev, link_off,
-                                                 linkerr))
-                               ipath_dbg("Failed write to clear HT "
-                                         "linkerror%d\n", i);
-                       if (pci_read_config_byte(dd->pcidev, link_off,
-                                                &linkerr))
-                               dev_info(&dd->pcidev->dev,
-                                        "Couldn't reread linkerror%d of "
-                                        "HT slave/primary block\n", i);
-                       else if (linkerr & 0xf0)
-                               dev_info(&dd->pcidev->dev,
-                                        "HT linkerror%d bits 0x%x "
-                                        "couldn't be cleared\n",
-                                        i, linkerr >> 4);
-               }
-       }
-}
-
-static int ipath_setup_ht_reset(struct ipath_devdata *dd)
-{
-       ipath_dbg("No reset possible for this InfiniPath hardware\n");
-       return 0;
-}
-
-#define HT_INTR_DISC_CONFIG  0x80      /* HT interrupt and discovery cap */
-#define HT_INTR_REG_INDEX    2 /* intconfig requires indirect accesses */
-
-/*
- * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
- * errors.  We only bother to do this at load time, because it's OK if
- * it happened before we were loaded (first time after boot/reset),
- * but any time after that, it's fatal anyway.  Also need to not check
- * for upper byte errors if we are in 8 bit mode, so figure out
- * our width.  For now, at least, also complain if it's 8 bit.
- */
-static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
-                            int pos, u8 cap_type)
-{
-       u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
-       u16 linkctrl = 0;
-       int i;
-
-       dd->ipath_ht_slave_off = pos;
-       /* command word, master_host bit */
-       /* master host || slave */
-       if ((cap_type >> 2) & 1)
-               link_a_b_off = 4;
-       else
-               link_a_b_off = 0;
-       ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
-                  link_a_b_off ? 1 : 0,
-                  link_a_b_off ? 'B' : 'A');
-
-       link_a_b_off += pos;
-
-       /*
-        * check both link control registers; clear both HT CRC sets if
-        * necessary.
-        */
-       for (i = 0; i < 2; i++) {
-               link_off = pos + i * 4 + 0x4;
-               if (pci_read_config_word(pdev, link_off, &linkctrl))
-                       ipath_dev_err(dd, "Couldn't read HT link control%d "
-                                     "register\n", i);
-               else if (linkctrl & (0xf << 8)) {
-                       ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
-                                  "bits %x\n", i, linkctrl & (0xf << 8));
-                       /*
-                        * now write them back to clear the error.
-                        */
-                       pci_write_config_word(pdev, link_off,
-                                             linkctrl & (0xf << 8));
-               }
-       }
-
-       /*
-        * As with HT CRC bits, same for protocol errors that might occur
-        * during boot.
-        */
-       for (i = 0; i < 2; i++) {
-               link_off = pos + i * 4 + 0xd;
-               if (pci_read_config_byte(pdev, link_off, &linkerr))
-                       dev_info(&pdev->dev, "Couldn't read linkerror%d "
-                                "of HT slave/primary block\n", i);
-               else if (linkerr & 0xf0) {
-                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-                                  "clearing\n", linkerr >> 4, i);
-                       /*
-                        * writing the linkerr bits that are set will clear
-                        * them
-                        */
-                       if (pci_write_config_byte
-                           (pdev, link_off, linkerr))
-                               ipath_dbg("Failed write to clear HT "
-                                         "linkerror%d\n", i);
-                       if (pci_read_config_byte(pdev, link_off, &linkerr))
-                               dev_info(&pdev->dev, "Couldn't reread "
-                                        "linkerror%d of HT slave/primary "
-                                        "block\n", i);
-                       else if (linkerr & 0xf0)
-                               dev_info(&pdev->dev, "HT linkerror%d bits "
-                                        "0x%x couldn't be cleared\n",
-                                        i, linkerr >> 4);
-               }
-       }
-
-       /*
-        * this is just for our link to the host, not devices connected
-        * through tunnel.
-        */
-
-       if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
-               ipath_dev_err(dd, "Couldn't read HT link width "
-                             "config register\n");
-       else {
-               u32 width;
-               switch (linkwidth & 7) {
-               case 5:
-                       width = 4;
-                       break;
-               case 4:
-                       width = 2;
-                       break;
-               case 3:
-                       width = 32;
-                       break;
-               case 1:
-                       width = 16;
-                       break;
-               case 0:
-               default:        /* if wrong, assume 8 bit */
-                       width = 8;
-                       break;
-               }
-
-               dd->ipath_lbus_width = width;
-
-               if (linkwidth != 0x11) {
-                       ipath_dev_err(dd, "Not configured for 16 bit HT "
-                                     "(%x)\n", linkwidth);
-                       if (!(linkwidth & 0xf)) {
-                               ipath_dbg("Will ignore HT lane1 errors\n");
-                               dd->ipath_flags |= IPATH_8BIT_IN_HT0;
-                       }
-               }
-       }
-
-       /*
-        * this is just for our link to the host, not devices connected
-        * through tunnel.
-        */
-       if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
-               ipath_dev_err(dd, "Couldn't read HT link frequency "
-                             "config register\n");
-       else {
-               u32 speed;
-               switch (linkwidth & 0xf) {
-               case 6:
-                       speed = 1000;
-                       break;
-               case 5:
-                       speed = 800;
-                       break;
-               case 4:
-                       speed = 600;
-                       break;
-               case 3:
-                       speed = 500;
-                       break;
-               case 2:
-                       speed = 400;
-                       break;
-               case 1:
-                       speed = 300;
-                       break;
-               default:
-                       /*
-                        * assume reserved and vendor-specific are 200...
-                        */
-               case 0:
-                       speed = 200;
-                       break;
-               }
-               dd->ipath_lbus_speed = speed;
-       }
-
-       snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
-               "HyperTransport,%uMHz,x%u\n",
-               dd->ipath_lbus_speed,
-               dd->ipath_lbus_width);
-}
-
-static int ipath_ht_intconfig(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (dd->ipath_intconfig) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
-                                dd->ipath_intconfig);  /* interrupt address */
-               ret = 0;
-       } else {
-               ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
-                             "interrupt address\n");
-               ret = -EINVAL;
-       }
-
-       return ret;
-}
-
-static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
-                               struct ht_irq_msg *msg)
-{
-       struct ipath_devdata *dd = pci_get_drvdata(dev);
-       u64 prev_intconfig = dd->ipath_intconfig;
-
-       dd->ipath_intconfig = msg->address_lo;
-       dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
-
-       /*
-        * If the previous value of dd->ipath_intconfig is zero, we're
-        * getting configured for the first time, and must not program the
-        * intconfig register here (it will be programmed later, when the
-        * hardware is ready).  Otherwise, we should.
-        */
-       if (prev_intconfig)
-               ipath_ht_intconfig(dd);
-}
-
-/**
- * ipath_setup_ht_config - setup the interruptconfig register
- * @dd: the infinipath device
- * @pdev: the PCI device
- *
- * setup the interruptconfig register from the HT config info.
- * Also clear CRC errors in HT linkcontrol, if necessary.
- * This is done only for the real hardware.  It is done before
- * chip address space is initted, so can't touch infinipath registers
- */
-static int ipath_setup_ht_config(struct ipath_devdata *dd,
-                                struct pci_dev *pdev)
-{
-       int pos, ret;
-
-       ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
-       if (ret < 0) {
-               ipath_dev_err(dd, "Couldn't create interrupt handler: "
-                             "err %d\n", ret);
-               goto bail;
-       }
-       dd->ipath_irq = ret;
-       ret = 0;
-
-       /*
-        * Handle clearing CRC errors in linkctrl register if necessary.  We
-        * do this early, before we ever enable errors or hardware errors,
-        * mostly to avoid causing the chip to enter freeze mode.
-        */
-       pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
-       if (!pos) {
-               ipath_dev_err(dd, "Couldn't find HyperTransport "
-                             "capability; no interrupts\n");
-               ret = -ENODEV;
-               goto bail;
-       }
-       do {
-               u8 cap_type;
-
-               /*
-                * The HT capability type byte is 3 bytes after the
-                * capability byte.
-                */
-               if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
-                       dev_info(&pdev->dev, "Couldn't read config "
-                                "command @ %d\n", pos);
-                       continue;
-               }
-               if (!(cap_type & 0xE0))
-                       slave_or_pri_blk(dd, pdev, pos, cap_type);
-       } while ((pos = pci_find_next_capability(pdev, pos,
-                                                PCI_CAP_ID_HT)));
-
-       dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
- * @dd: the infinipath device
- *
- * Called during driver unload.
- * This is currently a nop for the HT chip, not for all chips
- */
-static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
-{
-}
-
-/**
- * ipath_setup_ht_setextled - set the state of the two external LEDs
- * @dd: the infinipath device
- * @lst: the L state
- * @ltst: the LT state
- *
- * Set the state of the two external LEDs, to indicate physical and
- * logical state of IB link.   For this chip (at least with recommended
- * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
- * (logical state)
- *
- * Note:  We try to match the Mellanox HCA LED behavior as best
- * we can.  Green indicates physical link state is OK (something is
- * plugged in, and we can train).
- * Amber indicates the link is logically up (ACTIVE).
- * Mellanox further blinks the amber LED to indicate data packet
- * activity, but we have no hardware support for that, so it would
- * require waking up every 10-20 msecs and checking the counters
- * on the chip, and then turning the LED off if appropriate.  That's
- * visible overhead, so not something we will do.
- *
- */
-static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
-                                    u64 lst, u64 ltst)
-{
-       u64 extctl;
-       unsigned long flags = 0;
-
-       /* the diags use the LED to indicate diag info, so we leave
-        * the external LED alone when the diags are running */
-       if (ipath_diag_inuse)
-               return;
-
-       /* Allow override of LED display for, e.g. Locating system in rack */
-       if (dd->ipath_led_override) {
-               ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
-                       ? INFINIPATH_IBCS_LT_STATE_LINKUP
-                       : INFINIPATH_IBCS_LT_STATE_DISABLED;
-               lst = (dd->ipath_led_override & IPATH_LED_LOG)
-                       ? INFINIPATH_IBCS_L_STATE_ACTIVE
-                       : INFINIPATH_IBCS_L_STATE_DOWN;
-       }
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       /*
-        * start by setting both LED control bits to off, then turn
-        * on the appropriate bit(s).
-        */
-       if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
-               /*
-                * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
-                * is inverted,  because it is normally used to indicate
-                * a hardware fault at reset, if there were errors
-                */
-               extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
-                       | INFINIPATH_EXTC_LEDGBLERR_OFF;
-               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-                       extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
-               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-                       extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
-       } else {
-               extctl = dd->ipath_extctrl &
-                       ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
-                         INFINIPATH_EXTC_LED2PRIPORT_ON);
-               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-                       extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
-               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-                       extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
-       }
-       dd->ipath_extctrl = extctl;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-}
-
-static void ipath_init_ht_variables(struct ipath_devdata *dd)
-{
-       /*
-        * setup the register offsets, since they are different for each
-        * chip
-        */
-       dd->ipath_kregs = &ipath_ht_kregs;
-       dd->ipath_cregs = &ipath_ht_cregs;
-
-       dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
-       dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
-       dd->ipath_gpio_sda = IPATH_GPIO_SDA;
-       dd->ipath_gpio_scl = IPATH_GPIO_SCL;
-
-       /*
-        * Fill in data for field-values that change in newer chips.
-        * We dynamically specify only the mask for LINKTRAININGSTATE
-        * and only the shift for LINKSTATE, as they are the only ones
-        * that change.  Also precalculate the 3 link states of interest
-        * and the combined mask.
-        */
-       dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
-       dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
-       dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
-               dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
-       dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
-       dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
-       dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
-
-       /*
-        * Fill in data for ibcc field-values that change in newer chips.
-        * We dynamically specify only the mask for LINKINITCMD
-        * and only the shift for LINKCMD and MAXPKTLEN, as they are
-        * the only ones that change.
-        */
-       dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
-       dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
-       dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
-
-       /* Fill in shifts for RcvCtrl. */
-       dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
-       dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
-       dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
-       dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
-
-       dd->ipath_i_bitsextant =
-               (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
-               (INFINIPATH_I_RCVAVAIL_MASK <<
-                INFINIPATH_I_RCVAVAIL_SHIFT) |
-               INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
-               INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
-
-       dd->ipath_e_bitsextant =
-               INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
-               INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
-               INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
-               INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
-               INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
-               INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
-               INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-               INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
-               INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
-               INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
-               INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
-               INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
-               INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
-               INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
-               INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
-               INFINIPATH_E_HARDWARE;
-
-       dd->ipath_hwe_bitsextant =
-               (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
-               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
-               (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
-               INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
-               INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
-               INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
-               INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
-               INFINIPATH_HWE_HTCMISCERR4 |
-               INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
-               INFINIPATH_HWE_HTCMISCERR7 |
-               INFINIPATH_HWE_HTCBUSTREQPARITYERR |
-               INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
-               INFINIPATH_HWE_HTCBUSIREQPARITYERR |
-               INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
-               INFINIPATH_HWE_MEMBISTFAILED |
-               INFINIPATH_HWE_COREPLL_FBSLIP |
-               INFINIPATH_HWE_COREPLL_RFSLIP |
-               INFINIPATH_HWE_HTBPLL_FBSLIP |
-               INFINIPATH_HWE_HTBPLL_RFSLIP |
-               INFINIPATH_HWE_HTAPLL_FBSLIP |
-               INFINIPATH_HWE_HTAPLL_RFSLIP |
-               INFINIPATH_HWE_SERDESPLLFAILED |
-               INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
-               INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
-
-       dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
-       dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
-       dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
-       dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
-
-       /*
-        * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
-        * 2 is Some Misc, 3 is reserved for future.
-        */
-       dd->ipath_eep_st_masks[0].hwerrs_to_log =
-               INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-               INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
-
-       dd->ipath_eep_st_masks[1].hwerrs_to_log =
-               INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-               INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
-
-       dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
-
-       dd->delay_mult = 2; /* SDR, 4X, can't change */
-
-       dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
-       dd->ipath_link_speed_supported = IPATH_IB_SDR;
-       dd->ipath_link_width_enabled = IB_WIDTH_4X;
-       dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
-       /* these can't change for this chip, so set once */
-       dd->ipath_link_width_active = dd->ipath_link_width_enabled;
-       dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
-}
-
-/**
- * ipath_ht_init_hwerrors - enable hardware errors
- * @dd: the infinipath device
- *
- * now that we have finished initializing everything that might reasonably
- * cause a hardware error, and cleared those errors bits as they occur,
- * we can enable hardware errors in the mask (potentially enabling
- * freeze mode), and enable hardware errors as errors (along with
- * everything else) in errormask
- */
-static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
-{
-       ipath_err_t val;
-       u64 extsval;
-
-       extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-
-       if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
-               ipath_dev_err(dd, "MemBIST did not complete!\n");
-       if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
-               ipath_dbg("MemBIST corrected\n");
-
-       ipath_check_htlink(dd);
-
-       /* barring bugs, all hwerrors become interrupts, which can */
-       val = -1LL;
-       /* don't look at crc lane1 if 8 bit */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-               val &= ~infinipath_hwe_htclnkabyte1crcerr;
-       /* don't look at crc lane1 if 8 bit */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-               val &= ~infinipath_hwe_htclnkbbyte1crcerr;
-
-       /*
-        * disable RXDSYNCMEMPARITY because external serdes is unused,
-        * and therefore the logic will never be used or initialized,
-        * and uninitialized state will normally result in this error
-        * being asserted.  Similarly for the external serdess pll
-        * lock signal.
-        */
-       val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
-                INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
-
-       /*
-        * Disable MISCERR4 because of an inversion in the HT core
-        * logic checking for errors that cause this bit to be set.
-        * The errata can also cause the protocol error bit to be set
-        * in the HT config space linkerror register(s).
-        */
-       val &= ~INFINIPATH_HWE_HTCMISCERR4;
-
-       /*
-        * PLL ignored because unused MDIO interface has a logic problem
-        */
-       if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
-               val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-       dd->ipath_hwerrmask = val;
-}
-
-
-
-
-/**
- * ipath_ht_bringup_serdes - bring up the serdes
- * @dd: the infinipath device
- */
-static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
-{
-       u64 val, config1;
-       int ret = 0, change = 0;
-
-       ipath_dbg("Trying to bringup serdes\n");
-
-       if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
-           INFINIPATH_HWE_SERDESPLLFAILED)
-       {
-               ipath_dbg("At start, serdes PLL failed bit set in "
-                         "hwerrstatus, clearing and continuing\n");
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                                INFINIPATH_HWE_SERDESPLLFAILED);
-       }
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-       config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
-
-       ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
-                  "config1=%llx, sstatus=%llx xgxs %llx\n",
-                  (unsigned long long) val, (unsigned long long) config1,
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-       /* force reset on */
-       val |= INFINIPATH_SERDC0_RESET_PLL
-               /* | INFINIPATH_SERDC0_RESET_MASK */
-               ;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-       udelay(15);             /* need pll reset set at least for a bit */
-
-       if (val & INFINIPATH_SERDC0_RESET_PLL) {
-               u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
-               /* set lane resets, and tx idle, during pll reset */
-               val2 |= INFINIPATH_SERDC0_RESET_MASK |
-                       INFINIPATH_SERDC0_TXIDLE;
-               ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
-                          "%llx)\n", (unsigned long long) val2);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-                                val2);
-               /*
-                * be sure chip saw it
-                */
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               /*
-                * need pll reset clear at least 11 usec before lane
-                * resets cleared; give it a few more
-                */
-               udelay(15);
-               val = val2;     /* for check below */
-       }
-
-       if (val & (INFINIPATH_SERDC0_RESET_PLL |
-                  INFINIPATH_SERDC0_RESET_MASK |
-                  INFINIPATH_SERDC0_TXIDLE)) {
-               val &= ~(INFINIPATH_SERDC0_RESET_PLL |
-                        INFINIPATH_SERDC0_RESET_MASK |
-                        INFINIPATH_SERDC0_TXIDLE);
-               /* clear them */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-                                val);
-       }
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-       if (val & INFINIPATH_XGXS_RESET) {
-               /* normally true after boot */
-               val &= ~INFINIPATH_XGXS_RESET;
-               change = 1;
-       }
-       if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
-            INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
-               /* need to compensate for Tx inversion in partner */
-               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-                        INFINIPATH_XGXS_RX_POL_SHIFT);
-               val |= dd->ipath_rx_pol_inv <<
-                       INFINIPATH_XGXS_RX_POL_SHIFT;
-               change = 1;
-       }
-       if (change)
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-       /* clear current and de-emphasis bits */
-       config1 &= ~0x0ffffffff00ULL;
-       /* set current to 20ma */
-       config1 |= 0x00000000000ULL;
-       /* set de-emphasis to -5.68dB */
-       config1 |= 0x0cccc000000ULL;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
-
-       ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
-                  "config1=%llx, sstatus=%llx xgxs %llx\n",
-                  (unsigned long long) val, (unsigned long long) config1,
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-       return ret;             /* for now, say we always succeeded */
-}
-
-/**
- * ipath_ht_quiet_serdes - set serdes to txidle
- * @dd: the infinipath device
- * driver is being unloaded
- */
-static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
-{
-       u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-       val |= INFINIPATH_SERDC0_TXIDLE;
-       ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
-                 (unsigned long long) val);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-}
-
-/**
- * ipath_pe_put_tid - write a TID in chip
- * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to update
- * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
- * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
- *
- * This exists as a separate routine to allow for special locking etc.
- * It's used for both the full cleanup on exit, as well as the normal
- * setup and teardown.
- */
-static void ipath_ht_put_tid(struct ipath_devdata *dd,
-                            u64 __iomem *tidptr, u32 type,
-                            unsigned long pa)
-{
-       if (!dd->ipath_kregbase)
-               return;
-
-       if (pa != dd->ipath_tidinvalid) {
-               if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
-                       dev_info(&dd->pcidev->dev,
-                                "physaddr %lx has more than "
-                                "40 bits, using only 40!!!\n", pa);
-                       pa &= INFINIPATH_RT_ADDR_MASK;
-               }
-               if (type == RCVHQ_RCV_TYPE_EAGER)
-                       pa |= dd->ipath_tidtemplate;
-               else {
-                       /* in words (fixed, full page).  */
-                       u64 lenvalid = PAGE_SIZE >> 2;
-                       lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-                       pa |= lenvalid | INFINIPATH_RT_VALID;
-               }
-       }
-
-       writeq(pa, tidptr);
-}
-
-
-/**
- * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
- * @dd: the infinipath device
- * @port: the port
- *
- * Used from ipath_close(), and at chip initialization.
- */
-static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
-{
-       u64 __iomem *tidbase;
-       int i;
-
-       if (!dd->ipath_kregbase)
-               return;
-
-       ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
-
-       /*
-        * need to invalidate all of the expected TID entries for this
-        * port, so we don't have valid entries that might somehow get
-        * used (early in next use of this port, or through some bug)
-        */
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  port * dd->ipath_rcvtidcnt *
-                                  sizeof(*tidbase));
-       for (i = 0; i < dd->ipath_rcvtidcnt; i++)
-               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
-                                dd->ipath_tidinvalid);
-
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvegrbase +
-                                  port * dd->ipath_rcvegrcnt *
-                                  sizeof(*tidbase));
-
-       for (i = 0; i < dd->ipath_rcvegrcnt; i++)
-               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
-                                dd->ipath_tidinvalid);
-}
-
-/**
- * ipath_ht_tidtemplate - setup constants for TID updates
- * @dd: the infinipath device
- *
- * We setup stuff that we use a lot, to avoid calculating each time
- */
-static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
-{
-       dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
-       dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-       dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
-
-       /*
-        * work around chip errata bug 7358, by marking invalid tids
-        * as having max length
-        */
-       dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
-               INFINIPATH_RT_BUFSIZE_SHIFT;
-}
-
-static int ipath_ht_early_init(struct ipath_devdata *dd)
-{
-       u32 __iomem *piobuf;
-       u32 pioincr, val32;
-       int i;
-
-       /*
-        * one cache line; long IB headers will spill over into received
-        * buffer
-        */
-       dd->ipath_rcvhdrentsize = 16;
-       dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
-
-       /*
-        * For HT, we allocate a somewhat overly large eager buffer,
-        * such that we can guarantee that we can receive the largest
-        * packet that we can send out.  To truly support a 4KB MTU,
-        * we need to bump this to a large value.  To date, other than
-        * testing, we have never encountered an HCA that can really
-        * send 4KB MTU packets, so we do not handle that (we'll get
-        * errors interrupts if we ever see one).
-        */
-       dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
-
-       /*
-        * the min() check here is currently a nop, but it may not
-        * always be, depending on just how we do ipath_rcvegrbufsize
-        */
-       dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
-                                dd->ipath_rcvegrbufsize);
-       dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
-       ipath_ht_tidtemplate(dd);
-
-       /*
-        * zero all the TID entries at startup.  We do this for sanity,
-        * in case of a previous driver crash of some kind, and also
-        * because the chip powers up with these memories in an unknown
-        * state.  Use portcnt, not cfgports, since this is for the
-        * full chip, not for current (possibly different) configuration
-        * value.
-        * Chip Errata bug 6447
-        */
-       for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
-               ipath_ht_clear_tids(dd, val32);
-
-       /*
-        * write the pbc of each buffer, to be sure it's initialized, then
-        * cancel all the buffers, and also abort any packets that might
-        * have been in flight for some reason (the latter is for driver
-        * unload/reload, but isn't a bad idea at first init).  PIO send
-        * isn't enabled at this point, so there is no danger of sending
-        * these out on the wire.
-        * Chip Errata bug 6610
-        */
-       piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
-                                 dd->ipath_piobufbase);
-       pioincr = dd->ipath_palign / sizeof(*piobuf);
-       for (i = 0; i < dd->ipath_piobcnt2k; i++) {
-               /*
-                * reasonable word count, just to init pbc
-                */
-               writel(16, piobuf);
-               piobuf += pioincr;
-       }
-
-       ipath_get_eeprom_info(dd);
-       if (dd->ipath_boardrev == 5) {
-               /*
-                * Later production QHT7040 has same changes as QHT7140, so
-                * can use GPIO interrupts.  They have serial #'s starting
-                * with 128, rather than 112.
-                */
-               if (dd->ipath_serial[0] == '1' &&
-                   dd->ipath_serial[1] == '2' &&
-                   dd->ipath_serial[2] == '8')
-                       dd->ipath_flags |= IPATH_GPIO_INTR;
-               else {
-                       ipath_dev_err(dd, "Unsupported InfiniPath board "
-                               "(serial number %.16s)!\n",
-                               dd->ipath_serial);
-                       return 1;
-               }
-       }
-
-       if (dd->ipath_minrev >= 4) {
-               /* Rev4+ reports extra errors via internal GPIO pins */
-               dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
-               dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-       }
-
-       return 0;
-}
-
-
-/**
- * ipath_init_ht_get_base_info - set chip-specific flags for user code
- * @dd: the infinipath device
- * @kbase: ipath_base_info pointer
- *
- * We set the PCIE flag because the lower bandwidth on PCIe vs
- * HyperTransport can affect some user packet algorithms.
- */
-static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
-{
-       struct ipath_base_info *kinfo = kbase;
-
-       kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
-               IPATH_RUNTIME_PIO_REGSWAPPED;
-
-       if (pd->port_dd->ipath_minrev < 4)
-               kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
-
-       return 0;
-}
-
-static void ipath_ht_free_irq(struct ipath_devdata *dd)
-{
-       free_irq(dd->ipath_irq, dd);
-       ht_destroy_irq(dd->ipath_irq);
-       dd->ipath_irq = 0;
-       dd->ipath_intconfig = 0;
-}
-
-static struct ipath_message_header *
-ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
-{
-       return (struct ipath_message_header *)
-               &rhf_addr[sizeof(u64) / sizeof(u32)];
-}
-
-static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
-{
-       dd->ipath_portcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
-       dd->ipath_p0_rcvegrcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-}
-
-static void ipath_ht_read_counters(struct ipath_devdata *dd,
-                                  struct infinipath_counters *cntrs)
-{
-       cntrs->LBIntCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
-       cntrs->LBFlowStallCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
-       cntrs->TxSDmaDescCnt = 0;
-       cntrs->TxUnsupVLErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
-       cntrs->TxDataPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
-       cntrs->TxFlowPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
-       cntrs->TxDwordCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
-       cntrs->TxLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
-       cntrs->TxMaxMinLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
-       cntrs->TxUnderrunCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
-       cntrs->TxFlowStallCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
-       cntrs->TxDroppedPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
-       cntrs->RxDroppedPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
-       cntrs->RxDataPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
-       cntrs->RxFlowPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
-       cntrs->RxDwordCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
-       cntrs->RxLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
-       cntrs->RxMaxMinLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
-       cntrs->RxICRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
-       cntrs->RxVCRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
-       cntrs->RxFlowCtrlErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
-       cntrs->RxBadFormatCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
-       cntrs->RxLinkProblemCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
-       cntrs->RxEBPCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
-       cntrs->RxLPCRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
-       cntrs->RxBufOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
-       cntrs->RxTIDFullErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
-       cntrs->RxTIDValidErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
-       cntrs->RxPKeyMismatchCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
-       cntrs->RxP0HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
-       cntrs->RxP1HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
-       cntrs->RxP2HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
-       cntrs->RxP3HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
-       cntrs->RxP4HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
-       cntrs->RxP5HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
-       cntrs->RxP6HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
-       cntrs->RxP7HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
-       cntrs->RxP8HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
-       cntrs->RxP9HdrEgrOvflCnt = 0;
-       cntrs->RxP10HdrEgrOvflCnt = 0;
-       cntrs->RxP11HdrEgrOvflCnt = 0;
-       cntrs->RxP12HdrEgrOvflCnt = 0;
-       cntrs->RxP13HdrEgrOvflCnt = 0;
-       cntrs->RxP14HdrEgrOvflCnt = 0;
-       cntrs->RxP15HdrEgrOvflCnt = 0;
-       cntrs->RxP16HdrEgrOvflCnt = 0;
-       cntrs->IBStatusChangeCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
-       cntrs->IBLinkErrRecoveryCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
-       cntrs->IBLinkDownedCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
-       cntrs->IBSymbolErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
-       cntrs->RxVL15DroppedPktCnt = 0;
-       cntrs->RxOtherLocalPhyErrCnt = 0;
-       cntrs->PcieRetryBufDiagQwordCnt = 0;
-       cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
-       cntrs->LocalLinkIntegrityErrCnt =
-               (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-               dd->ipath_lli_errs : dd->ipath_lli_errors;
-       cntrs->RxVlErrCnt = 0;
-       cntrs->RxDlidFltrCnt = 0;
-}
-
-
-/* no interrupt fallback for these chips */
-static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
-{
-       return 0;
-}
-
-
-/*
- * reset the XGXS (between serdes and IBC).  Slightly less intrusive
- * than resetting the IBC or external link state, and useful in some
- * cases to cause some retraining.  To do this right, we reset IBC
- * as well.
- */
-static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
-{
-       u64 val, prev_val;
-
-       prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-       val = prev_val | INFINIPATH_XGXS_RESET;
-       prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-       ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-}
-
-
-static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
-{
-       int ret;
-
-       switch (which) {
-       case IPATH_IB_CFG_LWID:
-               ret = dd->ipath_link_width_active;
-               break;
-       case IPATH_IB_CFG_SPD:
-               ret = dd->ipath_link_speed_active;
-               break;
-       case IPATH_IB_CFG_LWID_ENB:
-               ret = dd->ipath_link_width_enabled;
-               break;
-       case IPATH_IB_CFG_SPD_ENB:
-               ret = dd->ipath_link_speed_enabled;
-               break;
-       default:
-               ret =  -ENOTSUPP;
-               break;
-       }
-       return ret;
-}
-
-
-/* we assume range checking is already done, if needed */
-static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
-{
-       int ret = 0;
-
-       if (which == IPATH_IB_CFG_LWID_ENB)
-               dd->ipath_link_width_enabled = val;
-       else if (which == IPATH_IB_CFG_SPD_ENB)
-               dd->ipath_link_speed_enabled = val;
-       else
-               ret = -ENOTSUPP;
-       return ret;
-}
-
-
-static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
-{
-}
-
-
-static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
-{
-       ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
-               ipath_ib_linktrstate(dd, ibcs));
-       return 0;
-}
-
-
-/**
- * ipath_init_iba6110_funcs - set up the chip-specific function pointers
- * @dd: the infinipath device
- *
- * This is global, and is called directly at init to set up the
- * chip-specific function pointers for later use.
- */
-void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
-{
-       dd->ipath_f_intrsetup = ipath_ht_intconfig;
-       dd->ipath_f_bus = ipath_setup_ht_config;
-       dd->ipath_f_reset = ipath_setup_ht_reset;
-       dd->ipath_f_get_boardname = ipath_ht_boardname;
-       dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
-       dd->ipath_f_early_init = ipath_ht_early_init;
-       dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
-       dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
-       dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
-       dd->ipath_f_clear_tids = ipath_ht_clear_tids;
-       dd->ipath_f_put_tid = ipath_ht_put_tid;
-       dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
-       dd->ipath_f_setextled = ipath_setup_ht_setextled;
-       dd->ipath_f_get_base_info = ipath_ht_get_base_info;
-       dd->ipath_f_free_irq = ipath_ht_free_irq;
-       dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
-       dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
-       dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
-       dd->ipath_f_config_ports = ipath_ht_config_ports;
-       dd->ipath_f_read_counters = ipath_ht_read_counters;
-       dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
-       dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
-       dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
-       dd->ipath_f_config_jint = ipath_ht_config_jint;
-       dd->ipath_f_ib_updown = ipath_ht_ib_updown;
-
-       /*
-        * initialize chip-specific variables
-        */
-       ipath_init_ht_variables(dd);
-}
diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c
deleted file mode 100644 (file)
index a5eea19..0000000
+++ /dev/null
@@ -1,1062 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/moduleparam.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-/*
- * min buffers we want to have per port, after driver
- */
-#define IPATH_MIN_USER_PORT_BUFCNT 7
-
-/*
- * Number of ports we are configured to use (to allow for more pio
- * buffers per port, etc.)  Zero means use chip value.
- */
-static ushort ipath_cfgports;
-
-module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
-MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
-
-/*
- * Number of buffers reserved for driver (verbs and layered drivers.)
- * Initialized based on number of PIO buffers if not set via module interface.
- * The problem with this is that it's global, but we'll use different
- * numbers for different chip types.
- */
-static ushort ipath_kpiobufs;
-
-static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
-
-module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
-                 &ipath_kpiobufs, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
-
-/**
- * create_port0_egr - allocate the eager TID buffers
- * @dd: the infinipath device
- *
- * This code is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance.
- * This is the kernel (port0) version.
- *
- * Allocate the eager TID buffers and program them into infinipath.
- * We use the network layer alloc_skb() allocator to allocate the
- * memory, and either use the buffers as is for things like verbs
- * packets, or pass the buffers up to the ipath layered driver and
- * thence the network layer, replacing them as we do so (see
- * ipath_rcv_layer()).
- */
-static int create_port0_egr(struct ipath_devdata *dd)
-{
-       unsigned e, egrcnt;
-       struct ipath_skbinfo *skbinfo;
-       int ret;
-
-       egrcnt = dd->ipath_p0_rcvegrcnt;
-
-       skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
-       if (skbinfo == NULL) {
-               ipath_dev_err(dd, "allocation error for eager TID "
-                             "skb array\n");
-               ret = -ENOMEM;
-               goto bail;
-       }
-       for (e = 0; e < egrcnt; e++) {
-               /*
-                * This is a bit tricky in that we allocate extra
-                * space for 2 bytes of the 14 byte ethernet header.
-                * These two bytes are passed in the ipath header so
-                * the rest of the data is word aligned.  We allocate
-                * 4 bytes so that the data buffer stays word aligned.
-                * See ipath_kreceive() for more details.
-                */
-               skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
-               if (!skbinfo[e].skb) {
-                       ipath_dev_err(dd, "SKB allocation error for "
-                                     "eager TID %u\n", e);
-                       while (e != 0)
-                               dev_kfree_skb(skbinfo[--e].skb);
-                       vfree(skbinfo);
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-       }
-       /*
-        * After loop above, so we can test non-NULL to see if ready
-        * to use at receive, etc.
-        */
-       dd->ipath_port0_skbinfo = skbinfo;
-
-       for (e = 0; e < egrcnt; e++) {
-               dd->ipath_port0_skbinfo[e].phys =
-                 ipath_map_single(dd->pcidev,
-                                  dd->ipath_port0_skbinfo[e].skb->data,
-                                  dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
-               dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
-                                   ((char __iomem *) dd->ipath_kregbase +
-                                    dd->ipath_rcvegrbase),
-                                   RCVHQ_RCV_TYPE_EAGER,
-                                   dd->ipath_port0_skbinfo[e].phys);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int bringup_link(struct ipath_devdata *dd)
-{
-       u64 val, ibc;
-       int ret = 0;
-
-       /* hold IBC in reset */
-       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-
-       /*
-        * set initial max size pkt IBC will send, including ICRC; it's the
-        * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
-        */
-       val = (dd->ipath_ibmaxlen >> 2) + 1;
-       ibc = val << dd->ibcc_mpl_shift;
-
-       /* flowcontrolwatermark is in units of KBytes */
-       ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
-       /*
-        * How often flowctrl sent.  More or less in usecs; balance against
-        * watermark value, so that in theory senders always get a flow
-        * control update in time to not let the IB link go idle.
-        */
-       ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
-       /* max error tolerance */
-       ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-       /* use "real" buffer space for */
-       ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
-       /* IB credit flow control. */
-       ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-       /* initially come up waiting for TS1, without sending anything. */
-       dd->ipath_ibcctrl = ibc;
-       /*
-        * Want to start out with both LINKCMD and LINKINITCMD in NOP
-        * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
-        * to stay a NOP. Flag that we are disabled, for the (unlikely)
-        * case that some recovery path is trying to bring the link up
-        * before we are ready.
-        */
-       ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
-               INFINIPATH_IBCC_LINKINITCMD_SHIFT;
-       dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-       ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
-                  (unsigned long long) ibc);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
-
-       // be sure chip saw it
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-       ret = dd->ipath_f_bringup_serdes(dd);
-
-       if (ret)
-               dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
-                        "not usable\n");
-       else {
-               /* enable IBC */
-               dd->ipath_control |= INFINIPATH_C_LINKENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                                dd->ipath_control);
-       }
-
-       return ret;
-}
-
-static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
-{
-       struct ipath_portdata *pd;
-
-       pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-       if (pd) {
-               pd->port_dd = dd;
-               pd->port_cnt = 1;
-               /* The port 0 pkey table is used by the layer interface. */
-               pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
-               pd->port_seq_cnt = 1;
-       }
-       return pd;
-}
-
-static int init_chip_first(struct ipath_devdata *dd)
-{
-       struct ipath_portdata *pd;
-       int ret = 0;
-       u64 val;
-
-       spin_lock_init(&dd->ipath_kernel_tid_lock);
-       spin_lock_init(&dd->ipath_user_tid_lock);
-       spin_lock_init(&dd->ipath_sendctrl_lock);
-       spin_lock_init(&dd->ipath_uctxt_lock);
-       spin_lock_init(&dd->ipath_sdma_lock);
-       spin_lock_init(&dd->ipath_gpio_lock);
-       spin_lock_init(&dd->ipath_eep_st_lock);
-       spin_lock_init(&dd->ipath_sdepb_lock);
-       mutex_init(&dd->ipath_eep_lock);
-
-       /*
-        * skip cfgports stuff because we are not allocating memory,
-        * and we don't want problems if the portcnt changed due to
-        * cfgports.  We do still check and report a difference, if
-        * not same (should be impossible).
-        */
-       dd->ipath_f_config_ports(dd, ipath_cfgports);
-       if (!ipath_cfgports)
-               dd->ipath_cfgports = dd->ipath_portcnt;
-       else if (ipath_cfgports <= dd->ipath_portcnt) {
-               dd->ipath_cfgports = ipath_cfgports;
-               ipath_dbg("Configured to use %u ports out of %u in chip\n",
-                         dd->ipath_cfgports, ipath_read_kreg32(dd,
-                         dd->ipath_kregs->kr_portcnt));
-       } else {
-               dd->ipath_cfgports = dd->ipath_portcnt;
-               ipath_dbg("Tried to configured to use %u ports; chip "
-                         "only supports %u\n", ipath_cfgports,
-                         ipath_read_kreg32(dd,
-                                 dd->ipath_kregs->kr_portcnt));
-       }
-       /*
-        * Allocate full portcnt array, rather than just cfgports, because
-        * cleanup iterates across all possible ports.
-        */
-       dd->ipath_pd = kcalloc(dd->ipath_portcnt, sizeof(*dd->ipath_pd),
-                              GFP_KERNEL);
-
-       if (!dd->ipath_pd) {
-               ipath_dev_err(dd, "Unable to allocate portdata array, "
-                             "failing\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       pd = create_portdata0(dd);
-       if (!pd) {
-               ipath_dev_err(dd, "Unable to allocate portdata for port "
-                             "0, failing\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-       dd->ipath_pd[0] = pd;
-
-       dd->ipath_rcvtidcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-       dd->ipath_rcvtidbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-       dd->ipath_rcvegrcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-       dd->ipath_rcvegrbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-       dd->ipath_palign =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-       dd->ipath_piobufbase =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
-       dd->ipath_piosize2k = val & ~0U;
-       dd->ipath_piosize4k = val >> 32;
-       if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
-               ipath_mtu4096 = 0; /* 4KB not supported by this chip */
-       dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
-       dd->ipath_piobcnt2k = val & ~0U;
-       dd->ipath_piobcnt4k = val >> 32;
-       dd->ipath_pio2kbase =
-               (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-                                (dd->ipath_piobufbase & 0xffffffff));
-       if (dd->ipath_piobcnt4k) {
-               dd->ipath_pio4kbase = (u32 __iomem *)
-                       (((char __iomem *) dd->ipath_kregbase) +
-                        (dd->ipath_piobufbase >> 32));
-               /*
-                * 4K buffers take 2 pages; we use roundup just to be
-                * paranoid; we calculate it once here, rather than on
-                * ever buf allocate
-                */
-               dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
-                                         dd->ipath_palign);
-               ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
-                         "(%x aligned)\n",
-                         dd->ipath_piobcnt2k, dd->ipath_piosize2k,
-                         dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
-                         dd->ipath_piosize4k, dd->ipath_pio4kbase,
-                         dd->ipath_4kalign);
-       } else {
-               ipath_dbg("%u 2k piobufs @ %p\n",
-                         dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
-       }
-done:
-       return ret;
-}
-
-/**
- * init_chip_reset - re-initialize after a reset, or enable
- * @dd: the infinipath device
- *
- * sanity check at least some of the values after reset, and
- * ensure no receive or transmit (explicitly, in case reset
- * failed
- */
-static int init_chip_reset(struct ipath_devdata *dd)
-{
-       u32 rtmp;
-       int i;
-       unsigned long flags;
-
-       /*
-        * ensure chip does no sends or receives, tail updates, or
-        * pioavail updates while we re-initialize
-        */
-       dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
-       for (i = 0; i < dd->ipath_portcnt; i++) {
-               clear_bit(dd->ipath_r_portenable_shift + i,
-                         &dd->ipath_rcvctrl);
-               clear_bit(dd->ipath_r_intravail_shift + i,
-                         &dd->ipath_rcvctrl);
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-               dd->ipath_rcvctrl);
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl = 0U; /* no sdma, etc */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-       if (rtmp != dd->ipath_rcvtidcnt)
-               dev_info(&dd->pcidev->dev, "tidcnt was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvtidcnt, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-       if (rtmp != dd->ipath_rcvtidbase)
-               dev_info(&dd->pcidev->dev, "tidbase was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvtidbase, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-       if (rtmp != dd->ipath_rcvegrcnt)
-               dev_info(&dd->pcidev->dev, "egrcnt was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvegrcnt, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-       if (rtmp != dd->ipath_rcvegrbase)
-               dev_info(&dd->pcidev->dev, "egrbase was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvegrbase, rtmp);
-
-       return 0;
-}
-
-static int init_pioavailregs(struct ipath_devdata *dd)
-{
-       int ret;
-
-       dd->ipath_pioavailregs_dma = dma_alloc_coherent(
-               &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
-               GFP_KERNEL);
-       if (!dd->ipath_pioavailregs_dma) {
-               ipath_dev_err(dd, "failed to allocate PIOavail reg area "
-                             "in memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       /*
-        * we really want L2 cache aligned, but for current CPUs of
-        * interest, they are the same.
-        */
-       dd->ipath_statusp = (u64 *)
-               ((char *)dd->ipath_pioavailregs_dma +
-                ((2 * L1_CACHE_BYTES +
-                  dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
-       /* copy the current value now that it's really allocated */
-       *dd->ipath_statusp = dd->_ipath_status;
-       /*
-        * setup buffer to hold freeze msg, accessible to apps,
-        * following statusp
-        */
-       dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
-       /* and its length */
-       dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
-
-       ret = 0;
-
-done:
-       return ret;
-}
-
-/**
- * init_shadow_tids - allocate the shadow TID array
- * @dd: the infinipath device
- *
- * allocate the shadow TID array, so we can ipath_munlock previous
- * entries.  It may make more sense to move the pageshadow to the
- * port data structure, so we only allocate memory for ports actually
- * in use, since we at 8k per port, now.
- */
-static void init_shadow_tids(struct ipath_devdata *dd)
-{
-       struct page **pages;
-       dma_addr_t *addrs;
-
-       pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-                       sizeof(struct page *));
-       if (!pages) {
-               ipath_dev_err(dd, "failed to allocate shadow page * "
-                             "array, no expected sends!\n");
-               dd->ipath_pageshadow = NULL;
-               return;
-       }
-
-       addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-                       sizeof(dma_addr_t));
-       if (!addrs) {
-               ipath_dev_err(dd, "failed to allocate shadow dma handle "
-                             "array, no expected sends!\n");
-               vfree(pages);
-               dd->ipath_pageshadow = NULL;
-               return;
-       }
-
-       dd->ipath_pageshadow = pages;
-       dd->ipath_physshadow = addrs;
-}
-
-static void enable_chip(struct ipath_devdata *dd, int reinit)
-{
-       u32 val;
-       u64 rcvmask;
-       unsigned long flags;
-       int i;
-
-       if (!reinit)
-               init_waitqueue_head(&ipath_state_wait);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       /* Enable PIO send, and update of PIOavail regs to memory. */
-       dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
-               INFINIPATH_S_PIOBUFAVAILUPD;
-
-       /*
-        * Set the PIO avail update threshold to host memory
-        * on chips that support it.
-        */
-       if (dd->ipath_pioupd_thresh)
-               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-                       << INFINIPATH_S_UPDTHRESH_SHIFT;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /*
-        * Enable kernel ports' receive and receive interrupt.
-        * Other ports done as user opens and inits them.
-        */
-       rcvmask = 1ULL;
-       dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
-               (rcvmask << dd->ipath_r_intravail_shift);
-       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
-               dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       /*
-        * now ready for use.  this should be cleared whenever we
-        * detect a reset, or initiate one.
-        */
-       dd->ipath_flags |= IPATH_INITTED;
-
-       /*
-        * Init our shadow copies of head from tail values,
-        * and write head values to match.
-        */
-       val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
-       ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
-
-       /* Initialize so we interrupt on next packet received */
-       ipath_write_ureg(dd, ur_rcvhdrhead,
-                        dd->ipath_rhdrhead_intr_off |
-                        dd->ipath_pd[0]->port_head, 0);
-
-       /*
-        * by now pioavail updates to memory should have occurred, so
-        * copy them into our working/shadow registers; this is in
-        * case something went wrong with abort, but mostly to get the
-        * initial values of the generation bit correct.
-        */
-       for (i = 0; i < dd->ipath_pioavregs; i++) {
-               __le64 pioavail;
-
-               /*
-                * Chip Errata bug 6641; even and odd qwords>3 are swapped.
-                */
-               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-                       pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
-               else
-                       pioavail = dd->ipath_pioavailregs_dma[i];
-               /*
-                * don't need to worry about ipath_pioavailkernel here
-                * because we will call ipath_chg_pioavailkernel() later
-                * in initialization, to busy out buffers as needed
-                */
-               dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
-       }
-       /* can get counters, stats, etc. */
-       dd->ipath_flags |= IPATH_PRESENT;
-}
-
-static int init_housekeeping(struct ipath_devdata *dd, int reinit)
-{
-       char boardn[40];
-       int ret = 0;
-
-       /*
-        * have to clear shadow copies of registers at init that are
-        * not otherwise set here, or all kinds of bizarre things
-        * happen with driver on chip reset
-        */
-       dd->ipath_rcvhdrsize = 0;
-
-       /*
-        * Don't clear ipath_flags as 8bit mode was set before
-        * entering this func. However, we do set the linkstate to
-        * unknown, so we can watch for a transition.
-        * PRESENT is set because we want register reads to work,
-        * and the kernel infrastructure saw it in config space;
-        * We clear it if we have failures.
-        */
-       dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
-       dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
-                            IPATH_LINKDOWN | IPATH_LINKINIT);
-
-       ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
-       dd->ipath_revision =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
-
-       /*
-        * set up fundamental info we need to use the chip; we assume
-        * if the revision reg and these regs are OK, we don't need to
-        * special case the rest
-        */
-       dd->ipath_sregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
-       dd->ipath_cregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
-       dd->ipath_uregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
-       ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
-                  "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
-                  dd->ipath_uregbase, dd->ipath_cregbase);
-       if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
-           || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
-           || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
-           || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
-               ipath_dev_err(dd, "Register read failures from chip, "
-                             "giving up initialization\n");
-               dd->ipath_flags &= ~IPATH_PRESENT;
-               ret = -ENODEV;
-               goto done;
-       }
-
-
-       /* clear diagctrl register, in case diags were running and crashed */
-       ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
-
-       /* clear the initial reset flag, in case first driver load */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-                        INFINIPATH_E_RESET);
-
-       ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
-                  (unsigned long long) dd->ipath_revision,
-                  dd->ipath_pcirev);
-
-       if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
-            INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
-               ipath_dev_err(dd, "Driver only handles version %d, "
-                             "chip swversion is %d (%llx), failng\n",
-                             IPATH_CHIP_SWVERSION,
-                             (int)(dd->ipath_revision >>
-                                   INFINIPATH_R_SOFTWARE_SHIFT) &
-                             INFINIPATH_R_SOFTWARE_MASK,
-                             (unsigned long long) dd->ipath_revision);
-               ret = -ENOSYS;
-               goto done;
-       }
-       dd->ipath_majrev = (u8) ((dd->ipath_revision >>
-                                 INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
-                                INFINIPATH_R_CHIPREVMAJOR_MASK);
-       dd->ipath_minrev = (u8) ((dd->ipath_revision >>
-                                 INFINIPATH_R_CHIPREVMINOR_SHIFT) &
-                                INFINIPATH_R_CHIPREVMINOR_MASK);
-       dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
-                                   INFINIPATH_R_BOARDID_SHIFT) &
-                                  INFINIPATH_R_BOARDID_MASK);
-
-       ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
-
-       snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
-                "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
-                "SW Compat %u\n",
-                IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
-                (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
-                INFINIPATH_R_ARCH_MASK,
-                dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
-                (unsigned)(dd->ipath_revision >>
-                           INFINIPATH_R_SOFTWARE_SHIFT) &
-                INFINIPATH_R_SOFTWARE_MASK);
-
-       ipath_dbg("%s", dd->ipath_boardversion);
-
-       if (ret)
-               goto done;
-
-       if (reinit)
-               ret = init_chip_reset(dd);
-       else
-               ret = init_chip_first(dd);
-
-done:
-       return ret;
-}
-
-static void verify_interrupt(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-
-       if (!dd)
-               return; /* being torn down */
-
-       /*
-        * If we don't have any interrupts, let the user know and
-        * don't bother checking again.
-        */
-       if (dd->ipath_int_counter == 0) {
-               if (!dd->ipath_f_intr_fallback(dd))
-                       dev_err(&dd->pcidev->dev, "No interrupts detected, "
-                               "not usable.\n");
-               else /* re-arm the timer to see if fallback works */
-                       mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
-       } else
-               ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
-                       dd->ipath_int_counter);
-}
-
-/**
- * ipath_init_chip - do the actual initialization sequence on the chip
- * @dd: the infinipath device
- * @reinit: reinitializing, so don't allocate new memory
- *
- * Do the actual initialization sequence on the chip.  This is done
- * both from the init routine called from the PCI infrastructure, and
- * when we reset the chip, or detect that it was reset internally,
- * or it's administratively re-enabled.
- *
- * Memory allocation here and in called routines is only done in
- * the first case (reinit == 0).  We have to be careful, because even
- * without memory allocation, we need to re-write all the chip registers
- * TIDs, etc. after the reset or enable has completed.
- */
-int ipath_init_chip(struct ipath_devdata *dd, int reinit)
-{
-       int ret = 0;
-       u32 kpiobufs, defkbufs;
-       u32 piobufs, uports;
-       u64 val;
-       struct ipath_portdata *pd;
-       gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-
-       ret = init_housekeeping(dd, reinit);
-       if (ret)
-               goto done;
-
-       /*
-        * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
-        * but then it no longer nicely fits power of two, and since
-        * we now use routines that backend onto __get_free_pages, the
-        * rest would be wasted.
-        */
-       dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
-                        dd->ipath_rcvhdrcnt);
-
-       /*
-        * Set up the shadow copies of the piobufavail registers,
-        * which we compare against the chip registers for now, and
-        * the in memory DMA'ed copies of the registers.  This has to
-        * be done early, before we calculate lastport, etc.
-        */
-       piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /*
-        * calc number of pioavail registers, and save it; we have 2
-        * bits per buffer.
-        */
-       dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
-               / (sizeof(u64) * BITS_PER_BYTE / 2);
-       uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
-       if (piobufs > 144)
-               defkbufs = 32 + dd->ipath_pioreserved;
-       else
-               defkbufs = 16 + dd->ipath_pioreserved;
-
-       if (ipath_kpiobufs && (ipath_kpiobufs +
-               (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
-               int i = (int) piobufs -
-                       (int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
-               if (i < 1)
-                       i = 1;
-               dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
-                        "%d for kernel leaves too few for %d user ports "
-                        "(%d each); using %u\n", ipath_kpiobufs,
-                        piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
-               /*
-                * shouldn't change ipath_kpiobufs, because could be
-                * different for different devices...
-                */
-               kpiobufs = i;
-       } else if (ipath_kpiobufs)
-               kpiobufs = ipath_kpiobufs;
-       else
-               kpiobufs = defkbufs;
-       dd->ipath_lastport_piobuf = piobufs - kpiobufs;
-       dd->ipath_pbufsport =
-               uports ? dd->ipath_lastport_piobuf / uports : 0;
-       /* if not an even divisor, some user ports get extra buffers */
-       dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
-               (dd->ipath_pbufsport * uports);
-       if (dd->ipath_ports_extrabuf)
-               ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
-                       "ports <= %u\n", dd->ipath_pbufsport,
-                       dd->ipath_ports_extrabuf);
-       dd->ipath_lastpioindex = 0;
-       dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
-       /* ipath_pioavailshadow initialized earlier */
-       ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
-                  "each for %u user ports\n", kpiobufs,
-                  piobufs, dd->ipath_pbufsport, uports);
-       ret = dd->ipath_f_early_init(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Early initialization failure\n");
-               goto done;
-       }
-
-       /*
-        * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
-        * done after early_init.
-        */
-       dd->ipath_hdrqlast =
-               dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
-                        dd->ipath_rcvhdrentsize);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-                        dd->ipath_rcvhdrsize);
-
-       if (!reinit) {
-               ret = init_pioavailregs(dd);
-               init_shadow_tids(dd);
-               if (ret)
-                       goto done;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
-                        dd->ipath_pioavailregs_phys);
-
-       /*
-        * this is to detect s/w errors, which the h/w works around by
-        * ignoring the low 6 bits of address, if it wasn't aligned.
-        */
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
-       if (val != dd->ipath_pioavailregs_phys) {
-               ipath_dev_err(dd, "Catastrophic software error, "
-                             "SendPIOAvailAddr written as %lx, "
-                             "read back as %llx\n",
-                             (unsigned long) dd->ipath_pioavailregs_phys,
-                             (unsigned long long) val);
-               ret = -EINVAL;
-               goto done;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
-
-       /*
-        * make sure we are not in freeze, and PIO send enabled, so
-        * writes to pbc happen
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-       /*
-        * before error clears, since we expect serdes pll errors during
-        * this, the first time after reset
-        */
-       if (bringup_link(dd)) {
-               dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
-               ret = -ENETDOWN;
-               goto done;
-       }
-
-       /*
-        * clear any "expected" hwerrs from reset and/or initialization
-        * clear any that aren't enabled (at least this once), and then
-        * set the enable mask
-        */
-       dd->ipath_f_init_hwerrors(dd);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                        dd->ipath_hwerrmask);
-
-       /* clear all */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-       /* enable errors that are masked, at least this first time. */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                        ~dd->ipath_maskederrs);
-       dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
-       dd->ipath_errormask =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-       /* clear any interrupts up to this point (ints still not enabled) */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-       dd->ipath_f_tidtemplate(dd);
-
-       /*
-        * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
-        * re-init, the simplest way to handle this is to free
-        * existing, and re-allocate.
-        * Need to re-create rest of port 0 portdata as well.
-        */
-       pd = dd->ipath_pd[0];
-       if (reinit) {
-               struct ipath_portdata *npd;
-
-               /*
-                * Alloc and init new ipath_portdata for port0,
-                * Then free old pd. Could lead to fragmentation, but also
-                * makes later support for hot-swap easier.
-                */
-               npd = create_portdata0(dd);
-               if (npd) {
-                       ipath_free_pddata(dd, pd);
-                       dd->ipath_pd[0] = npd;
-                       pd = npd;
-               } else {
-                       ipath_dev_err(dd, "Unable to allocate portdata"
-                                     " for port 0, failing\n");
-                       ret = -ENOMEM;
-                       goto done;
-               }
-       }
-       ret = ipath_create_rcvhdrq(dd, pd);
-       if (!ret)
-               ret = create_port0_egr(dd);
-       if (ret) {
-               ipath_dev_err(dd, "failed to allocate kernel port's "
-                             "rcvhdrq and/or egr bufs\n");
-               goto done;
-       } else {
-               enable_chip(dd, reinit);
-       }
-
-       /* after enable_chip, so pioavailshadow setup */
-       ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
-
-       /*
-        * Cancel any possible active sends from early driver load.
-        * Follows early_init because some chips have to initialize
-        * PIO buffers in early_init to avoid false parity errors.
-        * After enable and ipath_chg_pioavailkernel so we can safely
-        * enable pioavail updates and PIOENABLE; packets are now
-        * ready to go out.
-        */
-       ipath_cancel_sends(dd, 1);
-
-       if (!reinit) {
-               /*
-                * Used when we close a port, for DMA already in flight
-                * at close.
-                */
-               dd->ipath_dummy_hdrq = dma_alloc_coherent(
-                       &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
-                       &dd->ipath_dummy_hdrq_phys,
-                       gfp_flags);
-               if (!dd->ipath_dummy_hdrq) {
-                       dev_info(&dd->pcidev->dev,
-                               "Couldn't allocate 0x%lx bytes for dummy hdrq\n",
-                               dd->ipath_pd[0]->port_rcvhdrq_size);
-                       /* fallback to just 0'ing */
-                       dd->ipath_dummy_hdrq_phys = 0UL;
-               }
-       }
-
-       /*
-        * cause retrigger of pending interrupts ignored during init,
-        * even if we had errors
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-
-       if (!dd->ipath_stats_timer_active) {
-               /*
-                * first init, or after an admin disable/enable
-                * set up stats retrieval timer, even if we had errors
-                * in last portion of setup
-                */
-               setup_timer(&dd->ipath_stats_timer, ipath_get_faststats,
-                               (unsigned long)dd);
-               /* every 5 seconds; */
-               dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
-               /* takes ~16 seconds to overflow at full IB 4x bandwdith */
-               add_timer(&dd->ipath_stats_timer);
-               dd->ipath_stats_timer_active = 1;
-       }
-
-       /* Set up SendDMA if chip supports it */
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               ret = setup_sdma(dd);
-
-       /* Set up HoL state */
-       setup_timer(&dd->ipath_hol_timer, ipath_hol_event, (unsigned long)dd);
-
-       dd->ipath_hol_state = IPATH_HOL_UP;
-
-done:
-       if (!ret) {
-               *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
-               if (!dd->ipath_f_intrsetup(dd)) {
-                       /* now we can enable all interrupts from the chip */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-                                        -1LL);
-                       /* force re-interrupt of any pending interrupts. */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
-                                        0ULL);
-                       /* chip is usable; mark it as initialized */
-                       *dd->ipath_statusp |= IPATH_STATUS_INITTED;
-
-                       /*
-                        * setup to verify we get an interrupt, and fallback
-                        * to an alternate if necessary and possible
-                        */
-                       if (!reinit) {
-                               setup_timer(&dd->ipath_intrchk_timer,
-                                               verify_interrupt,
-                                               (unsigned long)dd);
-                       }
-                       dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
-                       add_timer(&dd->ipath_intrchk_timer);
-               } else
-                       ipath_dev_err(dd, "No interrupts enabled, couldn't "
-                                     "setup interrupt address\n");
-
-               if (dd->ipath_cfgports > ipath_stats.sps_nports)
-                       /*
-                        * sps_nports is a global, so, we set it to
-                        * the highest number of ports of any of the
-                        * chips we find; we never decrement it, at
-                        * least for now.  Since this might have changed
-                        * over disable/enable or prior to reset, always
-                        * do the check and potentially adjust.
-                        */
-                       ipath_stats.sps_nports = dd->ipath_cfgports;
-       } else
-               ipath_dbg("Failed (%d) to initialize chip\n", ret);
-
-       /* if ret is non-zero, we probably should do some cleanup
-          here... */
-       return ret;
-}
-
-static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
-{
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       unsigned short val;
-       int ret;
-
-       ret = ipath_parse_ushort(str, &val);
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       if (ret < 0)
-               goto bail;
-
-       if (val == 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-               if (dd->ipath_kregbase)
-                       continue;
-               if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
-                          (dd->ipath_cfgports *
-                           IPATH_MIN_USER_PORT_BUFCNT)))
-               {
-                       ipath_dev_err(
-                               dd,
-                               "Allocating %d PIO bufs for kernel leaves "
-                               "too few for %d user ports (%d each)\n",
-                               val, dd->ipath_cfgports - 1,
-                               IPATH_MIN_USER_PORT_BUFCNT);
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               dd->ipath_lastport_piobuf =
-                       dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
-       }
-
-       ipath_kpiobufs = val;
-       ret = 0;
-bail:
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c
deleted file mode 100644 (file)
index 0403fa2..0000000
+++ /dev/null
@@ -1,1271 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-
-/*
- * Called when we might have an error that is specific to a particular
- * PIO buffer, and may need to cancel that buffer, so it can be re-used.
- */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
-{
-       u32 piobcnt;
-       unsigned long sbuf[4];
-       /*
-        * it's possible that sendbuffererror could have bits set; might
-        * have already done this as a result of hardware error handling
-        */
-       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /* read these before writing errorclear */
-       sbuf[0] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror);
-       sbuf[1] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
-       if (piobcnt > 128)
-               sbuf[2] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
-       if (piobcnt > 192)
-               sbuf[3] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
-       else
-               sbuf[3] = 0;
-
-       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
-               int i;
-               if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
-                       time_after(dd->ipath_lastcancel, jiffies)) {
-                       __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
-                                         "SendbufErrs %lx %lx", sbuf[0],
-                                         sbuf[1]);
-                       if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
-                               printk(" %lx %lx ", sbuf[2], sbuf[3]);
-                       printk("\n");
-               }
-
-               for (i = 0; i < piobcnt; i++)
-                       if (test_bit(i, sbuf))
-                               ipath_disarm_piobufs(dd, i, 1);
-               /* ignore armlaunch errs for a bit */
-               dd->ipath_lastcancel = jiffies+3;
-       }
-}
-
-
-/* These are all rcv-related errors which we want to count for stats */
-#define E_SUM_PKTERRS \
-       (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
-        INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
-        INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
-        INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-        INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
-        INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
-
-/* These are all send-related errors which we want to count for stats */
-#define E_SUM_ERRS \
-       (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
-        INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
-        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-        INFINIPATH_E_INVALIDADDR)
-
-/*
- * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
- * errors not related to freeze and cancelling buffers.  Can't ignore
- * armlaunch because could get more while still cleaning up, and need
- * to cancel those as they happen.
- */
-#define E_SPKT_ERRS_IGNORE \
-        (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
-        INFINIPATH_E_SPKTLEN)
-
-/*
- * these are errors that can occur when the link changes state while
- * a packet is being sent or received.  This doesn't cover things
- * like EBP or VCRC that can be the result of a sending having the
- * link change state, so we receive a "known bad" packet.
- */
-#define E_SUM_LINK_PKTERRS \
-       (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-        INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-        INFINIPATH_E_RUNEXPCHAR)
-
-static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       u64 ignore_this_time = 0;
-
-       ipath_disarm_senderrbufs(dd);
-       if ((errs & E_SUM_LINK_PKTERRS) &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               /*
-                * This can happen when SMA is trying to bring the link
-                * up, but the IB link changes state at the "wrong" time.
-                * The IB logic then complains that the packet isn't
-                * valid.  We don't want to confuse people, so we just
-                * don't print them, except at debug
-                */
-               ipath_dbg("Ignoring packet errors %llx, because link not "
-                         "ACTIVE\n", (unsigned long long) errs);
-               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-       }
-
-       return ignore_this_time;
-}
-
-/* generic hw error messages... */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
-       { \
-               .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
-                         INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
-               .msg = "TXE " #a " Memory Parity"            \
-       }
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
-       { \
-               .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
-                         INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
-               .msg = "RXE " #a " Memory Parity"            \
-       }
-
-static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
-       INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
-       INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
-
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
-
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
-};
-
-/**
- * ipath_format_hwmsg - format a single hwerror message
- * @msg message buffer
- * @msgl length of message buffer
- * @hwmsg message to add to message buffer
- */
-static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
-{
-       strlcat(msg, "[", msgl);
-       strlcat(msg, hwmsg, msgl);
-       strlcat(msg, "]", msgl);
-}
-
-/**
- * ipath_format_hwerrors - format hardware error messages for display
- * @hwerrs hardware errors bit vector
- * @hwerrmsgs hardware error descriptions
- * @nhwerrmsgs number of hwerrmsgs
- * @msg message buffer
- * @msgl message buffer length
- */
-void ipath_format_hwerrors(u64 hwerrs,
-                          const struct ipath_hwerror_msgs *hwerrmsgs,
-                          size_t nhwerrmsgs,
-                          char *msg, size_t msgl)
-{
-       int i;
-       const int glen =
-           ARRAY_SIZE(ipath_generic_hwerror_msgs);
-
-       for (i=0; i<glen; i++) {
-               if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
-                       ipath_format_hwmsg(msg, msgl,
-                                          ipath_generic_hwerror_msgs[i].msg);
-               }
-       }
-
-       for (i=0; i<nhwerrmsgs; i++) {
-               if (hwerrs & hwerrmsgs[i].mask) {
-                       ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
-               }
-       }
-}
-
-/* return the strings for the most common link states */
-static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       char *ret;
-       u32 state;
-
-       state = ipath_ib_state(dd, ibcs);
-       if (state == dd->ib_init)
-               ret = "Init";
-       else if (state == dd->ib_arm)
-               ret = "Arm";
-       else if (state == dd->ib_active)
-               ret = "Active";
-       else
-               ret = "Down";
-       return ret;
-}
-
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
-{
-       struct ib_event event;
-
-       event.device = &dd->verbs_dev->ibdev;
-       event.element.port_num = 1;
-       event.event = ev;
-       ib_dispatch_event(&event);
-}
-
-static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
-                                    ipath_err_t errs)
-{
-       u32 ltstate, lstate, ibstate, lastlstate;
-       u32 init = dd->ib_init;
-       u32 arm = dd->ib_arm;
-       u32 active = dd->ib_active;
-       const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-
-       lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
-       ibstate = ipath_ib_state(dd, ibcs);
-       /* linkstate at last interrupt */
-       lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-       ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
-
-       /*
-        * Since going into a recovery state causes the link state to go
-        * down and since recovery is transitory, it is better if we "miss"
-        * ever seeing the link training state go into recovery (i.e.,
-        * ignore this transition for link state special handling purposes)
-        * without even updating ipath_lastibcstat.
-        */
-       if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
-           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
-           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
-               goto done;
-
-       /*
-        * if linkstate transitions into INIT from any of the various down
-        * states, or if it transitions from any of the up (INIT or better)
-        * states into any of the down states (except link recovery), then
-        * call the chip-specific code to take appropriate actions.
-        */
-       if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
-               lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
-               /* transitioned to UP */
-               if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
-                       /* link came up, so we must no longer be disabled */
-                       dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-                       ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
-                       goto skip_ibchange; /* chip-code handled */
-               }
-       } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
-               (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
-               ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
-               ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
-               int handled;
-               handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
-               dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
-               if (handled) {
-                       ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
-                       goto skip_ibchange; /* chip-code handled */
-               }
-       }
-
-       /*
-        * Significant enough to always print and get into logs, if it was
-        * unexpected.  If it was a requested state change, we'll have
-        * already cleared the flags, so we won't print this warning
-        */
-       if ((ibstate != arm && ibstate != active) &&
-           (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
-               dev_info(&dd->pcidev->dev, "Link state changed from %s "
-                        "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
-                        "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
-       }
-
-       if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-           ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-               u32 lastlts;
-               lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-               /*
-                * Ignore cycling back and forth from Polling.Active to
-                * Polling.Quiet while waiting for the other end of the link
-                * to come up, except to try and decide if we are connected
-                * to a live IB device or not.  We will cycle back and
-                * forth between them if no cable is plugged in, the other
-                * device is powered off or disabled, etc.
-                */
-               if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-                   lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-                       if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
-                            (++dd->ipath_ibpollcnt == 40)) {
-                               dd->ipath_flags |= IPATH_NOCABLE;
-                               *dd->ipath_statusp |=
-                                       IPATH_STATUS_IB_NOCABLE;
-                               ipath_cdbg(LINKVERB, "Set NOCABLE\n");
-                       }
-                       ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
-                               ipath_ibcstatus_str[ltstate], ibstate);
-                       goto skip_ibchange;
-               }
-       }
-
-       dd->ipath_ibpollcnt = 0; /* not poll*, now */
-       ipath_stats.sps_iblink++;
-
-       if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
-               u64 linkrecov;
-               linkrecov = ipath_snap_cntr(dd,
-                       dd->ipath_cregs->cr_iblinkerrrecovcnt);
-               if (linkrecov != dd->ipath_lastlinkrecov) {
-                       ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
-                               (unsigned long long) ibcs,
-                               ib_linkstate(dd, ibcs),
-                               ipath_ibcstatus_str[ltstate],
-                               (unsigned long long) linkrecov);
-                       /* and no more until active again */
-                       dd->ipath_lastlinkrecov = 0;
-                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-                       goto skip_ibchange;
-               }
-       }
-
-       if (ibstate == init || ibstate == arm || ibstate == active) {
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
-               if (ibstate == init || ibstate == arm) {
-                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-                       if (dd->ipath_flags & IPATH_LINKACTIVE)
-                               signal_ib_event(dd, IB_EVENT_PORT_ERR);
-               }
-               if (ibstate == arm) {
-                       dd->ipath_flags |= IPATH_LINKARMED;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK |
-                               IPATH_LINKINIT | IPATH_LINKDOWN |
-                               IPATH_LINKACTIVE | IPATH_NOCABLE);
-                       ipath_hol_down(dd);
-               } else  if (ibstate == init) {
-                       /*
-                        * set INIT and DOWN.  Down is checked by
-                        * most of the other code, but INIT is
-                        * useful to know in a few places.
-                        */
-                       dd->ipath_flags |= IPATH_LINKINIT |
-                               IPATH_LINKDOWN;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK |
-                               IPATH_LINKARMED | IPATH_LINKACTIVE |
-                               IPATH_NOCABLE);
-                       ipath_hol_down(dd);
-               } else {  /* active */
-                       dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
-                               dd->ipath_cregs->cr_iblinkerrrecovcnt);
-                       *dd->ipath_statusp |=
-                               IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
-                       dd->ipath_flags |= IPATH_LINKACTIVE;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                               | IPATH_LINKDOWN | IPATH_LINKARMED |
-                               IPATH_NOCABLE);
-                       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-                               ipath_restart_sdma(dd);
-                       signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
-                       /* LED active not handled in chip _f_updown */
-                       dd->ipath_f_setextled(dd, lstate, ltstate);
-                       ipath_hol_up(dd);
-               }
-
-               /*
-                * print after we've already done the work, so as not to
-                * delay the state changes and notifications, for debugging
-                */
-               if (lstate == lastlstate)
-                       ipath_cdbg(LINKVERB, "Unchanged from last: %s "
-                               "(%x)\n", ib_linkstate(dd, ibcs), ibstate);
-               else
-                       ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
-                                 dd->ipath_unit, ib_linkstate(dd, ibcs),
-                                 ipath_ibcstatus_str[ltstate],  ibstate);
-       } else { /* down */
-               if (dd->ipath_flags & IPATH_LINKACTIVE)
-                       signal_ib_event(dd, IB_EVENT_PORT_ERR);
-               dd->ipath_flags |= IPATH_LINKDOWN;
-               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                                    | IPATH_LINKACTIVE |
-                                    IPATH_LINKARMED);
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-               dd->ipath_lli_counter = 0;
-
-               if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
-                       ipath_cdbg(VERBOSE, "Unit %u link state down "
-                                  "(state 0x%x), from %s\n",
-                                  dd->ipath_unit, lstate,
-                                  ib_linkstate(dd, dd->ipath_lastibcstat));
-               else
-                       ipath_cdbg(LINKVERB, "Unit %u link state changed "
-                                  "to %s (0x%x) from down (%x)\n",
-                                  dd->ipath_unit,
-                                  ipath_ibcstatus_str[ltstate],
-                                  ibstate, lastlstate);
-       }
-
-skip_ibchange:
-       dd->ipath_lastibcstat = ibcs;
-done:
-       return;
-}
-
-static void handle_supp_msgs(struct ipath_devdata *dd,
-                            unsigned supp_msgs, char *msg, u32 msgsz)
-{
-       /*
-        * Print the message unless it's ibc status change only, which
-        * happens so often we never want to count it.
-        */
-       if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
-               int iserr;
-               ipath_err_t mask;
-               iserr = ipath_decode_err(dd, msg, msgsz,
-                                        dd->ipath_lasterror &
-                                        ~INFINIPATH_E_IBSTATUSCHANGED);
-
-               mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                       INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
-
-               /* if we're in debug, then don't mask SDMADISABLED msgs */
-               if (ipath_debug & __IPATH_DBG)
-                       mask &= ~INFINIPATH_E_SDMADISABLED;
-
-               if (dd->ipath_lasterror & ~mask)
-                       ipath_dev_err(dd, "Suppressed %u messages for "
-                                     "fast-repeating errors (%s) (%llx)\n",
-                                     supp_msgs, msg,
-                                     (unsigned long long)
-                                     dd->ipath_lasterror);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal", for some
-                        * types of processes (mostly benchmarks) that send
-                        * huge numbers of messages, while not processing
-                        * them. So only complain about these at debug
-                        * level.
-                        */
-                       if (iserr)
-                               ipath_dbg("Suppressed %u messages for %s\n",
-                                         supp_msgs, msg);
-                       else
-                               ipath_cdbg(ERRPKT,
-                                       "Suppressed %u messages for %s\n",
-                                         supp_msgs, msg);
-               }
-       }
-}
-
-static unsigned handle_frequent_errors(struct ipath_devdata *dd,
-                                      ipath_err_t errs, char *msg,
-                                      u32 msgsz, int *noprint)
-{
-       unsigned long nc;
-       static unsigned long nextmsg_time;
-       static unsigned nmsgs, supp_msgs;
-
-       /*
-        * Throttle back "fast" messages to no more than 10 per 5 seconds.
-        * This isn't perfect, but it's a reasonable heuristic. If we get
-        * more than 10, give a 6x longer delay.
-        */
-       nc = jiffies;
-       if (nmsgs > 10) {
-               if (time_before(nc, nextmsg_time)) {
-                       *noprint = 1;
-                       if (!supp_msgs++)
-                               nextmsg_time = nc + HZ * 3;
-               } else if (supp_msgs) {
-                       handle_supp_msgs(dd, supp_msgs, msg, msgsz);
-                       supp_msgs = 0;
-                       nmsgs = 0;
-               }
-       } else if (!nmsgs++ || time_after(nc, nextmsg_time)) {
-               nextmsg_time = nc + HZ / 2;
-       }
-
-       return supp_msgs;
-}
-
-static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       unsigned long flags;
-       int expected;
-
-       if (ipath_debug & __IPATH_DBG) {
-               char msg[128];
-               ipath_decode_err(dd, msg, sizeof msg, errs &
-                       INFINIPATH_E_SDMAERRS);
-               ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
-       }
-       if (ipath_debug & __IPATH_VERBDBG) {
-               unsigned long tl, hd, status, lengen;
-               tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-               hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-               status = ipath_read_kreg64(dd
-                       , dd->ipath_kregs->kr_senddmastatus);
-               lengen = ipath_read_kreg64(dd,
-                       dd->ipath_kregs->kr_senddmalengen);
-               ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
-                       "lengen 0x%lx\n", tl, hd, status, lengen);
-       }
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-       expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!expected)
-               ipath_cancel_sends(dd, 1);
-}
-
-static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
-{
-       unsigned long flags;
-       int expected;
-
-       if ((istat & INFINIPATH_I_SDMAINT) &&
-           !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               ipath_sdma_intr(dd);
-
-       if (istat & INFINIPATH_I_SDMADISABLED) {
-               expected = test_bit(IPATH_SDMA_ABORTING,
-                       &dd->ipath_sdma_status);
-               ipath_dbg("%s SDmaDisabled intr\n",
-                       expected ? "expected" : "unexpected");
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-               if (!expected)
-                       ipath_cancel_sends(dd, 1);
-               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-       }
-}
-
-static int handle_hdrq_full(struct ipath_devdata *dd)
-{
-       int chkerrpkts = 0;
-       u32 hd, tl;
-       u32 i;
-
-       ipath_stats.sps_hdrqfull++;
-       for (i = 0; i < dd->ipath_cfgports; i++) {
-               struct ipath_portdata *pd = dd->ipath_pd[i];
-
-               if (i == 0) {
-                       /*
-                        * For kernel receive queues, we just want to know
-                        * if there are packets in the queue that we can
-                        * process.
-                        */
-                       if (pd->port_head != ipath_get_hdrqtail(pd))
-                               chkerrpkts |= 1 << i;
-                       continue;
-               }
-
-               /* Skip if user context is not open */
-               if (!pd || !pd->port_cnt)
-                       continue;
-
-               /* Don't report the same point multiple times. */
-               if (dd->ipath_flags & IPATH_NODMA_RTAIL)
-                       tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
-               else
-                       tl = ipath_get_rcvhdrtail(pd);
-               if (tl == pd->port_lastrcvhdrqtail)
-                       continue;
-
-               hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
-               if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
-                       pd->port_lastrcvhdrqtail = tl;
-                       pd->port_hdrqfull++;
-                       /* flush hdrqfull so that poll() sees it */
-                       wmb();
-                       wake_up_interruptible(&pd->port_wait);
-               }
-       }
-
-       return chkerrpkts;
-}
-
-static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       char msg[128];
-       u64 ignore_this_time = 0;
-       u64 iserr = 0;
-       int chkerrpkts = 0, noprint = 0;
-       unsigned supp_msgs;
-       int log_idx;
-
-       /*
-        * don't report errors that are masked, either at init
-        * (not set in ipath_errormask), or temporarily (set in
-        * ipath_maskederrs)
-        */
-       errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
-
-       supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
-               &noprint);
-
-       /* do these first, they are most important */
-       if (errs & INFINIPATH_E_HARDWARE) {
-               /* reuse same msg buf */
-               dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
-       } else {
-               u64 mask;
-               for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
-                       mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
-                       if (errs & mask)
-                               ipath_inc_eeprom_err(dd, log_idx, 1);
-               }
-       }
-
-       if (errs & INFINIPATH_E_SDMAERRS)
-               handle_sdma_errors(dd, errs);
-
-       if (!noprint && (errs & ~dd->ipath_e_bitsextant))
-               ipath_dev_err(dd, "error interrupt with unknown errors "
-                             "%llx set\n", (unsigned long long)
-                             (errs & ~dd->ipath_e_bitsextant));
-
-       if (errs & E_SUM_ERRS)
-               ignore_this_time = handle_e_sum_errs(dd, errs);
-       else if ((errs & E_SUM_LINK_PKTERRS) &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               /*
-                * This can happen when SMA is trying to bring the link
-                * up, but the IB link changes state at the "wrong" time.
-                * The IB logic then complains that the packet isn't
-                * valid.  We don't want to confuse people, so we just
-                * don't print them, except at debug
-                */
-               ipath_dbg("Ignoring packet errors %llx, because link not "
-                         "ACTIVE\n", (unsigned long long) errs);
-               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-       }
-
-       if (supp_msgs == 250000) {
-               int s_iserr;
-               /*
-                * It's not entirely reasonable assuming that the errors set
-                * in the last clear period are all responsible for the
-                * problem, but the alternative is to assume it's the only
-                * ones on this particular interrupt, which also isn't great
-                */
-               dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
-
-               dd->ipath_errormask &= ~dd->ipath_maskederrs;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                                dd->ipath_errormask);
-               s_iserr = ipath_decode_err(dd, msg, sizeof msg,
-                                          dd->ipath_maskederrs);
-
-               if (dd->ipath_maskederrs &
-                   ~(INFINIPATH_E_RRCVEGRFULL |
-                     INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
-                       ipath_dev_err(dd, "Temporarily disabling "
-                           "error(s) %llx reporting; too frequent (%s)\n",
-                               (unsigned long long) dd->ipath_maskederrs,
-                               msg);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal",
-                        * for some types of processes (mostly benchmarks)
-                        * that send huge numbers of messages, while not
-                        * processing them.  So only complain about
-                        * these at debug level.
-                        */
-                       if (s_iserr)
-                               ipath_dbg("Temporarily disabling reporting "
-                                   "too frequent queue full errors (%s)\n",
-                                   msg);
-                       else
-                               ipath_cdbg(ERRPKT,
-                                   "Temporarily disabling reporting too"
-                                   " frequent packet errors (%s)\n",
-                                   msg);
-               }
-
-               /*
-                * Re-enable the masked errors after around 3 minutes.  in
-                * ipath_get_faststats().  If we have a series of fast
-                * repeating but different errors, the interval will keep
-                * stretching out, but that's OK, as that's pretty
-                * catastrophic.
-                */
-               dd->ipath_unmasktime = jiffies + HZ * 180;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
-       if (ignore_this_time)
-               errs &= ~ignore_this_time;
-       if (errs & ~dd->ipath_lasterror) {
-               errs &= ~dd->ipath_lasterror;
-               /* never suppress duplicate hwerrors or ibstatuschange */
-               dd->ipath_lasterror |= errs &
-                       ~(INFINIPATH_E_HARDWARE |
-                         INFINIPATH_E_IBSTATUSCHANGED);
-       }
-
-       if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
-               dd->ipath_spectriggerhit++;
-               ipath_dbg("%lu special trigger hits\n",
-                       dd->ipath_spectriggerhit);
-       }
-
-       /* likely due to cancel; so suppress message unless verbose */
-       if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
-               time_after(dd->ipath_lastcancel, jiffies)) {
-               /* armlaunch takes precedence; it often causes both. */
-               ipath_cdbg(VERBOSE,
-                       "Suppressed %s error (%llx) after sendbuf cancel\n",
-                       (errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
-                       "armlaunch" : "sendpktlen", (unsigned long long)errs);
-               errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
-       }
-
-       if (!errs)
-               return 0;
-
-       if (!noprint) {
-               ipath_err_t mask;
-               /*
-                * The ones we mask off are handled specially below
-                * or above.  Also mask SDMADISABLED by default as it
-                * is too chatty.
-                */
-               mask = INFINIPATH_E_IBSTATUSCHANGED |
-                       INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                       INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
-
-               /* if we're in debug, then don't mask SDMADISABLED msgs */
-               if (ipath_debug & __IPATH_DBG)
-                       mask &= ~INFINIPATH_E_SDMADISABLED;
-
-               ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
-       } else
-               /* so we don't need if (!noprint) at strlcat's below */
-               *msg = 0;
-
-       if (errs & E_SUM_PKTERRS) {
-               ipath_stats.sps_pkterrs++;
-               chkerrpkts = 1;
-       }
-       if (errs & E_SUM_ERRS)
-               ipath_stats.sps_errs++;
-
-       if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
-               ipath_stats.sps_crcerrs++;
-               chkerrpkts = 1;
-       }
-       iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
-
-
-       /*
-        * We don't want to print these two as they happen, or we can make
-        * the situation even worse, because it takes so long to print
-        * messages to serial consoles.  Kernel ports get printed from
-        * fast_stats, no more than every 5 seconds, user ports get printed
-        * on close
-        */
-       if (errs & INFINIPATH_E_RRCVHDRFULL)
-               chkerrpkts |= handle_hdrq_full(dd);
-       if (errs & INFINIPATH_E_RRCVEGRFULL) {
-               struct ipath_portdata *pd = dd->ipath_pd[0];
-
-               /*
-                * since this is of less importance and not likely to
-                * happen without also getting hdrfull, only count
-                * occurrences; don't check each port (or even the kernel
-                * vs user)
-                */
-               ipath_stats.sps_etidfull++;
-               if (pd->port_head != ipath_get_hdrqtail(pd))
-                       chkerrpkts |= 1;
-       }
-
-       /*
-        * do this before IBSTATUSCHANGED, in case both bits set in a single
-        * interrupt; we want the STATUSCHANGE to "win", so we do our
-        * internal copy of state machine correctly
-        */
-       if (errs & INFINIPATH_E_RIBLOSTLINK) {
-               /*
-                * force through block below
-                */
-               errs |= INFINIPATH_E_IBSTATUSCHANGED;
-               ipath_stats.sps_iblink++;
-               dd->ipath_flags |= IPATH_LINKDOWN;
-               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                                    | IPATH_LINKARMED | IPATH_LINKACTIVE);
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-
-               ipath_dbg("Lost link, link now down (%s)\n",
-                       ipath_ibcstatus_str[ipath_read_kreg64(dd,
-                       dd->ipath_kregs->kr_ibcstatus) & 0xf]);
-       }
-       if (errs & INFINIPATH_E_IBSTATUSCHANGED)
-               handle_e_ibstatuschanged(dd, errs);
-
-       if (errs & INFINIPATH_E_RESET) {
-               if (!noprint)
-                       ipath_dev_err(dd, "Got reset, requires re-init "
-                                     "(unload and reload driver)\n");
-               dd->ipath_flags &= ~IPATH_INITTED;      /* needs re-init */
-               /* mark as having had error */
-               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
-       }
-
-       if (!noprint && *msg) {
-               if (iserr)
-                       ipath_dev_err(dd, "%s error\n", msg);
-       }
-       if (dd->ipath_state_wanted & dd->ipath_flags) {
-               ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
-                          "waking\n", dd->ipath_state_wanted,
-                          dd->ipath_flags);
-               wake_up_interruptible(&ipath_state_wait);
-       }
-
-       return chkerrpkts;
-}
-
-/*
- * try to cleanup as much as possible for anything that might have gone
- * wrong while in freeze mode, such as pio buffers being written by user
- * processes (causing armlaunch), send errors due to going into freeze mode,
- * etc., and try to avoid causing extra interrupts while doing so.
- * Forcibly update the in-memory pioavail register copies after cleanup
- * because the chip won't do it while in freeze mode (the register values
- * themselves are kept correct).
- * Make sure that we don't lose any important interrupts by using the chip
- * feature that says that writing 0 to a bit in *clear that is set in
- * *status will cause an interrupt to be generated again (if allowed by
- * the *mask value).
- */
-void ipath_clear_freeze(struct ipath_devdata *dd)
-{
-       /* disable error interrupts, to avoid confusion */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
-
-       /* also disable interrupts; errormask is sometimes overwriten */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-       ipath_cancel_sends(dd, 1);
-
-       /* clear the freeze, and be sure chip saw it */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-       /* force in-memory update now we are out of freeze */
-       ipath_force_pio_avail_update(dd);
-
-       /*
-        * force new interrupt if any hwerr, error or interrupt bits are
-        * still set, and clear "safe" send packet errors related to freeze
-        * and cancelling sends.  Re-enable error interrupts before possible
-        * force of re-interrupt on pending interrupts.
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-               E_SPKT_ERRS_IGNORE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-}
-
-
-/* this is separate to allow for better optimization of ipath_intr() */
-
-static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
-{
-       /*
-        * sometimes happen during driver init and unload, don't want
-        * to process any interrupts at that point
-        */
-
-       /* this is just a bandaid, not a fix, if something goes badly
-        * wrong */
-       if (++*unexpectp > 100) {
-               if (++*unexpectp > 105) {
-                       /*
-                        * ok, we must be taking somebody else's interrupts,
-                        * due to a messed up mptable and/or PIRQ table, so
-                        * unregister the interrupt.  We've seen this during
-                        * linuxbios development work, and it may happen in
-                        * the future again.
-                        */
-                       if (dd->pcidev && dd->ipath_irq) {
-                               ipath_dev_err(dd, "Now %u unexpected "
-                                             "interrupts, unregistering "
-                                             "interrupt handler\n",
-                                             *unexpectp);
-                               ipath_dbg("free_irq of irq %d\n",
-                                         dd->ipath_irq);
-                               dd->ipath_f_free_irq(dd);
-                       }
-               }
-               if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
-                       ipath_dev_err(dd, "%u unexpected interrupts, "
-                                     "disabling interrupts completely\n",
-                                     *unexpectp);
-                       /*
-                        * disable all interrupts, something is very wrong
-                        */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-                                        0ULL);
-               }
-       } else if (*unexpectp > 1)
-               ipath_dbg("Interrupt when not ready, should not happen, "
-                         "ignoring\n");
-}
-
-static noinline void ipath_bad_regread(struct ipath_devdata *dd)
-{
-       static int allbits;
-
-       /* separate routine, for better optimization of ipath_intr() */
-
-       /*
-        * We print the message and disable interrupts, in hope of
-        * having a better chance of debugging the problem.
-        */
-       ipath_dev_err(dd,
-                     "Read of interrupt status failed (all bits set)\n");
-       if (allbits++) {
-               /* disable all interrupts, something is very wrong */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-               if (allbits == 2) {
-                       ipath_dev_err(dd, "Still bad interrupt status, "
-                                     "unregistering interrupt\n");
-                       dd->ipath_f_free_irq(dd);
-               } else if (allbits > 2) {
-                       if ((allbits % 10000) == 0)
-                               printk(".");
-               } else
-                       ipath_dev_err(dd, "Disabling interrupts, "
-                                     "multiple errors\n");
-       }
-}
-
-static void handle_layer_pioavail(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int ret;
-
-       ret = ipath_ib_piobufavail(dd->verbs_dev);
-       if (ret > 0)
-               goto set;
-
-       return;
-set:
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                        dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-/*
- * Handle receive interrupts for user ports; this means a user
- * process was waiting for a packet to arrive, and didn't want
- * to poll
- */
-static void handle_urcv(struct ipath_devdata *dd, u64 istat)
-{
-       u64 portr;
-       int i;
-       int rcvdint = 0;
-
-       /*
-        * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
-        * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
-        * would both like timely updates of the bits so that
-        * we don't pass them by unnecessarily.  the rmb()
-        * here ensures that we see them promptly -- the
-        * corresponding wmb()'s are in ipath_poll_urgent()
-        * and ipath_poll_next()...
-        */
-       rmb();
-       portr = ((istat >> dd->ipath_i_rcvavail_shift) &
-                dd->ipath_i_rcvavail_mask) |
-               ((istat >> dd->ipath_i_rcvurg_shift) &
-                dd->ipath_i_rcvurg_mask);
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               struct ipath_portdata *pd = dd->ipath_pd[i];
-
-               if (portr & (1 << i) && pd && pd->port_cnt) {
-                       if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
-                                              &pd->port_flag)) {
-                               clear_bit(i + dd->ipath_r_intravail_shift,
-                                         &dd->ipath_rcvctrl);
-                               wake_up_interruptible(&pd->port_wait);
-                               rcvdint = 1;
-                       } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
-                                                     &pd->port_flag)) {
-                               pd->port_urgent++;
-                               wake_up_interruptible(&pd->port_wait);
-                       }
-               }
-       }
-       if (rcvdint) {
-               /* only want to take one interrupt, so turn off the rcv
-                * interrupt for all the ports that we set the rcv_waiting
-                * (but never for kernel port)
-                */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                                dd->ipath_rcvctrl);
-       }
-}
-
-irqreturn_t ipath_intr(int irq, void *data)
-{
-       struct ipath_devdata *dd = data;
-       u64 istat, chk0rcv = 0;
-       ipath_err_t estat = 0;
-       irqreturn_t ret;
-       static unsigned unexpected = 0;
-       u64 kportrbits;
-
-       ipath_stats.sps_ints++;
-
-       if (dd->ipath_int_counter != (u32) -1)
-               dd->ipath_int_counter++;
-
-       if (!(dd->ipath_flags & IPATH_PRESENT)) {
-               /*
-                * This return value is not great, but we do not want the
-                * interrupt core code to remove our interrupt handler
-                * because we don't appear to be handling an interrupt
-                * during a chip reset.
-                */
-               return IRQ_HANDLED;
-       }
-
-       /*
-        * this needs to be flags&initted, not statusp, so we keep
-        * taking interrupts even after link goes down, etc.
-        * Also, we *must* clear the interrupt at some point, or we won't
-        * take it again, which can be real bad for errors, etc...
-        */
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               ipath_bad_intr(dd, &unexpected);
-               ret = IRQ_NONE;
-               goto bail;
-       }
-
-       istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
-
-       if (unlikely(!istat)) {
-               ipath_stats.sps_nullintr++;
-               ret = IRQ_NONE; /* not our interrupt, or already handled */
-               goto bail;
-       }
-       if (unlikely(istat == -1)) {
-               ipath_bad_regread(dd);
-               /* don't know if it was our interrupt or not */
-               ret = IRQ_NONE;
-               goto bail;
-       }
-
-       if (unexpected)
-               unexpected = 0;
-
-       if (unlikely(istat & ~dd->ipath_i_bitsextant))
-               ipath_dev_err(dd,
-                             "interrupt with unknown interrupts %Lx set\n",
-                             (unsigned long long)
-                             istat & ~dd->ipath_i_bitsextant);
-       else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
-               ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
-                       (unsigned long long) istat);
-
-       if (istat & INFINIPATH_I_ERROR) {
-               ipath_stats.sps_errints++;
-               estat = ipath_read_kreg64(dd,
-                                         dd->ipath_kregs->kr_errorstatus);
-               if (!estat)
-                       dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
-                                "but no error bits set!\n",
-                                (unsigned long long) istat);
-               else if (estat == -1LL)
-                       /*
-                        * should we try clearing all, or hope next read
-                        * works?
-                        */
-                       ipath_dev_err(dd, "Read of error status failed "
-                                     "(all bits set); ignoring\n");
-               else
-                       chk0rcv |= handle_errors(dd, estat);
-       }
-
-       if (istat & INFINIPATH_I_GPIO) {
-               /*
-                * GPIO interrupts fall in two broad classes:
-                * GPIO_2 indicates (on some HT4xx boards) that a packet
-                *        has arrived for Port 0. Checking for this
-                *        is controlled by flag IPATH_GPIO_INTR.
-                * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
-                *        errors that we need to count. Checking for this
-                *        is controlled by flag IPATH_GPIO_ERRINTRS.
-                */
-               u32 gpiostatus;
-               u32 to_clear = 0;
-
-               gpiostatus = ipath_read_kreg32(
-                       dd, dd->ipath_kregs->kr_gpio_status);
-               /* First the error-counter case. */
-               if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
-                   (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
-                       /* want to clear the bits we see asserted. */
-                       to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
-
-                       /*
-                        * Count appropriately, clear bits out of our copy,
-                        * as they have been "handled".
-                        */
-                       if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
-                               ipath_dbg("FlowCtl on UnsupVL\n");
-                               dd->ipath_rxfc_unsupvl_errs++;
-                       }
-                       if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
-                               ipath_dbg("Overrun Threshold exceeded\n");
-                               dd->ipath_overrun_thresh_errs++;
-                       }
-                       if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
-                               ipath_dbg("Local Link Integrity error\n");
-                               dd->ipath_lli_errs++;
-                       }
-                       gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
-               }
-               /* Now the Port0 Receive case */
-               if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
-                   (dd->ipath_flags & IPATH_GPIO_INTR)) {
-                       /*
-                        * GPIO status bit 2 is set, and we expected it.
-                        * clear it and indicate in p0bits.
-                        * This probably only happens if a Port0 pkt
-                        * arrives at _just_ the wrong time, and we
-                        * handle that by seting chk0rcv;
-                        */
-                       to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
-                       gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
-                       chk0rcv = 1;
-               }
-               if (gpiostatus) {
-                       /*
-                        * Some unexpected bits remain. If they could have
-                        * caused the interrupt, complain and clear.
-                        * To avoid repetition of this condition, also clear
-                        * the mask. It is almost certainly due to error.
-                        */
-                       const u32 mask = (u32) dd->ipath_gpio_mask;
-
-                       if (mask & gpiostatus) {
-                               ipath_dbg("Unexpected GPIO IRQ bits %x\n",
-                                 gpiostatus & mask);
-                               to_clear |= (gpiostatus & mask);
-                               dd->ipath_gpio_mask &= ~(gpiostatus & mask);
-                               ipath_write_kreg(dd,
-                                       dd->ipath_kregs->kr_gpio_mask,
-                                       dd->ipath_gpio_mask);
-                       }
-               }
-               if (to_clear) {
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-                                       (u64) to_clear);
-               }
-       }
-
-       /*
-        * Clear the interrupt bits we found set, unless they are receive
-        * related, in which case we already cleared them above, and don't
-        * want to clear them again, because we might lose an interrupt.
-        * Clear it early, so we "know" know the chip will have seen this by
-        * the time we process the queue, and will re-interrupt if necessary.
-        * The processor itself won't take the interrupt again until we return.
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
-
-       /*
-        * Handle kernel receive queues before checking for pio buffers
-        * available since receives can overflow; piobuf waiters can afford
-        * a few extra cycles, since they were waiting anyway, and user's
-        * waiting for receive are at the bottom.
-        */
-       kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
-               (1ULL << dd->ipath_i_rcvurg_shift);
-       if (chk0rcv || (istat & kportrbits)) {
-               istat &= ~kportrbits;
-               ipath_kreceive(dd->ipath_pd[0]);
-       }
-
-       if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
-                    (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
-               handle_urcv(dd, istat);
-
-       if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
-               handle_sdma_intr(dd, istat);
-
-       if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-               /* always process; sdma verbs uses PIO for acks and VL15  */
-               handle_layer_pioavail(dd);
-       }
-
-       ret = IRQ_HANDLED;
-
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h
deleted file mode 100644 (file)
index 66c934a..0000000
+++ /dev/null
@@ -1,1374 +0,0 @@
-#ifndef _IPATH_KERNEL_H
-#define _IPATH_KERNEL_H
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This header file is the base header file for infinipath kernel code
- * ipath_user.h serves a similar purpose for user code.
- */
-
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/scatterlist.h>
-#include <linux/sched.h>
-#include <asm/io.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_common.h"
-#include "ipath_debug.h"
-#include "ipath_registers.h"
-
-/* only s/w major version of InfiniPath we can handle */
-#define IPATH_CHIP_VERS_MAJ 2U
-
-/* don't care about this except printing */
-#define IPATH_CHIP_VERS_MIN 0U
-
-/* temporary, maybe always */
-extern struct infinipath_stats ipath_stats;
-
-#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
-/*
- * First-cut critierion for "device is active" is
- * two thousand dwords combined Tx, Rx traffic per
- * 5-second interval. SMA packets are 64 dwords,
- * and occur "a few per second", presumably each way.
- */
-#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
-/*
- * Struct used to indicate which errors are logged in each of the
- * error-counters that are logged to EEPROM. A counter is incremented
- * _once_ (saturating at 255) for each event with any bits set in
- * the error or hwerror register masks below.
- */
-#define IPATH_EEP_LOG_CNT (4)
-struct ipath_eep_log_mask {
-       u64 errs_to_log;
-       u64 hwerrs_to_log;
-};
-
-struct ipath_portdata {
-       void **port_rcvegrbuf;
-       dma_addr_t *port_rcvegrbuf_phys;
-       /* rcvhdrq base, needs mmap before useful */
-       void *port_rcvhdrq;
-       /* kernel virtual address where hdrqtail is updated */
-       void *port_rcvhdrtail_kvaddr;
-       /*
-        * temp buffer for expected send setup, allocated at open, instead
-        * of each setup call
-        */
-       void *port_tid_pg_list;
-       /* when waiting for rcv or pioavail */
-       wait_queue_head_t port_wait;
-       /*
-        * rcvegr bufs base, physical, must fit
-        * in 44 bits so 32 bit programs mmap64 44 bit works)
-        */
-       dma_addr_t port_rcvegr_phys;
-       /* mmap of hdrq, must fit in 44 bits */
-       dma_addr_t port_rcvhdrq_phys;
-       dma_addr_t port_rcvhdrqtailaddr_phys;
-       /*
-        * number of opens (including slave subports) on this instance
-        * (ignoring forks, dup, etc. for now)
-        */
-       int port_cnt;
-       /*
-        * how much space to leave at start of eager TID entries for
-        * protocol use, on each TID
-        */
-       /* instead of calculating it */
-       unsigned port_port;
-       /* non-zero if port is being shared. */
-       u16 port_subport_cnt;
-       /* non-zero if port is being shared. */
-       u16 port_subport_id;
-       /* number of pio bufs for this port (all procs, if shared) */
-       u32 port_piocnt;
-       /* first pio buffer for this port */
-       u32 port_pio_base;
-       /* chip offset of PIO buffers for this port */
-       u32 port_piobufs;
-       /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
-       u32 port_rcvegrbuf_chunks;
-       /* how many egrbufs per chunk */
-       u32 port_rcvegrbufs_perchunk;
-       /* order for port_rcvegrbuf_pages */
-       size_t port_rcvegrbuf_size;
-       /* rcvhdrq size (for freeing) */
-       size_t port_rcvhdrq_size;
-       /* next expected TID to check when looking for free */
-       u32 port_tidcursor;
-       /* next expected TID to check */
-       unsigned long port_flag;
-       /* what happened */
-       unsigned long int_flag;
-       /* WAIT_RCV that timed out, no interrupt */
-       u32 port_rcvwait_to;
-       /* WAIT_PIO that timed out, no interrupt */
-       u32 port_piowait_to;
-       /* WAIT_RCV already happened, no wait */
-       u32 port_rcvnowait;
-       /* WAIT_PIO already happened, no wait */
-       u32 port_pionowait;
-       /* total number of rcvhdrqfull errors */
-       u32 port_hdrqfull;
-       /*
-        * Used to suppress multiple instances of same
-        * port staying stuck at same point.
-        */
-       u32 port_lastrcvhdrqtail;
-       /* saved total number of rcvhdrqfull errors for poll edge trigger */
-       u32 port_hdrqfull_poll;
-       /* total number of polled urgent packets */
-       u32 port_urgent;
-       /* saved total number of polled urgent packets for poll edge trigger */
-       u32 port_urgent_poll;
-       /* pid of process using this port */
-       struct pid *port_pid;
-       struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
-       /* same size as task_struct .comm[] */
-       char port_comm[TASK_COMM_LEN];
-       /* pkeys set by this use of this port */
-       u16 port_pkeys[4];
-       /* so file ops can get at unit */
-       struct ipath_devdata *port_dd;
-       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
-       void *subport_uregbase;
-       /* An array of pages for the eager receive buffers * N */
-       void *subport_rcvegrbuf;
-       /* An array of pages for the eager header queue entries * N */
-       void *subport_rcvhdr_base;
-       /* The version of the library which opened this port */
-       u32 userversion;
-       /* Bitmask of active slaves */
-       u32 active_slaves;
-       /* Type of packets or conditions we want to poll for */
-       u16 poll_type;
-       /* port rcvhdrq head offset */
-       u32 port_head;
-       /* receive packet sequence counter */
-       u32 port_seq_cnt;
-};
-
-struct sk_buff;
-struct ipath_sge_state;
-struct ipath_verbs_txreq;
-
-/*
- * control information for layered drivers
- */
-struct _ipath_layer {
-       void *l_arg;
-};
-
-struct ipath_skbinfo {
-       struct sk_buff *skb;
-       dma_addr_t phys;
-};
-
-struct ipath_sdma_txreq {
-       int                 flags;
-       int                 sg_count;
-       union {
-               struct scatterlist *sg;
-               void *map_addr;
-       };
-       void              (*callback)(void *, int);
-       void               *callback_cookie;
-       int                 callback_status;
-       u16                 start_idx;  /* sdma private */
-       u16                 next_descq_idx;  /* sdma private */
-       struct list_head    list;       /* sdma private */
-};
-
-struct ipath_sdma_desc {
-       __le64 qw[2];
-};
-
-#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
-#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
-#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
-#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
-#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
-#define IPATH_SDMA_TXREQ_F_VL15         0x20
-
-#define IPATH_SDMA_TXREQ_S_OK        0
-#define IPATH_SDMA_TXREQ_S_SENDERROR 1
-#define IPATH_SDMA_TXREQ_S_ABORTED   2
-#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
-
-#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG    (1ull << 63)
-#define IPATH_SDMA_STATUS_ABORT_IN_PROG                        (1ull << 62)
-#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE         (1ull << 61)
-#define IPATH_SDMA_STATUS_SCB_EMPTY                    (1ull << 30)
-
-/* max dwords in small buffer packet */
-#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
-
-/*
- * Possible IB config parameters for ipath_f_get/set_ib_cfg()
- */
-#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
-#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
-#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
-#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
-#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
-#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
-#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
-#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
-#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
-#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
-#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
-
-
-struct ipath_devdata {
-       struct list_head ipath_list;
-
-       struct ipath_kregs const *ipath_kregs;
-       struct ipath_cregs const *ipath_cregs;
-
-       /* mem-mapped pointer to base of chip regs */
-       u64 __iomem *ipath_kregbase;
-       /* end of mem-mapped chip space; range checking */
-       u64 __iomem *ipath_kregend;
-       /* physical address of chip for io_remap, etc. */
-       unsigned long ipath_physaddr;
-       /* base of memory alloced for ipath_kregbase, for free */
-       u64 *ipath_kregalloc;
-       /* ipath_cfgports pointers */
-       struct ipath_portdata **ipath_pd;
-       /* sk_buffs used by port 0 eager receive queue */
-       struct ipath_skbinfo *ipath_port0_skbinfo;
-       /* kvirt address of 1st 2k pio buffer */
-       void __iomem *ipath_pio2kbase;
-       /* kvirt address of 1st 4k pio buffer */
-       void __iomem *ipath_pio4kbase;
-       /*
-        * points to area where PIOavail registers will be DMA'ed.
-        * Has to be on a page of it's own, because the page will be
-        * mapped into user program space.  This copy is *ONLY* ever
-        * written by DMA, not by the driver!  Need a copy per device
-        * when we get to multiple devices
-        */
-       volatile __le64 *ipath_pioavailregs_dma;
-       /* physical address where updates occur */
-       dma_addr_t ipath_pioavailregs_phys;
-       struct _ipath_layer ipath_layer;
-       /* setup intr */
-       int (*ipath_f_intrsetup)(struct ipath_devdata *);
-       /* fallback to alternate interrupt type if possible */
-       int (*ipath_f_intr_fallback)(struct ipath_devdata *);
-       /* setup on-chip bus config */
-       int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
-       /* hard reset chip */
-       int (*ipath_f_reset)(struct ipath_devdata *);
-       int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
-                                    size_t);
-       void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
-       void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
-                                       size_t);
-       void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
-       int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
-       int (*ipath_f_early_init)(struct ipath_devdata *);
-       void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
-       void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
-                               u32, unsigned long);
-       void (*ipath_f_tidtemplate)(struct ipath_devdata *);
-       void (*ipath_f_cleanup)(struct ipath_devdata *);
-       void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
-       /* fill out chip-specific fields */
-       int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
-       /* free irq */
-       void (*ipath_f_free_irq)(struct ipath_devdata *);
-       struct ipath_message_header *(*ipath_f_get_msgheader)
-                                       (struct ipath_devdata *, __le32 *);
-       void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
-       int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
-       int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
-       void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
-       void (*ipath_f_read_counters)(struct ipath_devdata *,
-                                       struct infinipath_counters *);
-       void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
-       /* per chip actions needed for IB Link up/down changes */
-       int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
-
-       unsigned ipath_lastegr_idx;
-       struct ipath_ibdev *verbs_dev;
-       struct timer_list verbs_timer;
-       /* total dwords sent (summed from counter) */
-       u64 ipath_sword;
-       /* total dwords rcvd (summed from counter) */
-       u64 ipath_rword;
-       /* total packets sent (summed from counter) */
-       u64 ipath_spkts;
-       /* total packets rcvd (summed from counter) */
-       u64 ipath_rpkts;
-       /* ipath_statusp initially points to this. */
-       u64 _ipath_status;
-       /* GUID for this interface, in network order */
-       __be64 ipath_guid;
-       /*
-        * aggregrate of error bits reported since last cleared, for
-        * limiting of error reporting
-        */
-       ipath_err_t ipath_lasterror;
-       /*
-        * aggregrate of error bits reported since last cleared, for
-        * limiting of hwerror reporting
-        */
-       ipath_err_t ipath_lasthwerror;
-       /* errors masked because they occur too fast */
-       ipath_err_t ipath_maskederrs;
-       u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
-       /* these 5 fields are used to establish deltas for IB Symbol
-        * errors and linkrecovery errors. They can be reported on
-        * some chips during link negotiation prior to INIT, and with
-        * DDR when faking DDR negotiations with non-IBTA switches.
-        * The chip counters are adjusted at driver unload if there is
-        * a non-zero delta.
-        */
-       u64 ibdeltainprog;
-       u64 ibsymdelta;
-       u64 ibsymsnap;
-       u64 iblnkerrdelta;
-       u64 iblnkerrsnap;
-
-       /* time in jiffies at which to re-enable maskederrs */
-       unsigned long ipath_unmasktime;
-       /* count of egrfull errors, combined for all ports */
-       u64 ipath_last_tidfull;
-       /* for ipath_qcheck() */
-       u64 ipath_lastport0rcv_cnt;
-       /* template for writing TIDs  */
-       u64 ipath_tidtemplate;
-       /* value to write to free TIDs */
-       u64 ipath_tidinvalid;
-       /* IBA6120 rcv interrupt setup */
-       u64 ipath_rhdrhead_intr_off;
-
-       /* size of memory at ipath_kregbase */
-       u32 ipath_kregsize;
-       /* number of registers used for pioavail */
-       u32 ipath_pioavregs;
-       /* IPATH_POLL, etc. */
-       u32 ipath_flags;
-       /* ipath_flags driver is waiting for */
-       u32 ipath_state_wanted;
-       /* last buffer for user use, first buf for kernel use is this
-        * index. */
-       u32 ipath_lastport_piobuf;
-       /* is a stats timer active */
-       u32 ipath_stats_timer_active;
-       /* number of interrupts for this device -- saturates... */
-       u32 ipath_int_counter;
-       /* dwords sent read from counter */
-       u32 ipath_lastsword;
-       /* dwords received read from counter */
-       u32 ipath_lastrword;
-       /* sent packets read from counter */
-       u32 ipath_lastspkts;
-       /* received packets read from counter */
-       u32 ipath_lastrpkts;
-       /* pio bufs allocated per port */
-       u32 ipath_pbufsport;
-       /* if remainder on bufs/port, ports < extrabuf get 1 extra */
-       u32 ipath_ports_extrabuf;
-       u32 ipath_pioupd_thresh; /* update threshold, some chips */
-       /*
-        * number of ports configured as max; zero is set to number chip
-        * supports, less gives more pio bufs/port, etc.
-        */
-       u32 ipath_cfgports;
-       /* count of port 0 hdrqfull errors */
-       u32 ipath_p0_hdrqfull;
-       /* port 0 number of receive eager buffers */
-       u32 ipath_p0_rcvegrcnt;
-
-       /*
-        * index of last piobuffer we used.  Speeds up searching, by
-        * starting at this point.  Doesn't matter if multiple cpu's use and
-        * update, last updater is only write that matters.  Whenever it
-        * wraps, we update shadow copies.  Need a copy per device when we
-        * get to multiple devices
-        */
-       u32 ipath_lastpioindex;
-       u32 ipath_lastpioindexl;
-       /* max length of freezemsg */
-       u32 ipath_freezelen;
-       /*
-        * consecutive times we wanted a PIO buffer but were unable to
-        * get one
-        */
-       u32 ipath_consec_nopiobuf;
-       /*
-        * hint that we should update ipath_pioavailshadow before
-        * looking for a PIO buffer
-        */
-       u32 ipath_upd_pio_shadow;
-       /* so we can rewrite it after a chip reset */
-       u32 ipath_pcibar0;
-       /* so we can rewrite it after a chip reset */
-       u32 ipath_pcibar1;
-       u32 ipath_x1_fix_tries;
-       u32 ipath_autoneg_tries;
-       u32 serdes_first_init_done;
-
-       struct ipath_relock {
-               atomic_t ipath_relock_timer_active;
-               struct timer_list ipath_relock_timer;
-               unsigned int ipath_relock_interval; /* in jiffies */
-       } ipath_relock_singleton;
-
-       /* interrupt number */
-       int ipath_irq;
-       /* HT/PCI Vendor ID (here for NodeInfo) */
-       u16 ipath_vendorid;
-       /* HT/PCI Device ID (here for NodeInfo) */
-       u16 ipath_deviceid;
-       /* offset in HT config space of slave/primary interface block */
-       u8 ipath_ht_slave_off;
-       /* for write combining settings */
-       int wc_cookie;
-       /* ref count for each pkey */
-       atomic_t ipath_pkeyrefs[4];
-       /* shadow copy of struct page *'s for exp tid pages */
-       struct page **ipath_pageshadow;
-       /* shadow copy of dma handles for exp tid pages */
-       dma_addr_t *ipath_physshadow;
-       u64 __iomem *ipath_egrtidbase;
-       /* lock to workaround chip bug 9437 and others */
-       spinlock_t ipath_kernel_tid_lock;
-       spinlock_t ipath_user_tid_lock;
-       spinlock_t ipath_sendctrl_lock;
-       /* around ipath_pd and (user ports) port_cnt use (intr vs free) */
-       spinlock_t ipath_uctxt_lock;
-
-       /*
-        * IPATH_STATUS_*,
-        * this address is mapped readonly into user processes so they can
-        * get status cheaply, whenever they want.
-        */
-       u64 *ipath_statusp;
-       /* freeze msg if hw error put chip in freeze */
-       char *ipath_freezemsg;
-       /* pci access data structure */
-       struct pci_dev *pcidev;
-       struct cdev *user_cdev;
-       struct cdev *diag_cdev;
-       struct device *user_dev;
-       struct device *diag_dev;
-       /* timer used to prevent stats overflow, error throttling, etc. */
-       struct timer_list ipath_stats_timer;
-       /* timer to verify interrupts work, and fallback if possible */
-       struct timer_list ipath_intrchk_timer;
-       void *ipath_dummy_hdrq; /* used after port close */
-       dma_addr_t ipath_dummy_hdrq_phys;
-
-       /* SendDMA related entries */
-       spinlock_t            ipath_sdma_lock;
-       unsigned long         ipath_sdma_status;
-       unsigned long         ipath_sdma_abort_jiffies;
-       unsigned long         ipath_sdma_abort_intr_timeout;
-       unsigned long         ipath_sdma_buf_jiffies;
-       struct ipath_sdma_desc *ipath_sdma_descq;
-       u64                   ipath_sdma_descq_added;
-       u64                   ipath_sdma_descq_removed;
-       int                   ipath_sdma_desc_nreserved;
-       u16                   ipath_sdma_descq_cnt;
-       u16                   ipath_sdma_descq_tail;
-       u16                   ipath_sdma_descq_head;
-       u16                   ipath_sdma_next_intr;
-       u16                   ipath_sdma_reset_wait;
-       u8                    ipath_sdma_generation;
-       struct tasklet_struct ipath_sdma_abort_task;
-       struct tasklet_struct ipath_sdma_notify_task;
-       struct list_head      ipath_sdma_activelist;
-       struct list_head      ipath_sdma_notifylist;
-       atomic_t              ipath_sdma_vl15_count;
-       struct timer_list     ipath_sdma_vl15_timer;
-
-       dma_addr_t       ipath_sdma_descq_phys;
-       volatile __le64 *ipath_sdma_head_dma;
-       dma_addr_t       ipath_sdma_head_phys;
-
-       unsigned long ipath_ureg_align; /* user register alignment */
-
-       struct delayed_work ipath_autoneg_work;
-       wait_queue_head_t ipath_autoneg_wait;
-
-       /* HoL blocking / user app forward-progress state */
-       unsigned          ipath_hol_state;
-       unsigned          ipath_hol_next;
-       struct timer_list ipath_hol_timer;
-
-       /*
-        * Shadow copies of registers; size indicates read access size.
-        * Most of them are readonly, but some are write-only register,
-        * where we manipulate the bits in the shadow copy, and then write
-        * the shadow copy to infinipath.
-        *
-        * We deliberately make most of these 32 bits, since they have
-        * restricted range.  For any that we read, we won't to generate 32
-        * bit accesses, since Opteron will generate 2 separate 32 bit HT
-        * transactions for a 64 bit read, and we want to avoid unnecessary
-        * HT transactions.
-        */
-
-       /* This is the 64 bit group */
-
-       /*
-        * shadow of pioavail, check to be sure it's large enough at
-        * init time.
-        */
-       unsigned long ipath_pioavailshadow[8];
-       /* bitmap of send buffers available for the kernel to use with PIO. */
-       unsigned long ipath_pioavailkernel[8];
-       /* shadow of kr_gpio_out, for rmw ops */
-       u64 ipath_gpio_out;
-       /* shadow the gpio mask register */
-       u64 ipath_gpio_mask;
-       /* shadow the gpio output enable, etc... */
-       u64 ipath_extctrl;
-       /* kr_revision shadow */
-       u64 ipath_revision;
-       /*
-        * shadow of ibcctrl, for interrupt handling of link changes,
-        * etc.
-        */
-       u64 ipath_ibcctrl;
-       /*
-        * last ibcstatus, to suppress "duplicate" status change messages,
-        * mostly from 2 to 3
-        */
-       u64 ipath_lastibcstat;
-       /* hwerrmask shadow */
-       ipath_err_t ipath_hwerrmask;
-       ipath_err_t ipath_errormask; /* errormask shadow */
-       /* interrupt config reg shadow */
-       u64 ipath_intconfig;
-       /* kr_sendpiobufbase value */
-       u64 ipath_piobufbase;
-       /* kr_ibcddrctrl shadow */
-       u64 ipath_ibcddrctrl;
-
-       /* these are the "32 bit" regs */
-
-       /*
-        * number of GUIDs in the flash for this interface; may need some
-        * rethinking for setting on other ifaces
-        */
-       u32 ipath_nguid;
-       /*
-        * the following two are 32-bit bitmasks, but {test,clear,set}_bit
-        * all expect bit fields to be "unsigned long"
-        */
-       /* shadow kr_rcvctrl */
-       unsigned long ipath_rcvctrl;
-       /* shadow kr_sendctrl */
-       unsigned long ipath_sendctrl;
-       /* to not count armlaunch after cancel */
-       unsigned long ipath_lastcancel;
-       /* count cases where special trigger was needed (double write) */
-       unsigned long ipath_spectriggerhit;
-
-       /* value we put in kr_rcvhdrcnt */
-       u32 ipath_rcvhdrcnt;
-       /* value we put in kr_rcvhdrsize */
-       u32 ipath_rcvhdrsize;
-       /* value we put in kr_rcvhdrentsize */
-       u32 ipath_rcvhdrentsize;
-       /* offset of last entry in rcvhdrq */
-       u32 ipath_hdrqlast;
-       /* kr_portcnt value */
-       u32 ipath_portcnt;
-       /* kr_pagealign value */
-       u32 ipath_palign;
-       /* number of "2KB" PIO buffers */
-       u32 ipath_piobcnt2k;
-       /* size in bytes of "2KB" PIO buffers */
-       u32 ipath_piosize2k;
-       /* number of "4KB" PIO buffers */
-       u32 ipath_piobcnt4k;
-       /* size in bytes of "4KB" PIO buffers */
-       u32 ipath_piosize4k;
-       u32 ipath_pioreserved; /* reserved special-inkernel; */
-       /* kr_rcvegrbase value */
-       u32 ipath_rcvegrbase;
-       /* kr_rcvegrcnt value */
-       u32 ipath_rcvegrcnt;
-       /* kr_rcvtidbase value */
-       u32 ipath_rcvtidbase;
-       /* kr_rcvtidcnt value */
-       u32 ipath_rcvtidcnt;
-       /* kr_sendregbase */
-       u32 ipath_sregbase;
-       /* kr_userregbase */
-       u32 ipath_uregbase;
-       /* kr_counterregbase */
-       u32 ipath_cregbase;
-       /* shadow the control register contents */
-       u32 ipath_control;
-       /* PCI revision register (HTC rev on FPGA) */
-       u32 ipath_pcirev;
-
-       /* chip address space used by 4k pio buffers */
-       u32 ipath_4kalign;
-       /* The MTU programmed for this unit */
-       u32 ipath_ibmtu;
-       /*
-        * The max size IB packet, included IB headers that we can send.
-        * Starts same as ipath_piosize, but is affected when ibmtu is
-        * changed, or by size of eager buffers
-        */
-       u32 ipath_ibmaxlen;
-       /*
-        * ibmaxlen at init time, limited by chip and by receive buffer
-        * size.  Not changed after init.
-        */
-       u32 ipath_init_ibmaxlen;
-       /* size of each rcvegrbuffer */
-       u32 ipath_rcvegrbufsize;
-       /* localbus width (1, 2,4,8,16,32) from config space  */
-       u32 ipath_lbus_width;
-       /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
-       u32 ipath_lbus_speed;
-       /*
-        * number of sequential ibcstatus change for polling active/quiet
-        * (i.e., link not coming up).
-        */
-       u32 ipath_ibpollcnt;
-       /* low and high portions of MSI capability/vector */
-       u32 ipath_msi_lo;
-       /* saved after PCIe init for restore after reset */
-       u32 ipath_msi_hi;
-       /* MSI data (vector) saved for restore */
-       u16 ipath_msi_data;
-       /* MLID programmed for this instance */
-       u16 ipath_mlid;
-       /* LID programmed for this instance */
-       u16 ipath_lid;
-       /* list of pkeys programmed; 0 if not set */
-       u16 ipath_pkeys[4];
-       /*
-        * ASCII serial number, from flash, large enough for original
-        * all digit strings, and longer QLogic serial number format
-        */
-       u8 ipath_serial[16];
-       /* human readable board version */
-       u8 ipath_boardversion[96];
-       u8 ipath_lbus_info[32]; /* human readable localbus info */
-       /* chip major rev, from ipath_revision */
-       u8 ipath_majrev;
-       /* chip minor rev, from ipath_revision */
-       u8 ipath_minrev;
-       /* board rev, from ipath_revision */
-       u8 ipath_boardrev;
-       /* saved for restore after reset */
-       u8 ipath_pci_cacheline;
-       /* LID mask control */
-       u8 ipath_lmc;
-       /* link width supported */
-       u8 ipath_link_width_supported;
-       /* link speed supported */
-       u8 ipath_link_speed_supported;
-       u8 ipath_link_width_enabled;
-       u8 ipath_link_speed_enabled;
-       u8 ipath_link_width_active;
-       u8 ipath_link_speed_active;
-       /* Rx Polarity inversion (compensate for ~tx on partner) */
-       u8 ipath_rx_pol_inv;
-
-       u8 ipath_r_portenable_shift;
-       u8 ipath_r_intravail_shift;
-       u8 ipath_r_tailupd_shift;
-       u8 ipath_r_portcfg_shift;
-
-       /* unit # of this chip, if present */
-       int ipath_unit;
-
-       /* local link integrity counter */
-       u32 ipath_lli_counter;
-       /* local link integrity errors */
-       u32 ipath_lli_errors;
-       /*
-        * Above counts only cases where _successive_ LocalLinkIntegrity
-        * errors were seen in the receive headers of kern-packets.
-        * Below are the three (monotonically increasing) counters
-        * maintained via GPIO interrupts on iba6120-rev2.
-        */
-       u32 ipath_rxfc_unsupvl_errs;
-       u32 ipath_overrun_thresh_errs;
-       u32 ipath_lli_errs;
-
-       /*
-        * Not all devices managed by a driver instance are the same
-        * type, so these fields must be per-device.
-        */
-       u64 ipath_i_bitsextant;
-       ipath_err_t ipath_e_bitsextant;
-       ipath_err_t ipath_hwe_bitsextant;
-
-       /*
-        * Below should be computable from number of ports,
-        * since they are never modified.
-        */
-       u64 ipath_i_rcvavail_mask;
-       u64 ipath_i_rcvurg_mask;
-       u16 ipath_i_rcvurg_shift;
-       u16 ipath_i_rcvavail_shift;
-
-       /*
-        * Register bits for selecting i2c direction and values, used for
-        * I2C serial flash.
-        */
-       u8 ipath_gpio_sda_num;
-       u8 ipath_gpio_scl_num;
-       u8 ipath_i2c_chain_type;
-       u64 ipath_gpio_sda;
-       u64 ipath_gpio_scl;
-
-       /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
-       spinlock_t ipath_gpio_lock;
-
-       /*
-        * IB link and linktraining states and masks that vary per chip in
-        * some way.  Set at init, to avoid each IB status change interrupt
-        */
-       u8 ibcs_ls_shift;
-       u8 ibcs_lts_mask;
-       u32 ibcs_mask;
-       u32 ib_init;
-       u32 ib_arm;
-       u32 ib_active;
-
-       u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
-
-       /*
-        * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
-        * reg. Changes for IBA7220
-        */
-       u8 ibcc_lic_mask; /* LinkInitCmd */
-       u8 ibcc_lc_shift; /* LinkCmd */
-       u8 ibcc_mpl_shift; /* Maxpktlen */
-
-       u8 delay_mult;
-
-       /* used to override LED behavior */
-       u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
-       u16 ipath_led_override_timeoff; /* delta to next timer event */
-       u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
-       u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
-       atomic_t ipath_led_override_timer_active;
-       /* Used to flash LEDs in override mode */
-       struct timer_list ipath_led_override_timer;
-
-       /* Support (including locks) for EEPROM logging of errors and time */
-       /* control access to actual counters, timer */
-       spinlock_t ipath_eep_st_lock;
-       /* control high-level access to EEPROM */
-       struct mutex ipath_eep_lock;
-       /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
-       uint64_t ipath_traffic_wds;
-       /* active time is kept in seconds, but logged in hours */
-       atomic_t ipath_active_time;
-       /* Below are nominal shadow of EEPROM, new since last EEPROM update */
-       uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
-       uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
-       uint16_t ipath_eep_hrs;
-       /*
-        * masks for which bits of errs, hwerrs that cause
-        * each of the counters to increment.
-        */
-       struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
-
-       /* interrupt mitigation reload register info */
-       u16 ipath_jint_idle_ticks;      /* idle clock ticks */
-       u16 ipath_jint_max_packets;     /* max packets across all ports */
-
-       /*
-        * lock for access to SerDes, and flags to sequence preset
-        * versus steady-state. 7220-only at the moment.
-        */
-       spinlock_t ipath_sdepb_lock;
-       u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
-};
-
-/* ipath_hol_state values (stopping/starting user proc, send flushing) */
-#define IPATH_HOL_UP       0
-#define IPATH_HOL_DOWN     1
-/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
-#define IPATH_HOL_DOWNSTOP 0
-#define IPATH_HOL_DOWNCONT 1
-
-/* bit positions for sdma_status */
-#define IPATH_SDMA_ABORTING  0
-#define IPATH_SDMA_DISARMED  1
-#define IPATH_SDMA_DISABLED  2
-#define IPATH_SDMA_LAYERBUF  3
-#define IPATH_SDMA_RUNNING  30
-#define IPATH_SDMA_SHUTDOWN 31
-
-/* bit combinations that correspond to abort states */
-#define IPATH_SDMA_ABORT_NONE 0
-#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
-#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED))
-#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-
-#define IPATH_SDMA_BUF_NONE 0
-#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
-
-/* Private data for file operations */
-struct ipath_filedata {
-       struct ipath_portdata *pd;
-       unsigned subport;
-       unsigned tidcursor;
-       struct ipath_user_sdma_queue *pq;
-};
-extern struct list_head ipath_dev_list;
-extern spinlock_t ipath_devs_lock;
-extern struct ipath_devdata *ipath_lookup(int unit);
-
-int ipath_init_chip(struct ipath_devdata *, int);
-int ipath_enable_wc(struct ipath_devdata *dd);
-void ipath_disable_wc(struct ipath_devdata *dd);
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
-void ipath_shutdown_device(struct ipath_devdata *);
-void ipath_clear_freeze(struct ipath_devdata *);
-
-struct file_operations;
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-                   struct cdev **cdevp, struct device **devp);
-void ipath_cdev_cleanup(struct cdev **cdevp,
-                       struct device **devp);
-
-int ipath_diag_add(struct ipath_devdata *);
-void ipath_diag_remove(struct ipath_devdata *);
-
-extern wait_queue_head_t ipath_state_wait;
-
-int ipath_user_add(struct ipath_devdata *dd);
-void ipath_user_remove(struct ipath_devdata *dd);
-
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
-
-extern int ipath_diag_inuse;
-
-irqreturn_t ipath_intr(int irq, void *devid);
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-                    ipath_err_t err);
-#if __IPATH_INFO || __IPATH_DBG
-extern const char *ipath_ibcstatus_str[];
-#endif
-
-/* clean up any per-chip chip-specific stuff */
-void ipath_chip_cleanup(struct ipath_devdata *);
-/* clean up any chip type-specific stuff */
-void ipath_chip_done(void);
-
-void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
-                         unsigned cnt);
-void ipath_cancel_sends(struct ipath_devdata *, int);
-
-int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
-void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
-
-int ipath_parse_ushort(const char *str, unsigned short *valp);
-
-void ipath_kreceive(struct ipath_portdata *);
-int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
-int ipath_reset_device(int);
-void ipath_get_faststats(unsigned long);
-int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
-int ipath_set_linkstate(struct ipath_devdata *, u8);
-int ipath_set_mtu(struct ipath_devdata *, u16);
-int ipath_set_lid(struct ipath_devdata *, u32, u8);
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
-void ipath_enable_armlaunch(struct ipath_devdata *);
-void ipath_disable_armlaunch(struct ipath_devdata *);
-void ipath_hol_down(struct ipath_devdata *);
-void ipath_hol_up(struct ipath_devdata *);
-void ipath_hol_event(unsigned long);
-void ipath_toggle_rclkrls(struct ipath_devdata *);
-void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
-void ipath_set_relock_poll(struct ipath_devdata *, int);
-void ipath_shutdown_relock_poll(struct ipath_devdata *);
-
-/* for use in system calls, where we want to know device type, etc. */
-#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
-#define subport_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->subport
-#define tidcursor_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->tidcursor
-#define user_sdma_queue_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->pq
-
-/*
- * values for ipath_flags
- */
-               /* chip can report link latency (IB 1.2) */
-#define IPATH_HAS_LINK_LATENCY 0x1
-               /* The chip is up and initted */
-#define IPATH_INITTED       0x2
-               /* set if any user code has set kr_rcvhdrsize */
-#define IPATH_RCVHDRSZ_SET  0x4
-               /* The chip is present and valid for accesses */
-#define IPATH_PRESENT       0x8
-               /* HT link0 is only 8 bits wide, ignore upper byte crc
-                * errors, etc. */
-#define IPATH_8BIT_IN_HT0   0x10
-               /* HT link1 is only 8 bits wide, ignore upper byte crc
-                * errors, etc. */
-#define IPATH_8BIT_IN_HT1   0x20
-               /* The link is down */
-#define IPATH_LINKDOWN      0x40
-               /* The link level is up (0x11) */
-#define IPATH_LINKINIT      0x80
-               /* The link is in the armed (0x21) state */
-#define IPATH_LINKARMED     0x100
-               /* The link is in the active (0x31) state */
-#define IPATH_LINKACTIVE    0x200
-               /* link current state is unknown */
-#define IPATH_LINKUNK       0x400
-               /* Write combining flush needed for PIO */
-#define IPATH_PIO_FLUSH_WC  0x1000
-               /* DMA Receive tail pointer */
-#define IPATH_NODMA_RTAIL   0x2000
-               /* no IB cable, or no device on IB cable */
-#define IPATH_NOCABLE       0x4000
-               /* Supports port zero per packet receive interrupts via
-                * GPIO */
-#define IPATH_GPIO_INTR     0x8000
-               /* uses the coded 4byte TID, not 8 byte */
-#define IPATH_4BYTE_TID     0x10000
-               /* packet/word counters are 32 bit, else those 4 counters
-                * are 64bit */
-#define IPATH_32BITCOUNTERS 0x20000
-               /* Interrupt register is 64 bits */
-#define IPATH_INTREG_64     0x40000
-               /* can miss port0 rx interrupts */
-#define IPATH_DISABLED      0x80000 /* administratively disabled */
-               /* Use GPIO interrupts for new counters */
-#define IPATH_GPIO_ERRINTRS 0x100000
-#define IPATH_SWAP_PIOBUFS  0x200000
-               /* Supports Send DMA */
-#define IPATH_HAS_SEND_DMA  0x400000
-               /* Supports Send Count (not just word count) in PBC */
-#define IPATH_HAS_PBC_CNT   0x800000
-               /* Suppress heartbeat, even if turning off loopback */
-#define IPATH_NO_HRTBT      0x1000000
-#define IPATH_HAS_THRESH_UPDATE 0x4000000
-#define IPATH_HAS_MULT_IB_SPEED 0x8000000
-#define IPATH_IB_AUTONEG_INPROG 0x10000000
-#define IPATH_IB_AUTONEG_FAILED 0x20000000
-               /* Linkdown-disable intentionally, Do not attempt to bring up */
-#define IPATH_IB_LINK_DISABLED 0x40000000
-#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
-
-/* Bits in GPIO for the added interrupts */
-#define IPATH_GPIO_PORT0_BIT 2
-#define IPATH_GPIO_RXUVL_BIT 3
-#define IPATH_GPIO_OVRUN_BIT 4
-#define IPATH_GPIO_LLI_BIT 5
-#define IPATH_GPIO_ERRINTR_MASK 0x38
-
-/* portdata flag bit offsets */
-               /* waiting for a packet to arrive */
-#define IPATH_PORT_WAITING_RCV   2
-               /* master has not finished initializing */
-#define IPATH_PORT_MASTER_UNINIT 4
-               /* waiting for an urgent packet to arrive */
-#define IPATH_PORT_WAITING_URG 5
-
-/* free up any allocated data at closes */
-void ipath_free_data(struct ipath_portdata *dd);
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-                               unsigned len, int avail);
-void ipath_init_iba6110_funcs(struct ipath_devdata *);
-void ipath_get_eeprom_info(struct ipath_devdata *);
-int ipath_update_eeprom_log(struct ipath_devdata *dd);
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
-u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_disarm_senderrbufs(struct ipath_devdata *);
-void ipath_force_pio_avail_update(struct ipath_devdata *);
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
-
-/*
- * Set LED override, only the two LSBs have "public" meaning, but
- * any non-zero value substitutes them for the Link and LinkTrain
- * LED states.
- */
-#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
-#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
-
-/* send dma routines */
-int setup_sdma(struct ipath_devdata *);
-void teardown_sdma(struct ipath_devdata *);
-void ipath_restart_sdma(struct ipath_devdata *);
-void ipath_sdma_intr(struct ipath_devdata *);
-int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
-                         u32, struct ipath_verbs_txreq *);
-/* ipath_sdma_lock should be locked before calling this. */
-int ipath_sdma_make_progress(struct ipath_devdata *dd);
-
-/* must be called under ipath_sdma_lock */
-static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
-{
-       return dd->ipath_sdma_descq_cnt -
-               (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
-               1 - dd->ipath_sdma_desc_nreserved;
-}
-
-static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
-{
-       dd->ipath_sdma_desc_nreserved += cnt;
-}
-
-static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
-{
-       dd->ipath_sdma_desc_nreserved -= cnt;
-}
-
-/*
- * number of words used for protocol header if not set by ipath_userinit();
- */
-#define IPATH_DFLT_RCVHDRSIZE 9
-
-int ipath_get_user_pages(unsigned long, size_t, struct page **);
-void ipath_release_user_pages(struct page **, size_t);
-void ipath_release_user_pages_on_close(struct page **, size_t);
-int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
-int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
-int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
-int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
-
-/* these are used for the registers that vary with port */
-void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
-                          unsigned, u64);
-
-/*
- * We could have a single register get/put routine, that takes a group type,
- * but this is somewhat clearer and cleaner.  It also gives us some error
- * checking.  64 bit register reads should always work, but are inefficient
- * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
- * so we use kreg32 wherever possible.  User register and counter register
- * reads are always 32 bit reads, so only one form of those routines.
- */
-
-/*
- * At the moment, none of the s-registers are writable, so no
- * ipath_write_sreg().
- */
-
-/**
- * ipath_read_ureg32 - read 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @port: port number
- *
- * Return the contents of a register that is virtualized to be per port.
- * Returns -1 on errors (not distinguishable from valid contents at
- * runtime; we may add a separate error variable at some point).
- */
-static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
-                                   ipath_ureg regno, int port)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-
-       return readl(regno + (u64 __iomem *)
-                    (dd->ipath_uregbase +
-                     (char __iomem *)dd->ipath_kregbase +
-                     dd->ipath_ureg_align * port));
-}
-
-/**
- * ipath_write_ureg - write 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @value: value
- * @port: port
- *
- * Write the contents of a register that is virtualized to be per port.
- */
-static inline void ipath_write_ureg(const struct ipath_devdata *dd,
-                                   ipath_ureg regno, u64 value, int port)
-{
-       u64 __iomem *ubase = (u64 __iomem *)
-               (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
-                dd->ipath_ureg_align * port);
-       if (dd->ipath_kregbase)
-               writeq(value, &ubase[regno]);
-}
-
-static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
-                                   ipath_kreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return -1;
-       return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
-                                   ipath_kreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return -1;
-
-       return readq(&dd->ipath_kregbase[regno]);
-}
-
-static inline void ipath_write_kreg(const struct ipath_devdata *dd,
-                                   ipath_kreg regno, u64 value)
-{
-       if (dd->ipath_kregbase)
-               writeq(value, &dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
-                                 ipath_sreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-
-       return readq(regno + (u64 __iomem *)
-                    (dd->ipath_cregbase +
-                     (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
-                                        ipath_sreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-       return readl(regno + (u64 __iomem *)
-                    (dd->ipath_cregbase +
-                     (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_write_creg(const struct ipath_devdata *dd,
-                                   ipath_creg regno, u64 value)
-{
-       if (dd->ipath_kregbase)
-               writeq(value, regno + (u64 __iomem *)
-                      (dd->ipath_cregbase +
-                       (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
-{
-       *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
-}
-
-static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
-{
-       return (u32) le64_to_cpu(*((volatile __le64 *)
-                               pd->port_rcvhdrtail_kvaddr));
-}
-
-static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
-{
-       const struct ipath_devdata *dd = pd->port_dd;
-       u32 hdrqtail;
-
-       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-               __le32 *rhf_addr;
-               u32 seq;
-
-               rhf_addr = (__le32 *) pd->port_rcvhdrq +
-                       pd->port_head + dd->ipath_rhf_offset;
-               seq = ipath_hdrget_seq(rhf_addr);
-               hdrqtail = pd->port_head;
-               if (seq == pd->port_seq_cnt)
-                       hdrqtail++;
-       } else
-               hdrqtail = ipath_get_rcvhdrtail(pd);
-
-       return hdrqtail;
-}
-
-static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
-{
-       return (dd->ipath_flags & IPATH_INTREG_64) ?
-               ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return linkstate
- * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
- * everywhere, anyway (and should be, for almost all purposes).
- */
-static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
-               INFINIPATH_IBCS_LINKSTATE_MASK;
-       if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
-               state = INFINIPATH_IBCS_L_STATE_ACTIVE;
-       return state;
-}
-
-/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
-static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-               dd->ibcs_lts_mask;
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return logical link state
- * combination of link state and linktraining state (down, active, init,
- * arm, etc.
- */
-static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
-{
-       u32 ibs;
-       ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-               dd->ibcs_lts_mask;
-       ibs |= (u32)(ibcs &
-               (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
-       return ibs;
-}
-
-/*
- * sysfs interface.
- */
-
-struct device_driver;
-
-extern const char ib_ipath_version[];
-
-extern const struct attribute_group *ipath_driver_attr_groups[];
-
-int ipath_device_create_group(struct device *, struct ipath_devdata *);
-void ipath_device_remove_group(struct device *, struct ipath_devdata *);
-int ipath_expose_reset(struct device *);
-
-int ipath_init_ipathfs(void);
-void ipath_exit_ipathfs(void);
-int ipathfs_add_device(struct ipath_devdata *);
-int ipathfs_remove_device(struct ipath_devdata *);
-
-/*
- * dma_addr wrappers - all 0's invalid for hw
- */
-dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
-                         size_t, int);
-dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
-const char *ipath_get_unit_name(int unit);
-
-/*
- * Flush write combining store buffers (if present) and perform a write
- * barrier.
- */
-#if defined(CONFIG_X86_64)
-#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
-#else
-#define ipath_flush_wc() wmb()
-#endif
-
-extern unsigned ipath_debug; /* debugging bit mask */
-extern unsigned ipath_linkrecovery;
-extern unsigned ipath_mtu4096;
-extern struct mutex ipath_mutex;
-
-#define IPATH_DRV_NAME         "ib_ipath"
-#define IPATH_MAJOR            233
-#define IPATH_USER_MINOR_BASE  0
-#define IPATH_DIAGPKT_MINOR    127
-#define IPATH_DIAG_MINOR_BASE  129
-#define IPATH_NMINORS          255
-
-#define ipath_dev_err(dd,fmt,...) \
-       do { \
-               const struct ipath_devdata *__dd = (dd); \
-               if (__dd->pcidev) \
-                       dev_err(&__dd->pcidev->dev, "%s: " fmt, \
-                               ipath_get_unit_name(__dd->ipath_unit), \
-                               ##__VA_ARGS__); \
-               else \
-                       printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
-                              ipath_get_unit_name(__dd->ipath_unit), \
-                              ##__VA_ARGS__); \
-       } while (0)
-
-#if _IPATH_DEBUGGING
-
-# define __IPATH_DBG_WHICH(which,fmt,...) \
-       do { \
-               if (unlikely(ipath_debug & (which))) \
-                       printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
-                              __func__,##__VA_ARGS__); \
-       } while(0)
-
-# define ipath_dbg(fmt,...) \
-       __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
-# define ipath_cdbg(which,fmt,...) \
-       __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
-
-#else /* ! _IPATH_DEBUGGING */
-
-# define ipath_dbg(fmt,...)
-# define ipath_cdbg(which,fmt,...)
-
-#endif /* _IPATH_DEBUGGING */
-
-/*
- * this is used for formatting hw error messages...
- */
-struct ipath_hwerror_msgs {
-       u64 mask;
-       const char *msg;
-};
-
-#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
-
-/* in ipath_intr.c... */
-void ipath_format_hwerrors(u64 hwerrs,
-                          const struct ipath_hwerror_msgs *hwerrmsgs,
-                          size_t nhwerrmsgs,
-                          char *msg, size_t lmsg);
-
-#endif                         /* _IPATH_KERNEL_H */
diff --git a/drivers/staging/rdma/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c
deleted file mode 100644 (file)
index c0e933f..0000000
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <asm/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_alloc_lkey - allocate an lkey
- * @rkt: lkey table in which to allocate the lkey
- * @mr: memory region that this lkey protects
- *
- * Returns 1 if successful, otherwise returns 0.
- */
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
-{
-       unsigned long flags;
-       u32 r;
-       u32 n;
-       int ret;
-
-       spin_lock_irqsave(&rkt->lock, flags);
-
-       /* Find the next available LKEY */
-       r = n = rkt->next;
-       for (;;) {
-               if (rkt->table[r] == NULL)
-                       break;
-               r = (r + 1) & (rkt->max - 1);
-               if (r == n) {
-                       spin_unlock_irqrestore(&rkt->lock, flags);
-                       ipath_dbg("LKEY table full\n");
-                       ret = 0;
-                       goto bail;
-               }
-       }
-       rkt->next = (r + 1) & (rkt->max - 1);
-       /*
-        * Make sure lkey is never zero which is reserved to indicate an
-        * unrestricted LKEY.
-        */
-       rkt->gen++;
-       mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
-               ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
-                << 8);
-       if (mr->lkey == 0) {
-               mr->lkey |= 1 << 8;
-               rkt->gen++;
-       }
-       rkt->table[r] = mr;
-       spin_unlock_irqrestore(&rkt->lock, flags);
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_free_lkey - free an lkey
- * @rkt: table from which to free the lkey
- * @lkey: lkey id to free
- */
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
-{
-       unsigned long flags;
-       u32 r;
-
-       if (lkey == 0)
-               return;
-       r = lkey >> (32 - ib_ipath_lkey_table_size);
-       spin_lock_irqsave(&rkt->lock, flags);
-       rkt->table[r] = NULL;
-       spin_unlock_irqrestore(&rkt->lock, flags);
-}
-
-/**
- * ipath_lkey_ok - check IB SGE for validity and initialize
- * @rkt: table containing lkey to check SGE against
- * @isge: outgoing internal SGE
- * @sge: SGE to check
- * @acc: access flags
- *
- * Return 1 if valid and successful, otherwise returns 0.
- *
- * Check the IB SGE for validity and initialize our internal version
- * of it.
- */
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-                 struct ib_sge *sge, int acc)
-{
-       struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
-       struct ipath_mregion *mr;
-       unsigned n, m;
-       size_t off;
-       int ret;
-
-       /*
-        * We use LKEY == zero for kernel virtual addresses
-        * (see ipath_get_dma_mr and ipath_dma.c).
-        */
-       if (sge->lkey == 0) {
-               /* always a kernel port, no locking needed */
-               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-               if (pd->user) {
-                       ret = 0;
-                       goto bail;
-               }
-               isge->mr = NULL;
-               isge->vaddr = (void *) sge->addr;
-               isge->length = sge->length;
-               isge->sge_length = sge->length;
-               ret = 1;
-               goto bail;
-       }
-       mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
-                    qp->ibqp.pd != mr->pd)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off = sge->addr - mr->user_base;
-       if (unlikely(sge->addr < mr->user_base ||
-                    off + sge->length > mr->length ||
-                    (mr->access_flags & acc) != acc)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off += mr->offset;
-       m = 0;
-       n = 0;
-       while (off >= mr->map[m]->segs[n].length) {
-               off -= mr->map[m]->segs[n].length;
-               n++;
-               if (n >= IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       isge->mr = mr;
-       isge->vaddr = mr->map[m]->segs[n].vaddr + off;
-       isge->length = mr->map[m]->segs[n].length - off;
-       isge->sge_length = sge->length;
-       isge->m = m;
-       isge->n = n;
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_rkey_ok - check the IB virtual address, length, and RKEY
- * @dev: infiniband device
- * @ss: SGE state
- * @len: length of data
- * @vaddr: virtual address to place data
- * @rkey: rkey to check
- * @acc: access flags
- *
- * Return 1 if successful, otherwise 0.
- */
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-                 u32 len, u64 vaddr, u32 rkey, int acc)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_lkey_table *rkt = &dev->lk_table;
-       struct ipath_sge *sge = &ss->sge;
-       struct ipath_mregion *mr;
-       unsigned n, m;
-       size_t off;
-       int ret;
-
-       /*
-        * We use RKEY == zero for kernel virtual addresses
-        * (see ipath_get_dma_mr and ipath_dma.c).
-        */
-       if (rkey == 0) {
-               /* always a kernel port, no locking needed */
-               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-               if (pd->user) {
-                       ret = 0;
-                       goto bail;
-               }
-               sge->mr = NULL;
-               sge->vaddr = (void *) vaddr;
-               sge->length = len;
-               sge->sge_length = len;
-               ss->sg_list = NULL;
-               ss->num_sge = 1;
-               ret = 1;
-               goto bail;
-       }
-
-       mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != rkey ||
-                    qp->ibqp.pd != mr->pd)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off = vaddr - mr->iova;
-       if (unlikely(vaddr < mr->iova || off + len > mr->length ||
-                    (mr->access_flags & acc) == 0)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off += mr->offset;
-       m = 0;
-       n = 0;
-       while (off >= mr->map[m]->segs[n].length) {
-               off -= mr->map[m]->segs[n].length;
-               n++;
-               if (n >= IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       sge->mr = mr;
-       sge->vaddr = mr->map[m]->segs[n].vaddr + off;
-       sge->length = mr->map[m]->segs[n].length - off;
-       sge->sge_length = len;
-       sge->m = m;
-       sge->n = n;
-       ss->sg_list = NULL;
-       ss->num_sge = 1;
-
-       ret = 1;
-
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c
deleted file mode 100644 (file)
index ad3a926..0000000
+++ /dev/null
@@ -1,1521 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_pma.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define IB_SMP_UNSUP_VERSION   cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD    cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD   cpu_to_be16(0x001C)
-
-static int reply(struct ib_smp *smp)
-{
-       /*
-        * The verbs framework will handle the directed/LID route
-        * packet changes.
-        */
-       smp->method = IB_MGMT_METHOD_GET_RESP;
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-               smp->status |= IB_SMP_DIRECTION;
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-static int recv_subn_get_nodedescription(struct ib_smp *smp,
-                                        struct ib_device *ibdev)
-{
-       if (smp->attr_mod)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
-
-       return reply(smp);
-}
-
-struct nodeinfo {
-       u8 base_version;
-       u8 class_version;
-       u8 node_type;
-       u8 num_ports;
-       __be64 sys_guid;
-       __be64 node_guid;
-       __be64 port_guid;
-       __be16 partition_cap;
-       __be16 device_id;
-       __be32 revision;
-       u8 local_port_num;
-       u8 vendor_id[3];
-} __attribute__ ((packed));
-
-static int recv_subn_get_nodeinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
-       struct ipath_devdata *dd = to_idev(ibdev)->dd;
-       u32 vendor, majrev, minrev;
-
-       /* GUID 0 is illegal */
-       if (smp->attr_mod || (dd->ipath_guid == 0))
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       nip->base_version = 1;
-       nip->class_version = 1;
-       nip->node_type = 1;     /* channel adapter */
-       /*
-        * XXX The num_ports value will need a layer function to get
-        * the value if we ever have more than one IB port on a chip.
-        * We will also need to get the GUID for the port.
-        */
-       nip->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       nip->sys_guid = to_idev(ibdev)->sys_image_guid;
-       nip->node_guid = dd->ipath_guid;
-       nip->port_guid = dd->ipath_guid;
-       nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
-       nip->device_id = cpu_to_be16(dd->ipath_deviceid);
-       majrev = dd->ipath_majrev;
-       minrev = dd->ipath_minrev;
-       nip->revision = cpu_to_be32((majrev << 16) | minrev);
-       nip->local_port_num = port;
-       vendor = dd->ipath_vendorid;
-       nip->vendor_id[0] = IPATH_SRC_OUI_1;
-       nip->vendor_id[1] = IPATH_SRC_OUI_2;
-       nip->vendor_id[2] = IPATH_SRC_OUI_3;
-
-       return reply(smp);
-}
-
-static int recv_subn_get_guidinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev)
-{
-       u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
-       __be64 *p = (__be64 *) smp->data;
-
-       /* 32 blocks of 8 64-bit GUIDs per block */
-
-       memset(smp->data, 0, sizeof(smp->data));
-
-       /*
-        * We only support one GUID for now.  If this changes, the
-        * portinfo.guid_cap field needs to be updated too.
-        */
-       if (startgx == 0) {
-               __be64 g = to_idev(ibdev)->dd->ipath_guid;
-               if (g == 0)
-                       /* GUID 0 is illegal */
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               else
-                       /* The first is a copy of the read-only HW GUID. */
-                       *p = g;
-       } else
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return reply(smp);
-}
-
-static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
-{
-       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
-}
-
-static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
-{
-       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
-}
-
-static int get_overrunthreshold(struct ipath_devdata *dd)
-{
-       return (dd->ipath_ibcctrl >>
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-}
-
-/**
- * set_overrunthreshold - set the overrun threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
-{
-       unsigned v;
-
-       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-       if (v != n) {
-               dd->ipath_ibcctrl &=
-                       ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
-                         INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
-               dd->ipath_ibcctrl |=
-                       (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-       }
-       return 0;
-}
-
-static int get_phyerrthreshold(struct ipath_devdata *dd)
-{
-       return (dd->ipath_ibcctrl >>
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-}
-
-/**
- * set_phyerrthreshold - set the physical error threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
-{
-       unsigned v;
-
-       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-       if (v != n) {
-               dd->ipath_ibcctrl &=
-                       ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
-                         INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
-               dd->ipath_ibcctrl |=
-                       (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-       }
-       return 0;
-}
-
-/**
- * get_linkdowndefaultstate - get the default linkdown state
- * @dd: the infinipath device
- *
- * Returns zero if the default is POLL, 1 if the default is SLEEP.
- */
-static int get_linkdowndefaultstate(struct ipath_devdata *dd)
-{
-       return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
-}
-
-static int recv_subn_get_portinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct ipath_ibdev *dev;
-       struct ipath_devdata *dd;
-       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-       u16 lid;
-       u8 ibcstat;
-       u8 mtu;
-       int ret;
-
-       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               ret = reply(smp);
-               goto bail;
-       }
-
-       dev = to_idev(ibdev);
-       dd = dev->dd;
-
-       /* Clear all fields.  Only set the non-zero fields. */
-       memset(smp->data, 0, sizeof(smp->data));
-
-       /* Only return the mkey if the protection field allows it. */
-       if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
-           dev->mkeyprot == 0)
-               pip->mkey = dev->mkey;
-       pip->gid_prefix = dev->gid_prefix;
-       lid = dd->ipath_lid;
-       pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
-       pip->sm_lid = cpu_to_be16(dev->sm_lid);
-       pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
-       /* pip->diag_code; */
-       pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
-       pip->local_port_num = port;
-       pip->link_width_enabled = dd->ipath_link_width_enabled;
-       pip->link_width_supported = dd->ipath_link_width_supported;
-       pip->link_width_active = dd->ipath_link_width_active;
-       pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
-       ibcstat = dd->ipath_lastibcstat;
-       /* map LinkState to IB portinfo values.  */
-       pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
-
-       pip->portphysstate_linkdown =
-               (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
-               (get_linkdowndefaultstate(dd) ? 1 : 2);
-       pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
-       pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
-               dd->ipath_link_speed_enabled;
-       switch (dd->ipath_ibmtu) {
-       case 4096:
-               mtu = IB_MTU_4096;
-               break;
-       case 2048:
-               mtu = IB_MTU_2048;
-               break;
-       case 1024:
-               mtu = IB_MTU_1024;
-               break;
-       case 512:
-               mtu = IB_MTU_512;
-               break;
-       case 256:
-               mtu = IB_MTU_256;
-               break;
-       default:                /* oops, something is wrong */
-               mtu = IB_MTU_2048;
-               break;
-       }
-       pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
-       pip->vlcap_inittype = 0x10;     /* VLCap = VL0, InitType = 0 */
-       pip->vl_high_limit = dev->vl_high_limit;
-       /* pip->vl_arb_high_cap; // only one VL */
-       /* pip->vl_arb_low_cap; // only one VL */
-       /* InitTypeReply = 0 */
-       /* our mtu cap depends on whether 4K MTU enabled or not */
-       pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-       /* HCAs ignore VLStallCount and HOQLife */
-       /* pip->vlstallcnt_hoqlife; */
-       pip->operationalvl_pei_peo_fpi_fpo = 0x10;      /* OVLs = 1 */
-       pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
-       /* P_KeyViolations are counted by hardware. */
-       pip->pkey_violations =
-               cpu_to_be16((ipath_get_cr_errpkey(dd) -
-                            dev->z_pkey_violations) & 0xFFFF);
-       pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
-       /* Only the hardware GUID is supported for now */
-       pip->guid_cap = 1;
-       pip->clientrereg_resv_subnetto = dev->subnet_timeout;
-       /* 32.768 usec. response time (guessing) */
-       pip->resv_resptimevalue = 3;
-       pip->localphyerrors_overrunerrors =
-               (get_phyerrthreshold(dd) << 4) |
-               get_overrunthreshold(dd);
-       /* pip->max_credit_hint; */
-       if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
-               u32 v;
-
-               v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
-               pip->link_roundtrip_latency[0] = v >> 16;
-               pip->link_roundtrip_latency[1] = v >> 8;
-               pip->link_roundtrip_latency[2] = v;
-       }
-
-       ret = reply(smp);
-
-bail:
-       return ret;
-}
-
-/**
- * get_pkeys - return the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the pkey table is placed here
- */
-static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
-{
-       /* always a kernel port, no locking needed */
-       struct ipath_portdata *pd = dd->ipath_pd[0];
-
-       memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
-
-       return 0;
-}
-
-static int recv_subn_get_pkeytable(struct ib_smp *smp,
-                                  struct ib_device *ibdev)
-{
-       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-       u16 *p = (u16 *) smp->data;
-       __be16 *q = (__be16 *) smp->data;
-
-       /* 64 blocks of 32 16-bit P_Key entries */
-
-       memset(smp->data, 0, sizeof(smp->data));
-       if (startpx == 0) {
-               struct ipath_ibdev *dev = to_idev(ibdev);
-               unsigned i, n = ipath_get_npkeys(dev->dd);
-
-               get_pkeys(dev->dd, p);
-
-               for (i = 0; i < n; i++)
-                       q[i] = cpu_to_be16(p[i]);
-       } else
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return reply(smp);
-}
-
-static int recv_subn_set_guidinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev)
-{
-       /* The only GUID we support is the first read-only entry. */
-       return recv_subn_get_guidinfo(smp, ibdev);
-}
-
-/**
- * set_linkdowndefaultstate - set the default linkdown state
- * @dd: the infinipath device
- * @sleep: the new state
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
-{
-       if (sleep)
-               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-       else
-               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                        dd->ipath_ibcctrl);
-       return 0;
-}
-
-/**
- * recv_subn_set_portinfo - set port information
- * @smp: the incoming SM packet
- * @ibdev: the infiniband device
- * @port: the port on the device
- *
- * Set Portinfo (see ch. 14.2.5.6).
- */
-static int recv_subn_set_portinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-       struct ib_event event;
-       struct ipath_ibdev *dev;
-       struct ipath_devdata *dd;
-       char clientrereg = 0;
-       u16 lid, smlid;
-       u8 lwe;
-       u8 lse;
-       u8 state;
-       u16 lstate;
-       u32 mtu;
-       int ret, ore;
-
-       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
-               goto err;
-
-       dev = to_idev(ibdev);
-       dd = dev->dd;
-       event.device = ibdev;
-       event.element.port_num = port;
-
-       dev->mkey = pip->mkey;
-       dev->gid_prefix = pip->gid_prefix;
-       dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
-
-       lid = be16_to_cpu(pip->lid);
-       if (dd->ipath_lid != lid ||
-           dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
-               /* Must be a valid unicast LID address. */
-               if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
-                       goto err;
-               ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
-               event.event = IB_EVENT_LID_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       smlid = be16_to_cpu(pip->sm_lid);
-       if (smlid != dev->sm_lid) {
-               /* Must be a valid unicast LID address. */
-               if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
-                       goto err;
-               dev->sm_lid = smlid;
-               event.event = IB_EVENT_SM_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       /* Allow 1x or 4x to be set (see 14.2.6.6). */
-       lwe = pip->link_width_enabled;
-       if (lwe) {
-               if (lwe == 0xFF)
-                       lwe = dd->ipath_link_width_supported;
-               else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
-                       goto err;
-               set_link_width_enabled(dd, lwe);
-       }
-
-       /* Allow 2.5 or 5.0 Gbs. */
-       lse = pip->linkspeedactive_enabled & 0xF;
-       if (lse) {
-               if (lse == 15)
-                       lse = dd->ipath_link_speed_supported;
-               else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
-                       goto err;
-               set_link_speed_enabled(dd, lse);
-       }
-
-       /* Set link down default state. */
-       switch (pip->portphysstate_linkdown & 0xF) {
-       case 0: /* NOP */
-               break;
-       case 1: /* SLEEP */
-               if (set_linkdowndefaultstate(dd, 1))
-                       goto err;
-               break;
-       case 2: /* POLL */
-               if (set_linkdowndefaultstate(dd, 0))
-                       goto err;
-               break;
-       default:
-               goto err;
-       }
-
-       dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
-       dev->vl_high_limit = pip->vl_high_limit;
-
-       switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
-       case IB_MTU_256:
-               mtu = 256;
-               break;
-       case IB_MTU_512:
-               mtu = 512;
-               break;
-       case IB_MTU_1024:
-               mtu = 1024;
-               break;
-       case IB_MTU_2048:
-               mtu = 2048;
-               break;
-       case IB_MTU_4096:
-               if (!ipath_mtu4096)
-                       goto err;
-               mtu = 4096;
-               break;
-       default:
-               /* XXX We have already partially updated our state! */
-               goto err;
-       }
-       ipath_set_mtu(dd, mtu);
-
-       dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
-
-       /* We only support VL0 */
-       if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
-               goto err;
-
-       if (pip->mkey_violations == 0)
-               dev->mkey_violations = 0;
-
-       /*
-        * Hardware counter can't be reset so snapshot and subtract
-        * later.
-        */
-       if (pip->pkey_violations == 0)
-               dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
-
-       if (pip->qkey_violations == 0)
-               dev->qkey_violations = 0;
-
-       ore = pip->localphyerrors_overrunerrors;
-       if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
-               goto err;
-
-       if (set_overrunthreshold(dd, (ore & 0xF)))
-               goto err;
-
-       dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
-
-       if (pip->clientrereg_resv_subnetto & 0x80) {
-               clientrereg = 1;
-               event.event = IB_EVENT_CLIENT_REREGISTER;
-               ib_dispatch_event(&event);
-       }
-
-       /*
-        * Do the port state change now that the other link parameters
-        * have been set.
-        * Changing the port physical state only makes sense if the link
-        * is down or is being set to down.
-        */
-       state = pip->linkspeed_portstate & 0xF;
-       lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
-       if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
-               goto err;
-
-       /*
-        * Only state changes of DOWN, ARM, and ACTIVE are valid
-        * and must be in the correct state to take effect (see 7.2.6).
-        */
-       switch (state) {
-       case IB_PORT_NOP:
-               if (lstate == 0)
-                       break;
-               /* FALLTHROUGH */
-       case IB_PORT_DOWN:
-               if (lstate == 0)
-                       lstate = IPATH_IB_LINKDOWN_ONLY;
-               else if (lstate == 1)
-                       lstate = IPATH_IB_LINKDOWN_SLEEP;
-               else if (lstate == 2)
-                       lstate = IPATH_IB_LINKDOWN;
-               else if (lstate == 3)
-                       lstate = IPATH_IB_LINKDOWN_DISABLE;
-               else
-                       goto err;
-               ipath_set_linkstate(dd, lstate);
-               if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
-                       ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-                       goto done;
-               }
-               ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
-                               IPATH_LINKACTIVE, 1000);
-               break;
-       case IB_PORT_ARMED:
-               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-               break;
-       case IB_PORT_ACTIVE:
-               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-               break;
-       default:
-               /* XXX We have already partially updated our state! */
-               goto err;
-       }
-
-       ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-       if (clientrereg)
-               pip->clientrereg_resv_subnetto |= 0x80;
-
-       goto done;
-
-err:
-       smp->status |= IB_SMP_INVALID_FIELD;
-       ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-done:
-       return ret;
-}
-
-/**
- * rm_pkey - decrecment the reference count for the given PKEY
- * @dd: the infinipath device
- * @key: the PKEY index
- *
- * Return true if this was the last reference and the hardware table entry
- * needs to be changed.
- */
-static int rm_pkey(struct ipath_devdata *dd, u16 key)
-{
-       int i;
-       int ret;
-
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (dd->ipath_pkeys[i] != key)
-                       continue;
-               if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
-                       dd->ipath_pkeys[i] = 0;
-                       ret = 1;
-                       goto bail;
-               }
-               break;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * add_pkey - add the given PKEY to the hardware table
- * @dd: the infinipath device
- * @key: the PKEY
- *
- * Return an error code if unable to add the entry, zero if no change,
- * or 1 if the hardware PKEY register needs to be updated.
- */
-static int add_pkey(struct ipath_devdata *dd, u16 key)
-{
-       int i;
-       u16 lkey = key & 0x7FFF;
-       int any = 0;
-       int ret;
-
-       if (lkey == 0x7FFF) {
-               ret = 0;
-               goto bail;
-       }
-
-       /* Look for an empty slot or a matching PKEY. */
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i]) {
-                       any++;
-                       continue;
-               }
-               /* If it matches exactly, try to increment the ref count */
-               if (dd->ipath_pkeys[i] == key) {
-                       if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
-                               ret = 0;
-                               goto bail;
-                       }
-                       /* Lost the race. Look for an empty slot below. */
-                       atomic_dec(&dd->ipath_pkeyrefs[i]);
-                       any++;
-               }
-               /*
-                * It makes no sense to have both the limited and unlimited
-                * PKEY set at the same time since the unlimited one will
-                * disable the limited one.
-                */
-               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (!any) {
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i] &&
-                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-                       /* for ipathstats, etc. */
-                       ipath_stats.sps_pkeys[i] = lkey;
-                       dd->ipath_pkeys[i] = key;
-                       ret = 1;
-                       goto bail;
-               }
-       }
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-/**
- * set_pkeys - set the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the PKEY table
- */
-static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
-{
-       struct ipath_portdata *pd;
-       int i;
-       int changed = 0;
-
-       /* always a kernel port, no locking needed */
-       pd = dd->ipath_pd[0];
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               u16 key = pkeys[i];
-               u16 okey = pd->port_pkeys[i];
-
-               if (key == okey)
-                       continue;
-               /*
-                * The value of this PKEY table entry is changing.
-                * Remove the old entry in the hardware's array of PKEYs.
-                */
-               if (okey & 0x7FFF)
-                       changed |= rm_pkey(dd, okey);
-               if (key & 0x7FFF) {
-                       int ret = add_pkey(dd, key);
-
-                       if (ret < 0)
-                               key = 0;
-                       else
-                               changed |= ret;
-               }
-               pd->port_pkeys[i] = key;
-       }
-       if (changed) {
-               u64 pkey;
-               struct ib_event event;
-
-               pkey = (u64) dd->ipath_pkeys[0] |
-                       ((u64) dd->ipath_pkeys[1] << 16) |
-                       ((u64) dd->ipath_pkeys[2] << 32) |
-                       ((u64) dd->ipath_pkeys[3] << 48);
-               ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
-                          (unsigned long long) pkey);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-                                pkey);
-
-               event.event = IB_EVENT_PKEY_CHANGE;
-               event.device = &dd->verbs_dev->ibdev;
-               event.element.port_num = port;
-               ib_dispatch_event(&event);
-       }
-       return 0;
-}
-
-static int recv_subn_set_pkeytable(struct ib_smp *smp,
-                                  struct ib_device *ibdev, u8 port)
-{
-       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-       __be16 *p = (__be16 *) smp->data;
-       u16 *q = (u16 *) smp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       unsigned i, n = ipath_get_npkeys(dev->dd);
-
-       for (i = 0; i < n; i++)
-               q[i] = be16_to_cpu(p[i]);
-
-       if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return recv_subn_get_pkeytable(smp, ibdev);
-}
-
-static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
-{
-       struct ib_class_port_info *p =
-               (struct ib_class_port_info *)pmp->data;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       if (pmp->mad_hdr.attr_mod != 0)
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       /* Indicate AllPortSelect is valid (only one port anyway) */
-       p->capability_mask = cpu_to_be16(1 << 8);
-       p->base_version = 1;
-       p->class_version = 1;
-       /*
-        * Expected response time is 4.096 usec. * 2^18 == 1.073741824
-        * sec.
-        */
-       p->resp_time_value = 18;
-
-       return reply((struct ib_smp *) pmp);
-}
-
-/*
- * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
- * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
- * We support 5 counters which only count the mandatory quantities.
- */
-#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
-                                   COUNTER_MASK(1, 1) | \
-                                   COUNTER_MASK(1, 2) | \
-                                   COUNTER_MASK(1, 3) | \
-                                   COUNTER_MASK(1, 4))
-
-static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
-                                          struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portsamplescontrol *p =
-               (struct ib_pma_portsamplescontrol *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       unsigned long flags;
-       u8 port_select = p->port_select;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-       /*
-        * Ticks are 10x the link transfer period which for 2.5Gbs is 4
-        * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
-        * intervals are counted in ticks.  Since we use Linux timers, that
-        * count in jiffies, we can't sample for less than 1000 ticks if HZ
-        * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
-        * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
-        * have hardware support for delaying packets.
-        */
-       if (crp->cr_psstat)
-               p->tick = dev->dd->ipath_link_speed_active - 1;
-       else
-               p->tick = 250;          /* 1 usec. */
-       p->counter_width = 4;   /* 32 bit counters */
-       p->counter_mask0_9 = COUNTER_MASK0_9;
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (crp->cr_psstat)
-               p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               p->sample_status = dev->pma_sample_status;
-       p->sample_start = cpu_to_be32(dev->pma_sample_start);
-       p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
-       p->tag = cpu_to_be16(dev->pma_tag);
-       p->counter_select[0] = dev->pma_counter_select[0];
-       p->counter_select[1] = dev->pma_counter_select[1];
-       p->counter_select[2] = dev->pma_counter_select[2];
-       p->counter_select[3] = dev->pma_counter_select[3];
-       p->counter_select[4] = dev->pma_counter_select[4];
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
-                                          struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portsamplescontrol *p =
-               (struct ib_pma_portsamplescontrol *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       unsigned long flags;
-       u8 status;
-       int ret;
-
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (p->port_select != port && p->port_select != 0xFF)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               ret = reply((struct ib_smp *) pmp);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       if (status == IB_PMA_SAMPLE_STATUS_DONE) {
-               dev->pma_sample_start = be32_to_cpu(p->sample_start);
-               dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
-               dev->pma_tag = be16_to_cpu(p->tag);
-               dev->pma_counter_select[0] = p->counter_select[0];
-               dev->pma_counter_select[1] = p->counter_select[1];
-               dev->pma_counter_select[2] = p->counter_select[2];
-               dev->pma_counter_select[3] = p->counter_select[3];
-               dev->pma_counter_select[4] = p->counter_select[4];
-               if (crp->cr_psstat) {
-                       ipath_write_creg(dev->dd, crp->cr_psinterval,
-                                        dev->pma_sample_interval);
-                       ipath_write_creg(dev->dd, crp->cr_psstart,
-                                        dev->pma_sample_start);
-               } else
-                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
-
-bail:
-       return ret;
-}
-
-static u64 get_counter(struct ipath_ibdev *dev,
-                      struct ipath_cregs const *crp,
-                      __be16 sel)
-{
-       u64 ret;
-
-       switch (sel) {
-       case IB_PMA_PORT_XMIT_DATA:
-               ret = (crp->cr_psxmitdatacount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
-                       dev->ipath_sword;
-               break;
-       case IB_PMA_PORT_RCV_DATA:
-               ret = (crp->cr_psrcvdatacount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
-                       dev->ipath_rword;
-               break;
-       case IB_PMA_PORT_XMIT_PKTS:
-               ret = (crp->cr_psxmitpktscount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
-                       dev->ipath_spkts;
-               break;
-       case IB_PMA_PORT_RCV_PKTS:
-               ret = (crp->cr_psrcvpktscount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
-                       dev->ipath_rpkts;
-               break;
-       case IB_PMA_PORT_XMIT_WAIT:
-               ret = (crp->cr_psxmitwaitcount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
-                       dev->ipath_xmit_wait;
-               break;
-       default:
-               ret = 0;
-       }
-
-       return ret;
-}
-
-static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
-                                         struct ib_device *ibdev)
-{
-       struct ib_pma_portsamplesresult *p =
-               (struct ib_pma_portsamplesresult *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       u8 status;
-       int i;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-       p->tag = cpu_to_be16(dev->pma_tag);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       p->sample_status = cpu_to_be16(status);
-       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-                   cpu_to_be32(
-                       get_counter(dev, crp, dev->pma_counter_select[i]));
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
-                                             struct ib_device *ibdev)
-{
-       struct ib_pma_portsamplesresult_ext *p =
-               (struct ib_pma_portsamplesresult_ext *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       u8 status;
-       int i;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-       p->tag = cpu_to_be16(dev->pma_tag);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       p->sample_status = cpu_to_be16(status);
-       /* 64 bits */
-       p->extended_width = cpu_to_be32(0x80000000);
-       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-                   cpu_to_be64(
-                       get_counter(dev, crp, dev->pma_counter_select[i]));
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
-                                    struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_verbs_counters cntrs;
-       u8 port_select = p->port_select;
-
-       ipath_get_counters(dev->dd, &cntrs);
-
-       /* Adjust counters for any resets done. */
-       cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
-       cntrs.link_error_recovery_counter -=
-               dev->z_link_error_recovery_counter;
-       cntrs.link_downed_counter -= dev->z_link_downed_counter;
-       cntrs.port_rcv_errors += dev->rcv_errors;
-       cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
-       cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
-       cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
-       cntrs.port_xmit_data -= dev->z_port_xmit_data;
-       cntrs.port_rcv_data -= dev->z_port_rcv_data;
-       cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
-       cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
-       cntrs.local_link_integrity_errors -=
-               dev->z_local_link_integrity_errors;
-       cntrs.excessive_buffer_overrun_errors -=
-               dev->z_excessive_buffer_overrun_errors;
-       cntrs.vl15_dropped -= dev->z_vl15_dropped;
-       cntrs.vl15_dropped += dev->n_vl15_dropped;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       if (cntrs.symbol_error_counter > 0xFFFFUL)
-               p->symbol_error_counter = cpu_to_be16(0xFFFF);
-       else
-               p->symbol_error_counter =
-                       cpu_to_be16((u16)cntrs.symbol_error_counter);
-       if (cntrs.link_error_recovery_counter > 0xFFUL)
-               p->link_error_recovery_counter = 0xFF;
-       else
-               p->link_error_recovery_counter =
-                       (u8)cntrs.link_error_recovery_counter;
-       if (cntrs.link_downed_counter > 0xFFUL)
-               p->link_downed_counter = 0xFF;
-       else
-               p->link_downed_counter = (u8)cntrs.link_downed_counter;
-       if (cntrs.port_rcv_errors > 0xFFFFUL)
-               p->port_rcv_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_errors =
-                       cpu_to_be16((u16) cntrs.port_rcv_errors);
-       if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
-               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_remphys_errors =
-                       cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
-       if (cntrs.port_xmit_discards > 0xFFFFUL)
-               p->port_xmit_discards = cpu_to_be16(0xFFFF);
-       else
-               p->port_xmit_discards =
-                       cpu_to_be16((u16)cntrs.port_xmit_discards);
-       if (cntrs.local_link_integrity_errors > 0xFUL)
-               cntrs.local_link_integrity_errors = 0xFUL;
-       if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
-               cntrs.excessive_buffer_overrun_errors = 0xFUL;
-       p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
-               cntrs.excessive_buffer_overrun_errors;
-       if (cntrs.vl15_dropped > 0xFFFFUL)
-               p->vl15_dropped = cpu_to_be16(0xFFFF);
-       else
-               p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
-       if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
-               p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
-       if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
-               p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
-       if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
-               p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_xmit_packets =
-                       cpu_to_be32((u32)cntrs.port_xmit_packets);
-       if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
-               p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_rcv_packets =
-                       cpu_to_be32((u32) cntrs.port_rcv_packets);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
-                                        struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters_ext *p =
-               (struct ib_pma_portcounters_ext *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       u64 swords, rwords, spkts, rpkts, xwait;
-       u8 port_select = p->port_select;
-
-       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-                               &rpkts, &xwait);
-
-       /* Adjust counters for any resets done. */
-       swords -= dev->z_port_xmit_data;
-       rwords -= dev->z_port_rcv_data;
-       spkts -= dev->z_port_xmit_packets;
-       rpkts -= dev->z_port_rcv_packets;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       p->port_xmit_data = cpu_to_be64(swords);
-       p->port_rcv_data = cpu_to_be64(rwords);
-       p->port_xmit_packets = cpu_to_be64(spkts);
-       p->port_rcv_packets = cpu_to_be64(rpkts);
-       p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
-       p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
-       p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
-       p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
-                                    struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_verbs_counters cntrs;
-
-       /*
-        * Since the HW doesn't support clearing counters, we save the
-        * current count and subtract it from future responses.
-        */
-       ipath_get_counters(dev->dd, &cntrs);
-
-       if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
-               dev->z_symbol_error_counter = cntrs.symbol_error_counter;
-
-       if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
-               dev->z_link_error_recovery_counter =
-                       cntrs.link_error_recovery_counter;
-
-       if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
-               dev->z_link_downed_counter = cntrs.link_downed_counter;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
-               dev->z_port_rcv_errors =
-                       cntrs.port_rcv_errors + dev->rcv_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
-               dev->z_port_rcv_remphys_errors =
-                       cntrs.port_rcv_remphys_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
-               dev->z_port_xmit_discards = cntrs.port_xmit_discards;
-
-       if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
-               dev->z_local_link_integrity_errors =
-                       cntrs.local_link_integrity_errors;
-
-       if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
-               dev->z_excessive_buffer_overrun_errors =
-                       cntrs.excessive_buffer_overrun_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
-               dev->n_vl15_dropped = 0;
-               dev->z_vl15_dropped = cntrs.vl15_dropped;
-       }
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
-               dev->z_port_xmit_data = cntrs.port_xmit_data;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
-               dev->z_port_rcv_data = cntrs.port_rcv_data;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
-               dev->z_port_xmit_packets = cntrs.port_xmit_packets;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
-               dev->z_port_rcv_packets = cntrs.port_rcv_packets;
-
-       return recv_pma_get_portcounters(pmp, ibdev, port);
-}
-
-static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
-                                        struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       u64 swords, rwords, spkts, rpkts, xwait;
-
-       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-                               &rpkts, &xwait);
-
-       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
-               dev->z_port_xmit_data = swords;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
-               dev->z_port_rcv_data = rwords;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
-               dev->z_port_xmit_packets = spkts;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
-               dev->z_port_rcv_packets = rpkts;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
-               dev->n_unicast_xmit = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
-               dev->n_unicast_rcv = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
-               dev->n_multicast_xmit = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
-               dev->n_multicast_rcv = 0;
-
-       return recv_pma_get_portcounters_ext(pmp, ibdev, port);
-}
-
-static int process_subn(struct ib_device *ibdev, int mad_flags,
-                       u8 port_num, const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_smp *smp = (struct ib_smp *)out_mad;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       *out_mad = *in_mad;
-       if (smp->class_version != 1) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply(smp);
-               goto bail;
-       }
-
-       /* Is the mkey in the process of expiring? */
-       if (dev->mkey_lease_timeout &&
-           time_after_eq(jiffies, dev->mkey_lease_timeout)) {
-               /* Clear timeout and mkey protection field. */
-               dev->mkey_lease_timeout = 0;
-               dev->mkeyprot = 0;
-       }
-
-       /*
-        * M_Key checking depends on
-        * Portinfo:M_Key_protect_bits
-        */
-       if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
-           dev->mkey != smp->mkey &&
-           (smp->method == IB_MGMT_METHOD_SET ||
-            (smp->method == IB_MGMT_METHOD_GET &&
-             dev->mkeyprot >= 2))) {
-               if (dev->mkey_violations != 0xFFFF)
-                       ++dev->mkey_violations;
-               if (dev->mkey_lease_timeout ||
-                   dev->mkey_lease_period == 0) {
-                       ret = IB_MAD_RESULT_SUCCESS |
-                               IB_MAD_RESULT_CONSUMED;
-                       goto bail;
-               }
-               dev->mkey_lease_timeout = jiffies +
-                       dev->mkey_lease_period * HZ;
-               /* Future: Generate a trap notice. */
-               ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               goto bail;
-       } else if (dev->mkey_lease_timeout)
-               dev->mkey_lease_timeout = 0;
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_NODE_DESC:
-                       ret = recv_subn_get_nodedescription(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_NODE_INFO:
-                       ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_GUID_INFO:
-                       ret = recv_subn_get_guidinfo(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_PORT_INFO:
-                       ret = recv_subn_get_portinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_PKEY_TABLE:
-                       ret = recv_subn_get_pkeytable(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_SM_INFO:
-                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-                               ret = IB_MAD_RESULT_SUCCESS |
-                                       IB_MAD_RESULT_CONSUMED;
-                               goto bail;
-                       }
-                       if (dev->port_cap_flags & IB_PORT_SM) {
-                               ret = IB_MAD_RESULT_SUCCESS;
-                               goto bail;
-                       }
-                       /* FALLTHROUGH */
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply(smp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_SET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_GUID_INFO:
-                       ret = recv_subn_set_guidinfo(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_PORT_INFO:
-                       ret = recv_subn_set_portinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_PKEY_TABLE:
-                       ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_SM_INFO:
-                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-                               ret = IB_MAD_RESULT_SUCCESS |
-                                       IB_MAD_RESULT_CONSUMED;
-                               goto bail;
-                       }
-                       if (dev->port_cap_flags & IB_PORT_SM) {
-                               ret = IB_MAD_RESULT_SUCCESS;
-                               goto bail;
-                       }
-                       /* FALLTHROUGH */
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply(smp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_REPORT_RESP:
-       case IB_MGMT_METHOD_TRAP_REPRESS:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
-       default:
-               smp->status |= IB_SMP_UNSUP_METHOD;
-               ret = reply(smp);
-       }
-
-bail:
-       return ret;
-}
-
-static int process_perf(struct ib_device *ibdev, u8 port_num,
-                       const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
-       int ret;
-
-       *out_mad = *in_mad;
-       if (pmp->mad_hdr.class_version != 1) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_smp *) pmp);
-               goto bail;
-       }
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_CLASS_PORT_INFO:
-                       ret = recv_pma_get_classportinfo(pmp);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_CONTROL:
-                       ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
-                                                             port_num);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_RESULT:
-                       ret = recv_pma_get_portsamplesresult(pmp, ibdev);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_RESULT_EXT:
-                       ret = recv_pma_get_portsamplesresult_ext(pmp,
-                                                                ibdev);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS:
-                       ret = recv_pma_get_portcounters(pmp, ibdev,
-                                                       port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = recv_pma_get_portcounters_ext(pmp, ibdev,
-                                                           port_num);
-                       goto bail;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_smp *) pmp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_SET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_PORT_SAMPLES_CONTROL:
-                       ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
-                                                             port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS:
-                       ret = recv_pma_set_portcounters(pmp, ibdev,
-                                                       port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = recv_pma_set_portcounters_ext(pmp, ibdev,
-                                                           port_num);
-                       goto bail;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_smp *) pmp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_smp *) pmp);
-       }
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_process_mad - process an incoming MAD packet
- * @ibdev: the infiniband device this packet came in on
- * @mad_flags: MAD flags
- * @port_num: the port number this packet came in on
- * @in_wc: the work completion entry for this packet
- * @in_grh: the global route header for this packet
- * @in_mad: the incoming MAD
- * @out_mad: any outgoing MAD reply
- *
- * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
- * interested in processing.
- *
- * Note that the verbs framework has already done the MAD sanity checks,
- * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
- * MADs.
- *
- * This is called by the ib_mad module.
- */
-int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                     const struct ib_mad_hdr *in, size_t in_mad_size,
-                     struct ib_mad_hdr *out, size_t *out_mad_size,
-                     u16 *out_mad_pkey_index)
-{
-       int ret;
-       const struct ib_mad *in_mad = (const struct ib_mad *)in;
-       struct ib_mad *out_mad = (struct ib_mad *)out;
-
-       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-                        *out_mad_size != sizeof(*out_mad)))
-               return IB_MAD_RESULT_FAILURE;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               ret = process_subn(ibdev, mad_flags, port_num,
-                                  in_mad, out_mad);
-               goto bail;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf(ibdev, port_num, in_mad, out_mad);
-               goto bail;
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-       }
-
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c
deleted file mode 100644 (file)
index e732742..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <asm/pgtable.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_release_mmap_info - free mmap info structure
- * @ref: a pointer to the kref within struct ipath_mmap_info
- */
-void ipath_release_mmap_info(struct kref *ref)
-{
-       struct ipath_mmap_info *ip =
-               container_of(ref, struct ipath_mmap_info, ref);
-       struct ipath_ibdev *dev = to_idev(ip->context->device);
-
-       spin_lock_irq(&dev->pending_lock);
-       list_del(&ip->pending_mmaps);
-       spin_unlock_irq(&dev->pending_lock);
-
-       vfree(ip->obj);
-       kfree(ip);
-}
-
-/*
- * open and close keep track of how many times the CQ is mapped,
- * to avoid releasing it.
- */
-static void ipath_vma_open(struct vm_area_struct *vma)
-{
-       struct ipath_mmap_info *ip = vma->vm_private_data;
-
-       kref_get(&ip->ref);
-}
-
-static void ipath_vma_close(struct vm_area_struct *vma)
-{
-       struct ipath_mmap_info *ip = vma->vm_private_data;
-
-       kref_put(&ip->ref, ipath_release_mmap_info);
-}
-
-static const struct vm_operations_struct ipath_vm_ops = {
-       .open =     ipath_vma_open,
-       .close =    ipath_vma_close,
-};
-
-/**
- * ipath_mmap - create a new mmap region
- * @context: the IB user context of the process making the mmap() call
- * @vma: the VMA to be initialized
- * Return zero if the mmap is OK. Otherwise, return an errno.
- */
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       struct ipath_ibdev *dev = to_idev(context->device);
-       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-       unsigned long size = vma->vm_end - vma->vm_start;
-       struct ipath_mmap_info *ip, *pp;
-       int ret = -EINVAL;
-
-       /*
-        * Search the device's list of objects waiting for a mmap call.
-        * Normally, this list is very short since a call to create a
-        * CQ, QP, or SRQ is soon followed by a call to mmap().
-        */
-       spin_lock_irq(&dev->pending_lock);
-       list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
-                                pending_mmaps) {
-               /* Only the creator is allowed to mmap the object */
-               if (context != ip->context || (__u64) offset != ip->offset)
-                       continue;
-               /* Don't allow a mmap larger than the object. */
-               if (size > ip->size)
-                       break;
-
-               list_del_init(&ip->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-
-               ret = remap_vmalloc_range(vma, ip->obj, 0);
-               if (ret)
-                       goto done;
-               vma->vm_ops = &ipath_vm_ops;
-               vma->vm_private_data = ip;
-               ipath_vma_open(vma);
-               goto done;
-       }
-       spin_unlock_irq(&dev->pending_lock);
-done:
-       return ret;
-}
-
-/*
- * Allocate information for ipath_mmap
- */
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-                                              u32 size,
-                                              struct ib_ucontext *context,
-                                              void *obj) {
-       struct ipath_mmap_info *ip;
-
-       ip = kmalloc(sizeof *ip, GFP_KERNEL);
-       if (!ip)
-               goto bail;
-
-       size = PAGE_ALIGN(size);
-
-       spin_lock_irq(&dev->mmap_offset_lock);
-       if (dev->mmap_offset == 0)
-               dev->mmap_offset = PAGE_SIZE;
-       ip->offset = dev->mmap_offset;
-       dev->mmap_offset += size;
-       spin_unlock_irq(&dev->mmap_offset_lock);
-
-       INIT_LIST_HEAD(&ip->pending_mmaps);
-       ip->size = size;
-       ip->context = context;
-       ip->obj = obj;
-       kref_init(&ip->ref);
-
-bail:
-       return ip;
-}
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-                           struct ipath_mmap_info *ip,
-                           u32 size, void *obj) {
-       size = PAGE_ALIGN(size);
-
-       spin_lock_irq(&dev->mmap_offset_lock);
-       if (dev->mmap_offset == 0)
-               dev->mmap_offset = PAGE_SIZE;
-       ip->offset = dev->mmap_offset;
-       dev->mmap_offset += size;
-       spin_unlock_irq(&dev->mmap_offset_lock);
-
-       ip->size = size;
-       ip->obj = obj;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
deleted file mode 100644 (file)
index b76b0ce..0000000
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/slab.h>
-
-#include <rdma/ib_umem.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-
-/* Fast memory region */
-struct ipath_fmr {
-       struct ib_fmr ibfmr;
-       u8 page_shift;
-       struct ipath_mregion mr;        /* must be last */
-};
-
-static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
-{
-       return container_of(ibfmr, struct ipath_fmr, ibfmr);
-}
-
-/**
- * ipath_get_dma_mr - get a DMA memory region
- * @pd: protection domain for this memory region
- * @acc: access flags
- *
- * Returns the memory region on success, otherwise returns an errno.
- * Note that all DMA addresses should be created via the
- * struct ib_dma_mapping_ops functions (see ipath_dma.c).
- */
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct ipath_mr *mr;
-       struct ib_mr *ret;
-
-       mr = kzalloc(sizeof *mr, GFP_KERNEL);
-       if (!mr) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       mr->mr.access_flags = acc;
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
-static struct ipath_mr *alloc_mr(int count,
-                                struct ipath_lkey_table *lk_table)
-{
-       struct ipath_mr *mr;
-       int m, i = 0;
-
-       /* Allocate struct plus pointers to first level page tables. */
-       m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-       mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
-       if (!mr)
-               goto done;
-
-       /* Allocate first level page tables. */
-       for (; i < m; i++) {
-               mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
-               if (!mr->mr.map[i])
-                       goto bail;
-       }
-       mr->mr.mapsz = m;
-
-       if (!ipath_alloc_lkey(lk_table, &mr->mr))
-               goto bail;
-       mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
-
-       goto done;
-
-bail:
-       while (i) {
-               i--;
-               kfree(mr->mr.map[i]);
-       }
-       kfree(mr);
-       mr = NULL;
-
-done:
-       return mr;
-}
-
-/**
- * ipath_reg_user_mr - register a userspace memory region
- * @pd: protection domain for this memory region
- * @start: starting userspace address
- * @length: length of region to register
- * @virt_addr: virtual address to use (from HCA's point of view)
- * @mr_access_flags: access flags for this memory region
- * @udata: unused by the InfiniPath driver
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                               u64 virt_addr, int mr_access_flags,
-                               struct ib_udata *udata)
-{
-       struct ipath_mr *mr;
-       struct ib_umem *umem;
-       int n, m, entry;
-       struct scatterlist *sg;
-       struct ib_mr *ret;
-
-       if (length == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       umem = ib_umem_get(pd->uobject->context, start, length,
-                          mr_access_flags, 0);
-       if (IS_ERR(umem))
-               return (void *) umem;
-
-       n = umem->nmap;
-       mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
-       if (!mr) {
-               ret = ERR_PTR(-ENOMEM);
-               ib_umem_release(umem);
-               goto bail;
-       }
-
-       mr->mr.pd = pd;
-       mr->mr.user_base = start;
-       mr->mr.iova = virt_addr;
-       mr->mr.length = length;
-       mr->mr.offset = ib_umem_offset(umem);
-       mr->mr.access_flags = mr_access_flags;
-       mr->mr.max_segs = n;
-       mr->umem = umem;
-
-       m = 0;
-       n = 0;
-       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               void *vaddr;
-
-               vaddr = page_address(sg_page(sg));
-               if (!vaddr) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-               mr->mr.map[m]->segs[n].vaddr = vaddr;
-               mr->mr.map[m]->segs[n].length = umem->page_size;
-               n++;
-               if (n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_dereg_mr - unregister and free a memory region
- * @ibmr: the memory region to free
- *
- * Returns 0 on success.
- *
- * Note that this is called to free MRs created by ipath_get_dma_mr()
- * or ipath_reg_user_mr().
- */
-int ipath_dereg_mr(struct ib_mr *ibmr)
-{
-       struct ipath_mr *mr = to_imr(ibmr);
-       int i;
-
-       ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
-       i = mr->mr.mapsz;
-       while (i) {
-               i--;
-               kfree(mr->mr.map[i]);
-       }
-
-       if (mr->umem)
-               ib_umem_release(mr->umem);
-
-       kfree(mr);
-       return 0;
-}
-
-/**
- * ipath_alloc_fmr - allocate a fast memory region
- * @pd: the protection domain for this memory region
- * @mr_access_flags: access flags for this memory region
- * @fmr_attr: fast memory region attributes
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-                              struct ib_fmr_attr *fmr_attr)
-{
-       struct ipath_fmr *fmr;
-       int m, i = 0;
-       struct ib_fmr *ret;
-
-       /* Allocate struct plus pointers to first level page tables. */
-       m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-       fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
-       if (!fmr)
-               goto bail;
-
-       /* Allocate first level page tables. */
-       for (; i < m; i++) {
-               fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
-                                        GFP_KERNEL);
-               if (!fmr->mr.map[i])
-                       goto bail;
-       }
-       fmr->mr.mapsz = m;
-
-       /*
-        * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
-        * rkey.
-        */
-       if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
-               goto bail;
-       fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
-       /*
-        * Resources are allocated but no valid mapping (RKEY can't be
-        * used).
-        */
-       fmr->mr.pd = pd;
-       fmr->mr.user_base = 0;
-       fmr->mr.iova = 0;
-       fmr->mr.length = 0;
-       fmr->mr.offset = 0;
-       fmr->mr.access_flags = mr_access_flags;
-       fmr->mr.max_segs = fmr_attr->max_pages;
-       fmr->page_shift = fmr_attr->page_shift;
-
-       ret = &fmr->ibfmr;
-       goto done;
-
-bail:
-       while (i)
-               kfree(fmr->mr.map[--i]);
-       kfree(fmr);
-       ret = ERR_PTR(-ENOMEM);
-
-done:
-       return ret;
-}
-
-/**
- * ipath_map_phys_fmr - set up a fast memory region
- * @ibmfr: the fast memory region to set up
- * @page_list: the list of pages to associate with the fast memory region
- * @list_len: the number of pages to associate with the fast memory region
- * @iova: the virtual address of the start of the fast memory region
- *
- * This may be called from interrupt context.
- */
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-                      int list_len, u64 iova)
-{
-       struct ipath_fmr *fmr = to_ifmr(ibfmr);
-       struct ipath_lkey_table *rkt;
-       unsigned long flags;
-       int m, n, i;
-       u32 ps;
-       int ret;
-
-       if (list_len > fmr->mr.max_segs) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       rkt = &to_idev(ibfmr->device)->lk_table;
-       spin_lock_irqsave(&rkt->lock, flags);
-       fmr->mr.user_base = iova;
-       fmr->mr.iova = iova;
-       ps = 1 << fmr->page_shift;
-       fmr->mr.length = list_len * ps;
-       m = 0;
-       n = 0;
-       ps = 1 << fmr->page_shift;
-       for (i = 0; i < list_len; i++) {
-               fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
-               fmr->mr.map[m]->segs[n].length = ps;
-               if (++n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       spin_unlock_irqrestore(&rkt->lock, flags);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_unmap_fmr - unmap fast memory regions
- * @fmr_list: the list of fast memory regions to unmap
- *
- * Returns 0 on success.
- */
-int ipath_unmap_fmr(struct list_head *fmr_list)
-{
-       struct ipath_fmr *fmr;
-       struct ipath_lkey_table *rkt;
-       unsigned long flags;
-
-       list_for_each_entry(fmr, fmr_list, ibfmr.list) {
-               rkt = &to_idev(fmr->ibfmr.device)->lk_table;
-               spin_lock_irqsave(&rkt->lock, flags);
-               fmr->mr.user_base = 0;
-               fmr->mr.iova = 0;
-               fmr->mr.length = 0;
-               spin_unlock_irqrestore(&rkt->lock, flags);
-       }
-       return 0;
-}
-
-/**
- * ipath_dealloc_fmr - deallocate a fast memory region
- * @ibfmr: the fast memory region to deallocate
- *
- * Returns 0 on success.
- */
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
-{
-       struct ipath_fmr *fmr = to_ifmr(ibfmr);
-       int i;
-
-       ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
-       i = fmr->mr.mapsz;
-       while (i)
-               kfree(fmr->mr.map[--i]);
-       kfree(fmr);
-       return 0;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c
deleted file mode 100644 (file)
index 280cd2d..0000000
+++ /dev/null
@@ -1,1079 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-#define BITS_PER_PAGE          (PAGE_SIZE*BITS_PER_BYTE)
-#define BITS_PER_PAGE_MASK     (BITS_PER_PAGE-1)
-#define mk_qpn(qpt, map, off)  (((map) - (qpt)->map) * BITS_PER_PAGE + \
-                                (off))
-#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
-                                                     BITS_PER_PAGE, off)
-
-/*
- * Convert the AETH credit code into the number of credits.
- */
-static u32 credit_table[31] = {
-       0,                      /* 0 */
-       1,                      /* 1 */
-       2,                      /* 2 */
-       3,                      /* 3 */
-       4,                      /* 4 */
-       6,                      /* 5 */
-       8,                      /* 6 */
-       12,                     /* 7 */
-       16,                     /* 8 */
-       24,                     /* 9 */
-       32,                     /* A */
-       48,                     /* B */
-       64,                     /* C */
-       96,                     /* D */
-       128,                    /* E */
-       192,                    /* F */
-       256,                    /* 10 */
-       384,                    /* 11 */
-       512,                    /* 12 */
-       768,                    /* 13 */
-       1024,                   /* 14 */
-       1536,                   /* 15 */
-       2048,                   /* 16 */
-       3072,                   /* 17 */
-       4096,                   /* 18 */
-       6144,                   /* 19 */
-       8192,                   /* 1A */
-       12288,                  /* 1B */
-       16384,                  /* 1C */
-       24576,                  /* 1D */
-       32768                   /* 1E */
-};
-
-
-static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
-{
-       unsigned long page = get_zeroed_page(GFP_KERNEL);
-       unsigned long flags;
-
-       /*
-        * Free the page if someone raced with us installing it.
-        */
-
-       spin_lock_irqsave(&qpt->lock, flags);
-       if (map->page)
-               free_page(page);
-       else
-               map->page = (void *)page;
-       spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-
-static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
-{
-       u32 i, offset, max_scan, qpn;
-       struct qpn_map *map;
-       u32 ret = -1;
-
-       if (type == IB_QPT_SMI)
-               ret = 0;
-       else if (type == IB_QPT_GSI)
-               ret = 1;
-
-       if (ret != -1) {
-               map = &qpt->map[0];
-               if (unlikely(!map->page)) {
-                       get_map_page(qpt, map);
-                       if (unlikely(!map->page)) {
-                               ret = -ENOMEM;
-                               goto bail;
-                       }
-               }
-               if (!test_and_set_bit(ret, map->page))
-                       atomic_dec(&map->n_free);
-               else
-                       ret = -EBUSY;
-               goto bail;
-       }
-
-       qpn = qpt->last + 1;
-       if (qpn >= QPN_MAX)
-               qpn = 2;
-       offset = qpn & BITS_PER_PAGE_MASK;
-       map = &qpt->map[qpn / BITS_PER_PAGE];
-       max_scan = qpt->nmaps - !offset;
-       for (i = 0;;) {
-               if (unlikely(!map->page)) {
-                       get_map_page(qpt, map);
-                       if (unlikely(!map->page))
-                               break;
-               }
-               if (likely(atomic_read(&map->n_free))) {
-                       do {
-                               if (!test_and_set_bit(offset, map->page)) {
-                                       atomic_dec(&map->n_free);
-                                       qpt->last = qpn;
-                                       ret = qpn;
-                                       goto bail;
-                               }
-                               offset = find_next_offset(map, offset);
-                               qpn = mk_qpn(qpt, map, offset);
-                               /*
-                                * This test differs from alloc_pidmap().
-                                * If find_next_offset() does find a zero
-                                * bit, we don't need to check for QPN
-                                * wrapping around past our starting QPN.
-                                * We just need to be sure we don't loop
-                                * forever.
-                                */
-                       } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
-               }
-               /*
-                * In order to keep the number of pages allocated to a
-                * minimum, we scan the all existing pages before increasing
-                * the size of the bitmap table.
-                */
-               if (++i > max_scan) {
-                       if (qpt->nmaps == QPNMAP_ENTRIES)
-                               break;
-                       map = &qpt->map[qpt->nmaps++];
-                       offset = 0;
-               } else if (map < &qpt->map[qpt->nmaps]) {
-                       ++map;
-                       offset = 0;
-               } else {
-                       map = &qpt->map[0];
-                       offset = 2;
-               }
-               qpn = mk_qpn(qpt, map, offset);
-       }
-
-       ret = -ENOMEM;
-
-bail:
-       return ret;
-}
-
-static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-       struct qpn_map *map;
-
-       map = qpt->map + qpn / BITS_PER_PAGE;
-       if (map->page)
-               clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
-       atomic_inc(&map->n_free);
-}
-
-/**
- * ipath_alloc_qpn - allocate a QP number
- * @qpt: the QP table
- * @qp: the QP
- * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
- *
- * Allocate the next available QPN and put the QP into the hash table.
- * The hash table holds a reference to the QP.
- */
-static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
-                          enum ib_qp_type type)
-{
-       unsigned long flags;
-       int ret;
-
-       ret = alloc_qpn(qpt, type);
-       if (ret < 0)
-               goto bail;
-       qp->ibqp.qp_num = ret;
-
-       /* Add the QP to the hash table. */
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       ret %= qpt->max;
-       qp->next = qpt->table[ret];
-       qpt->table[ret] = qp;
-       atomic_inc(&qp->refcount);
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_free_qp - remove a QP from the QP table
- * @qpt: the QP table
- * @qp: the QP to remove
- *
- * Remove the QP from the table so it can't be found asynchronously by
- * the receive interrupt routine.
- */
-static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
-{
-       struct ipath_qp *q, **qpp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       /* Remove QP from the hash table. */
-       qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
-       for (; (q = *qpp) != NULL; qpp = &q->next) {
-               if (q == qp) {
-                       *qpp = qp->next;
-                       qp->next = NULL;
-                       atomic_dec(&qp->refcount);
-                       break;
-               }
-       }
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-/**
- * ipath_free_all_qps - check for QPs still in use
- * @qpt: the QP table to empty
- *
- * There should not be any QPs still in use.
- * Free memory for table.
- */
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
-{
-       unsigned long flags;
-       struct ipath_qp *qp;
-       u32 n, qp_inuse = 0;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-       for (n = 0; n < qpt->max; n++) {
-               qp = qpt->table[n];
-               qpt->table[n] = NULL;
-
-               for (; qp; qp = qp->next)
-                       qp_inuse++;
-       }
-       spin_unlock_irqrestore(&qpt->lock, flags);
-
-       for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
-               if (qpt->map[n].page)
-                       free_page((unsigned long) qpt->map[n].page);
-       return qp_inuse;
-}
-
-/**
- * ipath_lookup_qpn - return the QP with the given QPN
- * @qpt: the QP table
- * @qpn: the QP number to look up
- *
- * The caller is responsible for decrementing the QP reference count
- * when done.
- */
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-       unsigned long flags;
-       struct ipath_qp *qp;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
-               if (qp->ibqp.qp_num == qpn) {
-                       atomic_inc(&qp->refcount);
-                       break;
-               }
-       }
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-       return qp;
-}
-
-/**
- * ipath_reset_qp - initialize the QP state to the reset state
- * @qp: the QP to reset
- * @type: the QP type
- */
-static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
-{
-       qp->remote_qpn = 0;
-       qp->qkey = 0;
-       qp->qp_access_flags = 0;
-       atomic_set(&qp->s_dma_busy, 0);
-       qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
-       qp->s_hdrwords = 0;
-       qp->s_wqe = NULL;
-       qp->s_pkt_delay = 0;
-       qp->s_draining = 0;
-       qp->s_psn = 0;
-       qp->r_psn = 0;
-       qp->r_msn = 0;
-       if (type == IB_QPT_RC) {
-               qp->s_state = IB_OPCODE_RC_SEND_LAST;
-               qp->r_state = IB_OPCODE_RC_SEND_LAST;
-       } else {
-               qp->s_state = IB_OPCODE_UC_SEND_LAST;
-               qp->r_state = IB_OPCODE_UC_SEND_LAST;
-       }
-       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
-       qp->r_nak_state = 0;
-       qp->r_aflags = 0;
-       qp->r_flags = 0;
-       qp->s_rnr_timeout = 0;
-       qp->s_head = 0;
-       qp->s_tail = 0;
-       qp->s_cur = 0;
-       qp->s_last = 0;
-       qp->s_ssn = 1;
-       qp->s_lsn = 0;
-       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
-       qp->r_head_ack_queue = 0;
-       qp->s_tail_ack_queue = 0;
-       qp->s_num_rd_atomic = 0;
-       if (qp->r_rq.wq) {
-               qp->r_rq.wq->head = 0;
-               qp->r_rq.wq->tail = 0;
-       }
-}
-
-/**
- * ipath_error_qp - put a QP into the error state
- * @qp: the QP to put into the error state
- * @err: the receive completion error to signal if a RWQE is active
- *
- * Flushes both send and receive work queues.
- * Returns true if last WQE event should be generated.
- * The QP s_lock should be held and interrupts disabled.
- * If we are already in error state, just return.
- */
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ib_wc wc;
-       int ret = 0;
-
-       if (qp->state == IB_QPS_ERR)
-               goto bail;
-
-       qp->state = IB_QPS_ERR;
-
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       if (!list_empty(&qp->piowait))
-               list_del_init(&qp->piowait);
-       spin_unlock(&dev->pending_lock);
-
-       /* Schedule the sending tasklet to drain the send work queue. */
-       if (qp->s_last != qp->s_head)
-               ipath_schedule_send(qp);
-
-       memset(&wc, 0, sizeof(wc));
-       wc.qp = &qp->ibqp;
-       wc.opcode = IB_WC_RECV;
-
-       if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
-               wc.wr_id = qp->r_wr_id;
-               wc.status = err;
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-       }
-       wc.status = IB_WC_WR_FLUSH_ERR;
-
-       if (qp->r_rq.wq) {
-               struct ipath_rwq *wq;
-               u32 head;
-               u32 tail;
-
-               spin_lock(&qp->r_rq.lock);
-
-               /* sanity check pointers before trusting them */
-               wq = qp->r_rq.wq;
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               while (tail != head) {
-                       wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
-                       if (++tail >= qp->r_rq.size)
-                               tail = 0;
-                       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-               }
-               wq->tail = tail;
-
-               spin_unlock(&qp->r_rq.lock);
-       } else if (qp->ibqp.event_handler)
-               ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_modify_qp - modify the attributes of a queue pair
- * @ibqp: the queue pair who's attributes we're modifying
- * @attr: the new attributes
- * @attr_mask: the mask of attributes to modify
- * @udata: user data for ipathverbs.so
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_qp *qp = to_iqp(ibqp);
-       enum ib_qp_state cur_state, new_state;
-       int lastwqe = 0;
-       int ret;
-
-       spin_lock_irq(&qp->s_lock);
-
-       cur_state = attr_mask & IB_QP_CUR_STATE ?
-               attr->cur_qp_state : qp->state;
-       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-                               attr_mask, IB_LINK_LAYER_UNSPECIFIED))
-               goto inval;
-
-       if (attr_mask & IB_QP_AV) {
-               if (attr->ah_attr.dlid == 0 ||
-                   attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
-                       goto inval;
-
-               if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
-                   (attr->ah_attr.grh.sgid_index > 1))
-                       goto inval;
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX)
-               if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
-                       goto inval;
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER)
-               if (attr->min_rnr_timer > 31)
-                       goto inval;
-
-       if (attr_mask & IB_QP_PORT)
-               if (attr->port_num == 0 ||
-                   attr->port_num > ibqp->device->phys_port_cnt)
-                       goto inval;
-
-       /*
-        * don't allow invalid Path MTU values or greater than 2048
-        * unless we are configured for a 4KB MTU
-        */
-       if ((attr_mask & IB_QP_PATH_MTU) &&
-               (ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
-               (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
-               goto inval;
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE)
-               if (attr->path_mig_state != IB_MIG_MIGRATED &&
-                   attr->path_mig_state != IB_MIG_REARM)
-                       goto inval;
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-               if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
-                       goto inval;
-
-       switch (new_state) {
-       case IB_QPS_RESET:
-               if (qp->state != IB_QPS_RESET) {
-                       qp->state = IB_QPS_RESET;
-                       spin_lock(&dev->pending_lock);
-                       if (!list_empty(&qp->timerwait))
-                               list_del_init(&qp->timerwait);
-                       if (!list_empty(&qp->piowait))
-                               list_del_init(&qp->piowait);
-                       spin_unlock(&dev->pending_lock);
-                       qp->s_flags &= ~IPATH_S_ANY_WAIT;
-                       spin_unlock_irq(&qp->s_lock);
-                       /* Stop the sending tasklet */
-                       tasklet_kill(&qp->s_task);
-                       wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-                       spin_lock_irq(&qp->s_lock);
-               }
-               ipath_reset_qp(qp, ibqp->qp_type);
-               break;
-
-       case IB_QPS_SQD:
-               qp->s_draining = qp->s_last != qp->s_cur;
-               qp->state = new_state;
-               break;
-
-       case IB_QPS_SQE:
-               if (qp->ibqp.qp_type == IB_QPT_RC)
-                       goto inval;
-               qp->state = new_state;
-               break;
-
-       case IB_QPS_ERR:
-               lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-               break;
-
-       default:
-               qp->state = new_state;
-               break;
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX)
-               qp->s_pkey_index = attr->pkey_index;
-
-       if (attr_mask & IB_QP_DEST_QPN)
-               qp->remote_qpn = attr->dest_qp_num;
-
-       if (attr_mask & IB_QP_SQ_PSN) {
-               qp->s_psn = qp->s_next_psn = attr->sq_psn;
-               qp->s_last_psn = qp->s_next_psn - 1;
-       }
-
-       if (attr_mask & IB_QP_RQ_PSN)
-               qp->r_psn = attr->rq_psn;
-
-       if (attr_mask & IB_QP_ACCESS_FLAGS)
-               qp->qp_access_flags = attr->qp_access_flags;
-
-       if (attr_mask & IB_QP_AV) {
-               qp->remote_ah_attr = attr->ah_attr;
-               qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
-       }
-
-       if (attr_mask & IB_QP_PATH_MTU)
-               qp->path_mtu = attr->path_mtu;
-
-       if (attr_mask & IB_QP_RETRY_CNT)
-               qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
-
-       if (attr_mask & IB_QP_RNR_RETRY) {
-               qp->s_rnr_retry = attr->rnr_retry;
-               if (qp->s_rnr_retry > 7)
-                       qp->s_rnr_retry = 7;
-               qp->s_rnr_retry_cnt = qp->s_rnr_retry;
-       }
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER)
-               qp->r_min_rnr_timer = attr->min_rnr_timer;
-
-       if (attr_mask & IB_QP_TIMEOUT)
-               qp->timeout = attr->timeout;
-
-       if (attr_mask & IB_QP_QKEY)
-               qp->qkey = attr->qkey;
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-               qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
-
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
-               qp->s_max_rd_atomic = attr->max_rd_atomic;
-
-       spin_unlock_irq(&qp->s_lock);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-       ret = 0;
-       goto bail;
-
-inval:
-       spin_unlock_irq(&qp->s_lock);
-       ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                  int attr_mask, struct ib_qp_init_attr *init_attr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-
-       attr->qp_state = qp->state;
-       attr->cur_qp_state = attr->qp_state;
-       attr->path_mtu = qp->path_mtu;
-       attr->path_mig_state = 0;
-       attr->qkey = qp->qkey;
-       attr->rq_psn = qp->r_psn;
-       attr->sq_psn = qp->s_next_psn;
-       attr->dest_qp_num = qp->remote_qpn;
-       attr->qp_access_flags = qp->qp_access_flags;
-       attr->cap.max_send_wr = qp->s_size - 1;
-       attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
-       attr->cap.max_send_sge = qp->s_max_sge;
-       attr->cap.max_recv_sge = qp->r_rq.max_sge;
-       attr->cap.max_inline_data = 0;
-       attr->ah_attr = qp->remote_ah_attr;
-       memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
-       attr->pkey_index = qp->s_pkey_index;
-       attr->alt_pkey_index = 0;
-       attr->en_sqd_async_notify = 0;
-       attr->sq_draining = qp->s_draining;
-       attr->max_rd_atomic = qp->s_max_rd_atomic;
-       attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
-       attr->min_rnr_timer = qp->r_min_rnr_timer;
-       attr->port_num = 1;
-       attr->timeout = qp->timeout;
-       attr->retry_cnt = qp->s_retry_cnt;
-       attr->rnr_retry = qp->s_rnr_retry_cnt;
-       attr->alt_port_num = 0;
-       attr->alt_timeout = 0;
-
-       init_attr->event_handler = qp->ibqp.event_handler;
-       init_attr->qp_context = qp->ibqp.qp_context;
-       init_attr->send_cq = qp->ibqp.send_cq;
-       init_attr->recv_cq = qp->ibqp.recv_cq;
-       init_attr->srq = qp->ibqp.srq;
-       init_attr->cap = attr->cap;
-       if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
-               init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-       else
-               init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
-       init_attr->qp_type = qp->ibqp.qp_type;
-       init_attr->port_num = 1;
-       return 0;
-}
-
-/**
- * ipath_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 ipath_compute_aeth(struct ipath_qp *qp)
-{
-       u32 aeth = qp->r_msn & IPATH_MSN_MASK;
-
-       if (qp->ibqp.srq) {
-               /*
-                * Shared receive queues don't generate credits.
-                * Set the credit field to the invalid value.
-                */
-               aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
-       } else {
-               u32 min, max, x;
-               u32 credits;
-               struct ipath_rwq *wq = qp->r_rq.wq;
-               u32 head;
-               u32 tail;
-
-               /* sanity check pointers before trusting them */
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               /*
-                * Compute the number of credits available (RWQEs).
-                * XXX Not holding the r_rq.lock here so there is a small
-                * chance that the pair of reads are not atomic.
-                */
-               credits = head - tail;
-               if ((int)credits < 0)
-                       credits += qp->r_rq.size;
-               /*
-                * Binary search the credit table to find the code to
-                * use.
-                */
-               min = 0;
-               max = 31;
-               for (;;) {
-                       x = (min + max) / 2;
-                       if (credit_table[x] == credits)
-                               break;
-                       if (credit_table[x] > credits)
-                               max = x;
-                       else if (min == x)
-                               break;
-                       else
-                               min = x;
-               }
-               aeth |= x << IPATH_AETH_CREDIT_SHIFT;
-       }
-       return cpu_to_be32(aeth);
-}
-
-/**
- * ipath_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: unused by InfiniPath
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-                             struct ib_qp_init_attr *init_attr,
-                             struct ib_udata *udata)
-{
-       struct ipath_qp *qp;
-       int err;
-       struct ipath_swqe *swq = NULL;
-       struct ipath_ibdev *dev;
-       size_t sz;
-       size_t sg_list_sz;
-       struct ib_qp *ret;
-
-       if (init_attr->create_flags) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
-           init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       /* Check receive queue parameters if no SRQ is specified. */
-       if (!init_attr->srq) {
-               if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
-                   init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-               if (init_attr->cap.max_send_sge +
-                   init_attr->cap.max_send_wr +
-                   init_attr->cap.max_recv_sge +
-                   init_attr->cap.max_recv_wr == 0) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-       }
-
-       switch (init_attr->qp_type) {
-       case IB_QPT_UC:
-       case IB_QPT_RC:
-       case IB_QPT_UD:
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               sz = sizeof(struct ipath_sge) *
-                       init_attr->cap.max_send_sge +
-                       sizeof(struct ipath_swqe);
-               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
-               if (swq == NULL) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail;
-               }
-               sz = sizeof(*qp);
-               sg_list_sz = 0;
-               if (init_attr->srq) {
-                       struct ipath_srq *srq = to_isrq(init_attr->srq);
-
-                       if (srq->rq.max_sge > 1)
-                               sg_list_sz = sizeof(*qp->r_sg_list) *
-                                       (srq->rq.max_sge - 1);
-               } else if (init_attr->cap.max_recv_sge > 1)
-                       sg_list_sz = sizeof(*qp->r_sg_list) *
-                               (init_attr->cap.max_recv_sge - 1);
-               qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
-               if (!qp) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_swq;
-               }
-               if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
-                   init_attr->qp_type == IB_QPT_SMI ||
-                   init_attr->qp_type == IB_QPT_GSI)) {
-                       qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
-                       if (!qp->r_ud_sg_list) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_qp;
-                       }
-               } else
-                       qp->r_ud_sg_list = NULL;
-               if (init_attr->srq) {
-                       sz = 0;
-                       qp->r_rq.size = 0;
-                       qp->r_rq.max_sge = 0;
-                       qp->r_rq.wq = NULL;
-                       init_attr->cap.max_recv_wr = 0;
-                       init_attr->cap.max_recv_sge = 0;
-               } else {
-                       qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
-                       qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
-                       sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
-                               sizeof(struct ipath_rwqe);
-                       qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
-                                             qp->r_rq.size * sz);
-                       if (!qp->r_rq.wq) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_sg_list;
-                       }
-               }
-
-               /*
-                * ib_create_qp() will initialize qp->ibqp
-                * except for qp->ibqp.qp_num.
-                */
-               spin_lock_init(&qp->s_lock);
-               spin_lock_init(&qp->r_rq.lock);
-               atomic_set(&qp->refcount, 0);
-               init_waitqueue_head(&qp->wait);
-               init_waitqueue_head(&qp->wait_dma);
-               tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
-               INIT_LIST_HEAD(&qp->piowait);
-               INIT_LIST_HEAD(&qp->timerwait);
-               qp->state = IB_QPS_RESET;
-               qp->s_wq = swq;
-               qp->s_size = init_attr->cap.max_send_wr + 1;
-               qp->s_max_sge = init_attr->cap.max_send_sge;
-               if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
-                       qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
-               else
-                       qp->s_flags = 0;
-               dev = to_idev(ibpd->device);
-               err = ipath_alloc_qpn(&dev->qp_table, qp,
-                                     init_attr->qp_type);
-               if (err) {
-                       ret = ERR_PTR(err);
-                       vfree(qp->r_rq.wq);
-                       goto bail_sg_list;
-               }
-               qp->ip = NULL;
-               qp->s_tx = NULL;
-               ipath_reset_qp(qp, init_attr->qp_type);
-               break;
-
-       default:
-               /* Don't support raw QPs */
-               ret = ERR_PTR(-ENOSYS);
-               goto bail;
-       }
-
-       init_attr->cap.max_inline_data = 0;
-
-       /*
-        * Return the address of the RWQ as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               if (!qp->r_rq.wq) {
-                       __u64 offset = 0;
-
-                       err = ib_copy_to_udata(udata, &offset,
-                                              sizeof(offset));
-                       if (err) {
-                               ret = ERR_PTR(err);
-                               goto bail_ip;
-                       }
-               } else {
-                       u32 s = sizeof(struct ipath_rwq) +
-                               qp->r_rq.size * sz;
-
-                       qp->ip =
-                           ipath_create_mmap_info(dev, s,
-                                                  ibpd->uobject->context,
-                                                  qp->r_rq.wq);
-                       if (!qp->ip) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_ip;
-                       }
-
-                       err = ib_copy_to_udata(udata, &(qp->ip->offset),
-                                              sizeof(qp->ip->offset));
-                       if (err) {
-                               ret = ERR_PTR(err);
-                               goto bail_ip;
-                       }
-               }
-       }
-
-       spin_lock(&dev->n_qps_lock);
-       if (dev->n_qps_allocated == ib_ipath_max_qps) {
-               spin_unlock(&dev->n_qps_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_qps_allocated++;
-       spin_unlock(&dev->n_qps_lock);
-
-       if (qp->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = &qp->ibqp;
-       goto bail;
-
-bail_ip:
-       if (qp->ip)
-               kref_put(&qp->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(qp->r_rq.wq);
-       ipath_free_qp(&dev->qp_table, qp);
-       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-bail_sg_list:
-       kfree(qp->r_ud_sg_list);
-bail_qp:
-       kfree(qp);
-bail_swq:
-       vfree(swq);
-bail:
-       return ret;
-}
-
-/**
- * ipath_destroy_qp - destroy a queue pair
- * @ibqp: the queue pair to destroy
- *
- * Returns 0 on success.
- *
- * Note that this can be called while the QP is actively sending or
- * receiving!
- */
-int ipath_destroy_qp(struct ib_qp *ibqp)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-
-       /* Make sure HW and driver activity is stopped. */
-       spin_lock_irq(&qp->s_lock);
-       if (qp->state != IB_QPS_RESET) {
-               qp->state = IB_QPS_RESET;
-               spin_lock(&dev->pending_lock);
-               if (!list_empty(&qp->timerwait))
-                       list_del_init(&qp->timerwait);
-               if (!list_empty(&qp->piowait))
-                       list_del_init(&qp->piowait);
-               spin_unlock(&dev->pending_lock);
-               qp->s_flags &= ~IPATH_S_ANY_WAIT;
-               spin_unlock_irq(&qp->s_lock);
-               /* Stop the sending tasklet */
-               tasklet_kill(&qp->s_task);
-               wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-       } else
-               spin_unlock_irq(&qp->s_lock);
-
-       ipath_free_qp(&dev->qp_table, qp);
-
-       if (qp->s_tx) {
-               atomic_dec(&qp->refcount);
-               if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-                       kfree(qp->s_tx->txreq.map_addr);
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
-               spin_unlock_irq(&dev->pending_lock);
-               qp->s_tx = NULL;
-       }
-
-       wait_event(qp->wait, !atomic_read(&qp->refcount));
-
-       /* all user's cleaned up, mark it available */
-       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-       spin_lock(&dev->n_qps_lock);
-       dev->n_qps_allocated--;
-       spin_unlock(&dev->n_qps_lock);
-
-       if (qp->ip)
-               kref_put(&qp->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(qp->r_rq.wq);
-       kfree(qp->r_ud_sg_list);
-       vfree(qp->s_wq);
-       kfree(qp);
-       return 0;
-}
-
-/**
- * ipath_init_qp_table - initialize the QP table for a device
- * @idev: the device who's QP table we're initializing
- * @size: the size of the QP table
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
-{
-       int i;
-       int ret;
-
-       idev->qp_table.last = 1;        /* QPN 0 and 1 are special. */
-       idev->qp_table.max = size;
-       idev->qp_table.nmaps = 1;
-       idev->qp_table.table = kcalloc(size, sizeof(*idev->qp_table.table),
-                                      GFP_KERNEL);
-       if (idev->qp_table.table == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
-               atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
-               idev->qp_table.map[i].page = NULL;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
-{
-       u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
-
-       /*
-        * If the credit is invalid, we can send
-        * as many packets as we like.  Otherwise, we have to
-        * honor the credit field.
-        */
-       if (credit == IPATH_AETH_CREDIT_INVAL)
-               qp->s_lsn = (u32) -1;
-       else if (qp->s_lsn != (u32) -1) {
-               /* Compute new LSN (i.e., MSN + credit) */
-               credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
-               if (ipath_cmp24(credit, qp->s_lsn) > 0)
-                       qp->s_lsn = credit;
-       }
-
-       /* Restart sending if it was blocked due to lack of credits. */
-       if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
-           qp->s_cur != qp->s_head &&
-           (qp->s_lsn == (u32) -1 ||
-            ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
-                        qp->s_lsn + 1) <= 0))
-               ipath_schedule_send(qp);
-}
diff --git a/drivers/staging/rdma/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c
deleted file mode 100644 (file)
index d4aa535..0000000
+++ /dev/null
@@ -1,1969 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_RC_##x
-
-static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
-                      u32 psn, u32 pmtu)
-{
-       u32 len;
-
-       len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-       ss->sge = wqe->sg_list[0];
-       ss->sg_list = wqe->sg_list + 1;
-       ss->num_sge = wqe->wr.num_sge;
-       ipath_skip_sge(ss, len);
-       return wqe->length - len;
-}
-
-/**
- * ipath_init_restart- initialize the qp->s_sge after a restart
- * @qp: the QP who's SGE we're restarting
- * @wqe: the work queue to initialize the QP's SGE from
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
-{
-       struct ipath_ibdev *dev;
-
-       qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
-                               ib_mtu_enum_to_int(qp->path_mtu));
-       dev = to_idev(qp->ibqp.device);
-       spin_lock(&dev->pending_lock);
-       if (list_empty(&qp->timerwait))
-               list_add_tail(&qp->timerwait,
-                             &dev->pending[dev->pending_index]);
-       spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
- * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- *
- * Return 1 if constructed; otherwise, return 0.
- * Note that we are in the responder's side of the QP context.
- * Note the QP s_lock must be held.
- */
-static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                            struct ipath_other_headers *ohdr, u32 pmtu)
-{
-       struct ipath_ack_entry *e;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-
-       /* Don't send an ACK if we aren't supposed to. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto bail;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-
-       switch (qp->s_ack_state) {
-       case OP(RDMA_READ_RESPONSE_LAST):
-       case OP(RDMA_READ_RESPONSE_ONLY):
-       case OP(ATOMIC_ACKNOWLEDGE):
-               /*
-                * We can increment the tail pointer now that the last
-                * response has been sent instead of only being
-                * constructed.
-                */
-               if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
-                       qp->s_tail_ack_queue = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_ONLY):
-       case OP(ACKNOWLEDGE):
-               /* Check for no next entry in the queue. */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
-                       if (qp->s_flags & IPATH_S_ACK_PENDING)
-                               goto normal;
-                       qp->s_ack_state = OP(ACKNOWLEDGE);
-                       goto bail;
-               }
-
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST)) {
-                       /* Copy SGE state in case we need to resend */
-                       qp->s_ack_rdma_sge = e->rdma_sge;
-                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
-                       len = e->rdma_sge.sge.sge_length;
-                       if (len > pmtu) {
-                               len = pmtu;
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
-                       } else {
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-                               e->sent = 1;
-                       }
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_rdma_psn = e->psn;
-                       bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-               } else {
-                       /* COMPARE_SWAP or FETCH_ADD */
-                       qp->s_cur_sge = NULL;
-                       len = 0;
-                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
-                       ohdr->u.at.aeth = ipath_compute_aeth(qp);
-                       ohdr->u.at.atomic_ack_eth[0] =
-                               cpu_to_be32(e->atomic_data >> 32);
-                       ohdr->u.at.atomic_ack_eth[1] =
-                               cpu_to_be32(e->atomic_data);
-                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
-                       bth2 = e->psn;
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               len = qp->s_ack_rdma_sge.sge.sge_length;
-               if (len > pmtu)
-                       len = pmtu;
-               else {
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-                       qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-               break;
-
-       default:
-       normal:
-               /*
-                * Send a regular ACK.
-                * Set the s_ack_state so we wait until after sending
-                * the ACK before setting s_ack_state to ACKNOWLEDGE
-                * (see above).
-                */
-               qp->s_ack_state = OP(SEND_ONLY);
-               qp->s_flags &= ~IPATH_S_ACK_PENDING;
-               qp->s_cur_sge = NULL;
-               if (qp->s_nak_state)
-                       ohdr->u.aeth =
-                               cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-                                           (qp->s_nak_state <<
-                                            IPATH_AETH_CREDIT_SHIFT));
-               else
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-               hwords++;
-               len = 0;
-               bth0 = OP(ACKNOWLEDGE) << 24;
-               bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
-       }
-       qp->s_hdrwords = hwords;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
-       return 1;
-
-bail:
-       return 0;
-}
-
-/**
- * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_rc_req(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_other_headers *ohdr;
-       struct ipath_sge_state *ss;
-       struct ipath_swqe *wqe;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       char newreq;
-       unsigned long flags;
-       int ret = 0;
-
-       ohdr = &qp->s_hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &qp->s_hdr.u.l.oth;
-
-       /*
-        * The lock is needed to synchronize between the sending tasklet,
-        * the receive interrupt handler, and timeout resends.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Sending responses has higher priority over sending requests. */
-       if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-            (qp->s_flags & IPATH_S_ACK_PENDING) ||
-            qp->s_ack_state != OP(ACKNOWLEDGE)) &&
-           ipath_make_rc_ack(dev, qp, ohdr, pmtu))
-               goto done;
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       /* Leave BUSY set until RNR timeout. */
-       if (qp->s_rnr_timeout) {
-               qp->s_flags |= IPATH_S_WAITING;
-               goto bail;
-       }
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-       bth0 = 1 << 22; /* Set M bit */
-
-       /* Send a request. */
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       switch (qp->s_state) {
-       default:
-               if (!(ib_ipath_state_ops[qp->state] &
-                   IPATH_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /*
-                * Resend an old request or start a new one.
-                *
-                * We keep track of the current SWQE so that
-                * we don't reset the "furthest progress" state
-                * if we need to back up.
-                */
-               newreq = 0;
-               if (qp->s_cur == qp->s_tail) {
-                       /* Check if send work queue is empty. */
-                       if (qp->s_tail == qp->s_head)
-                               goto bail;
-                       /*
-                        * If a fence is requested, wait for previous
-                        * RDMA read and atomic operations to finish.
-                        */
-                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-                           qp->s_num_rd_atomic) {
-                               qp->s_flags |= IPATH_S_FENCE_PENDING;
-                               goto bail;
-                       }
-                       wqe->psn = qp->s_next_psn;
-                       newreq = 1;
-               }
-               /*
-                * Note that we have to be careful not to modify the
-                * original work request since we may need to resend
-                * it.
-                */
-               len = wqe->length;
-               ss = &qp->s_sge;
-               bth2 = 0;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       /* If no credit, return. */
-                       if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       wqe->lpsn = wqe->psn;
-                       if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND)
-                               qp->s_state = OP(SEND_ONLY);
-                       else {
-                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-                       bth2 = 1 << 31; /* Request ACK. */
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-                       if (newreq && qp->s_lsn != (u32) -1)
-                               qp->s_lsn++;
-                       /* FALLTHROUGH */
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       /* If no credit, return. */
-                       if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       wqe->lpsn = wqe->psn;
-                       if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= 1 << 23;
-                       }
-                       bth2 = 1 << 31; /* Request ACK. */
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_READ:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (qp->s_lsn != (u32) -1)
-                                       qp->s_lsn++;
-                               /*
-                                * Adjust s_next_psn to count the
-                                * expected number of responses.
-                                */
-                               if (len > pmtu)
-                                       qp->s_next_psn += (len - 1) / pmtu;
-                               wqe->lpsn = qp->s_next_psn++;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       qp->s_state = OP(RDMA_READ_REQUEST);
-                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_ATOMIC_CMP_AND_SWP:
-               case IB_WR_ATOMIC_FETCH_AND_ADD:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (qp->s_lsn != (u32) -1)
-                                       qp->s_lsn++;
-                               wqe->lpsn = wqe->psn;
-                       }
-                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-                               qp->s_state = OP(COMPARE_SWAP);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.swap);
-                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                       } else {
-                               qp->s_state = OP(FETCH_ADD);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                               ohdr->u.atomic_eth.compare_data = 0;
-                       }
-                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr >> 32);
-                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr);
-                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
-                               wqe->atomic_wr.rkey);
-                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_len = wqe->length;
-               if (newreq) {
-                       qp->s_tail++;
-                       if (qp->s_tail >= qp->s_size)
-                               qp->s_tail = 0;
-               }
-               bth2 |= qp->s_psn & IPATH_PSN_MASK;
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       qp->s_psn = wqe->lpsn + 1;
-               else {
-                       qp->s_psn++;
-                       if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                               qp->s_next_psn = qp->s_psn;
-               }
-               /*
-                * Put the QP on the pending list so lost ACKs will cause
-                * a retry.  More than one request can be pending so the
-                * QP may already be on the dev->pending list.
-                */
-               spin_lock(&dev->pending_lock);
-               if (list_empty(&qp->timerwait))
-                       list_add_tail(&qp->timerwait,
-                                     &dev->pending[dev->pending_index]);
-               spin_unlock(&dev->pending_lock);
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               /*
-                * This case can only happen if a send is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               /* FALLTHROUGH */
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND)
-                       qp->s_state = OP(SEND_LAST);
-               else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= 1 << 23;
-               bth2 |= 1 << 31;        /* Request ACK. */
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /*
-                * This case can only happen if a RDMA write is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               else {
-                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-               }
-               bth2 |= 1 << 31;        /* Request ACK. */
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /*
-                * This case can only happen if a RDMA read is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-               ohdr->u.rc.reth.vaddr =
-                       cpu_to_be64(wqe->rdma_wr.remote_addr + len);
-               ohdr->u.rc.reth.rkey =
-                       cpu_to_be32(wqe->rdma_wr.rkey);
-               ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
-               qp->s_state = OP(RDMA_READ_REQUEST);
-               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-               bth2 = qp->s_psn & IPATH_PSN_MASK;
-               qp->s_psn = wqe->lpsn + 1;
-               ss = NULL;
-               len = 0;
-               qp->s_cur++;
-               if (qp->s_cur == qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
-               bth2 |= 1 << 31;        /* Request ACK. */
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       qp->s_cur_sge = ss;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * send_rc_ack - Construct an ACK packet and send it
- * @qp: a pointer to the QP
- *
- * This is called from ipath_rc_rcv() and only uses the receive
- * side QP state.
- * Note that RDMA reads and atomics are handled in the
- * send side QP state and tasklet.
- */
-static void send_rc_ack(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_devdata *dd;
-       u16 lrh0;
-       u32 bth0;
-       u32 hwords;
-       u32 __iomem *piobuf;
-       struct ipath_ib_header hdr;
-       struct ipath_other_headers *ohdr;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-       if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-           (qp->s_flags & IPATH_S_ACK_PENDING) ||
-           qp->s_ack_state != OP(ACKNOWLEDGE))
-               goto queue_ack;
-
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       /* Don't try to send ACKs if the link isn't ACTIVE */
-       dd = dev->dd;
-       if (!(dd->ipath_flags & IPATH_LINKACTIVE))
-               goto done;
-
-       piobuf = ipath_getpiobuf(dd, 0, NULL);
-       if (!piobuf) {
-               /*
-                * We are out of PIO buffers at the moment.
-                * Pass responsibility for sending the ACK to the
-                * send tasklet so that when a PIO buffer becomes
-                * available, the ACK is sent ahead of other outgoing
-                * packets.
-                */
-               spin_lock_irqsave(&qp->s_lock, flags);
-               goto queue_ack;
-       }
-
-       /* Construct the header. */
-       ohdr = &hdr.u.oth;
-       lrh0 = IPATH_LRH_BTH;
-       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
-       hwords = 6;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               hwords += ipath_make_grh(dev, &hdr.u.l.grh,
-                                        &qp->remote_ah_attr.grh,
-                                        hwords, 0);
-               ohdr = &hdr.u.l.oth;
-               lrh0 = IPATH_LRH_GRH;
-       }
-       /* read pkey_index w/o lock (its atomic) */
-       bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
-               (OP(ACKNOWLEDGE) << 24) | (1 << 22);
-       if (qp->r_nak_state)
-               ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-                                           (qp->r_nak_state <<
-                                            IPATH_AETH_CREDIT_SHIFT));
-       else
-               ohdr->u.aeth = ipath_compute_aeth(qp);
-       lrh0 |= qp->remote_ah_attr.sl << 4;
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
-                                qp->remote_ah_attr.src_path_bits);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
-
-       writeq(hwords + 1, piobuf);
-
-       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-               u32 *hdrp = (u32 *) &hdr;
-
-               ipath_flush_wc();
-               __iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
-               ipath_flush_wc();
-               __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
-       } else
-               __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
-
-       ipath_flush_wc();
-
-       dev->n_unicast_xmit++;
-       goto done;
-
-queue_ack:
-       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
-               dev->n_rc_qacks++;
-               qp->s_flags |= IPATH_S_ACK_PENDING;
-               qp->s_nak_state = qp->r_nak_state;
-               qp->s_ack_psn = qp->r_ack_psn;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return;
-}
-
-/**
- * reset_psn - reset the QP state to send starting from PSN
- * @qp: the QP
- * @psn: the packet sequence number to restart at
- *
- * This is called from ipath_rc_rcv() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held.
- */
-static void reset_psn(struct ipath_qp *qp, u32 psn)
-{
-       u32 n = qp->s_last;
-       struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
-       u32 opcode;
-
-       qp->s_cur = n;
-
-       /*
-        * If we are starting the request from the beginning,
-        * let the normal send code handle initialization.
-        */
-       if (ipath_cmp24(psn, wqe->psn) <= 0) {
-               qp->s_state = OP(SEND_LAST);
-               goto done;
-       }
-
-       /* Find the work request opcode corresponding to the given PSN. */
-       opcode = wqe->wr.opcode;
-       for (;;) {
-               int diff;
-
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-               wqe = get_swqe_ptr(qp, n);
-               diff = ipath_cmp24(psn, wqe->psn);
-               if (diff < 0)
-                       break;
-               qp->s_cur = n;
-               /*
-                * If we are starting the request from the beginning,
-                * let the normal send code handle initialization.
-                */
-               if (diff == 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       goto done;
-               }
-               opcode = wqe->wr.opcode;
-       }
-
-       /*
-        * Set the state to restart in the middle of a request.
-        * Don't change the s_sge, s_cur_sge, or s_cur_size.
-        * See ipath_make_rc_req().
-        */
-       switch (opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
-               break;
-
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
-               break;
-
-       case IB_WR_RDMA_READ:
-               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               break;
-
-       default:
-               /*
-                * This case shouldn't happen since its only
-                * one PSN per req.
-                */
-               qp->s_state = OP(SEND_LAST);
-       }
-done:
-       qp->s_psn = psn;
-}
-
-/**
- * ipath_restart_rc - back up requester to resend the last un-ACKed request
- * @qp: the QP to restart
- * @psn: packet sequence number for the request
- * @wc: the work completion request
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
-{
-       struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
-       struct ipath_ibdev *dev;
-
-       if (qp->s_retry == 0) {
-               ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-               ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-               goto bail;
-       }
-       qp->s_retry--;
-
-       /*
-        * Remove the QP from the timeout queue.
-        * Note: it may already have been removed by ipath_ib_timer().
-        */
-       dev = to_idev(qp->ibqp.device);
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       if (!list_empty(&qp->piowait))
-               list_del_init(&qp->piowait);
-       spin_unlock(&dev->pending_lock);
-
-       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-               dev->n_rc_resends++;
-       else
-               dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
-
-       reset_psn(qp, psn);
-       ipath_schedule_send(qp);
-
-bail:
-       return;
-}
-
-static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
-{
-       qp->s_last_psn = psn;
-}
-
-/**
- * do_rc_ack - process an incoming RC ACK
- * @qp: the QP the ACK came in on
- * @psn: the packet sequence number of the ACK
- * @opcode: the opcode of the request that resulted in the ACK
- *
- * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held and interrupts disabled.
- * Returns 1 if OK, 0 if current operation should be aborted (NAK).
- */
-static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
-                    u64 val)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ib_wc wc;
-       enum ib_wc_status status;
-       struct ipath_swqe *wqe;
-       int ret = 0;
-       u32 ack_psn;
-       int diff;
-
-       /*
-        * Remove the QP from the timeout queue (or RNR timeout queue).
-        * If ipath_ib_timer() has already removed it,
-        * it's OK since we hold the QP s_lock and ipath_restart_rc()
-        * just won't find anything to restart if we ACK everything.
-        */
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       spin_unlock(&dev->pending_lock);
-
-       /*
-        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
-        * requests and implicitly NAK RDMA read and atomic requests issued
-        * before the NAK'ed request.  The MSN won't include the NAK'ed
-        * request but will include an ACK'ed request(s).
-        */
-       ack_psn = psn;
-       if (aeth >> 29)
-               ack_psn--;
-       wqe = get_swqe_ptr(qp, qp->s_last);
-
-       /*
-        * The MSN might be for a later WQE than the PSN indicates so
-        * only complete WQEs that the PSN finishes.
-        */
-       while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
-               /*
-                * RDMA_READ_RESPONSE_ONLY is a special case since
-                * we want to generate completion events for everything
-                * before the RDMA read, copy the data, then generate
-                * the completion for the read.
-                */
-               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
-                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
-                   diff == 0) {
-                       ret = 1;
-                       goto bail;
-               }
-               /*
-                * If this request is a RDMA read or atomic, and the ACK is
-                * for a later operation, this ACK NAKs the RDMA read or
-                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
-                * can ACK a RDMA read and likewise for atomic ops.  Note
-                * that the NAK case can only happen if relaxed ordering is
-                * used and requests are sent after an RDMA read or atomic
-                * is sent but before the response is received.
-                */
-               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
-                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-                       /*
-                        * The last valid PSN seen is the previous
-                        * request's.
-                        */
-                       update_last_psn(qp, wqe->psn - 1);
-                       /* Retry this request. */
-                       ipath_restart_rc(qp, wqe->psn);
-                       /*
-                        * No need to process the ACK/NAK since we are
-                        * restarting an earlier request.
-                        */
-                       goto bail;
-               }
-               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-                       *(u64 *) wqe->sg_list[0].vaddr = val;
-               if (qp->s_num_rd_atomic &&
-                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
-                       qp->s_num_rd_atomic--;
-                       /* Restart sending task if fence is complete */
-                       if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
-                            !qp->s_num_rd_atomic) ||
-                           qp->s_flags & IPATH_S_RDMAR_PENDING)
-                               ipath_schedule_send(qp);
-               }
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof wc);
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       wc.src_qp = qp->remote_qpn;
-                       wc.slid = qp->remote_ah_attr.dlid;
-                       wc.sl = qp->remote_ah_attr.sl;
-                       ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
-               }
-               qp->s_retry = qp->s_retry_cnt;
-               /*
-                * If we are completing a request which is in the process of
-                * being resent, we can stop resending it since we know the
-                * responder has already seen it.
-                */
-               if (qp->s_last == qp->s_cur) {
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       qp->s_last = qp->s_cur;
-                       if (qp->s_last == qp->s_tail)
-                               break;
-                       wqe = get_swqe_ptr(qp, qp->s_cur);
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = wqe->psn;
-               } else {
-                       if (++qp->s_last >= qp->s_size)
-                               qp->s_last = 0;
-                       if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
-                               qp->s_draining = 0;
-                       if (qp->s_last == qp->s_tail)
-                               break;
-                       wqe = get_swqe_ptr(qp, qp->s_last);
-               }
-       }
-
-       switch (aeth >> 29) {
-       case 0:         /* ACK */
-               dev->n_rc_acks++;
-               /* If this is a partial ACK, reset the retransmit timer. */
-               if (qp->s_last != qp->s_tail) {
-                       spin_lock(&dev->pending_lock);
-                       if (list_empty(&qp->timerwait))
-                               list_add_tail(&qp->timerwait,
-                                       &dev->pending[dev->pending_index]);
-                       spin_unlock(&dev->pending_lock);
-                       /*
-                        * If we get a partial ACK for a resent operation,
-                        * we can stop resending the earlier packets and
-                        * continue with the next packet the receiver wants.
-                        */
-                       if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-                               reset_psn(qp, psn + 1);
-                               ipath_schedule_send(qp);
-                       }
-               } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = psn + 1;
-               }
-               ipath_get_credit(qp, aeth);
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               qp->s_retry = qp->s_retry_cnt;
-               update_last_psn(qp, psn);
-               ret = 1;
-               goto bail;
-
-       case 1:         /* RNR NAK */
-               dev->n_rnr_naks++;
-               if (qp->s_last == qp->s_tail)
-                       goto bail;
-               if (qp->s_rnr_retry == 0) {
-                       status = IB_WC_RNR_RETRY_EXC_ERR;
-                       goto class_b;
-               }
-               if (qp->s_rnr_retry_cnt < 7)
-                       qp->s_rnr_retry--;
-
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       dev->n_rc_resends++;
-               else
-                       dev->n_rc_resends +=
-                               (qp->s_psn - psn) & IPATH_PSN_MASK;
-
-               reset_psn(qp, psn);
-
-               qp->s_rnr_timeout =
-                       ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
-                                          IPATH_AETH_CREDIT_MASK];
-               ipath_insert_rnr_queue(qp);
-               ipath_schedule_send(qp);
-               goto bail;
-
-       case 3:         /* NAK */
-               if (qp->s_last == qp->s_tail)
-                       goto bail;
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-               switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
-                       IPATH_AETH_CREDIT_MASK) {
-               case 0: /* PSN sequence error */
-                       dev->n_seq_naks++;
-                       /*
-                        * Back up to the responder's expected PSN.
-                        * Note that we might get a NAK in the middle of an
-                        * RDMA READ response which terminates the RDMA
-                        * READ.
-                        */
-                       ipath_restart_rc(qp, psn);
-                       break;
-
-               case 1: /* Invalid Request */
-                       status = IB_WC_REM_INV_REQ_ERR;
-                       dev->n_other_naks++;
-                       goto class_b;
-
-               case 2: /* Remote Access Error */
-                       status = IB_WC_REM_ACCESS_ERR;
-                       dev->n_other_naks++;
-                       goto class_b;
-
-               case 3: /* Remote Operation Error */
-                       status = IB_WC_REM_OP_ERR;
-                       dev->n_other_naks++;
-               class_b:
-                       ipath_send_complete(qp, wqe, status);
-                       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       break;
-
-               default:
-                       /* Ignore other reserved NAK error codes */
-                       goto reserved;
-               }
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               goto bail;
-
-       default:                /* 2: reserved */
-       reserved:
-               /* Ignore reserved NAK codes. */
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_rc_rcv_resp - process an incoming RC response packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @hdrsize: the header length
- * @pmtu: the path MTU
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an incoming RC response
- * packet for the given QP.
- * Called at interrupt level.
- */
-static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
-                                    struct ipath_other_headers *ohdr,
-                                    void *data, u32 tlen,
-                                    struct ipath_qp *qp,
-                                    u32 opcode,
-                                    u32 psn, u32 hdrsize, u32 pmtu,
-                                    int header_in_data)
-{
-       struct ipath_swqe *wqe;
-       enum ib_wc_status status;
-       unsigned long flags;
-       int diff;
-       u32 pad;
-       u32 aeth;
-       u64 val;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Double check we can process this now that we hold the s_lock. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto ack_done;
-
-       /* Ignore invalid responses. */
-       if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
-               goto ack_done;
-
-       /* Ignore duplicate responses. */
-       diff = ipath_cmp24(psn, qp->s_last_psn);
-       if (unlikely(diff <= 0)) {
-               /* Update credits for "ghost" ACKs */
-               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
-                       if (!header_in_data)
-                               aeth = be32_to_cpu(ohdr->u.aeth);
-                       else {
-                               aeth = be32_to_cpu(((__be32 *) data)[0]);
-                               data += sizeof(__be32);
-                       }
-                       if ((aeth >> 29) == 0)
-                               ipath_get_credit(qp, aeth);
-               }
-               goto ack_done;
-       }
-
-       if (unlikely(qp->s_last == qp->s_tail))
-               goto ack_done;
-       wqe = get_swqe_ptr(qp, qp->s_last);
-       status = IB_WC_SUCCESS;
-
-       switch (opcode) {
-       case OP(ACKNOWLEDGE):
-       case OP(ATOMIC_ACKNOWLEDGE):
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else {
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               }
-               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-                       if (!header_in_data) {
-                               __be32 *p = ohdr->u.at.atomic_ack_eth;
-
-                               val = ((u64) be32_to_cpu(p[0]) << 32) |
-                                       be32_to_cpu(p[1]);
-                       } else
-                               val = be64_to_cpu(((__be64 *) data)[0]);
-               } else
-                       val = 0;
-               if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
-                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
-                       goto ack_done;
-               hdrsize += 4;
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_middle;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /* no AETH, no ACK */
-               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-                       dev->n_rdma_seq++;
-                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-                               goto ack_done;
-                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-                       goto ack_done;
-               }
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-       read_middle:
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto ack_len_err;
-               if (unlikely(pmtu >= qp->s_rdma_read_len))
-                       goto ack_len_err;
-
-               /* We got a response so update the timeout. */
-               spin_lock(&dev->pending_lock);
-               if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
-                       list_move_tail(&qp->timerwait,
-                                      &dev->pending[dev->pending_index]);
-               spin_unlock(&dev->pending_lock);
-
-               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
-                       qp->s_retry = qp->s_retry_cnt;
-
-               /*
-                * Update the RDMA receive state but do the copy w/o
-                * holding the locks and blocking interrupts.
-                */
-               qp->s_rdma_read_len -= pmtu;
-               update_last_psn(qp, psn);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
-               goto bail;
-
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-               if (!do_rc_ack(qp, aeth, psn, opcode, 0))
-                       goto ack_done;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 0 && <= pmtu.
-                * Remember to account for the AETH header (4) and
-                * ICRC (4).
-                */
-               if (unlikely(tlen < (hdrsize + pad + 8)))
-                       goto ack_len_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_last;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /* ACKs READ req. */
-               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-                       dev->n_rdma_seq++;
-                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-                               goto ack_done;
-                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-                       goto ack_done;
-               }
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 1 && <= pmtu.
-                * Remember to account for the AETH header (4) and
-                * ICRC (4).
-                */
-               if (unlikely(tlen <= (hdrsize + pad + 8)))
-                       goto ack_len_err;
-       read_last:
-               tlen -= hdrsize + pad + 8;
-               if (unlikely(tlen != qp->s_rdma_read_len))
-                       goto ack_len_err;
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else {
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               }
-               ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
-               (void) do_rc_ack(qp, aeth, psn,
-                                OP(RDMA_READ_RESPONSE_LAST), 0);
-               goto ack_done;
-       }
-
-ack_op_err:
-       status = IB_WC_LOC_QP_OP_ERR;
-       goto ack_err;
-
-ack_len_err:
-       status = IB_WC_LOC_LEN_ERR;
-ack_err:
-       ipath_send_complete(qp, wqe, status);
-       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-ack_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-bail:
-       return;
-}
-
-/**
- * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @diff: the difference between the PSN and the expected PSN
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an unexpected
- * incoming RC packet for the given QP.
- * Called at interrupt level.
- * Return 1 if no more processing is needed; otherwise return 0 to
- * schedule a response to be sent.
- */
-static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
-                                    struct ipath_other_headers *ohdr,
-                                    void *data,
-                                    struct ipath_qp *qp,
-                                    u32 opcode,
-                                    u32 psn,
-                                    int diff,
-                                    int header_in_data)
-{
-       struct ipath_ack_entry *e;
-       u8 i, prev;
-       int old_req;
-       unsigned long flags;
-
-       if (diff > 0) {
-               /*
-                * Packet sequence error.
-                * A NAK will ACK earlier sends and RDMA writes.
-                * Don't queue the NAK if we already sent one.
-                */
-               if (!qp->r_nak_state) {
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       goto send_ack;
-               }
-               goto done;
-       }
-
-       /*
-        * Handle a duplicate request.  Don't re-execute SEND, RDMA
-        * write or atomic op.  Don't NAK errors, just silently drop
-        * the duplicate request.  Note that r_sge, r_len, and
-        * r_rcv_len may be in use so don't modify them.
-        *
-        * We are supposed to ACK the earliest duplicate PSN but we
-        * can coalesce an outstanding duplicate ACK.  We have to
-        * send the earliest so that RDMA reads can be restarted at
-        * the requester's expected PSN.
-        *
-        * First, find where this duplicate PSN falls within the
-        * ACKs previously sent.
-        */
-       psn &= IPATH_PSN_MASK;
-       e = NULL;
-       old_req = 1;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       /* Double check we can process this now that we hold the s_lock. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto unlock_done;
-
-       for (i = qp->r_head_ack_queue; ; i = prev) {
-               if (i == qp->s_tail_ack_queue)
-                       old_req = 0;
-               if (i)
-                       prev = i - 1;
-               else
-                       prev = IPATH_MAX_RDMA_ATOMIC;
-               if (prev == qp->r_head_ack_queue) {
-                       e = NULL;
-                       break;
-               }
-               e = &qp->s_ack_queue[prev];
-               if (!e->opcode) {
-                       e = NULL;
-                       break;
-               }
-               if (ipath_cmp24(psn, e->psn) >= 0) {
-                       if (prev == qp->s_tail_ack_queue)
-                               old_req = 0;
-                       break;
-               }
-       }
-       switch (opcode) {
-       case OP(RDMA_READ_REQUEST): {
-               struct ib_reth *reth;
-               u32 offset;
-               u32 len;
-
-               /*
-                * If we didn't find the RDMA read request in the ack queue,
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
-                       goto unlock_done;
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               /*
-                * Address range must be a subset of the original
-                * request and start on pmtu boundaries.
-                * We reuse the old ack_queue slot since the requester
-                * should not back up and request an earlier PSN for the
-                * same request.
-                */
-               offset = ((psn - e->psn) & IPATH_PSN_MASK) *
-                       ib_mtu_enum_to_int(qp->path_mtu);
-               len = be32_to_cpu(reth->length);
-               if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
-                       goto unlock_done;
-               if (len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       ok = ipath_rkey_ok(qp, &e->rdma_sge,
-                                          len, vaddr, rkey,
-                                          IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto unlock_done;
-               } else {
-                       e->rdma_sge.sg_list = NULL;
-                       e->rdma_sge.num_sge = 0;
-                       e->rdma_sge.sge.mr = NULL;
-                       e->rdma_sge.sge.vaddr = NULL;
-                       e->rdma_sge.sge.length = 0;
-                       e->rdma_sge.sge.sge_length = 0;
-               }
-               e->psn = psn;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               /*
-                * If we didn't find the atomic request in the ack queue
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != (u8) opcode || old_req)
-                       goto unlock_done;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       default:
-               if (old_req)
-                       goto unlock_done;
-               /*
-                * Resend the most recent ACK if this request is
-                * after all the previous RDMA reads and atomics.
-                */
-               if (i == qp->r_head_ack_queue) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->r_psn - 1;
-                       goto send_ack;
-               }
-               /*
-                * Try to send a simple ACK to work around a Mellanox bug
-                * which doesn't accept a RDMA read response or atomic
-                * response as an ACK for earlier SENDs or RDMA writes.
-                */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
-                   !(qp->s_flags & IPATH_S_ACK_PENDING) &&
-                   qp->s_ack_state == OP(ACKNOWLEDGE)) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
-                       goto send_ack;
-               }
-               /*
-                * Resend the RDMA read or atomic op which
-                * ACKs this duplicate request.
-                */
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = i;
-               break;
-       }
-       qp->r_nak_state = 0;
-       ipath_schedule_send(qp);
-
-unlock_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return 1;
-
-send_ack:
-       return 0;
-}
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
-{
-       unsigned long flags;
-       int lastwqe;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       lastwqe = ipath_error_qp(qp, err);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
-{
-       unsigned next;
-
-       next = n + 1;
-       if (next > IPATH_MAX_RDMA_ATOMIC)
-               next = 0;
-       if (n == qp->s_tail_ack_queue) {
-               qp->s_tail_ack_queue = next;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-       }
-}
-
-/**
- * ipath_rc_rcv - process an incoming RC packet
- * @dev: the device this packet came in on
- * @hdr: the header of this packet
- * @has_grh: true if the header has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- *
- * This is called from ipath_qp_rcv() to process an incoming RC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       u32 opcode;
-       u32 hdrsize;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       int diff;
-       struct ib_reth *reth;
-       int header_in_data;
-       unsigned long flags;
-
-       /* Validate the SLID. See Ch. 9.6.1.5 */
-       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-               goto done;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12;       /* LRH + BTH */
-               psn = be32_to_cpu(ohdr->bth[2]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
-               /*
-                * The header with GRH is 60 bytes and the core driver sets
-                * the eager header buffer size to 56 bytes so the last 4
-                * bytes of the BTH header (PSN) is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       psn = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               } else
-                       psn = be32_to_cpu(ohdr->bth[2]);
-       }
-
-       /*
-        * Process responses (ACKs) before anything else.  Note that the
-        * packet sequence number will be for something in the send work
-        * queue rather than the expected receive packet sequence number.
-        * In other words, this QP is the requester.
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
-                                 hdrsize, pmtu, header_in_data);
-               goto done;
-       }
-
-       /* Compute 24 bits worth of difference. */
-       diff = ipath_cmp24(psn, qp->r_psn);
-       if (unlikely(diff)) {
-               if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
-                                      psn, diff, header_in_data))
-                       goto done;
-               goto send_ack;
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       default:
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       goto nack_inv;
-               /*
-                * Note that it is up to the requester to not send a new
-                * RDMA read or atomic operation before receiving an ACK
-                * for the previous operation.
-                */
-               break;
-       }
-
-       memset(&wc, 0, sizeof wc);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-       case OP(RDMA_WRITE_MIDDLE):
-       send_middle:
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto nack_inv;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto nack_inv;
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               /* consume RWQE */
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto send_last;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-       send_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST):
-       case OP(RDMA_WRITE_LAST):
-       send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto nack_inv;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto nack_inv;
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               qp->r_msn++;
-               if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-                       break;
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               else
-                       wc.opcode = IB_WC_RECV;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               wc.sl = qp->remote_ah_attr.sl;
-               /* Signal completion event if the solicited bit is set. */
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                              (ohdr->bth[0] &
-                               cpu_to_be32(1 << 23)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE)))
-                       goto nack_inv;
-               /* consume RWQE */
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               hdrsize += sizeof(*reth);
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(qp, &qp->r_sge,
-                                          qp->r_len, vaddr, rkey,
-                                          IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto nack_acc;
-               } else {
-                       qp->r_sge.sg_list = NULL;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_FIRST))
-                       goto send_middle;
-               else if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto send_last;
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(RDMA_READ_REQUEST): {
-               struct ipath_ack_entry *e;
-               u32 len;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_READ)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > IPATH_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               /* Double check we can process this while holding the s_lock. */
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-                       goto unlock;
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       ipath_update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               len = be32_to_cpu(reth->length);
-               if (len) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
-                                          rkey, IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto nack_acc_unlck;
-                       /*
-                        * Update the next expected PSN.  We add 1 later
-                        * below, so only add the remainder here.
-                        */
-                       if (len > pmtu)
-                               qp->r_psn += (len - 1) / pmtu;
-               } else {
-                       e->rdma_sge.sg_list = NULL;
-                       e->rdma_sge.num_sge = 0;
-                       e->rdma_sge.sge.mr = NULL;
-                       e->rdma_sge.sge.vaddr = NULL;
-                       e->rdma_sge.sge.length = 0;
-                       e->rdma_sge.sge.sge_length = 0;
-               }
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               /*
-                * We need to increment the MSN here instead of when we
-                * finish sending the result since a duplicate request would
-                * increment it more than once.
-                */
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-
-               goto unlock;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               struct ib_atomic_eth *ateth;
-               struct ipath_ack_entry *e;
-               u64 vaddr;
-               atomic64_t *maddr;
-               u64 sdata;
-               u32 rkey;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > IPATH_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               /* Double check we can process this while holding the s_lock. */
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-                       goto unlock;
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       ipath_update_ack_queue(qp, next);
-               }
-               if (!header_in_data)
-                       ateth = &ohdr->u.atomic_eth;
-               else
-                       ateth = (struct ib_atomic_eth *)data;
-               vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
-                       be32_to_cpu(ateth->vaddr[1]);
-               if (unlikely(vaddr & (sizeof(u64) - 1)))
-                       goto nack_inv_unlck;
-               rkey = be32_to_cpu(ateth->rkey);
-               /* Check rkey & NAK */
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
-                                           sizeof(u64), vaddr, rkey,
-                                           IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc_unlck;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-               sdata = be64_to_cpu(ateth->swap_data);
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
-                       (u64) atomic64_add_return(sdata, maddr) - sdata :
-                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-                                     be64_to_cpu(ateth->compare_data),
-                                     sdata);
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn & IPATH_PSN_MASK;
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-
-               goto unlock;
-       }
-
-       default:
-               /* NAK unknown opcodes. */
-               goto nack_inv;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       qp->r_ack_psn = psn;
-       qp->r_nak_state = 0;
-       /* Send an ACK if requested or required. */
-       if (psn & (1 << 31))
-               goto send_ack;
-       goto done;
-
-rnr_nak:
-       qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
-       qp->r_ack_psn = qp->r_psn;
-       goto send_ack;
-
-nack_inv_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_inv:
-       ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
-       qp->r_ack_psn = qp->r_psn;
-       goto send_ack;
-
-nack_acc_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_acc:
-       ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-send_ack:
-       send_rc_ack(qp);
-       goto done;
-
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h
deleted file mode 100644 (file)
index 8f44d0c..0000000
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_REGISTERS_H
-#define _IPATH_REGISTERS_H
-
-/*
- * This file should only be included by kernel source, and by the diags.  It
- * defines the registers, and their contents, for InfiniPath chips.
- */
-
-/*
- * These are the InfiniPath register and buffer bit definitions,
- * that are visible to software, and needed only by the kernel
- * and diag code.  A few, that are visible to protocol and user
- * code are in ipath_common.h.  Some bits are specific
- * to a given chip implementation, and have been moved to the
- * chip-specific source file
- */
-
-/* kr_revision bits */
-#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
-#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
-#define INFINIPATH_R_ARCH_MASK 0xFF
-#define INFINIPATH_R_ARCH_SHIFT 16
-#define INFINIPATH_R_SOFTWARE_MASK 0xFF
-#define INFINIPATH_R_SOFTWARE_SHIFT 24
-#define INFINIPATH_R_BOARDID_MASK 0xFF
-#define INFINIPATH_R_BOARDID_SHIFT 32
-
-/* kr_control bits */
-#define INFINIPATH_C_FREEZEMODE 0x00000002
-#define INFINIPATH_C_LINKENABLE 0x00000004
-
-/* kr_sendctrl bits */
-#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
-#define INFINIPATH_S_UPDTHRESH_SHIFT 24
-#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
-
-#define IPATH_S_ABORT          0
-#define IPATH_S_PIOINTBUFAVAIL 1
-#define IPATH_S_PIOBUFAVAILUPD 2
-#define IPATH_S_PIOENABLE      3
-#define IPATH_S_SDMAINTENABLE  9
-#define IPATH_S_SDMASINGLEDESCRIPTOR   10
-#define IPATH_S_SDMAENABLE     11
-#define IPATH_S_SDMAHALT       12
-#define IPATH_S_DISARM         31
-
-#define INFINIPATH_S_ABORT             (1U << IPATH_S_ABORT)
-#define INFINIPATH_S_PIOINTBUFAVAIL    (1U << IPATH_S_PIOINTBUFAVAIL)
-#define INFINIPATH_S_PIOBUFAVAILUPD    (1U << IPATH_S_PIOBUFAVAILUPD)
-#define INFINIPATH_S_PIOENABLE         (1U << IPATH_S_PIOENABLE)
-#define INFINIPATH_S_SDMAINTENABLE     (1U << IPATH_S_SDMAINTENABLE)
-#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
-                                       (1U << IPATH_S_SDMASINGLEDESCRIPTOR)
-#define INFINIPATH_S_SDMAENABLE                (1U << IPATH_S_SDMAENABLE)
-#define INFINIPATH_S_SDMAHALT          (1U << IPATH_S_SDMAHALT)
-#define INFINIPATH_S_DISARM            (1U << IPATH_S_DISARM)
-
-/* kr_rcvctrl bits that are the same on multiple chips */
-#define INFINIPATH_R_PORTENABLE_SHIFT 0
-#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_SDMAINT           0x8000000000000000ULL
-#define INFINIPATH_I_SDMADISABLED      0x4000000000000000ULL
-#define INFINIPATH_I_ERROR             0x0000000080000000ULL
-#define INFINIPATH_I_SPIOSENT          0x0000000040000000ULL
-#define INFINIPATH_I_SPIOBUFAVAIL      0x0000000020000000ULL
-#define INFINIPATH_I_GPIO              0x0000000010000000ULL
-#define INFINIPATH_I_JINT              0x0000000004000000ULL
-
-/* kr_errorstatus, kr_errorclear, kr_errormask bits */
-#define INFINIPATH_E_RFORMATERR                        0x0000000000000001ULL
-#define INFINIPATH_E_RVCRC                     0x0000000000000002ULL
-#define INFINIPATH_E_RICRC                     0x0000000000000004ULL
-#define INFINIPATH_E_RMINPKTLEN                        0x0000000000000008ULL
-#define INFINIPATH_E_RMAXPKTLEN                        0x0000000000000010ULL
-#define INFINIPATH_E_RLONGPKTLEN               0x0000000000000020ULL
-#define INFINIPATH_E_RSHORTPKTLEN              0x0000000000000040ULL
-#define INFINIPATH_E_RUNEXPCHAR                        0x0000000000000080ULL
-#define INFINIPATH_E_RUNSUPVL                  0x0000000000000100ULL
-#define INFINIPATH_E_REBP                      0x0000000000000200ULL
-#define INFINIPATH_E_RIBFLOW                   0x0000000000000400ULL
-#define INFINIPATH_E_RBADVERSION               0x0000000000000800ULL
-#define INFINIPATH_E_RRCVEGRFULL               0x0000000000001000ULL
-#define INFINIPATH_E_RRCVHDRFULL               0x0000000000002000ULL
-#define INFINIPATH_E_RBADTID                   0x0000000000004000ULL
-#define INFINIPATH_E_RHDRLEN                   0x0000000000008000ULL
-#define INFINIPATH_E_RHDR                      0x0000000000010000ULL
-#define INFINIPATH_E_RIBLOSTLINK               0x0000000000020000ULL
-#define INFINIPATH_E_SENDSPECIALTRIGGER                0x0000000008000000ULL
-#define INFINIPATH_E_SDMADISABLED              0x0000000010000000ULL
-#define INFINIPATH_E_SMINPKTLEN                        0x0000000020000000ULL
-#define INFINIPATH_E_SMAXPKTLEN                        0x0000000040000000ULL
-#define INFINIPATH_E_SUNDERRUN                 0x0000000080000000ULL
-#define INFINIPATH_E_SPKTLEN                   0x0000000100000000ULL
-#define INFINIPATH_E_SDROPPEDSMPPKT            0x0000000200000000ULL
-#define INFINIPATH_E_SDROPPEDDATAPKT           0x0000000400000000ULL
-#define INFINIPATH_E_SPIOARMLAUNCH             0x0000000800000000ULL
-#define INFINIPATH_E_SUNEXPERRPKTNUM           0x0000001000000000ULL
-#define INFINIPATH_E_SUNSUPVL                  0x0000002000000000ULL
-#define INFINIPATH_E_SENDBUFMISUSE             0x0000004000000000ULL
-#define INFINIPATH_E_SDMAGENMISMATCH           0x0000008000000000ULL
-#define INFINIPATH_E_SDMAOUTOFBOUND            0x0000010000000000ULL
-#define INFINIPATH_E_SDMATAILOUTOFBOUND                0x0000020000000000ULL
-#define INFINIPATH_E_SDMABASE                  0x0000040000000000ULL
-#define INFINIPATH_E_SDMA1STDESC               0x0000080000000000ULL
-#define INFINIPATH_E_SDMARPYTAG                        0x0000100000000000ULL
-#define INFINIPATH_E_SDMADWEN                  0x0000200000000000ULL
-#define INFINIPATH_E_SDMAMISSINGDW             0x0000400000000000ULL
-#define INFINIPATH_E_SDMAUNEXPDATA             0x0000800000000000ULL
-#define INFINIPATH_E_IBSTATUSCHANGED           0x0001000000000000ULL
-#define INFINIPATH_E_INVALIDADDR               0x0002000000000000ULL
-#define INFINIPATH_E_RESET                     0x0004000000000000ULL
-#define INFINIPATH_E_HARDWARE                  0x0008000000000000ULL
-#define INFINIPATH_E_SDMADESCADDRMISALIGN      0x0010000000000000ULL
-#define INFINIPATH_E_INVALIDEEPCMD             0x0020000000000000ULL
-
-/*
- * this is used to print "common" packet errors only when the
- * __IPATH_ERRPKTDBG bit is set in ipath_debug.
- */
-#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
-               | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
-               | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
-               | INFINIPATH_E_REBP )
-
-/* Convenience for decoding Send DMA errors */
-#define INFINIPATH_E_SDMAERRS ( \
-       INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
-       INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
-       INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
-       INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
-       INFINIPATH_E_SDMAUNEXPDATA | \
-       INFINIPATH_E_SDMADESCADDRMISALIGN | \
-       INFINIPATH_E_SDMADISABLED | \
-       INFINIPATH_E_SENDBUFMISUSE)
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
- * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
- *             bit 4: flag buffer, 5: datainfo, 6: header info */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF  0x1ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC  0x2ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
-/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
-/* waldo specific -- find the rest in ipath_6110.c */
-#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
-/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
-#define INFINIPATH_HWE_MEMBISTFAILED   0x0040000000000000ULL
-
-/* kr_hwdiagctrl bits */
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
-#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
-#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-
-/* kr_ibcctrl bits */
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
-#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
-/* cycle through TS1/TS2 till OK */
-#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
-/* wait for TS1, then go on */
-#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
-#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
-#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKCMD_DOWN 1         /* move to 0x11 */
-#define INFINIPATH_IBCC_LINKCMD_ARMED 2                /* move to 0x21 */
-#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3       /* move to 0x31 */
-#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
-#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
-#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
-#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
-#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
-#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
-#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
-
-/* kr_ibcstatus bits */
-#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
-#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
-
-#define INFINIPATH_IBCS_TXREADY       0x40000000
-#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
-/* link training states (shift by
-   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
-#define INFINIPATH_IBCS_LT_STATE_DISABLED      0x00
-#define INFINIPATH_IBCS_LT_STATE_LINKUP                0x01
-#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE    0x02
-#define INFINIPATH_IBCS_LT_STATE_POLLQUIET     0x03
-#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY    0x04
-#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET    0x05
-#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE   0x08
-#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG    0x09
-#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT    0x0a
-#define INFINIPATH_IBCS_LT_STATE_CFGIDLE       0x0b
-#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN        0x0c
-#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT        0x0e
-#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE   0x0f
-/* link state machine states (shift by ibcs_ls_shift) */
-#define INFINIPATH_IBCS_L_STATE_DOWN           0x0
-#define INFINIPATH_IBCS_L_STATE_INIT           0x1
-#define INFINIPATH_IBCS_L_STATE_ARM            0x2
-#define INFINIPATH_IBCS_L_STATE_ACTIVE         0x3
-#define INFINIPATH_IBCS_L_STATE_ACT_DEFER      0x4
-
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
-#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
-#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
-
-/* kr_extctrl bits */
-#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
-#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
-#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
-#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
-#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
-#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
-#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
-#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
-#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
-#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
-#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
-#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
-#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
-
-/* kr_partitionkey bits */
-#define INFINIPATH_PKEY_SIZE 16
-#define INFINIPATH_PKEY_MASK 0xFFFF
-#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
-
-/* kr_serdesconfig0 bits */
-#define INFINIPATH_SERDC0_RESET_MASK  0xfULL   /* overal reset bits */
-#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL    /* pll reset */
-/* tx idle enables (per lane) */
-#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
-/* rx detect enables (per lane) */
-#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
-/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
-#define INFINIPATH_SERDC0_L1PWR_DN      0xF0ULL
-
-/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
-#define INFINIPATH_XGXS_RX_POL_SHIFT 19
-#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
-
-
-/*
- * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
- * PIO send buffers.  This is well beyond anything currently
- * defined in the InfiniBand spec.
- */
-#define IPATH_PIO_MAXIBHDR 128
-
-typedef u64 ipath_err_t;
-
-/* The following change with the type of device, so
- * need to be part of the ipath_devdata struct, or
- * we could have problems plugging in devices of
- * different types (e.g. one HT, one PCIE)
- * in one system, to be managed by one driver.
- * On the other hand, this file is may also be included
- * by other code, so leave the declarations here
- * temporarily. Minor footprint issue if common-model
- * linker used, none if C89+ linker used.
- */
-
-/* mask of defined bits for various registers */
-extern u64 infinipath_i_bitsextant;
-extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
-
-/* masks that are different in various chips, or only exist in some chips */
-extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
-
-/*
- * These are the infinipath general register numbers (not offsets).
- * The kernel registers are used directly, those beyond the kernel
- * registers are calculated from one of the base registers.  The use of
- * an integer type doesn't allow type-checking as thorough as, say,
- * an enum but allows for better hiding of chip differences.
- */
-typedef const u16 ipath_kreg,  /* infinipath general registers */
- ipath_creg,                   /* infinipath counter registers */
- ipath_sreg;                   /* kernel-only, infinipath send registers */
-
-/*
- * These are the chip registers common to all infinipath chips, and
- * used both by the kernel and the diagnostics or other user code.
- * They are all implemented such that 64 bit accesses work.
- * Some implement no more than 32 bits.  Because 64 bit reads
- * require 2 HT cmds on opteron, we access those with 32 bit
- * reads for efficiency (they are written as 64 bits, since
- * the extra 32 bits are nearly free on writes, and it slightly reduces
- * complexity).  The rest are all accessed as 64 bits.
- */
-struct ipath_kregs {
-       /* These are the 32 bit group */
-       ipath_kreg kr_control;
-       ipath_kreg kr_counterregbase;
-       ipath_kreg kr_intmask;
-       ipath_kreg kr_intstatus;
-       ipath_kreg kr_pagealign;
-       ipath_kreg kr_portcnt;
-       ipath_kreg kr_rcvtidbase;
-       ipath_kreg kr_rcvtidcnt;
-       ipath_kreg kr_rcvegrbase;
-       ipath_kreg kr_rcvegrcnt;
-       ipath_kreg kr_scratch;
-       ipath_kreg kr_sendctrl;
-       ipath_kreg kr_sendpiobufbase;
-       ipath_kreg kr_sendpiobufcnt;
-       ipath_kreg kr_sendpiosize;
-       ipath_kreg kr_sendregbase;
-       ipath_kreg kr_userregbase;
-       /* These are the 64 bit group */
-       ipath_kreg kr_debugport;
-       ipath_kreg kr_debugportselect;
-       ipath_kreg kr_errorclear;
-       ipath_kreg kr_errormask;
-       ipath_kreg kr_errorstatus;
-       ipath_kreg kr_extctrl;
-       ipath_kreg kr_extstatus;
-       ipath_kreg kr_gpio_clear;
-       ipath_kreg kr_gpio_mask;
-       ipath_kreg kr_gpio_out;
-       ipath_kreg kr_gpio_status;
-       ipath_kreg kr_hwdiagctrl;
-       ipath_kreg kr_hwerrclear;
-       ipath_kreg kr_hwerrmask;
-       ipath_kreg kr_hwerrstatus;
-       ipath_kreg kr_ibcctrl;
-       ipath_kreg kr_ibcstatus;
-       ipath_kreg kr_intblocked;
-       ipath_kreg kr_intclear;
-       ipath_kreg kr_interruptconfig;
-       ipath_kreg kr_mdio;
-       ipath_kreg kr_partitionkey;
-       ipath_kreg kr_rcvbthqp;
-       ipath_kreg kr_rcvbufbase;
-       ipath_kreg kr_rcvbufsize;
-       ipath_kreg kr_rcvctrl;
-       ipath_kreg kr_rcvhdrcnt;
-       ipath_kreg kr_rcvhdrentsize;
-       ipath_kreg kr_rcvhdrsize;
-       ipath_kreg kr_rcvintmembase;
-       ipath_kreg kr_rcvintmemsize;
-       ipath_kreg kr_revision;
-       ipath_kreg kr_sendbuffererror;
-       ipath_kreg kr_sendpioavailaddr;
-       ipath_kreg kr_serdesconfig0;
-       ipath_kreg kr_serdesconfig1;
-       ipath_kreg kr_serdesstatus;
-       ipath_kreg kr_txintmembase;
-       ipath_kreg kr_txintmemsize;
-       ipath_kreg kr_xgxsconfig;
-       ipath_kreg kr_ibpllcfg;
-       /* use these two (and the following N ports) only with
-        * ipath_k*_kreg64_port(); not *kreg64() */
-       ipath_kreg kr_rcvhdraddr;
-       ipath_kreg kr_rcvhdrtailaddr;
-
-       /* remaining registers are not present on all types of infinipath
-          chips  */
-       ipath_kreg kr_rcvpktledcnt;
-       ipath_kreg kr_pcierbuftestreg0;
-       ipath_kreg kr_pcierbuftestreg1;
-       ipath_kreg kr_pcieq0serdesconfig0;
-       ipath_kreg kr_pcieq0serdesconfig1;
-       ipath_kreg kr_pcieq0serdesstatus;
-       ipath_kreg kr_pcieq1serdesconfig0;
-       ipath_kreg kr_pcieq1serdesconfig1;
-       ipath_kreg kr_pcieq1serdesstatus;
-       ipath_kreg kr_hrtbt_guid;
-       ipath_kreg kr_ibcddrctrl;
-       ipath_kreg kr_ibcddrstatus;
-       ipath_kreg kr_jintreload;
-
-       /* send dma related regs */
-       ipath_kreg kr_senddmabase;
-       ipath_kreg kr_senddmalengen;
-       ipath_kreg kr_senddmatail;
-       ipath_kreg kr_senddmahead;
-       ipath_kreg kr_senddmaheadaddr;
-       ipath_kreg kr_senddmabufmask0;
-       ipath_kreg kr_senddmabufmask1;
-       ipath_kreg kr_senddmabufmask2;
-       ipath_kreg kr_senddmastatus;
-
-       /* SerDes related regs (IBA7220-only) */
-       ipath_kreg kr_ibserdesctrl;
-       ipath_kreg kr_ib_epbacc;
-       ipath_kreg kr_ib_epbtrans;
-       ipath_kreg kr_pcie_epbacc;
-       ipath_kreg kr_pcie_epbtrans;
-       ipath_kreg kr_ib_ddsrxeq;
-};
-
-struct ipath_cregs {
-       ipath_creg cr_badformatcnt;
-       ipath_creg cr_erricrccnt;
-       ipath_creg cr_errlinkcnt;
-       ipath_creg cr_errlpcrccnt;
-       ipath_creg cr_errpkey;
-       ipath_creg cr_errrcvflowctrlcnt;
-       ipath_creg cr_err_rlencnt;
-       ipath_creg cr_errslencnt;
-       ipath_creg cr_errtidfull;
-       ipath_creg cr_errtidvalid;
-       ipath_creg cr_errvcrccnt;
-       ipath_creg cr_ibstatuschange;
-       ipath_creg cr_intcnt;
-       ipath_creg cr_invalidrlencnt;
-       ipath_creg cr_invalidslencnt;
-       ipath_creg cr_lbflowstallcnt;
-       ipath_creg cr_iblinkdowncnt;
-       ipath_creg cr_iblinkerrrecovcnt;
-       ipath_creg cr_ibsymbolerrcnt;
-       ipath_creg cr_pktrcvcnt;
-       ipath_creg cr_pktrcvflowctrlcnt;
-       ipath_creg cr_pktsendcnt;
-       ipath_creg cr_pktsendflowcnt;
-       ipath_creg cr_portovflcnt;
-       ipath_creg cr_rcvebpcnt;
-       ipath_creg cr_rcvovflcnt;
-       ipath_creg cr_rxdroppktcnt;
-       ipath_creg cr_senddropped;
-       ipath_creg cr_sendstallcnt;
-       ipath_creg cr_sendunderruncnt;
-       ipath_creg cr_unsupvlcnt;
-       ipath_creg cr_wordrcvcnt;
-       ipath_creg cr_wordsendcnt;
-       ipath_creg cr_vl15droppedpktcnt;
-       ipath_creg cr_rxotherlocalphyerrcnt;
-       ipath_creg cr_excessbufferovflcnt;
-       ipath_creg cr_locallinkintegrityerrcnt;
-       ipath_creg cr_rxvlerrcnt;
-       ipath_creg cr_rxdlidfltrcnt;
-       ipath_creg cr_psstat;
-       ipath_creg cr_psstart;
-       ipath_creg cr_psinterval;
-       ipath_creg cr_psrcvdatacount;
-       ipath_creg cr_psrcvpktscount;
-       ipath_creg cr_psxmitdatacount;
-       ipath_creg cr_psxmitpktscount;
-       ipath_creg cr_psxmitwaitcount;
-};
-
-#endif                         /* _IPATH_REGISTERS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c
deleted file mode 100644 (file)
index e541a01..0000000
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/spinlock.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/*
- * Convert the AETH RNR timeout code into the number of milliseconds.
- */
-const u32 ib_ipath_rnr_table[32] = {
-       656,                    /* 0 */
-       1,                      /* 1 */
-       1,                      /* 2 */
-       1,                      /* 3 */
-       1,                      /* 4 */
-       1,                      /* 5 */
-       1,                      /* 6 */
-       1,                      /* 7 */
-       1,                      /* 8 */
-       1,                      /* 9 */
-       1,                      /* A */
-       1,                      /* B */
-       1,                      /* C */
-       1,                      /* D */
-       2,                      /* E */
-       2,                      /* F */
-       3,                      /* 10 */
-       4,                      /* 11 */
-       6,                      /* 12 */
-       8,                      /* 13 */
-       11,                     /* 14 */
-       16,                     /* 15 */
-       21,                     /* 16 */
-       31,                     /* 17 */
-       41,                     /* 18 */
-       62,                     /* 19 */
-       82,                     /* 1A */
-       123,                    /* 1B */
-       164,                    /* 1C */
-       246,                    /* 1D */
-       328,                    /* 1E */
-       492                     /* 1F */
-};
-
-/**
- * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
- * @qp: the QP
- *
- * Called with the QP s_lock held and interrupts disabled.
- * XXX Use a simple list for now.  We might need a priority
- * queue if we have lots of QPs waiting for RNR timeouts
- * but that should be rare.
- */
-void ipath_insert_rnr_queue(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-
-       /* We already did a spin_lock_irqsave(), so just use spin_lock */
-       spin_lock(&dev->pending_lock);
-       if (list_empty(&dev->rnrwait))
-               list_add(&qp->timerwait, &dev->rnrwait);
-       else {
-               struct list_head *l = &dev->rnrwait;
-               struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
-                                                 timerwait);
-
-               while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
-                       qp->s_rnr_timeout -= nqp->s_rnr_timeout;
-                       l = l->next;
-                       if (l->next == &dev->rnrwait) {
-                               nqp = NULL;
-                               break;
-                       }
-                       nqp = list_entry(l->next, struct ipath_qp,
-                                        timerwait);
-               }
-               if (nqp)
-                       nqp->s_rnr_timeout -= qp->s_rnr_timeout;
-               list_add(&qp->timerwait, l);
-       }
-       spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_init_sge - Validate a RWQE and fill in the SGE state
- * @qp: the QP
- *
- * Return 1 if OK.
- */
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-                  u32 *lengthp, struct ipath_sge_state *ss)
-{
-       int i, j, ret;
-       struct ib_wc wc;
-
-       *lengthp = 0;
-       for (i = j = 0; i < wqe->num_sge; i++) {
-               if (wqe->sg_list[i].length == 0)
-                       continue;
-               /* Check LKEY */
-               if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
-                                  &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-                       goto bad_lkey;
-               *lengthp += wqe->sg_list[i].length;
-               j++;
-       }
-       ss->num_sge = j;
-       ret = 1;
-       goto bail;
-
-bad_lkey:
-       memset(&wc, 0, sizeof(wc));
-       wc.wr_id = wqe->wr_id;
-       wc.status = IB_WC_LOC_PROT_ERR;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       /* Signal solicited completion event. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return 0 if no RWQE is available, otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
-{
-       unsigned long flags;
-       struct ipath_rq *rq;
-       struct ipath_rwq *wq;
-       struct ipath_srq *srq;
-       struct ipath_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       u32 tail;
-       int ret;
-
-       if (qp->ibqp.srq) {
-               srq = to_isrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       spin_lock_irqsave(&rq->lock, flags);
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       do {
-               if (unlikely(tail == wq->head)) {
-                       ret = 0;
-                       goto unlock;
-               }
-               /* Make sure entry is read after head index is read. */
-               smp_rmb();
-               wqe = get_rwqe_ptr(rq, tail);
-               if (++tail >= rq->size)
-                       tail = 0;
-               if (wr_id_only)
-                       break;
-               qp->r_sge.sg_list = qp->r_sg_list;
-       } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
-       qp->r_wr_id = wqe->wr_id;
-       wq->tail = tail;
-
-       ret = 1;
-       set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
-       if (handler) {
-               u32 n;
-
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-                       goto bail;
-               }
-       }
-unlock:
-       spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-       return ret;
-}
-
-/**
- * ipath_ruc_loopback - handle UC and RC lookback requests
- * @sqp: the sending QP
- *
- * This is called from ipath_do_send() to
- * forward a WQE addressed to the same HCA.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ipath_ruc_loopback(struct ipath_qp *sqp)
-{
-       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-       struct ipath_qp *qp;
-       struct ipath_swqe *wqe;
-       struct ipath_sge *sge;
-       unsigned long flags;
-       struct ib_wc wc;
-       u64 sdata;
-       atomic64_t *maddr;
-       enum ib_wc_status send_status;
-
-       /*
-        * Note that we check the responder QP state after
-        * checking the requester's state.
-        */
-       qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-
-       spin_lock_irqsave(&sqp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-           !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-               goto unlock;
-
-       sqp->s_flags |= IPATH_S_BUSY;
-
-again:
-       if (sqp->s_last == sqp->s_head)
-               goto clr_busy;
-       wqe = get_swqe_ptr(sqp, sqp->s_last);
-
-       /* Return if it is not OK to start a new work reqeust. */
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
-                       goto clr_busy;
-               /* We are in the error state, flush the work request. */
-               send_status = IB_WC_WR_FLUSH_ERR;
-               goto flush_send;
-       }
-
-       /*
-        * We can rely on the entry not changing without the s_lock
-        * being held until we update s_last.
-        * We increment s_cur to indicate s_last is in progress.
-        */
-       if (sqp->s_last == sqp->s_cur) {
-               if (++sqp->s_cur >= sqp->s_size)
-                       sqp->s_cur = 0;
-       }
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               /*
-                * For RC, the requester would timeout and retry so
-                * shortcut the timeouts and just signal too many retries.
-                */
-               if (sqp->ibqp.qp_type == IB_QPT_RC)
-                       send_status = IB_WC_RETRY_EXC_ERR;
-               else
-                       send_status = IB_WC_SUCCESS;
-               goto serr;
-       }
-
-       memset(&wc, 0, sizeof wc);
-       send_status = IB_WC_SUCCESS;
-
-       sqp->s_sge.sge = wqe->sg_list[0];
-       sqp->s_sge.sg_list = wqe->sg_list + 1;
-       sqp->s_sge.num_sge = wqe->wr.num_sge;
-       sqp->s_len = wqe->length;
-       switch (wqe->wr.opcode) {
-       case IB_WR_SEND_WITH_IMM:
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               /* FALLTHROUGH */
-       case IB_WR_SEND:
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               break;
-
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               /* FALLTHROUGH */
-       case IB_WR_RDMA_WRITE:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               if (wqe->length == 0)
-                       break;
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
-                                           wqe->rdma_wr.remote_addr,
-                                           wqe->rdma_wr.rkey,
-                                           IB_ACCESS_REMOTE_WRITE)))
-                       goto acc_err;
-               break;
-
-       case IB_WR_RDMA_READ:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto inv_err;
-               if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
-                                           wqe->rdma_wr.remote_addr,
-                                           wqe->rdma_wr.rkey,
-                                           IB_ACCESS_REMOTE_READ)))
-                       goto acc_err;
-               qp->r_sge.sge = wqe->sg_list[0];
-               qp->r_sge.sg_list = wqe->sg_list + 1;
-               qp->r_sge.num_sge = wqe->wr.num_sge;
-               break;
-
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto inv_err;
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
-                                           wqe->atomic_wr.remote_addr,
-                                           wqe->atomic_wr.rkey,
-                                           IB_ACCESS_REMOTE_ATOMIC)))
-                       goto acc_err;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-               sdata = wqe->atomic_wr.compare_add;
-               *(u64 *) sqp->s_sge.sge.vaddr =
-                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-                       (u64) atomic64_add_return(sdata, maddr) - sdata :
-                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-                                     sdata, wqe->atomic_wr.swap);
-               goto send_comp;
-
-       default:
-               send_status = IB_WC_LOC_QP_OP_ERR;
-               goto serr;
-       }
-
-       sge = &sqp->s_sge.sge;
-       while (sqp->s_len) {
-               u32 len = sqp->s_len;
-
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--sqp->s_sge.num_sge)
-                               *sge = *sqp->s_sge.sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               sqp->s_len -= len;
-       }
-
-       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-               goto send_comp;
-
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-       else
-               wc.opcode = IB_WC_RECV;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.byte_len = wqe->length;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = qp->remote_qpn;
-       wc.slid = qp->remote_ah_attr.dlid;
-       wc.sl = qp->remote_ah_attr.sl;
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-flush_send:
-       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-       ipath_send_complete(sqp, wqe, send_status);
-       goto again;
-
-rnr_nak:
-       /* Handle RNR NAK */
-       if (qp->ibqp.qp_type == IB_QPT_UC)
-               goto send_comp;
-       /*
-        * Note: we don't need the s_lock held since the BUSY flag
-        * makes this single threaded.
-        */
-       if (sqp->s_rnr_retry == 0) {
-               send_status = IB_WC_RNR_RETRY_EXC_ERR;
-               goto serr;
-       }
-       if (sqp->s_rnr_retry_cnt < 7)
-               sqp->s_rnr_retry--;
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
-               goto clr_busy;
-       sqp->s_flags |= IPATH_S_WAITING;
-       dev->n_rnr_naks++;
-       sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
-       ipath_insert_rnr_queue(sqp);
-       goto clr_busy;
-
-inv_err:
-       send_status = IB_WC_REM_INV_REQ_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-acc_err:
-       send_status = IB_WC_REM_ACCESS_ERR;
-       wc.status = IB_WC_LOC_PROT_ERR;
-err:
-       /* responder goes to error state */
-       ipath_rc_error(qp, wc.status);
-
-serr:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       ipath_send_complete(sqp, wqe, send_status);
-       if (sqp->ibqp.qp_type == IB_QPT_RC) {
-               int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-               sqp->s_flags &= ~IPATH_S_BUSY;
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               if (lastwqe) {
-                       struct ib_event ev;
-
-                       ev.device = sqp->ibqp.device;
-                       ev.element.qp = &sqp->ibqp;
-                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-               }
-               goto done;
-       }
-clr_busy:
-       sqp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-       if (qp && atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
-{
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
-           qp->ibqp.qp_type == IB_QPT_SMI) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-}
-
-/**
- * ipath_no_bufs_available - tell the layer driver we need buffers
- * @qp: the QP that caused the problem
- * @dev: the device we ran out of buffers on
- *
- * Called when we run out of PIO buffers.
- * If we are now in the error state, return zero to flush the
- * send work request.
- */
-static int ipath_no_bufs_available(struct ipath_qp *qp,
-                                   struct ipath_ibdev *dev)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       /*
-        * Note that as soon as want_buffer() is called and
-        * possibly before it returns, ipath_ib_piobufavail()
-        * could be called. Therefore, put QP on the piowait list before
-        * enabling the PIO avail interrupt.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-               dev->n_piowait++;
-               qp->s_flags |= IPATH_S_WAITING;
-               qp->s_flags &= ~IPATH_S_BUSY;
-               spin_lock(&dev->pending_lock);
-               if (list_empty(&qp->piowait))
-                       list_add_tail(&qp->piowait, &dev->piowait);
-               spin_unlock(&dev->pending_lock);
-       } else
-               ret = 0;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (ret)
-               want_buffer(dev->dd, qp);
-       return ret;
-}
-
-/**
- * ipath_make_grh - construct a GRH header
- * @dev: a pointer to the ipath device
- * @hdr: a pointer to the GRH header being constructed
- * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
- * @nwords: the number of 32 bit words of data being sent
- *
- * Return the size of the header in 32 bit words.
- */
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-                  struct ib_global_route *grh, u32 hwords, u32 nwords)
-{
-       hdr->version_tclass_flow =
-               cpu_to_be32((6 << 28) |
-                           (grh->traffic_class << 20) |
-                           grh->flow_label);
-       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
-       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
-       hdr->next_hdr = 0x1B;
-       hdr->hop_limit = grh->hop_limit;
-       /* The SGID is 32-bit aligned. */
-       hdr->sgid.global.subnet_prefix = dev->gid_prefix;
-       hdr->sgid.global.interface_id = dev->dd->ipath_guid;
-       hdr->dgid = grh->dgid;
-
-       /* GRH header size in 32-bit words. */
-       return sizeof(struct ib_grh) / sizeof(u32);
-}
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                          struct ipath_other_headers *ohdr,
-                          u32 bth0, u32 bth2)
-{
-       u16 lrh0;
-       u32 nwords;
-       u32 extra_bytes;
-
-       /* Construct the header. */
-       extra_bytes = -qp->s_cur_size & 3;
-       nwords = (qp->s_cur_size + extra_bytes) >> 2;
-       lrh0 = IPATH_LRH_BTH;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-                                                &qp->remote_ah_attr.grh,
-                                                qp->s_hdrwords, nwords);
-               lrh0 = IPATH_LRH_GRH;
-       }
-       lrh0 |= qp->remote_ah_attr.sl << 4;
-       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
-                                      qp->remote_ah_attr.src_path_bits);
-       bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
-       bth0 |= extra_bytes << 20;
-       ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(bth2);
-}
-
-/**
- * ipath_do_send - perform a send on a QP
- * @data: contains a pointer to the QP
- *
- * Process entries in the send work queue until credit or queue is
- * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, two threads could send packets out of order.
- */
-void ipath_do_send(unsigned long data)
-{
-       struct ipath_qp *qp = (struct ipath_qp *)data;
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       int (*make_req)(struct ipath_qp *qp);
-       unsigned long flags;
-
-       if ((qp->ibqp.qp_type == IB_QPT_RC ||
-            qp->ibqp.qp_type == IB_QPT_UC) &&
-           qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
-               ipath_ruc_loopback(qp);
-               goto bail;
-       }
-
-       if (qp->ibqp.qp_type == IB_QPT_RC)
-              make_req = ipath_make_rc_req;
-       else if (qp->ibqp.qp_type == IB_QPT_UC)
-              make_req = ipath_make_uc_req;
-       else
-              make_req = ipath_make_ud_req;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-           !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               goto bail;
-       }
-
-       qp->s_flags |= IPATH_S_BUSY;
-
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-again:
-       /* Check for a constructed packet to be sent. */
-       if (qp->s_hdrwords != 0) {
-               /*
-                * If no PIO bufs are available, return.  An interrupt will
-                * call ipath_ib_piobufavail() when one is available.
-                */
-               if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
-                                    qp->s_cur_sge, qp->s_cur_size)) {
-                       if (ipath_no_bufs_available(qp, dev))
-                               goto bail;
-               }
-               dev->n_unicast_xmit++;
-               /* Record that we sent the packet and s_hdr is empty. */
-               qp->s_hdrwords = 0;
-       }
-
-       if (make_req(qp))
-               goto again;
-
-bail:;
-}
-
-/*
- * This should be called with s_lock held.
- */
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-                        enum ib_wc_status status)
-{
-       u32 old_last, last;
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       /* See ch. 11.2.4.1 and 10.7.3.1 */
-       if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-           status != IB_WC_SUCCESS) {
-               struct ib_wc wc;
-
-               memset(&wc, 0, sizeof wc);
-               wc.wr_id = wqe->wr.wr_id;
-               wc.status = status;
-               wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-               wc.qp = &qp->ibqp;
-               if (status == IB_WC_SUCCESS)
-                       wc.byte_len = wqe->length;
-               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
-                              status != IB_WC_SUCCESS);
-       }
-
-       old_last = last = qp->s_last;
-       if (++last >= qp->s_size)
-               last = 0;
-       qp->s_last = last;
-       if (qp->s_cur == old_last)
-               qp->s_cur = last;
-       if (qp->s_tail == old_last)
-               qp->s_tail = last;
-       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-               qp->s_draining = 0;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c
deleted file mode 100644 (file)
index 1ffc06a..0000000
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
-
-static void vl15_watchdog_enq(struct ipath_devdata *dd)
-{
-       /* ipath_sdma_lock must already be held */
-       if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
-               unsigned long interval = (HZ + 19) / 20;
-               dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
-               add_timer(&dd->ipath_sdma_vl15_timer);
-       }
-}
-
-static void vl15_watchdog_deq(struct ipath_devdata *dd)
-{
-       /* ipath_sdma_lock must already be held */
-       if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
-               unsigned long interval = (HZ + 19) / 20;
-               mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
-       } else {
-               del_timer(&dd->ipath_sdma_vl15_timer);
-       }
-}
-
-static void vl15_watchdog_timeout(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
-               ipath_dbg("vl15 watchdog timeout - clearing\n");
-               ipath_cancel_sends(dd, 1);
-               ipath_hol_down(dd);
-       } else {
-               ipath_dbg("vl15 watchdog timeout - "
-                         "condition already cleared\n");
-       }
-}
-
-static void unmap_desc(struct ipath_devdata *dd, unsigned head)
-{
-       __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
-       u64 desc[2];
-       dma_addr_t addr;
-       size_t len;
-
-       desc[0] = le64_to_cpu(descqp[0]);
-       desc[1] = le64_to_cpu(descqp[1]);
-
-       addr = (desc[1] << 32) | (desc[0] >> 32);
-       len = (desc[0] >> 14) & (0x7ffULL << 2);
-       dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
-}
-
-/*
- * ipath_sdma_lock should be locked before calling this.
- */
-int ipath_sdma_make_progress(struct ipath_devdata *dd)
-{
-       struct list_head *lp = NULL;
-       struct ipath_sdma_txreq *txp = NULL;
-       u16 dmahead;
-       u16 start_idx = 0;
-       int progress = 0;
-
-       if (!list_empty(&dd->ipath_sdma_activelist)) {
-               lp = dd->ipath_sdma_activelist.next;
-               txp = list_entry(lp, struct ipath_sdma_txreq, list);
-               start_idx = txp->start_idx;
-       }
-
-       /*
-        * Read the SDMA head register in order to know that the
-        * interrupt clear has been written to the chip.
-        * Otherwise, we may not get an interrupt for the last
-        * descriptor in the queue.
-        */
-       dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
-       /* sanity check return value for error handling (chip reset, etc.) */
-       if (dmahead >= dd->ipath_sdma_descq_cnt)
-               goto done;
-
-       while (dd->ipath_sdma_descq_head != dmahead) {
-               if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
-                   dd->ipath_sdma_descq_head == start_idx) {
-                       unmap_desc(dd, dd->ipath_sdma_descq_head);
-                       start_idx++;
-                       if (start_idx == dd->ipath_sdma_descq_cnt)
-                               start_idx = 0;
-               }
-
-               /* increment free count and head */
-               dd->ipath_sdma_descq_removed++;
-               if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
-                       dd->ipath_sdma_descq_head = 0;
-
-               if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
-                       /* move to notify list */
-                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                               vl15_watchdog_deq(dd);
-                       list_move_tail(lp, &dd->ipath_sdma_notifylist);
-                       if (!list_empty(&dd->ipath_sdma_activelist)) {
-                               lp = dd->ipath_sdma_activelist.next;
-                               txp = list_entry(lp, struct ipath_sdma_txreq,
-                                                list);
-                               start_idx = txp->start_idx;
-                       } else {
-                               lp = NULL;
-                               txp = NULL;
-                       }
-               }
-               progress = 1;
-       }
-
-       if (progress)
-               tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-done:
-       return progress;
-}
-
-static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
-{
-       struct ipath_sdma_txreq *txp, *txp_next;
-
-       list_for_each_entry_safe(txp, txp_next, list, list) {
-               list_del_init(&txp->list);
-
-               if (txp->callback)
-                       (*txp->callback)(txp->callback_cookie,
-                                        txp->callback_status);
-       }
-}
-
-static void sdma_notify_taskbody(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       struct list_head list;
-
-       INIT_LIST_HEAD(&list);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       list_splice_init(&dd->ipath_sdma_notifylist, &list);
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       ipath_sdma_notify(dd, &list);
-
-       /*
-        * The IB verbs layer needs to see the callback before getting
-        * the call to ipath_ib_piobufavail() because the callback
-        * handles releasing resources the next send will need.
-        * Otherwise, we could do these calls in
-        * ipath_sdma_make_progress().
-        */
-       ipath_ib_piobufavail(dd->verbs_dev);
-}
-
-static void sdma_notify_task(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               sdma_notify_taskbody(dd);
-}
-
-static void dump_sdma_state(struct ipath_devdata *dd)
-{
-       unsigned long reg;
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
-       ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
-       ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-       ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-       ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
-}
-
-static void sdma_abort_task(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-       u64 status;
-       unsigned long flags;
-
-       if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               return;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
-
-       /* nothing to do */
-       if (status == IPATH_SDMA_ABORT_NONE)
-               goto unlock;
-
-       /* ipath_sdma_abort() is done, waiting for interrupt */
-       if (status == IPATH_SDMA_ABORT_DISARMED) {
-               if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
-                       goto resched_noprint;
-               /* give up, intr got lost somewhere */
-               ipath_dbg("give up waiting for SDMADISABLED intr\n");
-               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               status = IPATH_SDMA_ABORT_ABORTED;
-       }
-
-       /* everything is stopped, time to clean up and restart */
-       if (status == IPATH_SDMA_ABORT_ABORTED) {
-               struct ipath_sdma_txreq *txp, *txpnext;
-               u64 hwstatus;
-               int notify = 0;
-
-               hwstatus = ipath_read_kreg64(dd,
-                               dd->ipath_kregs->kr_senddmastatus);
-
-               if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
-                                IPATH_SDMA_STATUS_ABORT_IN_PROG             |
-                                IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
-                   !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
-                       if (dd->ipath_sdma_reset_wait > 0) {
-                               /* not done shutting down sdma */
-                               --dd->ipath_sdma_reset_wait;
-                               goto resched;
-                       }
-                       ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
-                               "status after SDMA reset, continuing\n");
-                       dump_sdma_state(dd);
-               }
-
-               /* dequeue all "sent" requests */
-               list_for_each_entry_safe(txp, txpnext,
-                                        &dd->ipath_sdma_activelist, list) {
-                       txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
-                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                               vl15_watchdog_deq(dd);
-                       list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-                       notify = 1;
-               }
-               if (notify)
-                       tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-               /* reset our notion of head and tail */
-               dd->ipath_sdma_descq_tail = 0;
-               dd->ipath_sdma_descq_head = 0;
-               dd->ipath_sdma_head_dma[0] = 0;
-               dd->ipath_sdma_generation = 0;
-               dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
-
-               /* Reset SendDmaLenGen */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
-                       (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
-
-               /* done with sdma state for a bit */
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-               /*
-                * Don't restart sdma here (with the exception
-                * below). Wait until link is up to ACTIVE.  VL15 MADs
-                * used to bring the link up use PIO, and multiple link
-                * transitions otherwise cause the sdma engine to be
-                * stopped and started multiple times.
-                * The disable is done here, including the shadow,
-                * so the state is kept consistent.
-                * See ipath_restart_sdma() for the actual starting
-                * of sdma.
-                */
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-               /* make sure I see next message */
-               dd->ipath_sdma_abort_jiffies = 0;
-
-               /*
-                * Not everything that takes SDMA offline is a link
-                * status change.  If the link was up, restart SDMA.
-                */
-               if (dd->ipath_flags & IPATH_LINKACTIVE)
-                       ipath_restart_sdma(dd);
-
-               goto done;
-       }
-
-resched:
-       /*
-        * for now, keep spinning
-        * JAG - this is bad to just have default be a loop without
-        * state change
-        */
-       if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
-               ipath_dbg("looping with status 0x%08lx\n",
-                         dd->ipath_sdma_status);
-               dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
-       }
-resched_noprint:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-       return;
-
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-done:
-       return;
-}
-
-/*
- * This is called from interrupt context.
- */
-void ipath_sdma_intr(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       (void) ipath_sdma_make_progress(dd);
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-}
-
-static int alloc_sdma(struct ipath_devdata *dd)
-{
-       int ret = 0;
-
-       /* Allocate memory for SendDMA descriptor FIFO */
-       dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
-               SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
-
-       if (!dd->ipath_sdma_descq) {
-               ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
-                       "FIFO memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       dd->ipath_sdma_descq_cnt =
-               SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
-
-       /* Allocate memory for DMA of head register to memory */
-       dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
-               PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
-       if (!dd->ipath_sdma_head_dma) {
-               ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
-               ret = -ENOMEM;
-               goto cleanup_descq;
-       }
-       dd->ipath_sdma_head_dma[0] = 0;
-
-       setup_timer(&dd->ipath_sdma_vl15_timer, vl15_watchdog_timeout,
-                       (unsigned long)dd);
-
-       atomic_set(&dd->ipath_sdma_vl15_count, 0);
-
-       goto done;
-
-cleanup_descq:
-       dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-               (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
-       dd->ipath_sdma_descq = NULL;
-       dd->ipath_sdma_descq_phys = 0;
-done:
-       return ret;
-}
-
-int setup_sdma(struct ipath_devdata *dd)
-{
-       int ret = 0;
-       unsigned i, n;
-       u64 tmp64;
-       u64 senddmabufmask[3] = { 0 };
-       unsigned long flags;
-
-       ret = alloc_sdma(dd);
-       if (ret)
-               goto done;
-
-       if (!dd->ipath_sdma_descq) {
-               ipath_dev_err(dd, "SendDMA memory not allocated\n");
-               goto done;
-       }
-
-       /*
-        * Set initial status as if we had been up, then gone down.
-        * This lets initial start on transition to ACTIVE be the
-        * same as restart after link flap.
-        */
-       dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
-       dd->ipath_sdma_abort_jiffies = 0;
-       dd->ipath_sdma_generation = 0;
-       dd->ipath_sdma_descq_tail = 0;
-       dd->ipath_sdma_descq_head = 0;
-       dd->ipath_sdma_descq_removed = 0;
-       dd->ipath_sdma_descq_added = 0;
-
-       /* Set SendDmaBase */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
-                        dd->ipath_sdma_descq_phys);
-       /* Set SendDmaLenGen */
-       tmp64 = dd->ipath_sdma_descq_cnt;
-       tmp64 |= 1<<18; /* enable generation checking */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
-       /* Set SendDmaTail */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
-                        dd->ipath_sdma_descq_tail);
-       /* Set SendDmaHeadAddr */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
-                        dd->ipath_sdma_head_phys);
-
-       /*
-        * Reserve all the former "kernel" piobufs, using high number range
-        * so we get as many 4K buffers as possible
-        */
-       n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
-       ipath_chg_pioavailkernel(dd, i, n - i , 0);
-       for (; i < n; ++i) {
-               unsigned word = i / 64;
-               unsigned bit = i & 63;
-               BUG_ON(word >= 3);
-               senddmabufmask[word] |= 1ULL << bit;
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
-                        senddmabufmask[0]);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
-                        senddmabufmask[1]);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
-                        senddmabufmask[2]);
-
-       INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
-       INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
-
-       tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
-                    (unsigned long) dd);
-       tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
-                    (unsigned long) dd);
-
-       /*
-        * No use to turn on SDMA here, as link is probably not ACTIVE
-        * Just mark it RUNNING and enable the interrupt, and let the
-        * ipath_restart_sdma() on link transition to ACTIVE actually
-        * enable it.
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-done:
-       return ret;
-}
-
-void teardown_sdma(struct ipath_devdata *dd)
-{
-       struct ipath_sdma_txreq *txp, *txpnext;
-       unsigned long flags;
-       dma_addr_t sdma_head_phys = 0;
-       dma_addr_t sdma_descq_phys = 0;
-       void *sdma_descq = NULL;
-       void *sdma_head_dma = NULL;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-       __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       tasklet_kill(&dd->ipath_sdma_abort_task);
-       tasklet_kill(&dd->ipath_sdma_notify_task);
-
-       /* turn off sdma */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-               dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       /* dequeue all "sent" requests */
-       list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
-                                list) {
-               txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
-               if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                       vl15_watchdog_deq(dd);
-               list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       sdma_notify_taskbody(dd);
-
-       del_timer_sync(&dd->ipath_sdma_vl15_timer);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       dd->ipath_sdma_abort_jiffies = 0;
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
-
-       if (dd->ipath_sdma_head_dma) {
-               sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
-               sdma_head_phys = dd->ipath_sdma_head_phys;
-               dd->ipath_sdma_head_dma = NULL;
-               dd->ipath_sdma_head_phys = 0;
-       }
-
-       if (dd->ipath_sdma_descq) {
-               sdma_descq = dd->ipath_sdma_descq;
-               sdma_descq_phys = dd->ipath_sdma_descq_phys;
-               dd->ipath_sdma_descq = NULL;
-               dd->ipath_sdma_descq_phys = 0;
-       }
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       if (sdma_head_dma)
-               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                 sdma_head_dma, sdma_head_phys);
-
-       if (sdma_descq)
-               dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-                                 sdma_descq, sdma_descq_phys);
-}
-
-/*
- * [Re]start SDMA, if we use it, and it's not already OK.
- * This is called on transition to link ACTIVE, either the first or
- * subsequent times.
- */
-void ipath_restart_sdma(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int needed = 1;
-
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-               goto bail;
-
-       /*
-        * First, make sure we should, which is to say,
-        * check that we are "RUNNING" (not in teardown)
-        * and not "SHUTDOWN"
-        */
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
-               || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       needed = 0;
-       else {
-               __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-               __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!needed) {
-               ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
-                       dd->ipath_sdma_status);
-               goto bail;
-       }
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       /*
-        * First clear, just to be safe. Enable is only done
-        * in chip on 0->1 transition
-        */
-       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /* notify upper layers */
-       ipath_ib_piobufavail(dd->verbs_dev);
-
-bail:
-       return;
-}
-
-static inline void make_sdma_desc(struct ipath_devdata *dd,
-       u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
-{
-       WARN_ON(addr & 3);
-       /* SDmaPhyAddr[47:32] */
-       sdmadesc[1] = addr >> 32;
-       /* SDmaPhyAddr[31:0] */
-       sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
-       /* SDmaGeneration[1:0] */
-       sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
-       /* SDmaDwordCount[10:0] */
-       sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
-       /* SDmaBufOffset[12:2] */
-       sdmadesc[0] |= dwoffset & 0x7ffULL;
-}
-
-/*
- * This function queues one IB packet onto the send DMA queue per call.
- * The caller is responsible for checking:
- * 1) The number of send DMA descriptor entries is less than the size of
- *    the descriptor queue.
- * 2) The IB SGE addresses and lengths are 32-bit aligned
- *    (except possibly the last SGE's length)
- * 3) The SGE addresses are suitable for passing to dma_map_single().
- */
-int ipath_sdma_verbs_send(struct ipath_devdata *dd,
-       struct ipath_sge_state *ss, u32 dwords,
-       struct ipath_verbs_txreq *tx)
-{
-
-       unsigned long flags;
-       struct ipath_sge *sge;
-       int ret = 0;
-       u16 tail;
-       __le64 *descqp;
-       u64 sdmadesc[2];
-       u32 dwoffset;
-       dma_addr_t addr;
-
-       if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
-               ipath_dbg("packet size %X > ibmax %X, fail\n",
-                       tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
-               ret = -EMSGSIZE;
-               goto fail;
-       }
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-retry:
-       if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
-               ret = -EBUSY;
-               goto unlock;
-       }
-
-       if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
-               if (ipath_sdma_make_progress(dd))
-                       goto retry;
-               ret = -ENOBUFS;
-               goto unlock;
-       }
-
-       addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
-                             tx->map_len, DMA_TO_DEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, addr))
-               goto ioerr;
-
-       dwoffset = tx->map_len >> 2;
-       make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
-
-       /* SDmaFirstDesc */
-       sdmadesc[0] |= 1ULL << 12;
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-               sdmadesc[0] |= 1ULL << 14;      /* SDmaUseLargeBuf */
-
-       /* write to the descq */
-       tail = dd->ipath_sdma_descq_tail;
-       descqp = &dd->ipath_sdma_descq[tail].qw[0];
-       *descqp++ = cpu_to_le64(sdmadesc[0]);
-       *descqp++ = cpu_to_le64(sdmadesc[1]);
-
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
-               tx->txreq.start_idx = tail;
-
-       /* increment the tail */
-       if (++tail == dd->ipath_sdma_descq_cnt) {
-               tail = 0;
-               descqp = &dd->ipath_sdma_descq[0].qw[0];
-               ++dd->ipath_sdma_generation;
-       }
-
-       sge = &ss->sge;
-       while (dwords) {
-               u32 dw;
-               u32 len;
-
-               len = dwords << 2;
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               dw = (len + 3) >> 2;
-               addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
-                                     DMA_TO_DEVICE);
-               if (dma_mapping_error(&dd->pcidev->dev, addr))
-                       goto unmap;
-               make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
-               /* SDmaUseLargeBuf has to be set in every descriptor */
-               if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-                       sdmadesc[0] |= 1ULL << 14;
-               /* write to the descq */
-               *descqp++ = cpu_to_le64(sdmadesc[0]);
-               *descqp++ = cpu_to_le64(sdmadesc[1]);
-
-               /* increment the tail */
-               if (++tail == dd->ipath_sdma_descq_cnt) {
-                       tail = 0;
-                       descqp = &dd->ipath_sdma_descq[0].qw[0];
-                       ++dd->ipath_sdma_generation;
-               }
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-
-               dwoffset += dw;
-               dwords -= dw;
-       }
-
-       if (!tail)
-               descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
-       descqp -= 2;
-       /* SDmaLastDesc */
-       descqp[0] |= cpu_to_le64(1ULL << 11);
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
-               /* SDmaIntReq */
-               descqp[0] |= cpu_to_le64(1ULL << 15);
-       }
-
-       /* Commit writes to memory and advance the tail on the chip */
-       wmb();
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-
-       tx->txreq.next_descq_idx = tail;
-       tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
-       dd->ipath_sdma_descq_tail = tail;
-       dd->ipath_sdma_descq_added += tx->txreq.sg_count;
-       list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
-               vl15_watchdog_enq(dd);
-       goto unlock;
-
-unmap:
-       while (tail != dd->ipath_sdma_descq_tail) {
-               if (!tail)
-                       tail = dd->ipath_sdma_descq_cnt - 1;
-               else
-                       tail--;
-               unmap_desc(dd, tail);
-       }
-ioerr:
-       ret = -EIO;
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-fail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c
deleted file mode 100644 (file)
index 2627198..0000000
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_post_srq_receive - post a receive on a shared receive queue
- * @ibsrq: the SRQ to post the receive on
- * @wr: the list of work requests to post
- * @bad_wr: the first WR to cause a problem is put here
- *
- * This may be called from interrupt context.
- */
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-                          struct ib_recv_wr **bad_wr)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_rwq *wq;
-       unsigned long flags;
-       int ret;
-
-       for (; wr; wr = wr->next) {
-               struct ipath_rwqe *wqe;
-               u32 next;
-               int i;
-
-               if ((unsigned) wr->num_sge > srq->rq.max_sge) {
-                       *bad_wr = wr;
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               spin_lock_irqsave(&srq->rq.lock, flags);
-               wq = srq->rq.wq;
-               next = wq->head + 1;
-               if (next >= srq->rq.size)
-                       next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&srq->rq.lock, flags);
-                       *bad_wr = wr;
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               wqe = get_rwqe_ptr(&srq->rq, wq->head);
-               wqe->wr_id = wr->wr_id;
-               wqe->num_sge = wr->num_sge;
-               for (i = 0; i < wr->num_sge; i++)
-                       wqe->sg_list[i] = wr->sg_list[i];
-               /* Make sure queue entry is written before the head index. */
-               smp_wmb();
-               wq->head = next;
-               spin_unlock_irqrestore(&srq->rq.lock, flags);
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_create_srq - create a shared receive queue
- * @ibpd: the protection domain of the SRQ to create
- * @srq_init_attr: the attributes of the SRQ
- * @udata: data from libipathverbs when creating a user SRQ
- */
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-                               struct ib_srq_init_attr *srq_init_attr,
-                               struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibpd->device);
-       struct ipath_srq *srq;
-       u32 sz;
-       struct ib_srq *ret;
-
-       if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
-               ret = ERR_PTR(-ENOSYS);
-               goto done;
-       }
-
-       if (srq_init_attr->attr.max_wr == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
-           (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
-       /*
-        * Need to use vmalloc() if we want to support large #s of entries.
-        */
-       srq->rq.size = srq_init_attr->attr.max_wr + 1;
-       srq->rq.max_sge = srq_init_attr->attr.max_sge;
-       sz = sizeof(struct ib_sge) * srq->rq.max_sge +
-               sizeof(struct ipath_rwqe);
-       srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
-       if (!srq->rq.wq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_srq;
-       }
-
-       /*
-        * Return the address of the RWQ as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
-               u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
-
-               srq->ip =
-                   ipath_create_mmap_info(dev, s,
-                                          ibpd->uobject->context,
-                                          srq->rq.wq);
-               if (!srq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_wq;
-               }
-
-               err = ib_copy_to_udata(udata, &srq->ip->offset,
-                                      sizeof(srq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
-                       goto bail_ip;
-               }
-       } else
-               srq->ip = NULL;
-
-       /*
-        * ib_create_srq() will initialize srq->ibsrq.
-        */
-       spin_lock_init(&srq->rq.lock);
-       srq->rq.wq->head = 0;
-       srq->rq.wq->tail = 0;
-       srq->limit = srq_init_attr->attr.srq_limit;
-
-       spin_lock(&dev->n_srqs_lock);
-       if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
-               spin_unlock(&dev->n_srqs_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_srqs_allocated++;
-       spin_unlock(&dev->n_srqs_lock);
-
-       if (srq->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = &srq->ibsrq;
-       goto done;
-
-bail_ip:
-       kfree(srq->ip);
-bail_wq:
-       vfree(srq->rq.wq);
-bail_srq:
-       kfree(srq);
-done:
-       return ret;
-}
-
-/**
- * ipath_modify_srq - modify a shared receive queue
- * @ibsrq: the SRQ to modify
- * @attr: the new attributes of the SRQ
- * @attr_mask: indicates which attributes to modify
- * @udata: user data for ipathverbs.so
- */
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                    enum ib_srq_attr_mask attr_mask,
-                    struct ib_udata *udata)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_rwq *wq;
-       int ret = 0;
-
-       if (attr_mask & IB_SRQ_MAX_WR) {
-               struct ipath_rwq *owq;
-               struct ipath_rwqe *p;
-               u32 sz, size, n, head, tail;
-
-               /* Check that the requested sizes are below the limits. */
-               if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
-                   ((attr_mask & IB_SRQ_LIMIT) ?
-                    attr->srq_limit : srq->limit) > attr->max_wr) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               sz = sizeof(struct ipath_rwqe) +
-                       srq->rq.max_sge * sizeof(struct ib_sge);
-               size = attr->max_wr + 1;
-               wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
-               if (!wq) {
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               /* Check that we can write the offset to mmap. */
-               if (udata && udata->inlen >= sizeof(__u64)) {
-                       __u64 offset_addr;
-                       __u64 offset = 0;
-
-                       ret = ib_copy_from_udata(&offset_addr, udata,
-                                                sizeof(offset_addr));
-                       if (ret)
-                               goto bail_free;
-                       udata->outbuf =
-                               (void __user *) (unsigned long) offset_addr;
-                       ret = ib_copy_to_udata(udata, &offset,
-                                              sizeof(offset));
-                       if (ret)
-                               goto bail_free;
-               }
-
-               spin_lock_irq(&srq->rq.lock);
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               owq = srq->rq.wq;
-               head = owq->head;
-               if (head >= srq->rq.size)
-                       head = 0;
-               tail = owq->tail;
-               if (tail >= srq->rq.size)
-                       tail = 0;
-               n = head;
-               if (n < tail)
-                       n += srq->rq.size - tail;
-               else
-                       n -= tail;
-               if (size <= n) {
-                       ret = -EINVAL;
-                       goto bail_unlock;
-               }
-               n = 0;
-               p = wq->wq;
-               while (tail != head) {
-                       struct ipath_rwqe *wqe;
-                       int i;
-
-                       wqe = get_rwqe_ptr(&srq->rq, tail);
-                       p->wr_id = wqe->wr_id;
-                       p->num_sge = wqe->num_sge;
-                       for (i = 0; i < wqe->num_sge; i++)
-                               p->sg_list[i] = wqe->sg_list[i];
-                       n++;
-                       p = (struct ipath_rwqe *)((char *) p + sz);
-                       if (++tail >= srq->rq.size)
-                               tail = 0;
-               }
-               srq->rq.wq = wq;
-               srq->rq.size = size;
-               wq->head = n;
-               wq->tail = 0;
-               if (attr_mask & IB_SRQ_LIMIT)
-                       srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
-
-               vfree(owq);
-
-               if (srq->ip) {
-                       struct ipath_mmap_info *ip = srq->ip;
-                       struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
-                       u32 s = sizeof(struct ipath_rwq) + size * sz;
-
-                       ipath_update_mmap_info(dev, ip, s, wq);
-
-                       /*
-                        * Return the offset to mmap.
-                        * See ipath_mmap() for details.
-                        */
-                       if (udata && udata->inlen >= sizeof(__u64)) {
-                               ret = ib_copy_to_udata(udata, &ip->offset,
-                                                      sizeof(ip->offset));
-                               if (ret)
-                                       goto bail;
-                       }
-
-                       spin_lock_irq(&dev->pending_lock);
-                       if (list_empty(&ip->pending_mmaps))
-                               list_add(&ip->pending_mmaps,
-                                        &dev->pending_mmaps);
-                       spin_unlock_irq(&dev->pending_lock);
-               }
-       } else if (attr_mask & IB_SRQ_LIMIT) {
-               spin_lock_irq(&srq->rq.lock);
-               if (attr->srq_limit >= srq->rq.size)
-                       ret = -EINVAL;
-               else
-                       srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
-       }
-       goto bail;
-
-bail_unlock:
-       spin_unlock_irq(&srq->rq.lock);
-bail_free:
-       vfree(wq);
-bail:
-       return ret;
-}
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-
-       attr->max_wr = srq->rq.size - 1;
-       attr->max_sge = srq->rq.max_sge;
-       attr->srq_limit = srq->limit;
-       return 0;
-}
-
-/**
- * ipath_destroy_srq - destroy a shared receive queue
- * @ibsrq: the SRQ to destroy
- */
-int ipath_destroy_srq(struct ib_srq *ibsrq)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_ibdev *dev = to_idev(ibsrq->device);
-
-       spin_lock(&dev->n_srqs_lock);
-       dev->n_srqs_allocated--;
-       spin_unlock(&dev->n_srqs_lock);
-       if (srq->ip)
-               kref_put(&srq->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(srq->rq.wq);
-       kfree(srq);
-
-       return 0;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c
deleted file mode 100644 (file)
index f63e143..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_kernel.h"
-
-struct infinipath_stats ipath_stats;
-
-/**
- * ipath_snap_cntr - snapshot a chip counter
- * @dd: the infinipath device
- * @creg: the counter to snapshot
- *
- * called from add_timer and user counter read calls, to deal with
- * counters that wrap in "human time".  The words sent and received, and
- * the packets sent and received are all that we worry about.  For now,
- * at least, we don't worry about error counters, because if they wrap
- * that quickly, we probably don't care.  We may eventually just make this
- * handle all the counters.  word counters can wrap in about 20 seconds
- * of full bandwidth traffic, packet counters in a few hours.
- */
-
-u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
-{
-       u32 val, reg64 = 0;
-       u64 val64;
-       unsigned long t0, t1;
-       u64 ret;
-
-       t0 = jiffies;
-       /* If fast increment counters are only 32 bits, snapshot them,
-        * and maintain them as 64bit values in the driver */
-       if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
-           (creg == dd->ipath_cregs->cr_wordsendcnt ||
-            creg == dd->ipath_cregs->cr_wordrcvcnt ||
-            creg == dd->ipath_cregs->cr_pktsendcnt ||
-            creg == dd->ipath_cregs->cr_pktrcvcnt)) {
-               val64 = ipath_read_creg(dd, creg);
-               val = val64 == ~0ULL ? ~0U : 0;
-               reg64 = 1;
-       } else                  /* val64 just to keep gcc quiet... */
-               val64 = val = ipath_read_creg32(dd, creg);
-       /*
-        * See if a second has passed.  This is just a way to detect things
-        * that are quite broken.  Normally this should take just a few
-        * cycles (the check is for long enough that we don't care if we get
-        * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
-        * normal NB values
-        */
-       t1 = jiffies;
-       if (time_before(t0 + HZ, t1) && val == -1) {
-               ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
-                             creg);
-               ret = 0ULL;
-               goto bail;
-       }
-       if (reg64) {
-               ret = val64;
-               goto bail;
-       }
-
-       if (creg == dd->ipath_cregs->cr_wordsendcnt) {
-               if (val != dd->ipath_lastsword) {
-                       dd->ipath_sword += val - dd->ipath_lastsword;
-                       dd->ipath_lastsword = val;
-               }
-               val64 = dd->ipath_sword;
-       } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
-               if (val != dd->ipath_lastrword) {
-                       dd->ipath_rword += val - dd->ipath_lastrword;
-                       dd->ipath_lastrword = val;
-               }
-               val64 = dd->ipath_rword;
-       } else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
-               if (val != dd->ipath_lastspkts) {
-                       dd->ipath_spkts += val - dd->ipath_lastspkts;
-                       dd->ipath_lastspkts = val;
-               }
-               val64 = dd->ipath_spkts;
-       } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
-               if (val != dd->ipath_lastrpkts) {
-                       dd->ipath_rpkts += val - dd->ipath_lastrpkts;
-                       dd->ipath_lastrpkts = val;
-               }
-               val64 = dd->ipath_rpkts;
-       } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
-               if (dd->ibdeltainprog)
-                       val64 -= val64 - dd->ibsymsnap;
-               val64 -= dd->ibsymdelta;
-       } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
-               if (dd->ibdeltainprog)
-                       val64 -= val64 - dd->iblnkerrsnap;
-               val64 -= dd->iblnkerrdelta;
-       } else
-               val64 = (u64) val;
-
-       ret = val64;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
- * @dd: the infinipath device
- *
- * print the delta of egrfull/hdrqfull errors for kernel ports no more than
- * every 5 seconds.  User processes are printed at close, but kernel doesn't
- * close, so...  Separate routine so may call from other places someday, and
- * so function name when printed by _IPATH_INFO is meaningfull
- */
-static void ipath_qcheck(struct ipath_devdata *dd)
-{
-       static u64 last_tot_hdrqfull;
-       struct ipath_portdata *pd = dd->ipath_pd[0];
-       size_t blen = 0;
-       char buf[128];
-       u32 hdrqtail;
-
-       *buf = 0;
-       if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
-               blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
-                               pd->port_hdrqfull -
-                               dd->ipath_p0_hdrqfull);
-               dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
-       }
-       if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
-               blen += snprintf(buf + blen, sizeof buf - blen,
-                                "%srcvegrfull %llu",
-                                blen ? ", " : "",
-                                (unsigned long long)
-                                (ipath_stats.sps_etidfull -
-                                 dd->ipath_last_tidfull));
-               dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
-       }
-
-       /*
-        * this is actually the number of hdrq full interrupts, not actual
-        * events, but at the moment that's mostly what I'm interested in.
-        * Actual count, etc. is in the counters, if needed.  For production
-        * users this won't ordinarily be printed.
-        */
-
-       if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
-           ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
-               blen += snprintf(buf + blen, sizeof buf - blen,
-                                "%shdrqfull %llu (all ports)",
-                                blen ? ", " : "",
-                                (unsigned long long)
-                                (ipath_stats.sps_hdrqfull -
-                                 last_tot_hdrqfull));
-               last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
-       }
-       if (blen)
-               ipath_dbg("%s\n", buf);
-
-       hdrqtail = ipath_get_hdrqtail(pd);
-       if (pd->port_head != hdrqtail) {
-               if (dd->ipath_lastport0rcv_cnt ==
-                   ipath_stats.sps_port0pkts) {
-                       ipath_cdbg(PKT, "missing rcv interrupts? "
-                                  "port0 hd=%x tl=%x; port0pkts %llx; write"
-                                  " hd (w/intr)\n",
-                                  pd->port_head, hdrqtail,
-                                  (unsigned long long)
-                                  ipath_stats.sps_port0pkts);
-                       ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
-                               dd->ipath_rhdrhead_intr_off, pd->port_port);
-               }
-               dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
-       }
-}
-
-static void ipath_chk_errormask(struct ipath_devdata *dd)
-{
-       static u32 fixed;
-       u32 ctrl;
-       unsigned long errormask;
-       unsigned long hwerrs;
-
-       if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-
-       if (errormask == dd->ipath_errormask)
-               return;
-       fixed++;
-
-       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-
-       if ((hwerrs & dd->ipath_hwerrmask) ||
-               (ctrl & INFINIPATH_C_FREEZEMODE)) {
-               /* force re-interrupt of pending events, just in case */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-               dev_info(&dd->pcidev->dev,
-                       "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
-                       fixed, errormask, (unsigned long)dd->ipath_errormask,
-                       ctrl, hwerrs);
-       } else
-               ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
-                       fixed, errormask,
-                       (unsigned long)dd->ipath_errormask);
-}
-
-
-/**
- * ipath_get_faststats - get word counters from chip before they overflow
- * @opaque - contains a pointer to the infinipath device ipath_devdata
- *
- * called from add_timer
- */
-void ipath_get_faststats(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-       int i;
-       static unsigned cnt;
-       unsigned long flags;
-       u64 traffic_wds;
-
-       /*
-        * don't access the chip while running diags, or memory diags can
-        * fail
-        */
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
-           ipath_diag_inuse)
-               /* but re-arm the timer, for diags case; won't hurt other */
-               goto done;
-
-       /*
-        * We now try to maintain a "active timer", based on traffic
-        * exceeding a threshold, so we need to check the word-counts
-        * even if they are 64-bit.
-        */
-       traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       traffic_wds -= dd->ipath_traffic_wds;
-       dd->ipath_traffic_wds += traffic_wds;
-       if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
-               atomic_add(5, &dd->ipath_active_time); /* S/B #define */
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-
-       if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-       }
-
-       ipath_qcheck(dd);
-
-       /*
-        * deal with repeat error suppression.  Doesn't really matter if
-        * last error was almost a full interval ago, or just a few usecs
-        * ago; still won't get more than 2 per interval.  We may want
-        * longer intervals for this eventually, could do with mod, counter
-        * or separate timer.  Also see code in ipath_handle_errors() and
-        * ipath_handle_hwerrors().
-        */
-
-       if (dd->ipath_lasterror)
-               dd->ipath_lasterror = 0;
-       if (dd->ipath_lasthwerror)
-               dd->ipath_lasthwerror = 0;
-       if (dd->ipath_maskederrs
-           && time_after(jiffies, dd->ipath_unmasktime)) {
-               char ebuf[256];
-               int iserr;
-               iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
-                                        dd->ipath_maskederrs);
-               if (dd->ipath_maskederrs &
-                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                     INFINIPATH_E_PKTERRS))
-                       ipath_dev_err(dd, "Re-enabling masked errors "
-                                     "(%s)\n", ebuf);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal", for some
-                        * types of processes (mostly benchmarks) that send
-                        * huge numbers of messages, while not processing
-                        * them.  So only complain about these at debug
-                        * level.
-                        */
-                       if (iserr)
-                               ipath_dbg(
-                                       "Re-enabling queue full errors (%s)\n",
-                                       ebuf);
-                       else
-                               ipath_cdbg(ERRPKT, "Re-enabling packet"
-                                       " problem interrupt (%s)\n", ebuf);
-               }
-
-               /* re-enable masked errors */
-               dd->ipath_errormask |= dd->ipath_maskederrs;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                                dd->ipath_errormask);
-               dd->ipath_maskederrs = 0;
-       }
-
-       /* limit qfull messages to ~one per minute per port */
-       if ((++cnt & 0x10)) {
-               for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
-                       struct ipath_portdata *pd = dd->ipath_pd[i];
-
-                       if (pd && pd->port_lastrcvhdrqtail != -1)
-                               pd->port_lastrcvhdrqtail = -1;
-               }
-       }
-
-       ipath_chk_errormask(dd);
-done:
-       mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
-}
diff --git a/drivers/staging/rdma/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c
deleted file mode 100644 (file)
index b12b1f6..0000000
+++ /dev/null
@@ -1,1237 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-/**
- * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
- * @str: the string containing the number
- * @valp: where to put the result
- *
- * returns the number of bytes consumed, or negative value on error
- */
-int ipath_parse_ushort(const char *str, unsigned short *valp)
-{
-       unsigned long val;
-       char *end;
-       int ret;
-
-       if (!isdigit(str[0])) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       val = simple_strtoul(str, &end, 0);
-
-       if (val > 0xffff) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       *valp = val;
-
-       ret = end + 1 - str;
-       if (ret == 0)
-               ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-static ssize_t show_version(struct device_driver *dev, char *buf)
-{
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
-}
-
-static ssize_t show_num_units(struct device_driver *dev, char *buf)
-{
-       return scnprintf(buf, PAGE_SIZE, "%d\n",
-                        ipath_count_units(NULL, NULL, NULL));
-}
-
-static ssize_t show_status(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-
-       if (!dd->ipath_statusp) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
-                       (unsigned long long) *(dd->ipath_statusp));
-
-bail:
-       return ret;
-}
-
-static const char *ipath_status_str[] = {
-       "Initted",
-       "Disabled",
-       "Admin_Disabled",
-       "", /* This used to be the old "OIB_SMA" status. */
-       "", /* This used to be the old "SMA" status. */
-       "Present",
-       "IB_link_up",
-       "IB_configured",
-       "NoIBcable",
-       "Fatal_Hardware_Error",
-       NULL,
-};
-
-static ssize_t show_status_str(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int i, any;
-       u64 s;
-       ssize_t ret;
-
-       if (!dd->ipath_statusp) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       s = *(dd->ipath_statusp);
-       *buf = '\0';
-       for (any = i = 0; s && ipath_status_str[i]; i++) {
-               if (s & 1) {
-                       if (any && strlcat(buf, " ", PAGE_SIZE) >=
-                           PAGE_SIZE)
-                               /* overflow */
-                               break;
-                       if (strlcat(buf, ipath_status_str[i],
-                                   PAGE_SIZE) >= PAGE_SIZE)
-                               break;
-                       any = 1;
-               }
-               s >>= 1;
-       }
-       if (any)
-               strlcat(buf, "\n", PAGE_SIZE);
-
-       ret = strlen(buf);
-
-bail:
-       return ret;
-}
-
-static ssize_t show_boardversion(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
-}
-
-static ssize_t show_localbus_info(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
-}
-
-static ssize_t show_lmc(struct device *dev,
-                       struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
-}
-
-static ssize_t store_lmc(struct device *dev,
-                        struct device_attribute *attr,
-                        const char *buf,
-                        size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 lmc = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &lmc);
-       if (ret < 0)
-               goto invalid;
-
-       if (lmc > 7) {
-               ret = -EINVAL;
-               goto invalid;
-       }
-
-       ipath_set_lid(dd, dd->ipath_lid, lmc);
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
-bail:
-       return ret;
-}
-
-static ssize_t show_lid(struct device *dev,
-                       struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
-}
-
-static ssize_t store_lid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 lid = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &lid);
-       if (ret < 0)
-               goto invalid;
-
-       if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
-               ret = -EINVAL;
-               goto invalid;
-       }
-
-       ipath_set_lid(dd, lid, dd->ipath_lmc);
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
-bail:
-       return ret;
-}
-
-static ssize_t show_mlid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
-}
-
-static ssize_t store_mlid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 mlid;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &mlid);
-       if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
-               goto invalid;
-
-       dd->ipath_mlid = mlid;
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid MLID\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_guid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u8 *guid;
-
-       guid = (u8 *) & (dd->ipath_guid);
-
-       return scnprintf(buf, PAGE_SIZE,
-                        "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
-                        guid[0], guid[1], guid[2], guid[3],
-                        guid[4], guid[5], guid[6], guid[7]);
-}
-
-static ssize_t store_guid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       unsigned short guid[8];
-       __be64 new_guid;
-       u8 *ng;
-       int i;
-
-       if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
-                  &guid[0], &guid[1], &guid[2], &guid[3],
-                  &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
-               goto invalid;
-
-       ng = (u8 *) &new_guid;
-
-       for (i = 0; i < 8; i++) {
-               if (guid[i] > 0xff)
-                       goto invalid;
-               ng[i] = guid[i];
-       }
-
-       if (new_guid == 0)
-               goto invalid;
-
-       dd->ipath_guid = new_guid;
-       dd->ipath_nguid = 1;
-       if (dd->verbs_dev)
-               dd->verbs_dev->ibdev.node_guid = new_guid;
-
-       ret = strlen(buf);
-       goto bail;
-
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid GUID\n");
-       ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-static ssize_t show_nguid(struct device *dev,
-                         struct device_attribute *attr,
-                         char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
-}
-
-static ssize_t show_nports(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       /* Return the number of user ports available. */
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
-}
-
-static ssize_t show_serial(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       buf[sizeof dd->ipath_serial] = '\0';
-       memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
-       strcat(buf, "\n");
-       return strlen(buf);
-}
-
-static ssize_t show_unit(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
-}
-
-static ssize_t show_jint_max_packets(struct device *dev,
-                                    struct device_attribute *attr,
-                                    char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
-}
-
-static ssize_t store_jint_max_packets(struct device *dev,
-                                     struct device_attribute *attr,
-                                     const char *buf,
-                                     size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 v = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &v);
-       if (ret < 0)
-               ipath_dev_err(dd, "invalid jint_max_packets.\n");
-       else
-               dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
-
-       return ret;
-}
-
-static ssize_t show_jint_idle_ticks(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
-}
-
-static ssize_t store_jint_idle_ticks(struct device *dev,
-                                    struct device_attribute *attr,
-                                    const char *buf,
-                                    size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 v = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &v);
-       if (ret < 0)
-               ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
-       else
-               dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
-
-       return ret;
-}
-
-#define DEVICE_COUNTER(name, attr) \
-       static ssize_t show_counter_##name(struct device *dev, \
-                                          struct device_attribute *attr, \
-                                          char *buf) \
-       { \
-               struct ipath_devdata *dd = dev_get_drvdata(dev); \
-               return scnprintf(\
-                       buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
-                       ipath_snap_cntr( \
-                               dd, offsetof(struct infinipath_counters, \
-                                            attr) / sizeof(u64)));     \
-       } \
-       static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
-
-DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
-DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
-DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
-DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
-DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
-DEVICE_COUNTER(lb_ints, LBIntCnt);
-DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
-DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
-DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
-DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
-DEVICE_COUNTER(rx_dwords, RxDwordCnt);
-DEVICE_COUNTER(rx_ebps, RxEBPCnt);
-DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
-DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
-DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
-DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
-DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
-DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
-DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
-DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
-DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
-DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
-DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
-DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
-DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
-DEVICE_COUNTER(tx_dwords, TxDwordCnt);
-DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
-DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
-DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
-DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
-DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
-DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
-
-static struct attribute *dev_counter_attributes[] = {
-       &dev_attr_ib_link_downeds.attr,
-       &dev_attr_ib_link_err_recoveries.attr,
-       &dev_attr_ib_status_changes.attr,
-       &dev_attr_ib_symbol_errs.attr,
-       &dev_attr_lb_flow_stalls.attr,
-       &dev_attr_lb_ints.attr,
-       &dev_attr_rx_bad_formats.attr,
-       &dev_attr_rx_buf_ovfls.attr,
-       &dev_attr_rx_data_pkts.attr,
-       &dev_attr_rx_dropped_pkts.attr,
-       &dev_attr_rx_dwords.attr,
-       &dev_attr_rx_ebps.attr,
-       &dev_attr_rx_flow_ctrl_errs.attr,
-       &dev_attr_rx_flow_pkts.attr,
-       &dev_attr_rx_icrc_errs.attr,
-       &dev_attr_rx_len_errs.attr,
-       &dev_attr_rx_link_problems.attr,
-       &dev_attr_rx_lpcrc_errs.attr,
-       &dev_attr_rx_max_min_len_errs.attr,
-       &dev_attr_rx_p0_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p1_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p2_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p3_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p4_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p5_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p6_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p7_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p8_hdr_egr_ovfls.attr,
-       &dev_attr_rx_pkey_mismatches.attr,
-       &dev_attr_rx_tid_full_errs.attr,
-       &dev_attr_rx_tid_valid_errs.attr,
-       &dev_attr_rx_vcrc_errs.attr,
-       &dev_attr_tx_data_pkts.attr,
-       &dev_attr_tx_dropped_pkts.attr,
-       &dev_attr_tx_dwords.attr,
-       &dev_attr_tx_flow_pkts.attr,
-       &dev_attr_tx_flow_stalls.attr,
-       &dev_attr_tx_len_errs.attr,
-       &dev_attr_tx_max_min_len_errs.attr,
-       &dev_attr_tx_underruns.attr,
-       &dev_attr_tx_unsup_vl_errs.attr,
-       NULL
-};
-
-static struct attribute_group dev_counter_attr_group = {
-       .name = "counters",
-       .attrs = dev_counter_attributes
-};
-
-static ssize_t store_reset(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       if (count < 5 || memcmp(buf, "reset", 5)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (dd->ipath_flags & IPATH_DISABLED) {
-               /*
-                * post-reset init would re-enable interrupts, etc.
-                * so don't allow reset on disabled devices.  Not
-                * perfect error, but about the best choice.
-                */
-               dev_info(dev,"Unit %d is disabled, can't reset\n",
-                        dd->ipath_unit);
-               ret = -EINVAL;
-               goto bail;
-       }
-       ret = ipath_reset_device(dd->ipath_unit);
-bail:
-       return ret<0 ? ret : count;
-}
-
-static ssize_t store_link_state(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 state;
-
-       ret = ipath_parse_ushort(buf, &state);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_linkstate(dd, state);
-       if (r < 0) {
-               ret = r;
-               goto bail;
-       }
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid link state\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_mtu(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
-}
-
-static ssize_t store_mtu(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       u16 mtu = 0;
-       int r;
-
-       ret = ipath_parse_ushort(buf, &mtu);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_mtu(dd, mtu);
-       if (r < 0)
-               ret = r;
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid MTU\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_enabled(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
-}
-
-static ssize_t store_enabled(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       u16 enable = 0;
-
-       ret = ipath_parse_ushort(buf, &enable);
-       if (ret < 0) {
-               ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
-               goto bail;
-       }
-
-       if (enable) {
-               if (!(dd->ipath_flags & IPATH_DISABLED))
-                       goto bail;
-
-               dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
-               /* same as post-reset */
-               ret = ipath_init_chip(dd, 1);
-               if (ret)
-                       ipath_dev_err(dd, "Failed to enable unit %d\n",
-                                     dd->ipath_unit);
-               else {
-                       dd->ipath_flags &= ~IPATH_DISABLED;
-                       *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
-               }
-       } else if (!(dd->ipath_flags & IPATH_DISABLED)) {
-               dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
-               ipath_shutdown_device(dd);
-               dd->ipath_flags |= IPATH_DISABLED;
-               *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
-       }
-
-bail:
-       return ret;
-}
-
-static ssize_t store_rx_pol_inv(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_rx_pol_inv(dd, val);
-       if (r < 0) {
-               ret = r;
-               goto bail;
-       }
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
-bail:
-       return ret;
-}
-
-static ssize_t store_led_override(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret > 0)
-               ipath_set_led_override(dd, val);
-       else
-               ipath_dev_err(dd, "attempt to set invalid LED override\n");
-       return ret;
-}
-
-static ssize_t show_logged_errs(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int idx, count;
-
-       /* force consistency with actual EEPROM */
-       if (ipath_update_eeprom_log(dd) != 0)
-               return -ENXIO;
-
-       count = 0;
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
-                       dd->ipath_eep_st_errs[idx],
-                       idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
-       }
-
-       return count;
-}
-
-/*
- * New sysfs entries to control various IB config. These all turn into
- * accesses via ipath_f_get/set_ib_cfg.
- *
- * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
- */
-static ssize_t show_hrtbt_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_hrtbt_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 3)
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
-               goto bail;
-       }
-
-       /*
-        * Set the "intentional" heartbeat enable per either of
-        * "Enable" and "Auto", as these are normally set together.
-        * This bit is consulted when leaving loopback mode,
-        * because entering loopback mode overrides it and automatically
-        * disables heartbeat.
-        */
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
-       if (r < 0)
-               ret = r;
-       else if (val == IPATH_IB_HRTBT_OFF)
-               dd->ipath_flags |= IPATH_NO_HRTBT;
-       else
-               dd->ipath_flags &= ~IPATH_NO_HRTBT;
-
-bail:
-       return ret;
-}
-
-/*
- * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
- * _not_ the particular encoding of any given chip)
- */
-static ssize_t show_lwid_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_lwid_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && (val == 0 || val > 3))
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Link Width (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/* Get current link width */
-static ssize_t show_lwid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-/*
- * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
- */
-static ssize_t show_spd_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_spd_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Link Speed (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/* Get current link speed */
-static ssize_t show_spd(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-/*
- * Get/Set RX polarity-invert enable. 0=no, 1=yes.
- */
-static ssize_t show_rx_polinv_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_rx_polinv_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 1) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Rx Polarity (enable)\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/*
- * Get/Set RX lane-reversal enable. 0=no, 1=yes.
- */
-static ssize_t show_lanerev_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_lanerev_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 1) {
-               ret = -EINVAL;
-               ipath_dev_err(dd,
-                       "attempt to set invalid Lane reversal (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
-static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
-
-static struct attribute *driver_attributes[] = {
-       &driver_attr_num_units.attr,
-       &driver_attr_version.attr,
-       NULL
-};
-
-static struct attribute_group driver_attr_group = {
-       .attrs = driver_attributes
-};
-
-static ssize_t store_tempsense(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf,
-                              size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, stat;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret <= 0) {
-               ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
-               goto bail;
-       }
-       /* If anything but the highest limit, enable T_CRIT_A "interrupt" */
-       stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set tempsense config\n");
-               ret = -1;
-               goto bail;
-       }
-       stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set local Tcrit\n");
-               ret = -1;
-               goto bail;
-       }
-       stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set remote Tcrit\n");
-               ret = -1;
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-/*
- * dump tempsense regs. in decimal, to ease shell-scripts.
- */
-static ssize_t show_tempsense(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-       int idx;
-       u8 regvals[8];
-
-       ret = -ENXIO;
-       for (idx = 0; idx < 8; ++idx) {
-               if (idx == 6)
-                       continue;
-               ret = ipath_tempsense_read(dd, idx);
-               if (ret < 0)
-                       break;
-               regvals[idx] = ret;
-       }
-       if (idx == 8)
-               ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
-                       *(signed char *)(regvals),
-                       *(signed char *)(regvals + 1),
-                       regvals[2], regvals[3],
-                       *(signed char *)(regvals + 5),
-                       *(signed char *)(regvals + 7));
-       return ret;
-}
-
-const struct attribute_group *ipath_driver_attr_groups[] = {
-       &driver_attr_group,
-       NULL,
-};
-
-static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
-static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
-static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
-static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
-static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
-static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
-static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
-static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
-static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
-static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
-static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
-static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
-static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
-static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
-static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
-                  show_jint_max_packets, store_jint_max_packets);
-static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
-                  show_jint_idle_ticks, store_jint_idle_ticks);
-static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
-                  show_tempsense, store_tempsense);
-
-static struct attribute *dev_attributes[] = {
-       &dev_attr_guid.attr,
-       &dev_attr_lmc.attr,
-       &dev_attr_lid.attr,
-       &dev_attr_link_state.attr,
-       &dev_attr_mlid.attr,
-       &dev_attr_mtu.attr,
-       &dev_attr_nguid.attr,
-       &dev_attr_nports.attr,
-       &dev_attr_serial.attr,
-       &dev_attr_status.attr,
-       &dev_attr_status_str.attr,
-       &dev_attr_boardversion.attr,
-       &dev_attr_unit.attr,
-       &dev_attr_enabled.attr,
-       &dev_attr_rx_pol_inv.attr,
-       &dev_attr_led_override.attr,
-       &dev_attr_logged_errors.attr,
-       &dev_attr_tempsense.attr,
-       &dev_attr_localbus_info.attr,
-       NULL
-};
-
-static struct attribute_group dev_attr_group = {
-       .attrs = dev_attributes
-};
-
-static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
-                  store_hrtbt_enb);
-static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
-                  store_lwid_enb);
-static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
-static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
-                  store_spd_enb);
-static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
-static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
-                  store_rx_polinv_enb);
-static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
-                  store_lanerev_enb);
-
-static struct attribute *dev_ibcfg_attributes[] = {
-       &dev_attr_hrtbt_enable.attr,
-       &dev_attr_link_width_enable.attr,
-       &dev_attr_link_width.attr,
-       &dev_attr_link_speed_enable.attr,
-       &dev_attr_link_speed.attr,
-       &dev_attr_rx_pol_inv_enable.attr,
-       &dev_attr_rx_lane_rev_enable.attr,
-       NULL
-};
-
-static struct attribute_group dev_ibcfg_attr_group = {
-       .attrs = dev_ibcfg_attributes
-};
-
-/**
- * ipath_expose_reset - create a device reset file
- * @dev: the device structure
- *
- * Only expose a file that lets us reset the device after someone
- * enters diag mode.  A device reset is quite likely to crash the
- * machine entirely, so we don't want to normally make it
- * available.
- *
- * Called with ipath_mutex held.
- */
-int ipath_expose_reset(struct device *dev)
-{
-       static int exposed;
-       int ret;
-
-       if (!exposed) {
-               ret = device_create_file(dev, &dev_attr_reset);
-               exposed = 1;
-       } else {
-               ret = 0;
-       }
-
-       return ret;
-}
-
-int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
-{
-       int ret;
-
-       ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
-       if (ret)
-               goto bail;
-
-       ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
-       if (ret)
-               goto bail_attrs;
-
-       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-               ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
-               if (ret)
-                       goto bail_counter;
-               ret = device_create_file(dev, &dev_attr_jint_max_packets);
-               if (ret)
-                       goto bail_idle;
-
-               ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
-               if (ret)
-                       goto bail_max;
-       }
-
-       return 0;
-
-bail_max:
-       device_remove_file(dev, &dev_attr_jint_max_packets);
-bail_idle:
-       device_remove_file(dev, &dev_attr_jint_idle_ticks);
-bail_counter:
-       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-bail_attrs:
-       sysfs_remove_group(&dev->kobj, &dev_attr_group);
-bail:
-       return ret;
-}
-
-void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
-{
-       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-
-       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-               sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
-               device_remove_file(dev, &dev_attr_jint_idle_ticks);
-               device_remove_file(dev, &dev_attr_jint_max_packets);
-       }
-
-       sysfs_remove_group(&dev->kobj, &dev_attr_group);
-
-       device_remove_file(dev, &dev_attr_reset);
-}
diff --git a/drivers/staging/rdma/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c
deleted file mode 100644 (file)
index 0246b30..0000000
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_UC_##x
-
-/**
- * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_uc_req(struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       struct ipath_swqe *wqe;
-       unsigned long flags;
-       u32 hwords;
-       u32 bth0;
-       u32 len;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       int ret = 0;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       ohdr = &qp->s_hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &qp->s_hdr.u.l.oth;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-       bth0 = 1 << 22; /* Set M bit */
-
-       /* Get the next send request. */
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       qp->s_wqe = NULL;
-       switch (qp->s_state) {
-       default:
-               if (!(ib_ipath_state_ops[qp->state] &
-                   IPATH_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /* Check if send work queue is empty. */
-               if (qp->s_cur == qp->s_head)
-                       goto bail;
-               /*
-                * Start a new request.
-                */
-               qp->s_psn = wqe->psn = qp->s_next_psn;
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_len = len = wqe->length;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND)
-                               qp->s_state = OP(SEND_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / 4;
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= 1 << 23;
-                       }
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               break;
-
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND)
-                       qp->s_state = OP(SEND_LAST);
-               else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= 1 << 23;
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               else {
-                       qp->s_state =
-                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-               }
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(to_idev(qp->ibqp.device),
-                             qp, ohdr, bth0 | (qp->s_state << 24),
-                             qp->s_next_psn++ & IPATH_PSN_MASK);
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_uc_rcv - handle an incoming UC packet
- * @dev: the device the packet came in on
- * @hdr: the header of the packet
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the length of the packet
- * @qp: the QP for this packet.
- *
- * This is called from ipath_qp_rcv() to process an incoming UC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       int opcode;
-       u32 hdrsize;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       struct ib_reth *reth;
-       int header_in_data;
-
-       /* Validate the SLID. See Ch. 9.6.1.5 */
-       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-               goto done;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12;       /* LRH + BTH */
-               psn = be32_to_cpu(ohdr->bth[2]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
-               /*
-                * The header with GRH is 60 bytes and the
-                * core driver sets the eager header buffer
-                * size to 56 bytes so the last 4 bytes of
-                * the BTH header (PSN) is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       psn = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               } else
-                       psn = be32_to_cpu(ohdr->bth[2]);
-       }
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-       memset(&wc, 0, sizeof wc);
-
-       /* Compare the PSN verses the expected PSN. */
-       if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
-               /*
-                * Handle a sequence error.
-                * Silently drop any current message.
-                */
-               qp->r_psn = psn;
-       inv:
-               qp->r_state = OP(SEND_LAST);
-               switch (opcode) {
-               case OP(SEND_FIRST):
-               case OP(SEND_ONLY):
-               case OP(SEND_ONLY_WITH_IMMEDIATE):
-                       goto send_first;
-
-               case OP(RDMA_WRITE_FIRST):
-               case OP(RDMA_WRITE_ONLY):
-               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-                       goto rdma_first;
-
-               default:
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       default:
-               if (opcode == OP(SEND_FIRST) ||
-                   opcode == OP(SEND_ONLY) ||
-                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_FIRST) ||
-                   opcode == OP(RDMA_WRITE_ONLY) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-       }
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-       send_first:
-               if (qp->r_flags & IPATH_R_REUSE_SGE) {
-                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
-                       qp->r_sge = qp->s_rdma_read_sge;
-               } else if (!ipath_get_rwqe(qp, 0)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Save the WQE so we can reuse it in case of an error. */
-               qp->s_rdma_read_sge = qp->r_sge;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto send_last;
-               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
-                       goto send_last_imm;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len)) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-       send_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST):
-       send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len)) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               wc.opcode = IB_WC_RECV;
-       last_imm:
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               wc.sl = qp->remote_ah_attr.sl;
-               /* Signal completion event if the solicited bit is set. */
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                              (ohdr->bth[0] &
-                               cpu_to_be32(1 << 23)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
-       rdma_first:
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               hdrsize += sizeof(*reth);
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey */
-                       ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
-                                          vaddr, rkey,
-                                          IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok)) {
-                               dev->n_pkt_drops++;
-                               goto done;
-                       }
-               } else {
-                       qp->r_sge.sg_list = NULL;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto rdma_last;
-               else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       goto rdma_last_imm;
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-       rdma_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               if (qp->r_flags & IPATH_R_REUSE_SGE)
-                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
-               else if (!ipath_get_rwqe(qp, 1)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               wc.byte_len = qp->r_len;
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               goto last_imm;
-
-       case OP(RDMA_WRITE_LAST):
-       rdma_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               break;
-
-       default:
-               /* Drop packet for unknown opcodes. */
-               dev->n_pkt_drops++;
-               goto done;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-done:
-       return;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c
deleted file mode 100644 (file)
index 385d941..0000000
+++ /dev/null
@@ -1,579 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_ud_loopback - handle send on loopback QPs
- * @sqp: the sending QP
- * @swqe: the send work request
- *
- * This is called from ipath_make_ud_req() to forward a WQE addressed
- * to the same HCA.
- * Note that the receive interrupt handler may be calling ipath_ud_rcv()
- * while this is being called.
- */
-static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
-{
-       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-       struct ipath_qp *qp;
-       struct ib_ah_attr *ah_attr;
-       unsigned long flags;
-       struct ipath_rq *rq;
-       struct ipath_srq *srq;
-       struct ipath_sge_state rsge;
-       struct ipath_sge *sge;
-       struct ipath_rwq *wq;
-       struct ipath_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       struct ib_wc wc;
-       u32 tail;
-       u32 rlen;
-       u32 length;
-
-       qp = ipath_lookup_qpn(&dev->qp_table, swqe->ud_wr.remote_qpn);
-       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               goto done;
-       }
-
-       /*
-        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       if (unlikely(qp->ibqp.qp_num &&
-                    ((int) swqe->ud_wr.remote_qkey < 0 ?
-                     sqp->qkey : swqe->ud_wr.remote_qkey) != qp->qkey)) {
-               /* XXX OK to lose a count once in a while. */
-               dev->qkey_violations++;
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       length = swqe->length;
-       memset(&wc, 0, sizeof wc);
-       wc.byte_len = length + sizeof(struct ib_grh);
-
-       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = swqe->wr.ex.imm_data;
-       }
-
-       /*
-        * This would be a lot simpler if we could call ipath_get_rwqe()
-        * but that uses state that the receive interrupt handler uses
-        * so we would need to lock out receive interrupts while doing
-        * local loopback.
-        */
-       if (qp->ibqp.srq) {
-               srq = to_isrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        * Note that it is safe to drop the lock after changing rq->tail
-        * since ipath_post_receive() won't fill the empty slot.
-        */
-       spin_lock_irqsave(&rq->lock, flags);
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       if (unlikely(tail == wq->head)) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       wqe = get_rwqe_ptr(rq, tail);
-       rsge.sg_list = qp->r_ud_sg_list;
-       if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       /* Silently drop packets which are too big. */
-       if (wc.byte_len > rlen) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       if (++tail >= rq->size)
-               tail = 0;
-       wq->tail = tail;
-       wc.wr_id = wqe->wr_id;
-       if (handler) {
-               u32 n;
-
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-               } else
-                       spin_unlock_irqrestore(&rq->lock, flags);
-       } else
-               spin_unlock_irqrestore(&rq->lock, flags);
-
-       ah_attr = &to_iah(swqe->ud_wr.ah)->attr;
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
-               wc.wc_flags |= IB_WC_GRH;
-       } else
-               ipath_skip_sge(&rsge, sizeof(struct ib_grh));
-       sge = swqe->sg_list;
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               ipath_copy_sge(&rsge, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--swqe->wr.num_sge)
-                               sge++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = sqp->ibqp.qp_num;
-       /* XXX do we know which pkey matched? Only needed for GSI. */
-       wc.pkey_index = 0;
-       wc.slid = dev->dd->ipath_lid |
-               (ah_attr->src_path_bits &
-                ((1 << dev->dd->ipath_lmc) - 1));
-       wc.sl = ah_attr->sl;
-       wc.dlid_path_bits =
-               ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      swqe->ud_wr.wr.send_flags & IB_SEND_SOLICITED);
-drop:
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-done:;
-}
-
-/**
- * ipath_make_ud_req - construct a UD request packet
- * @qp: the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_ud_req(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_other_headers *ohdr;
-       struct ib_ah_attr *ah_attr;
-       struct ipath_swqe *wqe;
-       unsigned long flags;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth0;
-       u16 lrh0;
-       u16 lid;
-       int ret = 0;
-       int next_cur;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       if (qp->s_cur == qp->s_head)
-               goto bail;
-
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       next_cur = qp->s_cur + 1;
-       if (next_cur >= qp->s_size)
-               next_cur = 0;
-
-       /* Construct the header. */
-       ah_attr = &to_iah(wqe->ud_wr.ah)->attr;
-       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
-               if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
-                       dev->n_multicast_xmit++;
-               else
-                       dev->n_unicast_xmit++;
-       } else {
-               dev->n_unicast_xmit++;
-               lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
-               if (unlikely(lid == dev->dd->ipath_lid)) {
-                       /*
-                        * If DMAs are in progress, we can't generate
-                        * a completion for the loopback packet since
-                        * it would be out of order.
-                        * XXX Instead of waiting, we could queue a
-                        * zero length descriptor so we get a callback.
-                        */
-                       if (atomic_read(&qp->s_dma_busy)) {
-                               qp->s_flags |= IPATH_S_WAIT_DMA;
-                               goto bail;
-                       }
-                       qp->s_cur = next_cur;
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       ipath_ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
-                       goto done;
-               }
-       }
-
-       qp->s_cur = next_cur;
-       extra_bytes = -wqe->length & 3;
-       nwords = (wqe->length + extra_bytes) >> 2;
-
-       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-       qp->s_hdrwords = 7;
-       qp->s_cur_size = wqe->length;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_dmult = ah_attr->static_rate;
-       qp->s_wqe = wqe;
-       qp->s_sge.sge = wqe->sg_list[0];
-       qp->s_sge.sg_list = wqe->sg_list + 1;
-       qp->s_sge.num_sge = wqe->ud_wr.wr.num_sge;
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               /* Header size in 32-bit words. */
-               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-                                                &ah_attr->grh,
-                                                qp->s_hdrwords, nwords);
-               lrh0 = IPATH_LRH_GRH;
-               ohdr = &qp->s_hdr.u.l.oth;
-               /*
-                * Don't worry about sending to locally attached multicast
-                * QPs.  It is unspecified by the spec. what happens.
-                */
-       } else {
-               /* Header size in 32-bit words. */
-               lrh0 = IPATH_LRH_BTH;
-               ohdr = &qp->s_hdr.u.oth;
-       }
-       if (wqe->ud_wr.wr.opcode == IB_WR_SEND_WITH_IMM) {
-               qp->s_hdrwords++;
-               ohdr->u.ud.imm_data = wqe->ud_wr.wr.ex.imm_data;
-               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-       } else
-               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-       lrh0 |= ah_attr->sl << 4;
-       if (qp->ibqp.qp_type == IB_QPT_SMI)
-               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
-       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-       qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
-       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
-                                          SIZE_OF_CRC);
-       lid = dev->dd->ipath_lid;
-       if (lid) {
-               lid |= ah_attr->src_path_bits &
-                       ((1 << dev->dd->ipath_lmc) - 1);
-               qp->s_hdr.lrh[3] = cpu_to_be16(lid);
-       } else
-               qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
-       if (wqe->ud_wr.wr.send_flags & IB_SEND_SOLICITED)
-               bth0 |= 1 << 23;
-       bth0 |= extra_bytes << 20;
-       bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
-               ipath_get_pkey(dev->dd, qp->s_pkey_index);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       /*
-        * Use the multicast QP if the destination LID is a multicast LID.
-        */
-       ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-               ah_attr->dlid != IPATH_PERMISSIVE_LID ?
-               cpu_to_be32(IPATH_MULTICAST_QPN) :
-               cpu_to_be32(wqe->ud_wr.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
-       /*
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-                                        qp->qkey : wqe->ud_wr.remote_qkey);
-       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_ud_rcv - receive an incoming UD packet
- * @dev: the device the packet came in on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_qp_rcv() to process an incoming UD packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       int opcode;
-       u32 hdrsize;
-       u32 pad;
-       struct ib_wc wc;
-       u32 qkey;
-       u32 src_qp;
-       u16 dlid;
-       int header_in_data;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
-               qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-               src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
-               /*
-                * The header with GRH is 68 bytes and the core driver sets
-                * the eager header buffer size to 56 bytes so the last 12
-                * bytes of the IB header is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       qkey = be32_to_cpu(((__be32 *) data)[1]);
-                       src_qp = be32_to_cpu(((__be32 *) data)[2]);
-                       data += 12;
-               } else {
-                       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-                       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-               }
-       }
-       src_qp &= IPATH_QPN_MASK;
-
-       /*
-        * Check that the permissive LID is only used on QP0
-        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
-        */
-       if (qp->ibqp.qp_num) {
-               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                            hdr->lrh[3] == IB_LID_PERMISSIVE)) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               if (unlikely(qkey != qp->qkey)) {
-                       /* XXX OK to lose a count once in a while. */
-                       dev->qkey_violations++;
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-       } else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                  hdr->lrh[3] == IB_LID_PERMISSIVE) {
-               struct ib_smp *smp = (struct ib_smp *) data;
-
-               if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-       }
-
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (qp->ibqp.qp_num > 1 &&
-           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else
-                       wc.ex.imm_data = ohdr->u.ud.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               hdrsize += sizeof(u32);
-       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-       } else {
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-
-       /* Get the number of bytes the message was padded by. */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4))) {
-               /* Drop incomplete packets. */
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-       tlen -= hdrsize + pad + 4;
-
-       /* Drop invalid MAD packets (see 13.5.3.1). */
-       if (unlikely((qp->ibqp.qp_num == 0 &&
-                     (tlen != 256 ||
-                      (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
-                    (qp->ibqp.qp_num == 1 &&
-                     (tlen != 256 ||
-                      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       wc.byte_len = tlen + sizeof(struct ib_grh);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & IPATH_R_REUSE_SGE)
-               qp->r_flags &= ~IPATH_R_REUSE_SGE;
-       else if (!ipath_get_rwqe(qp, 0)) {
-               /*
-                * Count VL15 packets dropped due to no receive buffer.
-                * Otherwise, count them as buffer overruns since usually,
-                * the HW will be able to receive packets even if there are
-                * no QPs with posted receive buffers.
-                */
-               if (qp->ibqp.qp_num == 0)
-                       dev->n_vl15_dropped++;
-               else
-                       dev->rcv_errors++;
-               goto bail;
-       }
-       /* Silently drop packets which are too big. */
-       if (wc.byte_len > qp->r_len) {
-               qp->r_flags |= IPATH_R_REUSE_SGE;
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-       if (has_grh) {
-               ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-                              sizeof(struct ib_grh));
-               wc.wc_flags |= IB_WC_GRH;
-       } else
-               ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
-       ipath_copy_sge(&qp->r_sge, data,
-                      wc.byte_len - sizeof(struct ib_grh));
-       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-               goto bail;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = src_qp;
-       /* XXX do we know which pkey matched? Only needed for GSI. */
-       wc.pkey_index = 0;
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
-       dlid = be16_to_cpu(hdr->lrh[1]);
-       /*
-        * Save the LMC lower bits if the destination LID is a unicast LID.
-        */
-       wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
-               dlid & ((1 << dev->dd->ipath_lmc) - 1);
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      (ohdr->bth[0] &
-                       cpu_to_be32(1 << 23)) != 0);
-
-bail:;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
deleted file mode 100644 (file)
index d29b4da..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/mm.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-
-#include "ipath_kernel.h"
-
-static void __ipath_release_user_pages(struct page **p, size_t num_pages,
-                                  int dirty)
-{
-       size_t i;
-
-       for (i = 0; i < num_pages; i++) {
-               ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
-                          (unsigned long) num_pages, p[i]);
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
-}
-
-/* call with current->mm->mmap_sem held */
-static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-                                 struct page **p)
-{
-       unsigned long lock_limit;
-       size_t got;
-       int ret;
-
-       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-       if (num_pages > lock_limit) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
-                  (unsigned long) num_pages, start_page);
-
-       for (got = 0; got < num_pages; got += ret) {
-               ret = get_user_pages(current, current->mm,
-                                    start_page + got * PAGE_SIZE,
-                                    num_pages - got, 1, 1,
-                                    p + got, NULL);
-               if (ret < 0)
-                       goto bail_release;
-       }
-
-       current->mm->pinned_vm += num_pages;
-
-       ret = 0;
-       goto bail;
-
-bail_release:
-       __ipath_release_user_pages(p, got, 0);
-bail:
-       return ret;
-}
-
-/**
- * ipath_map_page - a safety wrapper around pci_map_page()
- *
- * A dma_addr of all 0's is interpreted by the chip as "disabled".
- * Unfortunately, it can also be a valid dma_addr returned on some
- * architectures.
- *
- * The powerpc iommu assigns dma_addrs in ascending order, so we don't
- * have to bother with retries or mapping a dummy page to insure we
- * don't just get the same mapping again.
- *
- * I'm sure we won't be so lucky with other iommu's, so FIXME.
- */
-dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
-       unsigned long offset, size_t size, int direction)
-{
-       dma_addr_t phys;
-
-       phys = pci_map_page(hwdev, page, offset, size, direction);
-
-       if (phys == 0) {
-               pci_unmap_page(hwdev, phys, size, direction);
-               phys = pci_map_page(hwdev, page, offset, size, direction);
-               /*
-                * FIXME: If we get 0 again, we should keep this page,
-                * map another, then free the 0 page.
-                */
-       }
-
-       return phys;
-}
-
-/**
- * ipath_map_single - a safety wrapper around pci_map_single()
- *
- * Same idea as ipath_map_page().
- */
-dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
-       int direction)
-{
-       dma_addr_t phys;
-
-       phys = pci_map_single(hwdev, ptr, size, direction);
-
-       if (phys == 0) {
-               pci_unmap_single(hwdev, phys, size, direction);
-               phys = pci_map_single(hwdev, ptr, size, direction);
-               /*
-                * FIXME: If we get 0 again, we should keep this page,
-                * map another, then free the 0 page.
-                */
-       }
-
-       return phys;
-}
-
-/**
- * ipath_get_user_pages - lock user pages into memory
- * @start_page: the start page
- * @num_pages: the number of pages
- * @p: the output page structures
- *
- * This function takes a given start page (page aligned user virtual
- * address) and pins it and the following specified number of pages.  For
- * now, num_pages is always 1, but that will probably change at some point
- * (because caller is doing expected sends on a single virtually contiguous
- * buffer, so we can do all pages at once).
- */
-int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-                        struct page **p)
-{
-       int ret;
-
-       down_write(&current->mm->mmap_sem);
-
-       ret = __ipath_get_user_pages(start_page, num_pages, p);
-
-       up_write(&current->mm->mmap_sem);
-
-       return ret;
-}
-
-void ipath_release_user_pages(struct page **p, size_t num_pages)
-{
-       down_write(&current->mm->mmap_sem);
-
-       __ipath_release_user_pages(p, num_pages, 1);
-
-       current->mm->pinned_vm -= num_pages;
-
-       up_write(&current->mm->mmap_sem);
-}
-
-struct ipath_user_pages_work {
-       struct work_struct work;
-       struct mm_struct *mm;
-       unsigned long num_pages;
-};
-
-static void user_pages_account(struct work_struct *_work)
-{
-       struct ipath_user_pages_work *work =
-               container_of(_work, struct ipath_user_pages_work, work);
-
-       down_write(&work->mm->mmap_sem);
-       work->mm->pinned_vm -= work->num_pages;
-       up_write(&work->mm->mmap_sem);
-       mmput(work->mm);
-       kfree(work);
-}
-
-void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
-{
-       struct ipath_user_pages_work *work;
-       struct mm_struct *mm;
-
-       __ipath_release_user_pages(p, num_pages, 1);
-
-       mm = get_task_mm(current);
-       if (!mm)
-               return;
-
-       work = kmalloc(sizeof(*work), GFP_KERNEL);
-       if (!work)
-               goto bail_mm;
-
-       INIT_WORK(&work->work, user_pages_account);
-       work->mm = mm;
-       work->num_pages = num_pages;
-
-       queue_work(ib_wq, &work->work);
-       return;
-
-bail_mm:
-       mmput(mm);
-       return;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c
deleted file mode 100644 (file)
index 8c12e3c..0000000
+++ /dev/null
@@ -1,874 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/uio.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-
-#include "ipath_kernel.h"
-#include "ipath_user_sdma.h"
-
-/* minimum size of header */
-#define IPATH_USER_SDMA_MIN_HEADER_LENGTH      64
-/* expected size of headers (for dma_pool) */
-#define IPATH_USER_SDMA_EXP_HEADER_LENGTH      64
-/* length mask in PBC (lower 11 bits) */
-#define IPATH_PBC_LENGTH_MASK                  ((1 << 11) - 1)
-
-struct ipath_user_sdma_pkt {
-       u8 naddr;               /* dimension of addr (1..3) ... */
-       u32 counter;            /* sdma pkts queued counter for this entry */
-       u64 added;              /* global descq number of entries */
-
-       struct {
-               u32 offset;                     /* offset for kvaddr, addr */
-               u32 length;                     /* length in page */
-               u8  put_page;                   /* should we put_page? */
-               u8  dma_mapped;                 /* is page dma_mapped? */
-               struct page *page;              /* may be NULL (coherent mem) */
-               void *kvaddr;                   /* FIXME: only for pio hack */
-               dma_addr_t addr;
-       } addr[4];   /* max pages, any more and we coalesce */
-       struct list_head list;  /* list element */
-};
-
-struct ipath_user_sdma_queue {
-       /*
-        * pkts sent to dma engine are queued on this
-        * list head.  the type of the elements of this
-        * list are struct ipath_user_sdma_pkt...
-        */
-       struct list_head sent;
-
-       /* headers with expected length are allocated from here... */
-       char header_cache_name[64];
-       struct dma_pool *header_cache;
-
-       /* packets are allocated from the slab cache... */
-       char pkt_slab_name[64];
-       struct kmem_cache *pkt_slab;
-
-       /* as packets go on the queued queue, they are counted... */
-       u32 counter;
-       u32 sent_counter;
-
-       /* dma page table */
-       struct rb_root dma_pages_root;
-
-       /* protect everything above... */
-       struct mutex lock;
-};
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
-{
-       struct ipath_user_sdma_queue *pq =
-               kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
-
-       if (!pq)
-               goto done;
-
-       pq->counter = 0;
-       pq->sent_counter = 0;
-       INIT_LIST_HEAD(&pq->sent);
-
-       mutex_init(&pq->lock);
-
-       snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
-                "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
-       pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
-                                        sizeof(struct ipath_user_sdma_pkt),
-                                        0, 0, NULL);
-
-       if (!pq->pkt_slab)
-               goto err_kfree;
-
-       snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
-                "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
-       pq->header_cache = dma_pool_create(pq->header_cache_name,
-                                          dev,
-                                          IPATH_USER_SDMA_EXP_HEADER_LENGTH,
-                                          4, 0);
-       if (!pq->header_cache)
-               goto err_slab;
-
-       pq->dma_pages_root = RB_ROOT;
-
-       goto done;
-
-err_slab:
-       kmem_cache_destroy(pq->pkt_slab);
-err_kfree:
-       kfree(pq);
-       pq = NULL;
-
-done:
-       return pq;
-}
-
-static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
-                                     int i, size_t offset, size_t len,
-                                     int put_page, int dma_mapped,
-                                     struct page *page,
-                                     void *kvaddr, dma_addr_t dma_addr)
-{
-       pkt->addr[i].offset = offset;
-       pkt->addr[i].length = len;
-       pkt->addr[i].put_page = put_page;
-       pkt->addr[i].dma_mapped = dma_mapped;
-       pkt->addr[i].page = page;
-       pkt->addr[i].kvaddr = kvaddr;
-       pkt->addr[i].addr = dma_addr;
-}
-
-static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
-                                       u32 counter, size_t offset,
-                                       size_t len, int dma_mapped,
-                                       struct page *page,
-                                       void *kvaddr, dma_addr_t dma_addr)
-{
-       pkt->naddr = 1;
-       pkt->counter = counter;
-       ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
-                                 kvaddr, dma_addr);
-}
-
-/* we've too many pages in the iovec, coalesce to a single page */
-static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
-                                   struct ipath_user_sdma_pkt *pkt,
-                                   const struct iovec *iov,
-                                   unsigned long niov) {
-       int ret = 0;
-       struct page *page = alloc_page(GFP_KERNEL);
-       void *mpage_save;
-       char *mpage;
-       int i;
-       int len = 0;
-       dma_addr_t dma_addr;
-
-       if (!page) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       mpage = kmap(page);
-       mpage_save = mpage;
-       for (i = 0; i < niov; i++) {
-               int cfur;
-
-               cfur = copy_from_user(mpage,
-                                     iov[i].iov_base, iov[i].iov_len);
-               if (cfur) {
-                       ret = -EFAULT;
-                       goto free_unmap;
-               }
-
-               mpage += iov[i].iov_len;
-               len += iov[i].iov_len;
-       }
-
-       dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
-                               DMA_TO_DEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-               ret = -ENOMEM;
-               goto free_unmap;
-       }
-
-       ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
-                                 dma_addr);
-       pkt->naddr = 2;
-
-       goto done;
-
-free_unmap:
-       kunmap(page);
-       __free_page(page);
-done:
-       return ret;
-}
-
-/* how many pages in this iovec element? */
-static int ipath_user_sdma_num_pages(const struct iovec *iov)
-{
-       const unsigned long addr  = (unsigned long) iov->iov_base;
-       const unsigned long  len  = iov->iov_len;
-       const unsigned long spage = addr & PAGE_MASK;
-       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
-
-       return 1 + ((epage - spage) >> PAGE_SHIFT);
-}
-
-/* truncate length to page boundary */
-static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
-{
-       const unsigned long offset = offset_in_page(addr);
-
-       return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
-}
-
-static void ipath_user_sdma_free_pkt_frag(struct device *dev,
-                                         struct ipath_user_sdma_queue *pq,
-                                         struct ipath_user_sdma_pkt *pkt,
-                                         int frag)
-{
-       const int i = frag;
-
-       if (pkt->addr[i].page) {
-               if (pkt->addr[i].dma_mapped)
-                       dma_unmap_page(dev,
-                                      pkt->addr[i].addr,
-                                      pkt->addr[i].length,
-                                      DMA_TO_DEVICE);
-
-               if (pkt->addr[i].kvaddr)
-                       kunmap(pkt->addr[i].page);
-
-               if (pkt->addr[i].put_page)
-                       put_page(pkt->addr[i].page);
-               else
-                       __free_page(pkt->addr[i].page);
-       } else if (pkt->addr[i].kvaddr)
-               /* free coherent mem from cache... */
-               dma_pool_free(pq->header_cache,
-                             pkt->addr[i].kvaddr, pkt->addr[i].addr);
-}
-
-/* return number of pages pinned... */
-static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
-                                    struct ipath_user_sdma_pkt *pkt,
-                                    unsigned long addr, int tlen, int npages)
-{
-       struct page *pages[2];
-       int j;
-       int ret;
-
-       ret = get_user_pages_fast(addr, npages, 0, pages);
-       if (ret != npages) {
-               int i;
-
-               for (i = 0; i < ret; i++)
-                       put_page(pages[i]);
-
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       for (j = 0; j < npages; j++) {
-               /* map the pages... */
-               const int flen =
-                       ipath_user_sdma_page_length(addr, tlen);
-               dma_addr_t dma_addr =
-                       dma_map_page(&dd->pcidev->dev,
-                                    pages[j], 0, flen, DMA_TO_DEVICE);
-               unsigned long fofs = offset_in_page(addr);
-
-               if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-
-               ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
-                                         pages[j], kmap(pages[j]),
-                                         dma_addr);
-
-               pkt->naddr++;
-               addr += flen;
-               tlen -= flen;
-       }
-
-done:
-       return ret;
-}
-
-static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
-                                  struct ipath_user_sdma_queue *pq,
-                                  struct ipath_user_sdma_pkt *pkt,
-                                  const struct iovec *iov,
-                                  unsigned long niov)
-{
-       int ret = 0;
-       unsigned long idx;
-
-       for (idx = 0; idx < niov; idx++) {
-               const int npages = ipath_user_sdma_num_pages(iov + idx);
-               const unsigned long addr = (unsigned long) iov[idx].iov_base;
-
-               ret = ipath_user_sdma_pin_pages(dd, pkt,
-                                               addr, iov[idx].iov_len,
-                                               npages);
-               if (ret < 0)
-                       goto free_pkt;
-       }
-
-       goto done;
-
-free_pkt:
-       for (idx = 0; idx < pkt->naddr; idx++)
-               ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
-
-done:
-       return ret;
-}
-
-static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
-                                       struct ipath_user_sdma_queue *pq,
-                                       struct ipath_user_sdma_pkt *pkt,
-                                       const struct iovec *iov,
-                                       unsigned long niov, int npages)
-{
-       int ret = 0;
-
-       if (npages >= ARRAY_SIZE(pkt->addr))
-               ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
-       else
-               ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
-
-       return ret;
-}
-
-/* free a packet list -- return counter value of last packet */
-static void ipath_user_sdma_free_pkt_list(struct device *dev,
-                                         struct ipath_user_sdma_queue *pq,
-                                         struct list_head *list)
-{
-       struct ipath_user_sdma_pkt *pkt, *pkt_next;
-
-       list_for_each_entry_safe(pkt, pkt_next, list, list) {
-               int i;
-
-               for (i = 0; i < pkt->naddr; i++)
-                       ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
-
-               kmem_cache_free(pq->pkt_slab, pkt);
-       }
-}
-
-/*
- * copy headers, coalesce etc -- pq->lock must be held
- *
- * we queue all the packets to list, returning the
- * number of bytes total.  list must be empty initially,
- * as, if there is an error we clean it...
- */
-static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
-                                     struct ipath_user_sdma_queue *pq,
-                                     struct list_head *list,
-                                     const struct iovec *iov,
-                                     unsigned long niov,
-                                     int maxpkts)
-{
-       unsigned long idx = 0;
-       int ret = 0;
-       int npkts = 0;
-       struct page *page = NULL;
-       __le32 *pbc;
-       dma_addr_t dma_addr;
-       struct ipath_user_sdma_pkt *pkt = NULL;
-       size_t len;
-       size_t nw;
-       u32 counter = pq->counter;
-       int dma_mapped = 0;
-
-       while (idx < niov && npkts < maxpkts) {
-               const unsigned long addr = (unsigned long) iov[idx].iov_base;
-               const unsigned long idx_save = idx;
-               unsigned pktnw;
-               unsigned pktnwc;
-               int nfrags = 0;
-               int npages = 0;
-               int cfur;
-
-               dma_mapped = 0;
-               len = iov[idx].iov_len;
-               nw = len >> 2;
-               page = NULL;
-
-               pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
-               if (!pkt) {
-                       ret = -ENOMEM;
-                       goto free_list;
-               }
-
-               if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
-                   len > PAGE_SIZE || len & 3 || addr & 3) {
-                       ret = -EINVAL;
-                       goto free_pkt;
-               }
-
-               if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
-                       pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
-                                            &dma_addr);
-               else
-                       pbc = NULL;
-
-               if (!pbc) {
-                       page = alloc_page(GFP_KERNEL);
-                       if (!page) {
-                               ret = -ENOMEM;
-                               goto free_pkt;
-                       }
-                       pbc = kmap(page);
-               }
-
-               cfur = copy_from_user(pbc, iov[idx].iov_base, len);
-               if (cfur) {
-                       ret = -EFAULT;
-                       goto free_pbc;
-               }
-
-               /*
-                * this assignment is a bit strange.  it's because the
-                * the pbc counts the number of 32 bit words in the full
-                * packet _except_ the first word of the pbc itself...
-                */
-               pktnwc = nw - 1;
-
-               /*
-                * pktnw computation yields the number of 32 bit words
-                * that the caller has indicated in the PBC.  note that
-                * this is one less than the total number of words that
-                * goes to the send DMA engine as the first 32 bit word
-                * of the PBC itself is not counted.  Armed with this count,
-                * we can verify that the packet is consistent with the
-                * iovec lengths.
-                */
-               pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
-               if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
-                       ret = -EINVAL;
-                       goto free_pbc;
-               }
-
-
-               idx++;
-               while (pktnwc < pktnw && idx < niov) {
-                       const size_t slen = iov[idx].iov_len;
-                       const unsigned long faddr =
-                               (unsigned long) iov[idx].iov_base;
-
-                       if (slen & 3 || faddr & 3 || !slen ||
-                           slen > PAGE_SIZE) {
-                               ret = -EINVAL;
-                               goto free_pbc;
-                       }
-
-                       npages++;
-                       if ((faddr & PAGE_MASK) !=
-                           ((faddr + slen - 1) & PAGE_MASK))
-                               npages++;
-
-                       pktnwc += slen >> 2;
-                       idx++;
-                       nfrags++;
-               }
-
-               if (pktnwc != pktnw) {
-                       ret = -EINVAL;
-                       goto free_pbc;
-               }
-
-               if (page) {
-                       dma_addr = dma_map_page(&dd->pcidev->dev,
-                                               page, 0, len, DMA_TO_DEVICE);
-                       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-                               ret = -ENOMEM;
-                               goto free_pbc;
-                       }
-
-                       dma_mapped = 1;
-               }
-
-               ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
-                                           page, pbc, dma_addr);
-
-               if (nfrags) {
-                       ret = ipath_user_sdma_init_payload(dd, pq, pkt,
-                                                          iov + idx_save + 1,
-                                                          nfrags, npages);
-                       if (ret < 0)
-                               goto free_pbc_dma;
-               }
-
-               counter++;
-               npkts++;
-
-               list_add_tail(&pkt->list, list);
-       }
-
-       ret = idx;
-       goto done;
-
-free_pbc_dma:
-       if (dma_mapped)
-               dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
-free_pbc:
-       if (page) {
-               kunmap(page);
-               __free_page(page);
-       } else
-               dma_pool_free(pq->header_cache, pbc, dma_addr);
-free_pkt:
-       kmem_cache_free(pq->pkt_slab, pkt);
-free_list:
-       ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
-done:
-       return ret;
-}
-
-static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
-                                                u32 c)
-{
-       pq->sent_counter = c;
-}
-
-/* try to clean out queue -- needs pq->lock */
-static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
-                                      struct ipath_user_sdma_queue *pq)
-{
-       struct list_head free_list;
-       struct ipath_user_sdma_pkt *pkt;
-       struct ipath_user_sdma_pkt *pkt_prev;
-       int ret = 0;
-
-       INIT_LIST_HEAD(&free_list);
-
-       list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
-               s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
-
-               if (descd < 0)
-                       break;
-
-               list_move_tail(&pkt->list, &free_list);
-
-               /* one more packet cleaned */
-               ret++;
-       }
-
-       if (!list_empty(&free_list)) {
-               u32 counter;
-
-               pkt = list_entry(free_list.prev,
-                                struct ipath_user_sdma_pkt, list);
-               counter = pkt->counter;
-
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-               ipath_user_sdma_set_complete_counter(pq, counter);
-       }
-
-       return ret;
-}
-
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
-{
-       if (!pq)
-               return;
-
-       kmem_cache_destroy(pq->pkt_slab);
-       dma_pool_destroy(pq->header_cache);
-       kfree(pq);
-}
-
-/* clean descriptor queue, returns > 0 if some elements cleaned */
-static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
-{
-       int ret;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       ret = ipath_sdma_make_progress(dd);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       return ret;
-}
-
-/* we're in close, drain packets so that we can cleanup successfully... */
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-                                struct ipath_user_sdma_queue *pq)
-{
-       int i;
-
-       if (!pq)
-               return;
-
-       for (i = 0; i < 100; i++) {
-               mutex_lock(&pq->lock);
-               if (list_empty(&pq->sent)) {
-                       mutex_unlock(&pq->lock);
-                       break;
-               }
-               ipath_user_sdma_hwqueue_clean(dd);
-               ipath_user_sdma_queue_clean(dd, pq);
-               mutex_unlock(&pq->lock);
-               msleep(10);
-       }
-
-       if (!list_empty(&pq->sent)) {
-               struct list_head free_list;
-
-               printk(KERN_INFO "drain: lists not empty: forcing!\n");
-               INIT_LIST_HEAD(&free_list);
-               mutex_lock(&pq->lock);
-               list_splice_init(&pq->sent, &free_list);
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-               mutex_unlock(&pq->lock);
-       }
-}
-
-static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
-                                          u64 addr, u64 dwlen, u64 dwoffset)
-{
-       return cpu_to_le64(/* SDmaPhyAddr[31:0] */
-                          ((addr & 0xfffffffcULL) << 32) |
-                          /* SDmaGeneration[1:0] */
-                          ((dd->ipath_sdma_generation & 3ULL) << 30) |
-                          /* SDmaDwordCount[10:0] */
-                          ((dwlen & 0x7ffULL) << 16) |
-                          /* SDmaBufOffset[12:2] */
-                          (dwoffset & 0x7ffULL));
-}
-
-static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
-{
-       return descq | cpu_to_le64(1ULL << 12);
-}
-
-static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
-{
-                                             /* last */  /* dma head */
-       return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
-}
-
-static inline __le64 ipath_sdma_make_desc1(u64 addr)
-{
-       /* SDmaPhyAddr[47:32] */
-       return cpu_to_le64(addr >> 32);
-}
-
-static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
-                                     struct ipath_user_sdma_pkt *pkt, int idx,
-                                     unsigned ofs, u16 tail)
-{
-       const u64 addr = (u64) pkt->addr[idx].addr +
-               (u64) pkt->addr[idx].offset;
-       const u64 dwlen = (u64) pkt->addr[idx].length / 4;
-       __le64 *descqp;
-       __le64 descq0;
-
-       descqp = &dd->ipath_sdma_descq[tail].qw[0];
-
-       descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
-       if (idx == 0)
-               descq0 = ipath_sdma_make_first_desc0(descq0);
-       if (idx == pkt->naddr - 1)
-               descq0 = ipath_sdma_make_last_desc0(descq0);
-
-       descqp[0] = descq0;
-       descqp[1] = ipath_sdma_make_desc1(addr);
-}
-
-/* pq->lock must be held, get packets on the wire... */
-static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
-                                    struct ipath_user_sdma_queue *pq,
-                                    struct list_head *pktlist)
-{
-       int ret = 0;
-       unsigned long flags;
-       u16 tail;
-
-       if (list_empty(pktlist))
-               return 0;
-
-       if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
-               return -ECOMM;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
-               ret = -ECOMM;
-               goto unlock;
-       }
-
-       tail = dd->ipath_sdma_descq_tail;
-       while (!list_empty(pktlist)) {
-               struct ipath_user_sdma_pkt *pkt =
-                       list_entry(pktlist->next, struct ipath_user_sdma_pkt,
-                                  list);
-               int i;
-               unsigned ofs = 0;
-               u16 dtail = tail;
-
-               if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
-                       goto unlock_check_tail;
-
-               for (i = 0; i < pkt->naddr; i++) {
-                       ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
-                       ofs += pkt->addr[i].length >> 2;
-
-                       if (++tail == dd->ipath_sdma_descq_cnt) {
-                               tail = 0;
-                               ++dd->ipath_sdma_generation;
-                       }
-               }
-
-               if ((ofs<<2) > dd->ipath_ibmaxlen) {
-                       ipath_dbg("packet size %X > ibmax %X, fail\n",
-                               ofs<<2, dd->ipath_ibmaxlen);
-                       ret = -EMSGSIZE;
-                       goto unlock;
-               }
-
-               /*
-                * if the packet is >= 2KB mtu equivalent, we have to use
-                * the large buffers, and have to mark each descriptor as
-                * part of a large buffer packet.
-                */
-               if (ofs >= IPATH_SMALLBUF_DWORDS) {
-                       for (i = 0; i < pkt->naddr; i++) {
-                               dd->ipath_sdma_descq[dtail].qw[0] |=
-                                       cpu_to_le64(1ULL << 14);
-                               if (++dtail == dd->ipath_sdma_descq_cnt)
-                                       dtail = 0;
-                       }
-               }
-
-               dd->ipath_sdma_descq_added += pkt->naddr;
-               pkt->added = dd->ipath_sdma_descq_added;
-               list_move_tail(&pkt->list, &pq->sent);
-               ret++;
-       }
-
-unlock_check_tail:
-       /* advance the tail on the chip if necessary */
-       if (dd->ipath_sdma_descq_tail != tail) {
-               wmb();
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-               dd->ipath_sdma_descq_tail = tail;
-       }
-
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       return ret;
-}
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-                          struct ipath_user_sdma_queue *pq,
-                          const struct iovec *iov,
-                          unsigned long dim)
-{
-       int ret = 0;
-       struct list_head list;
-       int npkts = 0;
-
-       INIT_LIST_HEAD(&list);
-
-       mutex_lock(&pq->lock);
-
-       if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
-               ipath_user_sdma_hwqueue_clean(dd);
-               ipath_user_sdma_queue_clean(dd, pq);
-       }
-
-       while (dim) {
-               const int mxp = 8;
-
-               ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
-               if (ret <= 0)
-                       goto done_unlock;
-               else {
-                       dim -= ret;
-                       iov += ret;
-               }
-
-               /* force packets onto the sdma hw queue... */
-               if (!list_empty(&list)) {
-                       /*
-                        * lazily clean hw queue.  the 4 is a guess of about
-                        * how many sdma descriptors a packet will take (it
-                        * doesn't have to be perfect).
-                        */
-                       if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
-                               ipath_user_sdma_hwqueue_clean(dd);
-                               ipath_user_sdma_queue_clean(dd, pq);
-                       }
-
-                       ret = ipath_user_sdma_push_pkts(dd, pq, &list);
-                       if (ret < 0)
-                               goto done_unlock;
-                       else {
-                               npkts += ret;
-                               pq->counter += ret;
-
-                               if (!list_empty(&list))
-                                       goto done_unlock;
-                       }
-               }
-       }
-
-done_unlock:
-       if (!list_empty(&list))
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
-       mutex_unlock(&pq->lock);
-
-       return (ret < 0) ? ret : npkts;
-}
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-                                 struct ipath_user_sdma_queue *pq)
-{
-       int ret = 0;
-
-       mutex_lock(&pq->lock);
-       ipath_user_sdma_hwqueue_clean(dd);
-       ret = ipath_user_sdma_queue_clean(dd, pq);
-       mutex_unlock(&pq->lock);
-
-       return ret;
-}
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
-{
-       return pq->sent_counter;
-}
-
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
-{
-       return pq->counter;
-}
-
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h
deleted file mode 100644 (file)
index fc76316..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/device.h>
-
-struct ipath_user_sdma_queue;
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-                          struct ipath_user_sdma_queue *pq,
-                          const struct iovec *iov,
-                          unsigned long dim);
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-                                 struct ipath_user_sdma_queue *pq);
-
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-                                struct ipath_user_sdma_queue *pq);
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
deleted file mode 100644 (file)
index 53f9dca..0000000
+++ /dev/null
@@ -1,2376 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_user_verbs.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/utsname.h>
-#include <linux/rculist.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-static unsigned int ib_ipath_qp_table_size = 251;
-module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
-MODULE_PARM_DESC(qp_table_size, "QP table size");
-
-unsigned int ib_ipath_lkey_table_size = 12;
-module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
-                  S_IRUGO);
-MODULE_PARM_DESC(lkey_table_size,
-                "LKEY table size in bits (2^n, 1 <= n <= 23)");
-
-static unsigned int ib_ipath_max_pds = 0xFFFF;
-module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_pds,
-                "Maximum number of protection domains to support");
-
-static unsigned int ib_ipath_max_ahs = 0xFFFF;
-module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
-
-unsigned int ib_ipath_max_cqes = 0x2FFFF;
-module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqes,
-                "Maximum number of completion queue entries to support");
-
-unsigned int ib_ipath_max_cqs = 0x1FFFF;
-module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
-
-unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
-module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
-                  S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
-
-unsigned int ib_ipath_max_qps = 16384;
-module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
-
-unsigned int ib_ipath_max_sges = 0x60;
-module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
-
-unsigned int ib_ipath_max_mcast_grps = 16384;
-module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
-                  S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_grps,
-                "Maximum number of multicast groups to support");
-
-unsigned int ib_ipath_max_mcast_qp_attached = 16;
-module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_qp_attached,
-                "Maximum number of attached QPs to support");
-
-unsigned int ib_ipath_max_srqs = 1024;
-module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
-
-unsigned int ib_ipath_max_srq_sges = 128;
-module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
-
-unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
-module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
-
-static unsigned int ib_ipath_disable_sma;
-module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(disable_sma, "Disable the SMA");
-
-/*
- * Note that it is OK to post send work requests in the SQE and ERR
- * states; ipath_do_send() will process them and generate error
- * completions as per IB 1.2 C10-96.
- */
-const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
-       [IB_QPS_RESET] = 0,
-       [IB_QPS_INIT] = IPATH_POST_RECV_OK,
-       [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
-       [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
-           IPATH_PROCESS_NEXT_SEND_OK,
-       [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
-       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-       [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
-           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-};
-
-struct ipath_ucontext {
-       struct ib_ucontext ibucontext;
-};
-
-static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
-                                                 *ibucontext)
-{
-       return container_of(ibucontext, struct ipath_ucontext, ibucontext);
-}
-
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
-       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-       [IB_WR_SEND] = IB_WC_SEND,
-       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
-/*
- * System image GUID.
- */
-static __be64 sys_image_guid;
-
-/**
- * ipath_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- */
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               memcpy(sge->vaddr, data, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-}
-
-/**
- * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
- * @ss: the SGE state
- * @length: the number of bytes to skip
- */
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-}
-
-/*
- * Count the number of DMA descriptors needed to send length bytes of data.
- * Don't modify the ipath_sge_state to get the count.
- * Return zero if any of the segments is not aligned.
- */
-static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sg_list = ss->sg_list;
-       struct ipath_sge sge = ss->sge;
-       u8 num_sge = ss->num_sge;
-       u32 ndesc = 1;  /* count the header */
-
-       while (length) {
-               u32 len = sge.length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge.sge_length)
-                       len = sge.sge_length;
-               BUG_ON(len == 0);
-               if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
-                   (len != length && (len & (sizeof(u32) - 1)))) {
-                       ndesc = 0;
-                       break;
-               }
-               ndesc++;
-               sge.vaddr += len;
-               sge.length -= len;
-               sge.sge_length -= len;
-               if (sge.sge_length == 0) {
-                       if (--num_sge)
-                               sge = *sg_list++;
-               } else if (sge.length == 0 && sge.mr != NULL) {
-                       if (++sge.n >= IPATH_SEGSZ) {
-                               if (++sge.m >= sge.mr->mapsz)
-                                       break;
-                               sge.n = 0;
-                       }
-                       sge.vaddr =
-                               sge.mr->map[sge.m]->segs[sge.n].vaddr;
-                       sge.length =
-                               sge.mr->map[sge.m]->segs[sge.n].length;
-               }
-               length -= len;
-       }
-       return ndesc;
-}
-
-/*
- * Copy from the SGEs to the data buffer.
- */
-static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
-                               u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               memcpy(data, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-}
-
-/**
- * ipath_post_one_send - post one RC, UC, or UD send work request
- * @qp: the QP to post on
- * @wr: the work request to send
- */
-static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-       struct ipath_swqe *wqe;
-       u32 next;
-       int i;
-       int j;
-       int acc;
-       int ret;
-       unsigned long flags;
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (qp->ibqp.qp_type != IB_QPT_SMI &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               ret = -ENETDOWN;
-               goto bail;
-       }
-
-       /* Check that state is OK to post send. */
-       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
-               goto bail_inval;
-
-       /* IB spec says that num_sge == 0 is OK. */
-       if (wr->num_sge > qp->s_max_sge)
-               goto bail_inval;
-
-       /*
-        * Don't allow RDMA reads or atomic operations on UC or
-        * undefined operations.
-        * Make sure buffer is large enough to hold the result for atomics.
-        */
-       if (qp->ibqp.qp_type == IB_QPT_UC) {
-               if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
-                       goto bail_inval;
-       } else if (qp->ibqp.qp_type == IB_QPT_UD) {
-               /* Check UD opcode */
-               if (wr->opcode != IB_WR_SEND &&
-                   wr->opcode != IB_WR_SEND_WITH_IMM)
-                       goto bail_inval;
-               /* Check UD destination address PD */
-               if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
-                       goto bail_inval;
-       } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
-               goto bail_inval;
-       else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-                  (wr->num_sge == 0 ||
-                   wr->sg_list[0].length < sizeof(u64) ||
-                   wr->sg_list[0].addr & (sizeof(u64) - 1)))
-               goto bail_inval;
-       else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
-               goto bail_inval;
-
-       next = qp->s_head + 1;
-       if (next >= qp->s_size)
-               next = 0;
-       if (next == qp->s_last) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       wqe = get_swqe_ptr(qp, qp->s_head);
-
-       if (qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_RC)
-               memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
-       else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
-                wr->opcode == IB_WR_RDMA_WRITE ||
-                wr->opcode == IB_WR_RDMA_READ)
-               memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
-       else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-               memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
-       else
-               memcpy(&wqe->wr, wr, sizeof(wqe->wr));
-
-       wqe->length = 0;
-       if (wr->num_sge) {
-               acc = wr->opcode >= IB_WR_RDMA_READ ?
-                       IB_ACCESS_LOCAL_WRITE : 0;
-               for (i = 0, j = 0; i < wr->num_sge; i++) {
-                       u32 length = wr->sg_list[i].length;
-                       int ok;
-
-                       if (length == 0)
-                               continue;
-                       ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
-                                          &wr->sg_list[i], acc);
-                       if (!ok)
-                               goto bail_inval;
-                       wqe->length += length;
-                       j++;
-               }
-               wqe->wr.num_sge = j;
-       }
-       if (qp->ibqp.qp_type == IB_QPT_UC ||
-           qp->ibqp.qp_type == IB_QPT_RC) {
-               if (wqe->length > 0x80000000U)
-                       goto bail_inval;
-       } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
-               goto bail_inval;
-       wqe->ssn = qp->s_ssn++;
-       qp->s_head = next;
-
-       ret = 0;
-       goto bail;
-
-bail_inval:
-       ret = -EINVAL;
-bail:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_post_send - post a send on a QP
- * @ibqp: the QP to post the send on
- * @wr: the list of work requests to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
-                          struct ib_send_wr **bad_wr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       int err = 0;
-
-       for (; wr; wr = wr->next) {
-               err = ipath_post_one_send(qp, wr);
-               if (err) {
-                       *bad_wr = wr;
-                       goto bail;
-               }
-       }
-
-       /* Try to do the send work in the caller's context. */
-       ipath_do_send((unsigned long) qp);
-
-bail:
-       return err;
-}
-
-/**
- * ipath_post_receive - post a receive on a QP
- * @ibqp: the QP to post the receive on
- * @wr: the WR to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
-                             struct ib_recv_wr **bad_wr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_rwq *wq = qp->r_rq.wq;
-       unsigned long flags;
-       int ret;
-
-       /* Check that state is OK to post receive. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
-               *bad_wr = wr;
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       for (; wr; wr = wr->next) {
-               struct ipath_rwqe *wqe;
-               u32 next;
-               int i;
-
-               if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
-                       *bad_wr = wr;
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               spin_lock_irqsave(&qp->r_rq.lock, flags);
-               next = wq->head + 1;
-               if (next >= qp->r_rq.size)
-                       next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-                       *bad_wr = wr;
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
-               wqe->wr_id = wr->wr_id;
-               wqe->num_sge = wr->num_sge;
-               for (i = 0; i < wr->num_sge; i++)
-                       wqe->sg_list[i] = wr->sg_list[i];
-               /* Make sure queue entry is written before the head index. */
-               smp_wmb();
-               wq->head = next;
-               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_qp_rcv - processing an incoming packet on a QP
- * @dev: the device the packet came on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_ib_rcv() to process an incoming packet
- * for the given QP.
- * Called at interrupt level.
- */
-static void ipath_qp_rcv(struct ipath_ibdev *dev,
-                        struct ipath_ib_header *hdr, int has_grh,
-                        void *data, u32 tlen, struct ipath_qp *qp)
-{
-       /* Check for valid receive state. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               return;
-       }
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               if (ib_ipath_disable_sma)
-                       break;
-               /* FALLTHROUGH */
-       case IB_QPT_UD:
-               ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       case IB_QPT_RC:
-               ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       case IB_QPT_UC:
-               ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       default:
-               break;
-       }
-}
-
-/**
- * ipath_ib_rcv - process an incoming packet
- * @arg: the device pointer
- * @rhdr: the header of the packet
- * @data: the packet data
- * @tlen: the packet length
- *
- * This is called from ipath_kreceive() to process an incoming packet at
- * interrupt level. Tlen is the length of the header + data + CRC in bytes.
- */
-void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
-                 u32 tlen)
-{
-       struct ipath_ib_header *hdr = rhdr;
-       struct ipath_other_headers *ohdr;
-       struct ipath_qp *qp;
-       u32 qp_num;
-       int lnh;
-       u8 opcode;
-       u16 lid;
-
-       if (unlikely(dev == NULL))
-               goto bail;
-
-       if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
-               dev->rcv_errors++;
-               goto bail;
-       }
-
-       /* Check for a valid destination LID (see ch. 7.11.1). */
-       lid = be16_to_cpu(hdr->lrh[1]);
-       if (lid < IPATH_MULTICAST_LID_BASE) {
-               lid &= ~((1 << dev->dd->ipath_lmc) - 1);
-               if (unlikely(lid != dev->dd->ipath_lid)) {
-                       dev->rcv_errors++;
-                       goto bail;
-               }
-       }
-
-       /* Check for GRH */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == IPATH_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == IPATH_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else {
-               dev->rcv_errors++;
-               goto bail;
-       }
-
-       opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
-       dev->opstats[opcode].n_bytes += tlen;
-       dev->opstats[opcode].n_packets++;
-
-       /* Get the destination QP number. */
-       qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
-       if (qp_num == IPATH_MULTICAST_QPN) {
-               struct ipath_mcast *mcast;
-               struct ipath_mcast_qp *p;
-
-               if (lnh != IPATH_LRH_GRH) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
-               if (mcast == NULL) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               dev->n_multicast_rcv++;
-               list_for_each_entry_rcu(p, &mcast->qp_list, list)
-                       ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
-               /*
-                * Notify ipath_multicast_detach() if it is waiting for us
-                * to finish.
-                */
-               if (atomic_dec_return(&mcast->refcount) <= 1)
-                       wake_up(&mcast->wait);
-       } else {
-               qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
-               if (qp) {
-                       dev->n_unicast_rcv++;
-                       ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
-                                    tlen, qp);
-                       /*
-                        * Notify ipath_destroy_qp() if it is waiting
-                        * for us to finish.
-                        */
-                       if (atomic_dec_and_test(&qp->refcount))
-                               wake_up(&qp->wait);
-               } else
-                       dev->n_pkt_drops++;
-       }
-
-bail:;
-}
-
-/**
- * ipath_ib_timer - verbs timer
- * @arg: the device pointer
- *
- * This is called from ipath_do_rcv_timer() at interrupt level to check for
- * QPs which need retransmits and to collect performance numbers.
- */
-static void ipath_ib_timer(struct ipath_ibdev *dev)
-{
-       struct ipath_qp *resend = NULL;
-       struct ipath_qp *rnr = NULL;
-       struct list_head *last;
-       struct ipath_qp *qp;
-       unsigned long flags;
-
-       if (dev == NULL)
-               return;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       /* Start filling the next pending queue. */
-       if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
-               dev->pending_index = 0;
-       /* Save any requests still in the new queue, they have timed out. */
-       last = &dev->pending[dev->pending_index];
-       while (!list_empty(last)) {
-               qp = list_entry(last->next, struct ipath_qp, timerwait);
-               list_del_init(&qp->timerwait);
-               qp->timer_next = resend;
-               resend = qp;
-               atomic_inc(&qp->refcount);
-       }
-       last = &dev->rnrwait;
-       if (!list_empty(last)) {
-               qp = list_entry(last->next, struct ipath_qp, timerwait);
-               if (--qp->s_rnr_timeout == 0) {
-                       do {
-                               list_del_init(&qp->timerwait);
-                               qp->timer_next = rnr;
-                               rnr = qp;
-                               atomic_inc(&qp->refcount);
-                               if (list_empty(last))
-                                       break;
-                               qp = list_entry(last->next, struct ipath_qp,
-                                               timerwait);
-                       } while (qp->s_rnr_timeout == 0);
-               }
-       }
-       /*
-        * We should only be in the started state if pma_sample_start != 0
-        */
-       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
-           --dev->pma_sample_start == 0) {
-               dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
-               ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
-                                       &dev->ipath_rword,
-                                       &dev->ipath_spkts,
-                                       &dev->ipath_rpkts,
-                                       &dev->ipath_xmit_wait);
-       }
-       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
-               if (dev->pma_sample_interval == 0) {
-                       u64 ta, tb, tc, td, te;
-
-                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
-                       ipath_snapshot_counters(dev->dd, &ta, &tb,
-                                               &tc, &td, &te);
-
-                       dev->ipath_sword = ta - dev->ipath_sword;
-                       dev->ipath_rword = tb - dev->ipath_rword;
-                       dev->ipath_spkts = tc - dev->ipath_spkts;
-                       dev->ipath_rpkts = td - dev->ipath_rpkts;
-                       dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
-               } else {
-                       dev->pma_sample_interval--;
-               }
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       /* XXX What if timer fires again while this is running? */
-       while (resend != NULL) {
-               qp = resend;
-               resend = qp->timer_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (qp->s_last != qp->s_tail &&
-                   ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-                       dev->n_timeouts++;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-               }
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       while (rnr != NULL) {
-               qp = rnr;
-               rnr = qp->timer_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-}
-
-static void update_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       sge->vaddr += length;
-       sge->length -= length;
-       sge->sge_length -= length;
-       if (sge->sge_length == 0) {
-               if (--ss->num_sge)
-                       *sge = *ss->sg_list++;
-       } else if (sge->length == 0 && sge->mr != NULL) {
-               if (++sge->n >= IPATH_SEGSZ) {
-                       if (++sge->m >= sge->mr->mapsz)
-                               return;
-                       sge->n = 0;
-               }
-               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
-               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
-       }
-}
-
-#ifdef __LITTLE_ENDIAN
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-       return data >> shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-       return data << shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-       data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
-       data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-       return data;
-}
-#else
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-       return data << shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-       return data >> shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-       data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
-       data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-       return data;
-}
-#endif
-
-static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
-                   u32 length, unsigned flush_wc)
-{
-       u32 extra = 0;
-       u32 data = 0;
-       u32 last;
-
-       while (1) {
-               u32 len = ss->sge.length;
-               u32 off;
-
-               if (len > length)
-                       len = length;
-               if (len > ss->sge.sge_length)
-                       len = ss->sge.sge_length;
-               BUG_ON(len == 0);
-               /* If the source address is not aligned, try to align it. */
-               off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
-               if (off) {
-                       u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
-                                           ~(sizeof(u32) - 1));
-                       u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
-                       u32 y;
-
-                       y = sizeof(u32) - off;
-                       if (len > y)
-                               len = y;
-                       if (len + extra >= sizeof(u32)) {
-                               data |= set_upper_bits(v, extra *
-                                                      BITS_PER_BYTE);
-                               len = sizeof(u32) - extra;
-                               if (len == length) {
-                                       last = data;
-                                       break;
-                               }
-                               __raw_writel(data, piobuf);
-                               piobuf++;
-                               extra = 0;
-                               data = 0;
-                       } else {
-                               /* Clear unused upper bytes */
-                               data |= clear_upper_bytes(v, len, extra);
-                               if (len == length) {
-                                       last = data;
-                                       break;
-                               }
-                               extra += len;
-                       }
-               } else if (extra) {
-                       /* Source address is aligned. */
-                       u32 *addr = (u32 *) ss->sge.vaddr;
-                       int shift = extra * BITS_PER_BYTE;
-                       int ushift = 32 - shift;
-                       u32 l = len;
-
-                       while (l >= sizeof(u32)) {
-                               u32 v = *addr;
-
-                               data |= set_upper_bits(v, shift);
-                               __raw_writel(data, piobuf);
-                               data = get_upper_bits(v, ushift);
-                               piobuf++;
-                               addr++;
-                               l -= sizeof(u32);
-                       }
-                       /*
-                        * We still have 'extra' number of bytes leftover.
-                        */
-                       if (l) {
-                               u32 v = *addr;
-
-                               if (l + extra >= sizeof(u32)) {
-                                       data |= set_upper_bits(v, shift);
-                                       len -= l + extra - sizeof(u32);
-                                       if (len == length) {
-                                               last = data;
-                                               break;
-                                       }
-                                       __raw_writel(data, piobuf);
-                                       piobuf++;
-                                       extra = 0;
-                                       data = 0;
-                               } else {
-                                       /* Clear unused upper bytes */
-                                       data |= clear_upper_bytes(v, l,
-                                                                 extra);
-                                       if (len == length) {
-                                               last = data;
-                                               break;
-                                       }
-                                       extra += l;
-                               }
-                       } else if (len == length) {
-                               last = data;
-                               break;
-                       }
-               } else if (len == length) {
-                       u32 w;
-
-                       /*
-                        * Need to round up for the last dword in the
-                        * packet.
-                        */
-                       w = (len + 3) >> 2;
-                       __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
-                       piobuf += w - 1;
-                       last = ((u32 *) ss->sge.vaddr)[w - 1];
-                       break;
-               } else {
-                       u32 w = len >> 2;
-
-                       __iowrite32_copy(piobuf, ss->sge.vaddr, w);
-                       piobuf += w;
-
-                       extra = len & (sizeof(u32) - 1);
-                       if (extra) {
-                               u32 v = ((u32 *) ss->sge.vaddr)[w];
-
-                               /* Clear unused upper bytes */
-                               data = clear_upper_bytes(v, extra, 0);
-                       }
-               }
-               update_sge(ss, len);
-               length -= len;
-       }
-       /* Update address before sending packet. */
-       update_sge(ss, length);
-       if (flush_wc) {
-               /* must flush early everything before trigger word */
-               ipath_flush_wc();
-               __raw_writel(last, piobuf);
-               /* be sure trigger word is written */
-               ipath_flush_wc();
-       } else
-               __raw_writel(last, piobuf);
-}
-
-/*
- * Convert IB rate to delay multiplier.
- */
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
-{
-       switch (rate) {
-       case IB_RATE_2_5_GBPS: return 8;
-       case IB_RATE_5_GBPS:   return 4;
-       case IB_RATE_10_GBPS:  return 2;
-       case IB_RATE_20_GBPS:  return 1;
-       default:               return 0;
-       }
-}
-
-/*
- * Convert delay multiplier to IB rate
- */
-static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
-{
-       switch (mult) {
-       case 8:  return IB_RATE_2_5_GBPS;
-       case 4:  return IB_RATE_5_GBPS;
-       case 2:  return IB_RATE_10_GBPS;
-       case 1:  return IB_RATE_20_GBPS;
-       default: return IB_RATE_PORT_CURRENT;
-       }
-}
-
-static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
-{
-       struct ipath_verbs_txreq *tx = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (!list_empty(&dev->txreq_free)) {
-               struct list_head *l = dev->txreq_free.next;
-
-               list_del(l);
-               tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-       return tx;
-}
-
-static inline void put_txreq(struct ipath_ibdev *dev,
-                            struct ipath_verbs_txreq *tx)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       list_add(&tx->txreq.list, &dev->txreq_free);
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-}
-
-static void sdma_complete(void *cookie, int status)
-{
-       struct ipath_verbs_txreq *tx = cookie;
-       struct ipath_qp *qp = tx->qp;
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
-       enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
-               IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
-
-       if (atomic_dec_and_test(&qp->s_dma_busy)) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (tx->wqe)
-                       ipath_send_complete(qp, tx->wqe, ibs);
-               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-                    qp->s_last != qp->s_head) ||
-                   (qp->s_flags & IPATH_S_WAIT_DMA))
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               wake_up(&qp->wait_dma);
-       } else if (tx->wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               ipath_send_complete(qp, tx->wqe, ibs);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-               kfree(tx->txreq.map_addr);
-       put_txreq(dev, tx);
-
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static void decrement_dma_busy(struct ipath_qp *qp)
-{
-       unsigned long flags;
-
-       if (atomic_dec_and_test(&qp->s_dma_busy)) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-                    qp->s_last != qp->s_head) ||
-                   (qp->s_flags & IPATH_S_WAIT_DMA))
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               wake_up(&qp->wait_dma);
-       }
-}
-
-/*
- * Compute the number of clock cycles of delay before sending the next packet.
- * The multipliers reflect the number of clocks for the fastest rate so
- * one tick at 4xDDR is 8 ticks at 1xSDR.
- * If the destination port will take longer to receive a packet than
- * the outgoing link can send it, we need to delay sending the next packet
- * by the difference in time it takes the receiver to receive and the sender
- * to send this packet.
- * Note that this delay is always correct for UC and RC but not always
- * optimal for UD. For UD, the destination HCA can be different for each
- * packet, in which case, we could send packets to a different destination
- * while "waiting" for the delay. The overhead for doing this without
- * HW support is more than just paying the cost of delaying some packets
- * unnecessarily.
- */
-static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
-{
-       return (rcv_mult > snd_mult) ?
-               (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
-}
-
-static int ipath_verbs_send_dma(struct ipath_qp *qp,
-                               struct ipath_ib_header *hdr, u32 hdrwords,
-                               struct ipath_sge_state *ss, u32 len,
-                               u32 plen, u32 dwords)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_devdata *dd = dev->dd;
-       struct ipath_verbs_txreq *tx;
-       u32 *piobuf;
-       u32 control;
-       u32 ndesc;
-       int ret;
-
-       tx = qp->s_tx;
-       if (tx) {
-               qp->s_tx = NULL;
-               /* resend previously constructed packet */
-               atomic_inc(&qp->s_dma_busy);
-               ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
-               if (ret) {
-                       qp->s_tx = tx;
-                       decrement_dma_busy(qp);
-               }
-               goto bail;
-       }
-
-       tx = get_txreq(dev);
-       if (!tx) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       /*
-        * Get the saved delay count we computed for the previous packet
-        * and save the delay count for this packet to be used next time
-        * we get here.
-        */
-       control = qp->s_pkt_delay;
-       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-       tx->qp = qp;
-       atomic_inc(&qp->refcount);
-       tx->wqe = qp->s_wqe;
-       tx->txreq.callback = sdma_complete;
-       tx->txreq.callback_cookie = tx;
-       tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
-               IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
-       if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
-
-       /* VL15 packets bypass credit check */
-       if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
-               control |= 1ULL << 31;
-               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
-       }
-
-       if (len) {
-               /*
-                * Don't try to DMA if it takes more descriptors than
-                * the queue holds.
-                */
-               ndesc = ipath_count_sge(ss, len);
-               if (ndesc >= dd->ipath_sdma_descq_cnt)
-                       ndesc = 0;
-       } else
-               ndesc = 1;
-       if (ndesc) {
-               tx->hdr.pbc[0] = cpu_to_le32(plen);
-               tx->hdr.pbc[1] = cpu_to_le32(control);
-               memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
-               tx->txreq.sg_count = ndesc;
-               tx->map_len = (hdrwords + 2) << 2;
-               tx->txreq.map_addr = &tx->hdr;
-               atomic_inc(&qp->s_dma_busy);
-               ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
-               if (ret) {
-                       /* save ss and length in dwords */
-                       tx->ss = ss;
-                       tx->len = dwords;
-                       qp->s_tx = tx;
-                       decrement_dma_busy(qp);
-               }
-               goto bail;
-       }
-
-       /* Allocate a buffer and copy the header and payload to it. */
-       tx->map_len = (plen + 1) << 2;
-       piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
-       if (unlikely(piobuf == NULL)) {
-               ret = -EBUSY;
-               goto err_tx;
-       }
-       tx->txreq.map_addr = piobuf;
-       tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
-       tx->txreq.sg_count = 1;
-
-       *piobuf++ = (__force u32) cpu_to_le32(plen);
-       *piobuf++ = (__force u32) cpu_to_le32(control);
-       memcpy(piobuf, hdr, hdrwords << 2);
-       ipath_copy_from_sge(piobuf + hdrwords, ss, len);
-
-       atomic_inc(&qp->s_dma_busy);
-       ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
-       /*
-        * If we couldn't queue the DMA request, save the info
-        * and try again later rather than destroying the
-        * buffer and undoing the side effects of the copy.
-        */
-       if (ret) {
-               tx->ss = NULL;
-               tx->len = 0;
-               qp->s_tx = tx;
-               decrement_dma_busy(qp);
-       }
-       dev->n_unaligned++;
-       goto bail;
-
-err_tx:
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-       put_txreq(dev, tx);
-bail:
-       return ret;
-}
-
-static int ipath_verbs_send_pio(struct ipath_qp *qp,
-                               struct ipath_ib_header *ibhdr, u32 hdrwords,
-                               struct ipath_sge_state *ss, u32 len,
-                               u32 plen, u32 dwords)
-{
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-       u32 *hdr = (u32 *) ibhdr;
-       u32 __iomem *piobuf;
-       unsigned flush_wc;
-       u32 control;
-       int ret;
-       unsigned long flags;
-
-       piobuf = ipath_getpiobuf(dd, plen, NULL);
-       if (unlikely(piobuf == NULL)) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       /*
-        * Get the saved delay count we computed for the previous packet
-        * and save the delay count for this packet to be used next time
-        * we get here.
-        */
-       control = qp->s_pkt_delay;
-       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-       /* VL15 packets bypass credit check */
-       if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
-               control |= 1ULL << 31;
-
-       /*
-        * Write the length to the control qword plus any needed flags.
-        * We have to flush after the PBC for correctness on some cpus
-        * or WC buffer can be written out of order.
-        */
-       writeq(((u64) control << 32) | plen, piobuf);
-       piobuf += 2;
-
-       flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
-       if (len == 0) {
-               /*
-                * If there is just the header portion, must flush before
-                * writing last word of header for correctness, and after
-                * the last header word (trigger word).
-                */
-               if (flush_wc) {
-                       ipath_flush_wc();
-                       __iowrite32_copy(piobuf, hdr, hdrwords - 1);
-                       ipath_flush_wc();
-                       __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
-                       ipath_flush_wc();
-               } else
-                       __iowrite32_copy(piobuf, hdr, hdrwords);
-               goto done;
-       }
-
-       if (flush_wc)
-               ipath_flush_wc();
-       __iowrite32_copy(piobuf, hdr, hdrwords);
-       piobuf += hdrwords;
-
-       /* The common case is aligned and contained in one segment. */
-       if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
-                  !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
-               u32 *addr = (u32 *) ss->sge.vaddr;
-
-               /* Update address before sending packet. */
-               update_sge(ss, len);
-               if (flush_wc) {
-                       __iowrite32_copy(piobuf, addr, dwords - 1);
-                       /* must flush early everything before trigger word */
-                       ipath_flush_wc();
-                       __raw_writel(addr[dwords - 1], piobuf + dwords - 1);
-                       /* be sure trigger word is written */
-                       ipath_flush_wc();
-               } else
-                       __iowrite32_copy(piobuf, addr, dwords);
-               goto done;
-       }
-       copy_io(piobuf, ss, len, flush_wc);
-done:
-       if (qp->s_wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_verbs_send - send a packet
- * @qp: the QP to send on
- * @hdr: the packet header
- * @hdrwords: the number of 32-bit words in the header
- * @ss: the SGE to send
- * @len: the length of the packet in bytes
- */
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-                    u32 hdrwords, struct ipath_sge_state *ss, u32 len)
-{
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-       u32 plen;
-       int ret;
-       u32 dwords = (len + 3) >> 2;
-
-       /*
-        * Calculate the send buffer trigger address.
-        * The +1 counts for the pbc control dword following the pbc length.
-        */
-       plen = hdrwords + dwords + 1;
-
-       /*
-        * VL15 packets (IB_QPT_SMI) will always use PIO, so we
-        * can defer SDMA restart until link goes ACTIVE without
-        * worrying about just how we got there.
-        */
-       if (qp->ibqp.qp_type == IB_QPT_SMI ||
-           !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-               ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
-                                          plen, dwords);
-       else
-               ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
-                                          plen, dwords);
-
-       return ret;
-}
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-                           u64 *rwords, u64 *spkts, u64 *rpkts,
-                           u64 *xmit_wait)
-{
-       int ret;
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -EINVAL;
-               goto bail;
-       }
-       *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
-       *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-       *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-       *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-       *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_counters - get various chip counters
- * @dd: the infinipath device
- * @cntrs: counters are placed here
- *
- * Return the counters needed by recv_pma_get_portcounters().
- */
-int ipath_get_counters(struct ipath_devdata *dd,
-                      struct ipath_verbs_counters *cntrs)
-{
-       struct ipath_cregs const *crp = dd->ipath_cregs;
-       int ret;
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -EINVAL;
-               goto bail;
-       }
-       cntrs->symbol_error_counter =
-               ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
-       cntrs->link_error_recovery_counter =
-               ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
-       /*
-        * The link downed counter counts when the other side downs the
-        * connection.  We add in the number of times we downed the link
-        * due to local link integrity errors to compensate.
-        */
-       cntrs->link_downed_counter =
-               ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
-       cntrs->port_rcv_errors =
-               ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
-               ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
-               ipath_snap_cntr(dd, crp->cr_portovflcnt) +
-               ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
-               ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
-               ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
-               ipath_snap_cntr(dd, crp->cr_erricrccnt) +
-               ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
-               ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
-               ipath_snap_cntr(dd, crp->cr_badformatcnt) +
-               dd->ipath_rxfc_unsupvl_errs;
-       if (crp->cr_rxotherlocalphyerrcnt)
-               cntrs->port_rcv_errors +=
-                       ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
-       if (crp->cr_rxvlerrcnt)
-               cntrs->port_rcv_errors +=
-                       ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
-       cntrs->port_rcv_remphys_errors =
-               ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
-       cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
-       cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
-       cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
-       cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
-       cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
-       cntrs->local_link_integrity_errors =
-               crp->cr_locallinkintegrityerrcnt ?
-               ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
-               ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-                dd->ipath_lli_errs : dd->ipath_lli_errors);
-       cntrs->excessive_buffer_overrun_errors =
-               crp->cr_excessbufferovflcnt ?
-               ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
-               dd->ipath_overrun_thresh_errs;
-       cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
-               ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_ib_piobufavail - callback when a PIO buffer is available
- * @arg: the device pointer
- *
- * This is called from ipath_intr() at interrupt level when a PIO buffer is
- * available after ipath_verbs_send() returned an error that no buffers were
- * available.  Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just restart the send tasklet and
- * return zero).
- */
-int ipath_ib_piobufavail(struct ipath_ibdev *dev)
-{
-       struct list_head *list;
-       struct ipath_qp *qplist;
-       struct ipath_qp *qp;
-       unsigned long flags;
-
-       if (dev == NULL)
-               goto bail;
-
-       list = &dev->piowait;
-       qplist = NULL;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       while (!list_empty(list)) {
-               qp = list_entry(list->next, struct ipath_qp, piowait);
-               list_del_init(&qp->piowait);
-               qp->pio_next = qplist;
-               qplist = qp;
-               atomic_inc(&qp->refcount);
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       while (qplist != NULL) {
-               qp = qplist;
-               qplist = qp->pio_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-
-bail:
-       return 0;
-}
-
-static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                             struct ib_udata *uhw)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       memset(props, 0, sizeof(*props));
-
-       props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
-               IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-               IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-               IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
-       props->page_size_cap = PAGE_SIZE;
-       props->vendor_id =
-               IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
-       props->vendor_part_id = dev->dd->ipath_deviceid;
-       props->hw_ver = dev->dd->ipath_pcirev;
-
-       props->sys_image_guid = dev->sys_image_guid;
-
-       props->max_mr_size = ~0ull;
-       props->max_qp = ib_ipath_max_qps;
-       props->max_qp_wr = ib_ipath_max_qp_wrs;
-       props->max_sge = ib_ipath_max_sges;
-       props->max_sge_rd = ib_ipath_max_sges;
-       props->max_cq = ib_ipath_max_cqs;
-       props->max_ah = ib_ipath_max_ahs;
-       props->max_cqe = ib_ipath_max_cqes;
-       props->max_mr = dev->lk_table.max;
-       props->max_fmr = dev->lk_table.max;
-       props->max_map_per_fmr = 32767;
-       props->max_pd = ib_ipath_max_pds;
-       props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
-       props->max_qp_init_rd_atom = 255;
-       /* props->max_res_rd_atom */
-       props->max_srq = ib_ipath_max_srqs;
-       props->max_srq_wr = ib_ipath_max_srq_wrs;
-       props->max_srq_sge = ib_ipath_max_srq_sges;
-       /* props->local_ca_ack_delay */
-       props->atomic_cap = IB_ATOMIC_GLOB;
-       props->max_pkeys = ipath_get_npkeys(dev->dd);
-       props->max_mcast_grp = ib_ipath_max_mcast_grps;
-       props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
-       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
-               props->max_mcast_grp;
-
-       return 0;
-}
-
-const u8 ipath_cvt_physportstate[32] = {
-       [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
-       [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
-       [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
-       [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
-       [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
-       [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
-       [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
-};
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
-{
-       return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
-}
-
-static int ipath_query_port(struct ib_device *ibdev,
-                           u8 port, struct ib_port_attr *props)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_devdata *dd = dev->dd;
-       enum ib_mtu mtu;
-       u16 lid = dd->ipath_lid;
-       u64 ibcstat;
-
-       memset(props, 0, sizeof(*props));
-       props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
-       props->lmc = dd->ipath_lmc;
-       props->sm_lid = dev->sm_lid;
-       props->sm_sl = dev->sm_sl;
-       ibcstat = dd->ipath_lastibcstat;
-       /* map LinkState to IB portinfo values.  */
-       props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
-
-       /* See phys_state_show() */
-       props->phys_state = /* MEA: assumes shift == 0 */
-               ipath_cvt_physportstate[dd->ipath_lastibcstat &
-               dd->ibcs_lts_mask];
-       props->port_cap_flags = dev->port_cap_flags;
-       props->gid_tbl_len = 1;
-       props->max_msg_sz = 0x80000000;
-       props->pkey_tbl_len = ipath_get_npkeys(dd);
-       props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
-               dev->z_pkey_violations;
-       props->qkey_viol_cntr = dev->qkey_violations;
-       props->active_width = dd->ipath_link_width_active;
-       /* See rate_show() */
-       props->active_speed = dd->ipath_link_speed_active;
-       props->max_vl_num = 1;          /* VLCap = VL0 */
-       props->init_type_reply = 0;
-
-       props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-       switch (dd->ipath_ibmtu) {
-       case 4096:
-               mtu = IB_MTU_4096;
-               break;
-       case 2048:
-               mtu = IB_MTU_2048;
-               break;
-       case 1024:
-               mtu = IB_MTU_1024;
-               break;
-       case 512:
-               mtu = IB_MTU_512;
-               break;
-       case 256:
-               mtu = IB_MTU_256;
-               break;
-       default:
-               mtu = IB_MTU_2048;
-       }
-       props->active_mtu = mtu;
-       props->subnet_timeout = dev->subnet_timeout;
-
-       return 0;
-}
-
-static int ipath_modify_device(struct ib_device *device,
-                              int device_modify_mask,
-                              struct ib_device_modify *device_modify)
-{
-       int ret;
-
-       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
-                                  IB_DEVICE_MODIFY_NODE_DESC)) {
-               ret = -EOPNOTSUPP;
-               goto bail;
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
-               memcpy(device->node_desc, device_modify->node_desc, 64);
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
-               to_idev(device)->sys_image_guid =
-                       cpu_to_be64(device_modify->sys_image_guid);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_modify_port(struct ib_device *ibdev,
-                            u8 port, int port_modify_mask,
-                            struct ib_port_modify *props)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-
-       dev->port_cap_flags |= props->set_port_cap_mask;
-       dev->port_cap_flags &= ~props->clr_port_cap_mask;
-       if (port_modify_mask & IB_PORT_SHUTDOWN)
-               ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
-       if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
-               dev->qkey_violations = 0;
-       return 0;
-}
-
-static int ipath_query_gid(struct ib_device *ibdev, u8 port,
-                          int index, union ib_gid *gid)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       if (index >= 1) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       gid->global.subnet_prefix = dev->gid_prefix;
-       gid->global.interface_id = dev->dd->ipath_guid;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
-                                   struct ib_ucontext *context,
-                                   struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_pd *pd;
-       struct ib_pd *ret;
-
-       /*
-        * This is actually totally arbitrary.  Some correctness tests
-        * assume there's a maximum number of PDs that can be allocated.
-        * We don't actually have this limit, but we fail the test if
-        * we allow allocations of more than we report for this value.
-        */
-
-       pd = kmalloc(sizeof *pd, GFP_KERNEL);
-       if (!pd) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       spin_lock(&dev->n_pds_lock);
-       if (dev->n_pds_allocated == ib_ipath_max_pds) {
-               spin_unlock(&dev->n_pds_lock);
-               kfree(pd);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       dev->n_pds_allocated++;
-       spin_unlock(&dev->n_pds_lock);
-
-       /* ib_alloc_pd() will initialize pd->ibpd. */
-       pd->user = udata != NULL;
-
-       ret = &pd->ibpd;
-
-bail:
-       return ret;
-}
-
-static int ipath_dealloc_pd(struct ib_pd *ibpd)
-{
-       struct ipath_pd *pd = to_ipd(ibpd);
-       struct ipath_ibdev *dev = to_idev(ibpd->device);
-
-       spin_lock(&dev->n_pds_lock);
-       dev->n_pds_allocated--;
-       spin_unlock(&dev->n_pds_lock);
-
-       kfree(pd);
-
-       return 0;
-}
-
-/**
- * ipath_create_ah - create an address handle
- * @pd: the protection domain
- * @ah_attr: the attributes of the AH
- *
- * This may be called from interrupt context.
- */
-static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
-                                    struct ib_ah_attr *ah_attr)
-{
-       struct ipath_ah *ah;
-       struct ib_ah *ret;
-       struct ipath_ibdev *dev = to_idev(pd->device);
-       unsigned long flags;
-
-       /* A multicast address requires a GRH (see ch. 8.4.1). */
-       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-           ah_attr->dlid != IPATH_PERMISSIVE_LID &&
-           !(ah_attr->ah_flags & IB_AH_GRH)) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (ah_attr->dlid == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (ah_attr->port_num < 1 ||
-           ah_attr->port_num > pd->device->phys_port_cnt) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-       if (!ah) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dev->n_ahs_lock, flags);
-       if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
-               spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-               kfree(ah);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       dev->n_ahs_allocated++;
-       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-       /* ib_create_ah() will initialize ah->ibah. */
-       ah->attr = *ah_attr;
-       ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
-
-       ret = &ah->ibah;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_destroy_ah - destroy an address handle
- * @ibah: the AH to destroy
- *
- * This may be called from interrupt context.
- */
-static int ipath_destroy_ah(struct ib_ah *ibah)
-{
-       struct ipath_ibdev *dev = to_idev(ibah->device);
-       struct ipath_ah *ah = to_iah(ibah);
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->n_ahs_lock, flags);
-       dev->n_ahs_allocated--;
-       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-       kfree(ah);
-
-       return 0;
-}
-
-static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
-{
-       struct ipath_ah *ah = to_iah(ibah);
-
-       *ah_attr = ah->attr;
-       ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
-
-       return 0;
-}
-
-/**
- * ipath_get_npkeys - return the size of the PKEY table for port 0
- * @dd: the infinipath device
- */
-unsigned ipath_get_npkeys(struct ipath_devdata *dd)
-{
-       return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
-}
-
-/**
- * ipath_get_pkey - return the indexed PKEY from the port PKEY table
- * @dd: the infinipath device
- * @index: the PKEY index
- */
-unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
-{
-       unsigned ret;
-
-       /* always a kernel port, no locking needed */
-       if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
-               ret = 0;
-       else
-               ret = dd->ipath_pd[0]->port_pkeys[index];
-
-       return ret;
-}
-
-static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-                           u16 *pkey)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       if (index >= ipath_get_npkeys(dev->dd)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       *pkey = ipath_get_pkey(dev->dd, index);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_alloc_ucontext - allocate a ucontest
- * @ibdev: the infiniband device
- * @udata: not used by the InfiniPath driver
- */
-
-static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
-                                               struct ib_udata *udata)
-{
-       struct ipath_ucontext *context;
-       struct ib_ucontext *ret;
-
-       context = kmalloc(sizeof *context, GFP_KERNEL);
-       if (!context) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       ret = &context->ibucontext;
-
-bail:
-       return ret;
-}
-
-static int ipath_dealloc_ucontext(struct ib_ucontext *context)
-{
-       kfree(to_iucontext(context));
-       return 0;
-}
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev);
-
-static void __verbs_timer(unsigned long arg)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) arg;
-
-       /* Handle verbs layer timeouts. */
-       ipath_ib_timer(dd->verbs_dev);
-
-       mod_timer(&dd->verbs_timer, jiffies + 1);
-}
-
-static int enable_timer(struct ipath_devdata *dd)
-{
-       /*
-        * Early chips had a design flaw where the chip and kernel idea
-        * of the tail register don't always agree, and therefore we won't
-        * get an interrupt on the next packet received.
-        * If the board supports per packet receive interrupts, use it.
-        * Otherwise, the timer function periodically checks for packets
-        * to cover this case.
-        * Either way, the timer is needed for verbs layer related
-        * processing.
-        */
-       if (dd->ipath_flags & IPATH_GPIO_INTR) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
-                                0x2074076542310ULL);
-               /* Enable GPIO bit 2 interrupt */
-               dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-       }
-
-       setup_timer(&dd->verbs_timer, __verbs_timer, (unsigned long)dd);
-
-       dd->verbs_timer.expires = jiffies + 1;
-       add_timer(&dd->verbs_timer);
-
-       return 0;
-}
-
-static int disable_timer(struct ipath_devdata *dd)
-{
-       /* Disable GPIO bit 2 interrupt */
-       if (dd->ipath_flags & IPATH_GPIO_INTR) {
-                /* Disable GPIO bit 2 interrupt */
-               dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-               /*
-                * We might want to undo changes to debugportselect,
-                * but how?
-                */
-       }
-
-       del_timer_sync(&dd->verbs_timer);
-
-       return 0;
-}
-
-static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
-                               struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = ipath_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-       return 0;
-}
-
-/**
- * ipath_register_ib_device - register our device with the infiniband core
- * @dd: the device data structure
- * Return the allocated ipath_ibdev pointer or NULL on error.
- */
-int ipath_register_ib_device(struct ipath_devdata *dd)
-{
-       struct ipath_verbs_counters cntrs;
-       struct ipath_ibdev *idev;
-       struct ib_device *dev;
-       struct ipath_verbs_txreq *tx;
-       unsigned i;
-       int ret;
-
-       idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
-       if (idev == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       dev = &idev->ibdev;
-
-       if (dd->ipath_sdma_descq_cnt) {
-               tx = kmalloc_array(dd->ipath_sdma_descq_cnt, sizeof *tx,
-                                  GFP_KERNEL);
-               if (tx == NULL) {
-                       ret = -ENOMEM;
-                       goto err_tx;
-               }
-       } else
-               tx = NULL;
-       idev->txreq_bufs = tx;
-
-       /* Only need to initialize non-zero fields. */
-       spin_lock_init(&idev->n_pds_lock);
-       spin_lock_init(&idev->n_ahs_lock);
-       spin_lock_init(&idev->n_cqs_lock);
-       spin_lock_init(&idev->n_qps_lock);
-       spin_lock_init(&idev->n_srqs_lock);
-       spin_lock_init(&idev->n_mcast_grps_lock);
-
-       spin_lock_init(&idev->qp_table.lock);
-       spin_lock_init(&idev->lk_table.lock);
-       idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
-       /* Set the prefix to the default value (see ch. 4.1.1) */
-       idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
-
-       ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
-       if (ret)
-               goto err_qp;
-
-       /*
-        * The top ib_ipath_lkey_table_size bits are used to index the
-        * table.  The lower 8 bits can be owned by the user (copied from
-        * the LKEY).  The remaining bits act as a generation number or tag.
-        */
-       idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
-       idev->lk_table.table = kcalloc(idev->lk_table.max,
-                                      sizeof(*idev->lk_table.table),
-                                      GFP_KERNEL);
-       if (idev->lk_table.table == NULL) {
-               ret = -ENOMEM;
-               goto err_lk;
-       }
-       INIT_LIST_HEAD(&idev->pending_mmaps);
-       spin_lock_init(&idev->pending_lock);
-       idev->mmap_offset = PAGE_SIZE;
-       spin_lock_init(&idev->mmap_offset_lock);
-       INIT_LIST_HEAD(&idev->pending[0]);
-       INIT_LIST_HEAD(&idev->pending[1]);
-       INIT_LIST_HEAD(&idev->pending[2]);
-       INIT_LIST_HEAD(&idev->piowait);
-       INIT_LIST_HEAD(&idev->rnrwait);
-       INIT_LIST_HEAD(&idev->txreq_free);
-       idev->pending_index = 0;
-       idev->port_cap_flags =
-               IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
-       if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
-               idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
-       idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
-       idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
-       idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
-       idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
-       idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
-
-       /* Snapshot current HW counters to "clear" them. */
-       ipath_get_counters(dd, &cntrs);
-       idev->z_symbol_error_counter = cntrs.symbol_error_counter;
-       idev->z_link_error_recovery_counter =
-               cntrs.link_error_recovery_counter;
-       idev->z_link_downed_counter = cntrs.link_downed_counter;
-       idev->z_port_rcv_errors = cntrs.port_rcv_errors;
-       idev->z_port_rcv_remphys_errors =
-               cntrs.port_rcv_remphys_errors;
-       idev->z_port_xmit_discards = cntrs.port_xmit_discards;
-       idev->z_port_xmit_data = cntrs.port_xmit_data;
-       idev->z_port_rcv_data = cntrs.port_rcv_data;
-       idev->z_port_xmit_packets = cntrs.port_xmit_packets;
-       idev->z_port_rcv_packets = cntrs.port_rcv_packets;
-       idev->z_local_link_integrity_errors =
-               cntrs.local_link_integrity_errors;
-       idev->z_excessive_buffer_overrun_errors =
-               cntrs.excessive_buffer_overrun_errors;
-       idev->z_vl15_dropped = cntrs.vl15_dropped;
-
-       for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
-               list_add(&tx->txreq.list, &idev->txreq_free);
-
-       /*
-        * The system image GUID is supposed to be the same for all
-        * IB HCAs in a single system but since there can be other
-        * device types in the system, we can't be sure this is unique.
-        */
-       if (!sys_image_guid)
-               sys_image_guid = dd->ipath_guid;
-       idev->sys_image_guid = sys_image_guid;
-       idev->ib_unit = dd->ipath_unit;
-       idev->dd = dd;
-
-       strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
-       dev->owner = THIS_MODULE;
-       dev->node_guid = dd->ipath_guid;
-       dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
-       dev->uverbs_cmd_mask =
-               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
-               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
-               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
-               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
-               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
-               (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
-               (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
-               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
-               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
-               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
-               (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
-               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
-               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
-               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
-               (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
-               (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
-               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
-               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
-               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
-       dev->node_type = RDMA_NODE_IB_CA;
-       dev->phys_port_cnt = 1;
-       dev->num_comp_vectors = 1;
-       dev->dma_device = &dd->pcidev->dev;
-       dev->query_device = ipath_query_device;
-       dev->modify_device = ipath_modify_device;
-       dev->query_port = ipath_query_port;
-       dev->modify_port = ipath_modify_port;
-       dev->query_pkey = ipath_query_pkey;
-       dev->query_gid = ipath_query_gid;
-       dev->alloc_ucontext = ipath_alloc_ucontext;
-       dev->dealloc_ucontext = ipath_dealloc_ucontext;
-       dev->alloc_pd = ipath_alloc_pd;
-       dev->dealloc_pd = ipath_dealloc_pd;
-       dev->create_ah = ipath_create_ah;
-       dev->destroy_ah = ipath_destroy_ah;
-       dev->query_ah = ipath_query_ah;
-       dev->create_srq = ipath_create_srq;
-       dev->modify_srq = ipath_modify_srq;
-       dev->query_srq = ipath_query_srq;
-       dev->destroy_srq = ipath_destroy_srq;
-       dev->create_qp = ipath_create_qp;
-       dev->modify_qp = ipath_modify_qp;
-       dev->query_qp = ipath_query_qp;
-       dev->destroy_qp = ipath_destroy_qp;
-       dev->post_send = ipath_post_send;
-       dev->post_recv = ipath_post_receive;
-       dev->post_srq_recv = ipath_post_srq_receive;
-       dev->create_cq = ipath_create_cq;
-       dev->destroy_cq = ipath_destroy_cq;
-       dev->resize_cq = ipath_resize_cq;
-       dev->poll_cq = ipath_poll_cq;
-       dev->req_notify_cq = ipath_req_notify_cq;
-       dev->get_dma_mr = ipath_get_dma_mr;
-       dev->reg_user_mr = ipath_reg_user_mr;
-       dev->dereg_mr = ipath_dereg_mr;
-       dev->alloc_fmr = ipath_alloc_fmr;
-       dev->map_phys_fmr = ipath_map_phys_fmr;
-       dev->unmap_fmr = ipath_unmap_fmr;
-       dev->dealloc_fmr = ipath_dealloc_fmr;
-       dev->attach_mcast = ipath_multicast_attach;
-       dev->detach_mcast = ipath_multicast_detach;
-       dev->process_mad = ipath_process_mad;
-       dev->mmap = ipath_mmap;
-       dev->dma_ops = &ipath_dma_mapping_ops;
-       dev->get_port_immutable = ipath_port_immutable;
-
-       snprintf(dev->node_desc, sizeof(dev->node_desc),
-                IPATH_IDSTR " %s", init_utsname()->nodename);
-
-       ret = ib_register_device(dev, NULL);
-       if (ret)
-               goto err_reg;
-
-       ret = ipath_verbs_register_sysfs(dev);
-       if (ret)
-               goto err_class;
-
-       enable_timer(dd);
-
-       goto bail;
-
-err_class:
-       ib_unregister_device(dev);
-err_reg:
-       kfree(idev->lk_table.table);
-err_lk:
-       kfree(idev->qp_table.table);
-err_qp:
-       kfree(idev->txreq_bufs);
-err_tx:
-       ib_dealloc_device(dev);
-       ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
-       idev = NULL;
-
-bail:
-       dd->verbs_dev = idev;
-       return ret;
-}
-
-void ipath_unregister_ib_device(struct ipath_ibdev *dev)
-{
-       struct ib_device *ibdev = &dev->ibdev;
-       u32 qps_inuse;
-
-       ib_unregister_device(ibdev);
-
-       disable_timer(dev->dd);
-
-       if (!list_empty(&dev->pending[0]) ||
-           !list_empty(&dev->pending[1]) ||
-           !list_empty(&dev->pending[2]))
-               ipath_dev_err(dev->dd, "pending list not empty!\n");
-       if (!list_empty(&dev->piowait))
-               ipath_dev_err(dev->dd, "piowait list not empty!\n");
-       if (!list_empty(&dev->rnrwait))
-               ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
-       if (!ipath_mcast_tree_empty())
-               ipath_dev_err(dev->dd, "multicast table memory leak!\n");
-       /*
-        * Note that ipath_unregister_ib_device() can be called before all
-        * the QPs are destroyed!
-        */
-       qps_inuse = ipath_free_all_qps(&dev->qp_table);
-       if (qps_inuse)
-               ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
-                       qps_inuse);
-       kfree(dev->qp_table.table);
-       kfree(dev->lk_table.table);
-       kfree(dev->txreq_bufs);
-       ib_dealloc_device(ibdev);
-}
-
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-
-       return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
-}
-
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-       int ret;
-
-       ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
-       if (ret < 0)
-               goto bail;
-       strcat(buf, "\n");
-       ret = strlen(buf);
-
-bail:
-       return ret;
-}
-
-static ssize_t show_stats(struct device *device, struct device_attribute *attr,
-                         char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-       int i;
-       int len;
-
-       len = sprintf(buf,
-                     "RC resends  %d\n"
-                     "RC no QACK  %d\n"
-                     "RC ACKs     %d\n"
-                     "RC SEQ NAKs %d\n"
-                     "RC RDMA seq %d\n"
-                     "RC RNR NAKs %d\n"
-                     "RC OTH NAKs %d\n"
-                     "RC timeouts %d\n"
-                     "RC RDMA dup %d\n"
-                     "piobuf wait %d\n"
-                     "unaligned   %d\n"
-                     "PKT drops   %d\n"
-                     "WQE errs    %d\n",
-                     dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
-                     dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
-                     dev->n_other_naks, dev->n_timeouts,
-                     dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
-                     dev->n_pkt_drops, dev->n_wqe_errs);
-       for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
-               const struct ipath_opcode_stats *si = &dev->opstats[i];
-
-               if (!si->n_packets && !si->n_bytes)
-                       continue;
-               len += sprintf(buf + len, "%02x %llu/%llu\n", i,
-                              (unsigned long long) si->n_packets,
-                              (unsigned long long) si->n_bytes);
-       }
-       return len;
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
-
-static struct device_attribute *ipath_class_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_hca_type,
-       &dev_attr_board_id,
-       &dev_attr_stats
-};
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev)
-{
-       int i;
-       int ret;
-
-       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
-               ret = device_create_file(&dev->dev,
-                                      ipath_class_attributes[i]);
-               if (ret)
-                       goto bail;
-       }
-       return 0;
-bail:
-       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
-               device_remove_file(&dev->dev, ipath_class_attributes[i]);
-       return ret;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
deleted file mode 100644 (file)
index 6c70a89..0000000
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IPATH_VERBS_H
-#define IPATH_VERBS_H
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/kref.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_user_verbs.h>
-
-#include "ipath_kernel.h"
-
-#define IPATH_MAX_RDMA_ATOMIC  4
-
-#define QPN_MAX                 (1 << 24)
-#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define IPATH_UVERBS_ABI_VERSION       2
-
-/*
- * Define an ib_cq_notify value that is not valid so we know when CQ
- * notifications are armed.
- */
-#define IB_CQ_NONE     (IB_CQ_NEXT_COMP + 1)
-
-/* AETH NAK opcode values */
-#define IB_RNR_NAK                     0x20
-#define IB_NAK_PSN_ERROR               0x60
-#define IB_NAK_INVALID_REQUEST         0x61
-#define IB_NAK_REMOTE_ACCESS_ERROR     0x62
-#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
-#define IB_NAK_INVALID_RD_REQUEST      0x64
-
-/* Flags for checking QP state (see ib_ipath_state_ops[]) */
-#define IPATH_POST_SEND_OK             0x01
-#define IPATH_POST_RECV_OK             0x02
-#define IPATH_PROCESS_RECV_OK          0x04
-#define IPATH_PROCESS_SEND_OK          0x08
-#define IPATH_PROCESS_NEXT_SEND_OK     0x10
-#define IPATH_FLUSH_SEND               0x20
-#define IPATH_FLUSH_RECV               0x40
-#define IPATH_PROCESS_OR_FLUSH_SEND \
-       (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
-
-/* IB Performance Manager status values */
-#define IB_PMA_SAMPLE_STATUS_DONE      0x00
-#define IB_PMA_SAMPLE_STATUS_STARTED   0x01
-#define IB_PMA_SAMPLE_STATUS_RUNNING   0x02
-
-/* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA  cpu_to_be16(0x0001)
-#define IB_PMA_PORT_RCV_DATA   cpu_to_be16(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS  cpu_to_be16(0x0003)
-#define IB_PMA_PORT_RCV_PKTS   cpu_to_be16(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT  cpu_to_be16(0x0005)
-
-struct ib_reth {
-       __be64 vaddr;
-       __be32 rkey;
-       __be32 length;
-} __attribute__ ((packed));
-
-struct ib_atomic_eth {
-       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
-       __be32 rkey;
-       __be64 swap_data;
-       __be64 compare_data;
-} __attribute__ ((packed));
-
-struct ipath_other_headers {
-       __be32 bth[3];
-       union {
-               struct {
-                       __be32 deth[2];
-                       __be32 imm_data;
-               } ud;
-               struct {
-                       struct ib_reth reth;
-                       __be32 imm_data;
-               } rc;
-               struct {
-                       __be32 aeth;
-                       __be32 atomic_ack_eth[2];
-               } at;
-               __be32 imm_data;
-               __be32 aeth;
-               struct ib_atomic_eth atomic_eth;
-       } u;
-} __attribute__ ((packed));
-
-/*
- * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
- * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
- * will be in the eager header buffer.  The remaining 12 or 16 bytes
- * are in the data buffer.
- */
-struct ipath_ib_header {
-       __be16 lrh[4];
-       union {
-               struct {
-                       struct ib_grh grh;
-                       struct ipath_other_headers oth;
-               } l;
-               struct ipath_other_headers oth;
-       } u;
-} __attribute__ ((packed));
-
-struct ipath_pio_header {
-       __le32 pbc[2];
-       struct ipath_ib_header hdr;
-} __attribute__ ((packed));
-
-/*
- * There is one struct ipath_mcast for each multicast GID.
- * All attached QPs are then stored as a list of
- * struct ipath_mcast_qp.
- */
-struct ipath_mcast_qp {
-       struct list_head list;
-       struct ipath_qp *qp;
-};
-
-struct ipath_mcast {
-       struct rb_node rb_node;
-       union ib_gid mgid;
-       struct list_head qp_list;
-       wait_queue_head_t wait;
-       atomic_t refcount;
-       int n_attached;
-};
-
-/* Protection domain */
-struct ipath_pd {
-       struct ib_pd ibpd;
-       int user;               /* non-zero if created from user space */
-};
-
-/* Address Handle */
-struct ipath_ah {
-       struct ib_ah ibah;
-       struct ib_ah_attr attr;
-};
-
-/*
- * This structure is used by ipath_mmap() to validate an offset
- * when an mmap() request is made.  The vm_area_struct then uses
- * this as its vm_private_data.
- */
-struct ipath_mmap_info {
-       struct list_head pending_mmaps;
-       struct ib_ucontext *context;
-       void *obj;
-       __u64 offset;
-       struct kref ref;
-       unsigned size;
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and completion queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- */
-struct ipath_cq_wc {
-       u32 head;               /* index of next entry to fill */
-       u32 tail;               /* index of next ib_poll_cq() entry */
-       union {
-               /* these are actually size ibcq.cqe + 1 */
-               struct ib_uverbs_wc uqueue[0];
-               struct ib_wc kqueue[0];
-       };
-};
-
-/*
- * The completion queue structure.
- */
-struct ipath_cq {
-       struct ib_cq ibcq;
-       struct tasklet_struct comptask;
-       spinlock_t lock;
-       u8 notify;
-       u8 triggered;
-       struct ipath_cq_wc *queue;
-       struct ipath_mmap_info *ip;
-};
-
-/*
- * A segment is a linear region of low physical memory.
- * XXX Maybe we should use phys addr here and kmap()/kunmap().
- * Used by the verbs layer.
- */
-struct ipath_seg {
-       void *vaddr;
-       size_t length;
-};
-
-/* The number of ipath_segs that fit in a page. */
-#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
-
-struct ipath_segarray {
-       struct ipath_seg segs[IPATH_SEGSZ];
-};
-
-struct ipath_mregion {
-       struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
-       u64 user_base;          /* User's address for this region */
-       u64 iova;               /* IB start address of this region */
-       size_t length;
-       u32 lkey;
-       u32 offset;             /* offset (bytes) to start of region */
-       int access_flags;
-       u32 max_segs;           /* number of ipath_segs in all the arrays */
-       u32 mapsz;              /* size of the map array */
-       struct ipath_segarray *map[0];  /* the segments */
-};
-
-/*
- * These keep track of the copy progress within a memory region.
- * Used by the verbs layer.
- */
-struct ipath_sge {
-       struct ipath_mregion *mr;
-       void *vaddr;            /* kernel virtual address of segment */
-       u32 sge_length;         /* length of the SGE */
-       u32 length;             /* remaining length of the segment */
-       u16 m;                  /* current index: mr->map[m] */
-       u16 n;                  /* current index: mr->map[m]->segs[n] */
-};
-
-/* Memory region */
-struct ipath_mr {
-       struct ib_mr ibmr;
-       struct ib_umem *umem;
-       struct ipath_mregion mr;        /* must be last */
-};
-
-/*
- * Send work request queue entry.
- * The size of the sg_list is determined when the QP is created and stored
- * in qp->s_max_sge.
- */
-struct ipath_swqe {
-       union {
-               struct ib_send_wr wr;   /* don't use wr.sg_list */
-               struct ib_ud_wr ud_wr;
-               struct ib_rdma_wr rdma_wr;
-               struct ib_atomic_wr atomic_wr;
-       };
-
-       u32 psn;                /* first packet sequence number */
-       u32 lpsn;               /* last packet sequence number */
-       u32 ssn;                /* send sequence number */
-       u32 length;             /* total length of data in sg_list */
-       struct ipath_sge sg_list[0];
-};
-
-/*
- * Receive work request queue entry.
- * The size of the sg_list is determined when the QP (or SRQ) is created
- * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
- */
-struct ipath_rwqe {
-       u64 wr_id;
-       u8 num_sge;
-       struct ib_sge sg_list[0];
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and receive work queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- * Note that the wq array elements are variable size so you can't
- * just index into the array to get the N'th element;
- * use get_rwqe_ptr() instead.
- */
-struct ipath_rwq {
-       u32 head;               /* new work requests posted to the head */
-       u32 tail;               /* receives pull requests from here. */
-       struct ipath_rwqe wq[0];
-};
-
-struct ipath_rq {
-       struct ipath_rwq *wq;
-       spinlock_t lock;
-       u32 size;               /* size of RWQE array */
-       u8 max_sge;
-};
-
-struct ipath_srq {
-       struct ib_srq ibsrq;
-       struct ipath_rq rq;
-       struct ipath_mmap_info *ip;
-       /* send signal when number of RWQEs < limit */
-       u32 limit;
-};
-
-struct ipath_sge_state {
-       struct ipath_sge *sg_list;      /* next SGE to be used if any */
-       struct ipath_sge sge;   /* progress state for the current SGE */
-       u8 num_sge;
-       u8 static_rate;
-};
-
-/*
- * This structure holds the information that the send tasklet needs
- * to send a RDMA read response or atomic operation.
- */
-struct ipath_ack_entry {
-       u8 opcode;
-       u8 sent;
-       u32 psn;
-       union {
-               struct ipath_sge_state rdma_sge;
-               u64 atomic_data;
-       };
-};
-
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
- *
- * Common variables are protected by both r_rq.lock and s_lock in that order
- * which only happens in modify_qp() or changing the QP 'state'.
- */
-struct ipath_qp {
-       struct ib_qp ibqp;
-       struct ipath_qp *next;          /* link list for QPN hash table */
-       struct ipath_qp *timer_next;    /* link list for ipath_ib_timer() */
-       struct ipath_qp *pio_next;      /* link for ipath_ib_piobufavail() */
-       struct list_head piowait;       /* link for wait PIO buf */
-       struct list_head timerwait;     /* link for waiting for timeouts */
-       struct ib_ah_attr remote_ah_attr;
-       struct ipath_ib_header s_hdr;   /* next packet header to send */
-       atomic_t refcount;
-       wait_queue_head_t wait;
-       wait_queue_head_t wait_dma;
-       struct tasklet_struct s_task;
-       struct ipath_mmap_info *ip;
-       struct ipath_sge_state *s_cur_sge;
-       struct ipath_verbs_txreq *s_tx;
-       struct ipath_sge_state s_sge;   /* current send request data */
-       struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
-       struct ipath_sge_state s_ack_rdma_sge;
-       struct ipath_sge_state s_rdma_read_sge;
-       struct ipath_sge_state r_sge;   /* current receive data */
-       spinlock_t s_lock;
-       atomic_t s_dma_busy;
-       u16 s_pkt_delay;
-       u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
-       u32 s_cur_size;         /* size of send packet in bytes */
-       u32 s_len;              /* total length of s_sge */
-       u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
-       u32 s_next_psn;         /* PSN for next request */
-       u32 s_last_psn;         /* last response PSN processed */
-       u32 s_psn;              /* current packet sequence number */
-       u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
-       u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
-       u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
-       u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
-       u64 r_wr_id;            /* ID for current receive WQE */
-       unsigned long r_aflags;
-       u32 r_len;              /* total length of r_sge */
-       u32 r_rcv_len;          /* receive data len processed */
-       u32 r_psn;              /* expected rcv packet sequence number */
-       u32 r_msn;              /* message sequence number */
-       u8 state;               /* QP state */
-       u8 s_state;             /* opcode of last packet sent */
-       u8 s_ack_state;         /* opcode of packet to ACK */
-       u8 s_nak_state;         /* non-zero if NAK is pending */
-       u8 r_state;             /* opcode of last packet received */
-       u8 r_nak_state;         /* non-zero if NAK is pending */
-       u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
-       u8 r_flags;
-       u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
-       u8 r_head_ack_queue;    /* index into s_ack_queue[] */
-       u8 qp_access_flags;
-       u8 s_max_sge;           /* size of s_wq->sg_list */
-       u8 s_retry_cnt;         /* number of times to retry */
-       u8 s_rnr_retry_cnt;
-       u8 s_retry;             /* requester retry counter */
-       u8 s_rnr_retry;         /* requester RNR retry counter */
-       u8 s_pkey_index;        /* PKEY index to use */
-       u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
-       u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
-       u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
-       u8 s_flags;
-       u8 s_dmult;
-       u8 s_draining;
-       u8 timeout;             /* Timeout for this QP */
-       enum ib_mtu path_mtu;
-       u32 remote_qpn;
-       u32 qkey;               /* QKEY for this QP (for UD or RD) */
-       u32 s_size;             /* send work queue size */
-       u32 s_head;             /* new entries added here */
-       u32 s_tail;             /* next entry to process */
-       u32 s_cur;              /* current work queue entry */
-       u32 s_last;             /* last un-ACK'ed entry */
-       u32 s_ssn;              /* SSN of tail entry */
-       u32 s_lsn;              /* limit sequence number (credit) */
-       struct ipath_swqe *s_wq;        /* send work queue */
-       struct ipath_swqe *s_wqe;
-       struct ipath_sge *r_ud_sg_list;
-       struct ipath_rq r_rq;           /* receive work queue */
-       struct ipath_sge r_sg_list[0];  /* verified SGEs */
-};
-
-/*
- * Atomic bit definitions for r_aflags.
- */
-#define IPATH_R_WRID_VALID     0
-
-/*
- * Bit definitions for r_flags.
- */
-#define IPATH_R_REUSE_SGE      0x01
-#define IPATH_R_RDMAR_SEQ      0x02
-
-/*
- * Bit definitions for s_flags.
- *
- * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
- *                        before processing the next SWQE
- * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
- *                        before processing the next SWQE
- * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
- * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
- * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
- *                   next send completion entry not via send DMA.
- */
-#define IPATH_S_SIGNAL_REQ_WR  0x01
-#define IPATH_S_FENCE_PENDING  0x02
-#define IPATH_S_RDMAR_PENDING  0x04
-#define IPATH_S_ACK_PENDING    0x08
-#define IPATH_S_BUSY           0x10
-#define IPATH_S_WAITING                0x20
-#define IPATH_S_WAIT_SSN_CREDIT        0x40
-#define IPATH_S_WAIT_DMA       0x80
-
-#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
-       IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
-
-#define IPATH_PSN_CREDIT       512
-
-/*
- * Since struct ipath_swqe is not a fixed size, we can't simply index into
- * struct ipath_qp.s_wq.  This function does the array index computation.
- */
-static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
-                                             unsigned n)
-{
-       return (struct ipath_swqe *)((char *)qp->s_wq +
-                                    (sizeof(struct ipath_swqe) +
-                                     qp->s_max_sge *
-                                     sizeof(struct ipath_sge)) * n);
-}
-
-/*
- * Since struct ipath_rwqe is not a fixed size, we can't simply index into
- * struct ipath_rwq.wq.  This function does the array index computation.
- */
-static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
-                                             unsigned n)
-{
-       return (struct ipath_rwqe *)
-               ((char *) rq->wq->wq +
-                (sizeof(struct ipath_rwqe) +
-                 rq->max_sge * sizeof(struct ib_sge)) * n);
-}
-
-/*
- * QPN-map pages start out as NULL, they get allocated upon
- * first use and are never deallocated. This way,
- * large bitmaps are not allocated unless large numbers of QPs are used.
- */
-struct qpn_map {
-       atomic_t n_free;
-       void *page;
-};
-
-struct ipath_qp_table {
-       spinlock_t lock;
-       u32 last;               /* last QP number allocated */
-       u32 max;                /* size of the hash table */
-       u32 nmaps;              /* size of the map table */
-       struct ipath_qp **table;
-       /* bit map of free numbers */
-       struct qpn_map map[QPNMAP_ENTRIES];
-};
-
-struct ipath_lkey_table {
-       spinlock_t lock;
-       u32 next;               /* next unused index (speeds search) */
-       u32 gen;                /* generation count */
-       u32 max;                /* size of the table */
-       struct ipath_mregion **table;
-};
-
-struct ipath_opcode_stats {
-       u64 n_packets;          /* number of packets */
-       u64 n_bytes;            /* total number of bytes */
-};
-
-struct ipath_ibdev {
-       struct ib_device ibdev;
-       struct ipath_devdata *dd;
-       struct list_head pending_mmaps;
-       spinlock_t mmap_offset_lock;
-       u32 mmap_offset;
-       int ib_unit;            /* This is the device number */
-       u16 sm_lid;             /* in host order */
-       u8 sm_sl;
-       u8 mkeyprot;
-       /* non-zero when timer is set */
-       unsigned long mkey_lease_timeout;
-
-       /* The following fields are really per port. */
-       struct ipath_qp_table qp_table;
-       struct ipath_lkey_table lk_table;
-       struct list_head pending[3];    /* FIFO of QPs waiting for ACKs */
-       struct list_head piowait;       /* list for wait PIO buf */
-       struct list_head txreq_free;
-       void *txreq_bufs;
-       /* list of QPs waiting for RNR timer */
-       struct list_head rnrwait;
-       spinlock_t pending_lock;
-       __be64 sys_image_guid;  /* in network order */
-       __be64 gid_prefix;      /* in network order */
-       __be64 mkey;
-
-       u32 n_pds_allocated;    /* number of PDs allocated for device */
-       spinlock_t n_pds_lock;
-       u32 n_ahs_allocated;    /* number of AHs allocated for device */
-       spinlock_t n_ahs_lock;
-       u32 n_cqs_allocated;    /* number of CQs allocated for device */
-       spinlock_t n_cqs_lock;
-       u32 n_qps_allocated;    /* number of QPs allocated for device */
-       spinlock_t n_qps_lock;
-       u32 n_srqs_allocated;   /* number of SRQs allocated for device */
-       spinlock_t n_srqs_lock;
-       u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
-       spinlock_t n_mcast_grps_lock;
-
-       u64 ipath_sword;        /* total dwords sent (sample result) */
-       u64 ipath_rword;        /* total dwords received (sample result) */
-       u64 ipath_spkts;        /* total packets sent (sample result) */
-       u64 ipath_rpkts;        /* total packets received (sample result) */
-       /* # of ticks no data sent (sample result) */
-       u64 ipath_xmit_wait;
-       u64 rcv_errors;         /* # of packets with SW detected rcv errs */
-       u64 n_unicast_xmit;     /* total unicast packets sent */
-       u64 n_unicast_rcv;      /* total unicast packets received */
-       u64 n_multicast_xmit;   /* total multicast packets sent */
-       u64 n_multicast_rcv;    /* total multicast packets received */
-       u64 z_symbol_error_counter;             /* starting count for PMA */
-       u64 z_link_error_recovery_counter;      /* starting count for PMA */
-       u64 z_link_downed_counter;              /* starting count for PMA */
-       u64 z_port_rcv_errors;                  /* starting count for PMA */
-       u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
-       u64 z_port_xmit_discards;               /* starting count for PMA */
-       u64 z_port_xmit_data;                   /* starting count for PMA */
-       u64 z_port_rcv_data;                    /* starting count for PMA */
-       u64 z_port_xmit_packets;                /* starting count for PMA */
-       u64 z_port_rcv_packets;                 /* starting count for PMA */
-       u32 z_pkey_violations;                  /* starting count for PMA */
-       u32 z_local_link_integrity_errors;      /* starting count for PMA */
-       u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
-       u32 z_vl15_dropped;                     /* starting count for PMA */
-       u32 n_rc_resends;
-       u32 n_rc_acks;
-       u32 n_rc_qacks;
-       u32 n_seq_naks;
-       u32 n_rdma_seq;
-       u32 n_rnr_naks;
-       u32 n_other_naks;
-       u32 n_timeouts;
-       u32 n_pkt_drops;
-       u32 n_vl15_dropped;
-       u32 n_wqe_errs;
-       u32 n_rdma_dup_busy;
-       u32 n_piowait;
-       u32 n_unaligned;
-       u32 port_cap_flags;
-       u32 pma_sample_start;
-       u32 pma_sample_interval;
-       __be16 pma_counter_select[5];
-       u16 pma_tag;
-       u16 qkey_violations;
-       u16 mkey_violations;
-       u16 mkey_lease_period;
-       u16 pending_index;      /* which pending queue is active */
-       u8 pma_sample_status;
-       u8 subnet_timeout;
-       u8 vl_high_limit;
-       struct ipath_opcode_stats opstats[128];
-};
-
-struct ipath_verbs_counters {
-       u64 symbol_error_counter;
-       u64 link_error_recovery_counter;
-       u64 link_downed_counter;
-       u64 port_rcv_errors;
-       u64 port_rcv_remphys_errors;
-       u64 port_xmit_discards;
-       u64 port_xmit_data;
-       u64 port_rcv_data;
-       u64 port_xmit_packets;
-       u64 port_rcv_packets;
-       u32 local_link_integrity_errors;
-       u32 excessive_buffer_overrun_errors;
-       u32 vl15_dropped;
-};
-
-struct ipath_verbs_txreq {
-       struct ipath_qp         *qp;
-       struct ipath_swqe       *wqe;
-       u32                      map_len;
-       u32                      len;
-       struct ipath_sge_state  *ss;
-       struct ipath_pio_header  hdr;
-       struct ipath_sdma_txreq  txreq;
-};
-
-static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
-{
-       return container_of(ibmr, struct ipath_mr, ibmr);
-}
-
-static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
-{
-       return container_of(ibpd, struct ipath_pd, ibpd);
-}
-
-static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
-{
-       return container_of(ibah, struct ipath_ah, ibah);
-}
-
-static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
-{
-       return container_of(ibcq, struct ipath_cq, ibcq);
-}
-
-static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
-{
-       return container_of(ibsrq, struct ipath_srq, ibsrq);
-}
-
-static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct ipath_qp, ibqp);
-}
-
-static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
-{
-       return container_of(ibdev, struct ipath_ibdev, ibdev);
-}
-
-/*
- * This must be called with s_lock held.
- */
-static inline void ipath_schedule_send(struct ipath_qp *qp)
-{
-       if (qp->s_flags & IPATH_S_ANY_WAIT)
-               qp->s_flags &= ~IPATH_S_ANY_WAIT;
-       if (!(qp->s_flags & IPATH_S_BUSY))
-               tasklet_hi_schedule(&qp->s_task);
-}
-
-int ipath_process_mad(struct ib_device *ibdev,
-                     int mad_flags,
-                     u8 port_num,
-                     const struct ib_wc *in_wc,
-                     const struct ib_grh *in_grh,
-                     const struct ib_mad_hdr *in, size_t in_mad_size,
-                     struct ib_mad_hdr *out, size_t *out_mad_size,
-                     u16 *out_mad_pkey_index);
-
-/*
- * Compare the lower 24 bits of the two values.
- * Returns an integer <, ==, or > than zero.
- */
-static inline int ipath_cmp24(u32 a, u32 b)
-{
-       return (((int) a) - ((int) b)) << 8;
-}
-
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-                           u64 *rwords, u64 *spkts, u64 *rpkts,
-                           u64 *xmit_wait);
-
-int ipath_get_counters(struct ipath_devdata *dd,
-                      struct ipath_verbs_counters *cntrs);
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_mcast_tree_empty(void);
-
-__be32 ipath_compute_aeth(struct ipath_qp *qp);
-
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
-
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-                             struct ib_qp_init_attr *init_attr,
-                             struct ib_udata *udata);
-
-int ipath_destroy_qp(struct ib_qp *ibqp);
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata);
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                  int attr_mask, struct ib_qp_init_attr *init_attr);
-
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
-
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
-
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
-
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
-
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-                    u32 hdrwords, struct ipath_sge_state *ss, u32 len);
-
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
-
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
-
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
-
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
-                    struct ipath_mregion *mr);
-
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
-
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-                 struct ib_sge *sge, int acc);
-
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-                 u32 len, u64 vaddr, u32 rkey, int acc);
-
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-                          struct ib_recv_wr **bad_wr);
-
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-                               struct ib_srq_init_attr *srq_init_attr,
-                               struct ib_udata *udata);
-
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                    enum ib_srq_attr_mask attr_mask,
-                    struct ib_udata *udata);
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-
-int ipath_destroy_srq(struct ib_srq *ibsrq);
-
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
-
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-                             const struct ib_cq_init_attr *attr,
-                             struct ib_ucontext *context,
-                             struct ib_udata *udata);
-
-int ipath_destroy_cq(struct ib_cq *ibcq);
-
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
-
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
-
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
-
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                               u64 virt_addr, int mr_access_flags,
-                               struct ib_udata *udata);
-
-int ipath_dereg_mr(struct ib_mr *ibmr);
-
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-                              struct ib_fmr_attr *fmr_attr);
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-                      int list_len, u64 iova);
-
-int ipath_unmap_fmr(struct list_head *fmr_list);
-
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
-
-void ipath_release_mmap_info(struct kref *ref);
-
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-                                              u32 size,
-                                              struct ib_ucontext *context,
-                                              void *obj);
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-                           struct ipath_mmap_info *ip,
-                           u32 size, void *obj);
-
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-void ipath_insert_rnr_queue(struct ipath_qp *qp);
-
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-                  u32 *lengthp, struct ipath_sge_state *ss);
-
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
-
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-                  struct ib_global_route *grh, u32 hwords, u32 nwords);
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                          struct ipath_other_headers *ohdr,
-                          u32 bth0, u32 bth2);
-
-void ipath_do_send(unsigned long data);
-
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-                        enum ib_wc_status status);
-
-int ipath_make_rc_req(struct ipath_qp *qp);
-
-int ipath_make_uc_req(struct ipath_qp *qp);
-
-int ipath_make_ud_req(struct ipath_qp *qp);
-
-int ipath_register_ib_device(struct ipath_devdata *);
-
-void ipath_unregister_ib_device(struct ipath_ibdev *);
-
-void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
-
-int ipath_ib_piobufavail(struct ipath_ibdev *);
-
-unsigned ipath_get_npkeys(struct ipath_devdata *);
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *);
-
-unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
-
-extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
-
-/*
- * Below converts HCA-specific LinkTrainingState to IB PhysPortState
- * values.
- */
-extern const u8 ipath_cvt_physportstate[];
-#define IB_PHYSPORTSTATE_SLEEP 1
-#define IB_PHYSPORTSTATE_POLL 2
-#define IB_PHYSPORTSTATE_DISABLED 3
-#define IB_PHYSPORTSTATE_CFG_TRAIN 4
-#define IB_PHYSPORTSTATE_LINKUP 5
-#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
-
-extern const int ib_ipath_state_ops[];
-
-extern unsigned int ib_ipath_lkey_table_size;
-
-extern unsigned int ib_ipath_max_cqes;
-
-extern unsigned int ib_ipath_max_cqs;
-
-extern unsigned int ib_ipath_max_qp_wrs;
-
-extern unsigned int ib_ipath_max_qps;
-
-extern unsigned int ib_ipath_max_sges;
-
-extern unsigned int ib_ipath_max_mcast_grps;
-
-extern unsigned int ib_ipath_max_mcast_qp_attached;
-
-extern unsigned int ib_ipath_max_srqs;
-
-extern unsigned int ib_ipath_max_srq_sges;
-
-extern unsigned int ib_ipath_max_srq_wrs;
-
-extern const u32 ib_ipath_rnr_table[];
-
-extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
-
-#endif                         /* IPATH_VERBS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
deleted file mode 100644 (file)
index 72d476f..0000000
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/rculist.h>
-#include <linux/slab.h>
-
-#include "ipath_verbs.h"
-
-/*
- * Global table of GID to attached QPs.
- * The table is global to all ipath devices since a send from one QP/device
- * needs to be locally routed to any locally attached QPs on the same
- * or different device.
- */
-static struct rb_root mcast_tree;
-static DEFINE_SPINLOCK(mcast_lock);
-
-/**
- * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
- * @qp: the QP to link
- */
-static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
-{
-       struct ipath_mcast_qp *mqp;
-
-       mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
-       if (!mqp)
-               goto bail;
-
-       mqp->qp = qp;
-       atomic_inc(&qp->refcount);
-
-bail:
-       return mqp;
-}
-
-static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
-{
-       struct ipath_qp *qp = mqp->qp;
-
-       /* Notify ipath_destroy_qp() if it is waiting. */
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-
-       kfree(mqp);
-}
-
-/**
- * ipath_mcast_alloc - allocate the multicast GID structure
- * @mgid: the multicast GID
- *
- * A list of QPs will be attached to this structure.
- */
-static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
-{
-       struct ipath_mcast *mcast;
-
-       mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
-       if (!mcast)
-               goto bail;
-
-       mcast->mgid = *mgid;
-       INIT_LIST_HEAD(&mcast->qp_list);
-       init_waitqueue_head(&mcast->wait);
-       atomic_set(&mcast->refcount, 0);
-       mcast->n_attached = 0;
-
-bail:
-       return mcast;
-}
-
-static void ipath_mcast_free(struct ipath_mcast *mcast)
-{
-       struct ipath_mcast_qp *p, *tmp;
-
-       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
-               ipath_mcast_qp_free(p);
-
-       kfree(mcast);
-}
-
-/**
- * ipath_mcast_find - search the global table for the given multicast GID
- * @mgid: the multicast GID to search for
- *
- * Returns NULL if not found.
- *
- * The caller is responsible for decrementing the reference count if found.
- */
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
-{
-       struct rb_node *n;
-       unsigned long flags;
-       struct ipath_mcast *mcast;
-
-       spin_lock_irqsave(&mcast_lock, flags);
-       n = mcast_tree.rb_node;
-       while (n) {
-               int ret;
-
-               mcast = rb_entry(n, struct ipath_mcast, rb_node);
-
-               ret = memcmp(mgid->raw, mcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0)
-                       n = n->rb_left;
-               else if (ret > 0)
-                       n = n->rb_right;
-               else {
-                       atomic_inc(&mcast->refcount);
-                       spin_unlock_irqrestore(&mcast_lock, flags);
-                       goto bail;
-               }
-       }
-       spin_unlock_irqrestore(&mcast_lock, flags);
-
-       mcast = NULL;
-
-bail:
-       return mcast;
-}
-
-/**
- * ipath_mcast_add - insert mcast GID into table and attach QP struct
- * @mcast: the mcast GID table
- * @mqp: the QP to attach
- *
- * Return zero if both were added.  Return EEXIST if the GID was already in
- * the table but the QP was added.  Return ESRCH if the QP was already
- * attached and neither structure was added.
- */
-static int ipath_mcast_add(struct ipath_ibdev *dev,
-                          struct ipath_mcast *mcast,
-                          struct ipath_mcast_qp *mqp)
-{
-       struct rb_node **n = &mcast_tree.rb_node;
-       struct rb_node *pn = NULL;
-       int ret;
-
-       spin_lock_irq(&mcast_lock);
-
-       while (*n) {
-               struct ipath_mcast *tmcast;
-               struct ipath_mcast_qp *p;
-
-               pn = *n;
-               tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
-
-               ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0) {
-                       n = &pn->rb_left;
-                       continue;
-               }
-               if (ret > 0) {
-                       n = &pn->rb_right;
-                       continue;
-               }
-
-               /* Search the QP list to see if this is already there. */
-               list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
-                       if (p->qp == mqp->qp) {
-                               ret = ESRCH;
-                               goto bail;
-                       }
-               }
-               if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
-                       ret = ENOMEM;
-                       goto bail;
-               }
-
-               tmcast->n_attached++;
-
-               list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
-               ret = EEXIST;
-               goto bail;
-       }
-
-       spin_lock(&dev->n_mcast_grps_lock);
-       if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
-               spin_unlock(&dev->n_mcast_grps_lock);
-               ret = ENOMEM;
-               goto bail;
-       }
-
-       dev->n_mcast_grps_allocated++;
-       spin_unlock(&dev->n_mcast_grps_lock);
-
-       mcast->n_attached++;
-
-       list_add_tail_rcu(&mqp->list, &mcast->qp_list);
-
-       atomic_inc(&mcast->refcount);
-       rb_link_node(&mcast->rb_node, pn, n);
-       rb_insert_color(&mcast->rb_node, &mcast_tree);
-
-       ret = 0;
-
-bail:
-       spin_unlock_irq(&mcast_lock);
-
-       return ret;
-}
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_mcast *mcast;
-       struct ipath_mcast_qp *mqp;
-       int ret;
-
-       /*
-        * Allocate data structures since its better to do this outside of
-        * spin locks and it will most likely be needed.
-        */
-       mcast = ipath_mcast_alloc(gid);
-       if (mcast == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       mqp = ipath_mcast_qp_alloc(qp);
-       if (mqp == NULL) {
-               ipath_mcast_free(mcast);
-               ret = -ENOMEM;
-               goto bail;
-       }
-       switch (ipath_mcast_add(dev, mcast, mqp)) {
-       case ESRCH:
-               /* Neither was used: can't attach the same QP twice. */
-               ipath_mcast_qp_free(mqp);
-               ipath_mcast_free(mcast);
-               ret = -EINVAL;
-               goto bail;
-       case EEXIST:            /* The mcast wasn't used */
-               ipath_mcast_free(mcast);
-               break;
-       case ENOMEM:
-               /* Exceeded the maximum number of mcast groups. */
-               ipath_mcast_qp_free(mqp);
-               ipath_mcast_free(mcast);
-               ret = -ENOMEM;
-               goto bail;
-       default:
-               break;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_mcast *mcast = NULL;
-       struct ipath_mcast_qp *p, *tmp;
-       struct rb_node *n;
-       int last = 0;
-       int ret;
-
-       spin_lock_irq(&mcast_lock);
-
-       /* Find the GID in the mcast table. */
-       n = mcast_tree.rb_node;
-       while (1) {
-               if (n == NULL) {
-                       spin_unlock_irq(&mcast_lock);
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               mcast = rb_entry(n, struct ipath_mcast, rb_node);
-               ret = memcmp(gid->raw, mcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0)
-                       n = n->rb_left;
-               else if (ret > 0)
-                       n = n->rb_right;
-               else
-                       break;
-       }
-
-       /* Search the QP list. */
-       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
-               if (p->qp != qp)
-                       continue;
-               /*
-                * We found it, so remove it, but don't poison the forward
-                * link until we are sure there are no list walkers.
-                */
-               list_del_rcu(&p->list);
-               mcast->n_attached--;
-
-               /* If this was the last attached QP, remove the GID too. */
-               if (list_empty(&mcast->qp_list)) {
-                       rb_erase(&mcast->rb_node, &mcast_tree);
-                       last = 1;
-               }
-               break;
-       }
-
-       spin_unlock_irq(&mcast_lock);
-
-       if (p) {
-               /*
-                * Wait for any list walkers to finish before freeing the
-                * list element.
-                */
-               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
-               ipath_mcast_qp_free(p);
-       }
-       if (last) {
-               atomic_dec(&mcast->refcount);
-               wait_event(mcast->wait, !atomic_read(&mcast->refcount));
-               ipath_mcast_free(mcast);
-               spin_lock_irq(&dev->n_mcast_grps_lock);
-               dev->n_mcast_grps_allocated--;
-               spin_unlock_irq(&dev->n_mcast_grps_lock);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_mcast_tree_empty(void)
-{
-       return mcast_tree.rb_node == NULL;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
deleted file mode 100644 (file)
index 1a7e20a..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on PowerPC only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * Nothing to do on PowerPC, so just return without error.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-       return 0;
-}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
deleted file mode 100644 (file)
index 7b6e4c8..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on x86_64 only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include <linux/pci.h>
-#include <asm/processor.h>
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
- * write combining.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-       int ret = 0;
-       u64 pioaddr, piolen;
-       unsigned bits;
-       const unsigned long addr = pci_resource_start(dd->pcidev, 0);
-       const size_t len = pci_resource_len(dd->pcidev, 0);
-
-       /*
-        * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
-        * chip.  Linux (possibly the hardware) requires it to be on a power
-        * of 2 address matching the length (which has to be a power of 2).
-        * For rev1, that means the base address, for rev2, it will be just
-        * the PIO buffers themselves.
-        * For chips with two sets of buffers, the calculations are
-        * somewhat more complicated; we need to sum, and the piobufbase
-        * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
-        * The buffers are still packed, so a single range covers both.
-        */
-       if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
-               unsigned long pio2kbase, pio4kbase;
-               pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
-               pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
-               if (pio2kbase < pio4kbase) { /* all, for now */
-                       pioaddr = addr + pio2kbase;
-                       piolen = pio4kbase - pio2kbase +
-                               dd->ipath_piobcnt4k * dd->ipath_4kalign;
-               } else {
-                       pioaddr = addr + pio4kbase;
-                       piolen = pio2kbase - pio4kbase +
-                               dd->ipath_piobcnt2k * dd->ipath_palign;
-               }
-       } else {  /* single buffer size (2K, currently) */
-               pioaddr = addr + dd->ipath_piobufbase;
-               piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
-                       dd->ipath_piobcnt4k * dd->ipath_4kalign;
-       }
-
-       for (bits = 0; !(piolen & (1ULL << bits)); bits++)
-               /* do nothing */ ;
-
-       if (piolen != (1ULL << bits)) {
-               piolen >>= bits;
-               while (piolen >>= 1)
-                       bits++;
-               piolen = 1ULL << (bits + 1);
-       }
-       if (pioaddr & (piolen - 1)) {
-               u64 atmp;
-               ipath_dbg("pioaddr %llx not on right boundary for size "
-                         "%llx, fixing\n",
-                         (unsigned long long) pioaddr,
-                         (unsigned long long) piolen);
-               atmp = pioaddr & ~(piolen - 1);
-               if (atmp < addr || (atmp + piolen) > (addr + len)) {
-                       ipath_dev_err(dd, "No way to align address/size "
-                                     "(%llx/%llx), no WC mtrr\n",
-                                     (unsigned long long) atmp,
-                                     (unsigned long long) piolen << 1);
-                       ret = -ENODEV;
-               } else {
-                       ipath_dbg("changing WC base from %llx to %llx, "
-                                 "len from %llx to %llx\n",
-                                 (unsigned long long) pioaddr,
-                                 (unsigned long long) atmp,
-                                 (unsigned long long) piolen,
-                                 (unsigned long long) piolen << 1);
-                       pioaddr = atmp;
-                       piolen <<= 1;
-               }
-       }
-
-       if (!ret) {
-               dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
-               if (dd->wc_cookie < 0) {
-                       ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
-                       ret = -ENODEV;
-               } else if (dd->wc_cookie == 0)
-                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
-               else
-                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
-       }
-
-       return ret;
-}
-
-/**
- * ipath_disable_wc - disable write combining for MMIO writes to the device
- * @dd: infinipath device
- */
-void ipath_disable_wc(struct ipath_devdata *dd)
-{
-       arch_phys_wc_del(dd->wc_cookie);
-}
index efd6f45..7e8037e 100644 (file)
@@ -1,7 +1,7 @@
 menu "Speakup console speech"
 
 config SPEAKUP
-       depends on VT
+       depends on VT && !MN10300
        tristate "Speakup core"
        ---help---
                This is the Speakup screen reader.  Think of it as a
index 63c59bc..30cf973 100644 (file)
@@ -264,8 +264,9 @@ static struct notifier_block vt_notifier_block = {
        .notifier_call = vt_notifier_call,
 };
 
-static unsigned char get_attributes(u16 *pos)
+static unsigned char get_attributes(struct vc_data *vc, u16 *pos)
 {
+       pos = screen_pos(vc, pos - (u16 *)vc->vc_origin, 1);
        return (u_char) (scr_readw(pos) >> 8);
 }
 
@@ -275,7 +276,7 @@ static void speakup_date(struct vc_data *vc)
        spk_y = spk_cy = vc->vc_y;
        spk_pos = spk_cp = vc->vc_pos;
        spk_old_attr = spk_attr;
-       spk_attr = get_attributes((u_short *) spk_pos);
+       spk_attr = get_attributes(vc, (u_short *)spk_pos);
 }
 
 static void bleep(u_short val)
@@ -469,8 +470,12 @@ static u16 get_char(struct vc_data *vc, u16 *pos, u_char *attribs)
        u16 ch = ' ';
 
        if (vc && pos) {
-               u16 w = scr_readw(pos);
-               u16 c = w & 0xff;
+               u16 w;
+               u16 c;
+
+               pos = screen_pos(vc, pos - (u16 *)vc->vc_origin, 1);
+               w = scr_readw(pos);
+               c = w & 0xff;
 
                if (w & vc->vc_hi_font_mask)
                        c |= 0x100;
@@ -746,7 +751,7 @@ static int get_line(struct vc_data *vc)
        u_char tmp2;
 
        spk_old_attr = spk_attr;
-       spk_attr = get_attributes((u_short *) spk_pos);
+       spk_attr = get_attributes(vc, (u_short *)spk_pos);
        for (i = 0; i < vc->vc_cols; i++) {
                buf[i] = (u_char) get_char(vc, (u_short *) tmp, &tmp2);
                tmp += 2;
@@ -811,7 +816,7 @@ static int say_from_to(struct vc_data *vc, u_long from, u_long to,
        u_short saved_punc_mask = spk_punc_mask;
 
        spk_old_attr = spk_attr;
-       spk_attr = get_attributes((u_short *) from);
+       spk_attr = get_attributes(vc, (u_short *)from);
        while (from < to) {
                buf[i++] = (char)get_char(vc, (u_short *) from, &tmp);
                from += 2;
@@ -886,7 +891,7 @@ static int get_sentence_buf(struct vc_data *vc, int read_punc)
        sentmarks[bn][0] = &sentbuf[bn][0];
        i = 0;
        spk_old_attr = spk_attr;
-       spk_attr = get_attributes((u_short *) start);
+       spk_attr = get_attributes(vc, (u_short *)start);
 
        while (start < end) {
                sentbuf[bn][i] = (char)get_char(vc, (u_short *) start, &tmp);
@@ -1585,7 +1590,7 @@ static int count_highlight_color(struct vc_data *vc)
                u16 *ptr;
 
                for (ptr = start; ptr < end; ptr++) {
-                       ch = get_attributes(ptr);
+                       ch = get_attributes(vc, ptr);
                        bg = (ch & 0x70) >> 4;
                        speakup_console[vc_num]->ht.bgcount[bg]++;
                }
index aa5ab6c..41ef099 100644 (file)
@@ -142,7 +142,9 @@ static void __speakup_paste_selection(struct work_struct *work)
        struct tty_ldisc *ld;
        DECLARE_WAITQUEUE(wait, current);
 
-       ld = tty_ldisc_ref_wait(tty);
+       ld = tty_ldisc_ref(tty);
+       if (!ld)
+               goto tty_unref;
        tty_buffer_lock_exclusive(&vc->port);
 
        add_wait_queue(&vc->paste_wait, &wait);
@@ -162,6 +164,7 @@ static void __speakup_paste_selection(struct work_struct *work)
 
        tty_buffer_unlock_exclusive(&vc->port);
        tty_ldisc_deref(ld);
+tty_unref:
        tty_kref_put(tty);
 }
 
index 3b5835b..a5bbb33 100644 (file)
@@ -6,6 +6,11 @@
 #include "spk_priv.h"
 #include "serialio.h"
 
+#include <linux/serial_core.h>
+/* WARNING:  Do not change this to <linux/serial.h> without testing that
+ * SERIAL_PORT_DFNS does get defined to the appropriate value. */
+#include <asm/serial.h>
+
 #ifndef SERIAL_PORT_DFNS
 #define SERIAL_PORT_DFNS
 #endif
@@ -23,9 +28,15 @@ const struct old_serial_port *spk_serial_init(int index)
        int baud = 9600, quot = 0;
        unsigned int cval = 0;
        int cflag = CREAD | HUPCL | CLOCAL | B9600 | CS8;
-       const struct old_serial_port *ser = rs_table + index;
+       const struct old_serial_port *ser;
        int err;
 
+       if (index >= ARRAY_SIZE(rs_table)) {
+               pr_info("no port info for ttyS%d\n", index);
+               return NULL;
+       }
+       ser = rs_table + index;
+
        /*      Divisor, bytesize and parity */
        quot = ser->baud_base / baud;
        cval = cflag & (CSIZE | CSTOPB);
index 3327c49..713c63d 100644 (file)
@@ -898,7 +898,7 @@ static ssize_t unmap_zeroes_data_store(struct config_item *item,
        da->unmap_zeroes_data = flag;
        pr_debug("dev[%p]: SE Device Thin Provisioning LBPRZ bit: %d\n",
                 da->da_dev, flag);
-       return 0;
+       return count;
 }
 
 /*
index cacd97a..da457e2 100644 (file)
@@ -828,6 +828,50 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        return dev;
 }
 
+/*
+ * Check if the underlying struct block_device request_queue supports
+ * the QUEUE_FLAG_DISCARD bit for UNMAP/WRITE_SAME in SCSI + TRIM
+ * in ATA and we need to set TPE=1
+ */
+bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
+                                      struct request_queue *q, int block_size)
+{
+       if (!blk_queue_discard(q))
+               return false;
+
+       attrib->max_unmap_lba_count = (q->limits.max_discard_sectors << 9) /
+                                                               block_size;
+       /*
+        * Currently hardcoded to 1 in Linux/SCSI code..
+        */
+       attrib->max_unmap_block_desc_count = 1;
+       attrib->unmap_granularity = q->limits.discard_granularity / block_size;
+       attrib->unmap_granularity_alignment = q->limits.discard_alignment /
+                                                               block_size;
+       attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+       return true;
+}
+EXPORT_SYMBOL(target_configure_unmap_from_queue);
+
+/*
+ * Convert from blocksize advertised to the initiator to the 512 byte
+ * units unconditionally used by the Linux block layer.
+ */
+sector_t target_to_linux_sector(struct se_device *dev, sector_t lb)
+{
+       switch (dev->dev_attrib.block_size) {
+       case 4096:
+               return lb << 3;
+       case 2048:
+               return lb << 2;
+       case 1024:
+               return lb << 1;
+       default:
+               return lb;
+       }
+}
+EXPORT_SYMBOL(target_to_linux_sector);
+
 int target_configure_device(struct se_device *dev)
 {
        struct se_hba *hba = dev->se_hba;
index e319570..75f0f08 100644 (file)
@@ -160,25 +160,11 @@ static int fd_configure_device(struct se_device *dev)
                        " block_device blocks: %llu logical_block_size: %d\n",
                        dev_size, div_u64(dev_size, fd_dev->fd_block_size),
                        fd_dev->fd_block_size);
-               /*
-                * Check if the underlying struct block_device request_queue supports
-                * the QUEUE_FLAG_DISCARD bit for UNMAP/WRITE_SAME in SCSI + TRIM
-                * in ATA and we need to set TPE=1
-                */
-               if (blk_queue_discard(q)) {
-                       dev->dev_attrib.max_unmap_lba_count =
-                               q->limits.max_discard_sectors;
-                       /*
-                        * Currently hardcoded to 1 in Linux/SCSI code..
-                        */
-                       dev->dev_attrib.max_unmap_block_desc_count = 1;
-                       dev->dev_attrib.unmap_granularity =
-                               q->limits.discard_granularity >> 9;
-                       dev->dev_attrib.unmap_granularity_alignment =
-                               q->limits.discard_alignment;
+
+               if (target_configure_unmap_from_queue(&dev->dev_attrib, q,
+                                                     fd_dev->fd_block_size))
                        pr_debug("IFILE: BLOCK Discard support available,"
-                                       " disabled by default\n");
-               }
+                                " disabled by default\n");
                /*
                 * Enable write same emulation for IBLOCK and use 0xFFFF as
                 * the smaller WRITE_SAME(10) only has a two-byte block count.
@@ -490,9 +476,12 @@ fd_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb)
        if (S_ISBLK(inode->i_mode)) {
                /* The backend is block device, use discard */
                struct block_device *bdev = inode->i_bdev;
+               struct se_device *dev = cmd->se_dev;
 
-               ret = blkdev_issue_discard(bdev, lba,
-                               nolb, GFP_KERNEL, 0);
+               ret = blkdev_issue_discard(bdev,
+                                          target_to_linux_sector(dev, lba),
+                                          target_to_linux_sector(dev,  nolb),
+                                          GFP_KERNEL, 0);
                if (ret < 0) {
                        pr_warn("FILEIO: blkdev_issue_discard() failed: %d\n",
                                ret);
index 5a2899f..abe4eb9 100644 (file)
@@ -121,29 +121,11 @@ static int iblock_configure_device(struct se_device *dev)
        dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q);
        dev->dev_attrib.hw_queue_depth = q->nr_requests;
 
-       /*
-        * Check if the underlying struct block_device request_queue supports
-        * the QUEUE_FLAG_DISCARD bit for UNMAP/WRITE_SAME in SCSI + TRIM
-        * in ATA and we need to set TPE=1
-        */
-       if (blk_queue_discard(q)) {
-               dev->dev_attrib.max_unmap_lba_count =
-                               q->limits.max_discard_sectors;
-
-               /*
-                * Currently hardcoded to 1 in Linux/SCSI code..
-                */
-               dev->dev_attrib.max_unmap_block_desc_count = 1;
-               dev->dev_attrib.unmap_granularity =
-                               q->limits.discard_granularity >> 9;
-               dev->dev_attrib.unmap_granularity_alignment =
-                               q->limits.discard_alignment;
-               dev->dev_attrib.unmap_zeroes_data =
-                               q->limits.discard_zeroes_data;
-
+       if (target_configure_unmap_from_queue(&dev->dev_attrib, q,
+                                             dev->dev_attrib.hw_block_size))
                pr_debug("IBLOCK: BLOCK Discard support available,"
-                               " disabled by default\n");
-       }
+                        " disabled by default\n");
+
        /*
         * Enable write same emulation for IBLOCK and use 0xFFFF as
         * the smaller WRITE_SAME(10) only has a two-byte block count.
@@ -415,9 +397,13 @@ static sense_reason_t
 iblock_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb)
 {
        struct block_device *bdev = IBLOCK_DEV(cmd->se_dev)->ibd_bd;
+       struct se_device *dev = cmd->se_dev;
        int ret;
 
-       ret = blkdev_issue_discard(bdev, lba, nolb, GFP_KERNEL, 0);
+       ret = blkdev_issue_discard(bdev,
+                                  target_to_linux_sector(dev, lba),
+                                  target_to_linux_sector(dev,  nolb),
+                                  GFP_KERNEL, 0);
        if (ret < 0) {
                pr_err("blkdev_issue_discard() failed: %d\n", ret);
                return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
@@ -433,8 +419,10 @@ iblock_execute_write_same(struct se_cmd *cmd)
        struct scatterlist *sg;
        struct bio *bio;
        struct bio_list list;
-       sector_t block_lba = cmd->t_task_lba;
-       sector_t sectors = sbc_get_write_same_sectors(cmd);
+       struct se_device *dev = cmd->se_dev;
+       sector_t block_lba = target_to_linux_sector(dev, cmd->t_task_lba);
+       sector_t sectors = target_to_linux_sector(dev,
+                                       sbc_get_write_same_sectors(cmd));
 
        if (cmd->prot_op) {
                pr_err("WRITE_SAME: Protection information with IBLOCK"
@@ -648,12 +636,12 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
                  enum dma_data_direction data_direction)
 {
        struct se_device *dev = cmd->se_dev;
+       sector_t block_lba = target_to_linux_sector(dev, cmd->t_task_lba);
        struct iblock_req *ibr;
        struct bio *bio, *bio_start;
        struct bio_list list;
        struct scatterlist *sg;
        u32 sg_num = sgl_nents;
-       sector_t block_lba;
        unsigned bio_cnt;
        int rw = 0;
        int i;
@@ -679,24 +667,6 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
                rw = READ;
        }
 
-       /*
-        * Convert the blocksize advertised to the initiator to the 512 byte
-        * units unconditionally used by the Linux block layer.
-        */
-       if (dev->dev_attrib.block_size == 4096)
-               block_lba = (cmd->t_task_lba << 3);
-       else if (dev->dev_attrib.block_size == 2048)
-               block_lba = (cmd->t_task_lba << 2);
-       else if (dev->dev_attrib.block_size == 1024)
-               block_lba = (cmd->t_task_lba << 1);
-       else if (dev->dev_attrib.block_size == 512)
-               block_lba = cmd->t_task_lba;
-       else {
-               pr_err("Unsupported SCSI -> BLOCK LBA conversion:"
-                               " %u\n", dev->dev_attrib.block_size);
-               return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-       }
-
        ibr = kzalloc(sizeof(struct iblock_req), GFP_KERNEL);
        if (!ibr)
                goto fail;
index dae0750..db4412f 100644 (file)
@@ -141,7 +141,6 @@ void        transport_dump_vpd_proto_id(struct t10_vpd *, unsigned char *, int);
 int    transport_dump_vpd_assoc(struct t10_vpd *, unsigned char *, int);
 int    transport_dump_vpd_ident_type(struct t10_vpd *, unsigned char *, int);
 int    transport_dump_vpd_ident(struct t10_vpd *, unsigned char *, int);
-bool   target_stop_cmd(struct se_cmd *cmd, unsigned long *flags);
 void   transport_clear_lun_ref(struct se_lun *);
 void   transport_send_task_abort(struct se_cmd *);
 sense_reason_t target_cmd_size_check(struct se_cmd *cmd, unsigned int size);
index fcdcb11..4f229e7 100644 (file)
@@ -68,23 +68,25 @@ void core_tmr_release_req(struct se_tmr_req *tmr)
 
        if (dev) {
                spin_lock_irqsave(&dev->se_tmr_lock, flags);
-               list_del(&tmr->tmr_list);
+               list_del_init(&tmr->tmr_list);
                spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
        }
 
        kfree(tmr);
 }
 
-static void core_tmr_handle_tas_abort(
-       struct se_node_acl *tmr_nacl,
-       struct se_cmd *cmd,
-       int tas)
+static void core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas)
 {
-       bool remove = true;
+       unsigned long flags;
+       bool remove = true, send_tas;
        /*
         * TASK ABORTED status (TAS) bit support
         */
-       if ((tmr_nacl && (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) {
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       send_tas = (cmd->transport_state & CMD_T_TAS);
+       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+
+       if (send_tas) {
                remove = false;
                transport_send_task_abort(cmd);
        }
@@ -107,6 +109,46 @@ static int target_check_cdb_and_preempt(struct list_head *list,
        return 1;
 }
 
+static bool __target_check_io_state(struct se_cmd *se_cmd,
+                                   struct se_session *tmr_sess, int tas)
+{
+       struct se_session *sess = se_cmd->se_sess;
+
+       assert_spin_locked(&sess->sess_cmd_lock);
+       WARN_ON_ONCE(!irqs_disabled());
+       /*
+        * If command already reached CMD_T_COMPLETE state within
+        * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
+        * this se_cmd has been passed to fabric driver and will
+        * not be aborted.
+        *
+        * Otherwise, obtain a local se_cmd->cmd_kref now for TMR
+        * ABORT_TASK + LUN_RESET for CMD_T_ABORTED processing as
+        * long as se_cmd->cmd_kref is still active unless zero.
+        */
+       spin_lock(&se_cmd->t_state_lock);
+       if (se_cmd->transport_state & (CMD_T_COMPLETE | CMD_T_FABRIC_STOP)) {
+               pr_debug("Attempted to abort io tag: %llu already complete or"
+                       " fabric stop, skipping\n", se_cmd->tag);
+               spin_unlock(&se_cmd->t_state_lock);
+               return false;
+       }
+       if (sess->sess_tearing_down || se_cmd->cmd_wait_set) {
+               pr_debug("Attempted to abort io tag: %llu already shutdown,"
+                       " skipping\n", se_cmd->tag);
+               spin_unlock(&se_cmd->t_state_lock);
+               return false;
+       }
+       se_cmd->transport_state |= CMD_T_ABORTED;
+
+       if ((tmr_sess != se_cmd->se_sess) && tas)
+               se_cmd->transport_state |= CMD_T_TAS;
+
+       spin_unlock(&se_cmd->t_state_lock);
+
+       return kref_get_unless_zero(&se_cmd->cmd_kref);
+}
+
 void core_tmr_abort_task(
        struct se_device *dev,
        struct se_tmr_req *tmr,
@@ -130,34 +172,21 @@ void core_tmr_abort_task(
                if (tmr->ref_task_tag != ref_tag)
                        continue;
 
-               if (!kref_get_unless_zero(&se_cmd->cmd_kref))
-                       continue;
-
                printk("ABORT_TASK: Found referenced %s task_tag: %llu\n",
                        se_cmd->se_tfo->get_fabric_name(), ref_tag);
 
-               spin_lock(&se_cmd->t_state_lock);
-               if (se_cmd->transport_state & CMD_T_COMPLETE) {
-                       printk("ABORT_TASK: ref_tag: %llu already complete,"
-                              " skipping\n", ref_tag);
-                       spin_unlock(&se_cmd->t_state_lock);
+               if (!__target_check_io_state(se_cmd, se_sess, 0)) {
                        spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
-
-                       target_put_sess_cmd(se_cmd);
-
                        goto out;
                }
-               se_cmd->transport_state |= CMD_T_ABORTED;
-               spin_unlock(&se_cmd->t_state_lock);
-
                list_del_init(&se_cmd->se_cmd_list);
                spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
 
                cancel_work_sync(&se_cmd->work);
                transport_wait_for_tasks(se_cmd);
 
-               target_put_sess_cmd(se_cmd);
                transport_cmd_finish_abort(se_cmd, true);
+               target_put_sess_cmd(se_cmd);
 
                printk("ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for"
                                " ref_tag: %llu\n", ref_tag);
@@ -178,9 +207,11 @@ static void core_tmr_drain_tmr_list(
        struct list_head *preempt_and_abort_list)
 {
        LIST_HEAD(drain_tmr_list);
+       struct se_session *sess;
        struct se_tmr_req *tmr_p, *tmr_pp;
        struct se_cmd *cmd;
        unsigned long flags;
+       bool rc;
        /*
         * Release all pending and outgoing TMRs aside from the received
         * LUN_RESET tmr..
@@ -206,17 +237,39 @@ static void core_tmr_drain_tmr_list(
                if (target_check_cdb_and_preempt(preempt_and_abort_list, cmd))
                        continue;
 
+               sess = cmd->se_sess;
+               if (WARN_ON_ONCE(!sess))
+                       continue;
+
+               spin_lock(&sess->sess_cmd_lock);
                spin_lock(&cmd->t_state_lock);
-               if (!(cmd->transport_state & CMD_T_ACTIVE)) {
+               if (!(cmd->transport_state & CMD_T_ACTIVE) ||
+                    (cmd->transport_state & CMD_T_FABRIC_STOP)) {
                        spin_unlock(&cmd->t_state_lock);
+                       spin_unlock(&sess->sess_cmd_lock);
                        continue;
                }
                if (cmd->t_state == TRANSPORT_ISTATE_PROCESSING) {
                        spin_unlock(&cmd->t_state_lock);
+                       spin_unlock(&sess->sess_cmd_lock);
                        continue;
                }
+               if (sess->sess_tearing_down || cmd->cmd_wait_set) {
+                       spin_unlock(&cmd->t_state_lock);
+                       spin_unlock(&sess->sess_cmd_lock);
+                       continue;
+               }
+               cmd->transport_state |= CMD_T_ABORTED;
                spin_unlock(&cmd->t_state_lock);
 
+               rc = kref_get_unless_zero(&cmd->cmd_kref);
+               if (!rc) {
+                       printk("LUN_RESET TMR: non-zero kref_get_unless_zero\n");
+                       spin_unlock(&sess->sess_cmd_lock);
+                       continue;
+               }
+               spin_unlock(&sess->sess_cmd_lock);
+
                list_move_tail(&tmr_p->tmr_list, &drain_tmr_list);
        }
        spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
@@ -230,20 +283,26 @@ static void core_tmr_drain_tmr_list(
                        (preempt_and_abort_list) ? "Preempt" : "", tmr_p,
                        tmr_p->function, tmr_p->response, cmd->t_state);
 
+               cancel_work_sync(&cmd->work);
+               transport_wait_for_tasks(cmd);
+
                transport_cmd_finish_abort(cmd, 1);
+               target_put_sess_cmd(cmd);
        }
 }
 
 static void core_tmr_drain_state_list(
        struct se_device *dev,
        struct se_cmd *prout_cmd,
-       struct se_node_acl *tmr_nacl,
+       struct se_session *tmr_sess,
        int tas,
        struct list_head *preempt_and_abort_list)
 {
        LIST_HEAD(drain_task_list);
+       struct se_session *sess;
        struct se_cmd *cmd, *next;
        unsigned long flags;
+       int rc;
 
        /*
         * Complete outstanding commands with TASK_ABORTED SAM status.
@@ -282,6 +341,16 @@ static void core_tmr_drain_state_list(
                if (prout_cmd == cmd)
                        continue;
 
+               sess = cmd->se_sess;
+               if (WARN_ON_ONCE(!sess))
+                       continue;
+
+               spin_lock(&sess->sess_cmd_lock);
+               rc = __target_check_io_state(cmd, tmr_sess, tas);
+               spin_unlock(&sess->sess_cmd_lock);
+               if (!rc)
+                       continue;
+
                list_move_tail(&cmd->state_list, &drain_task_list);
                cmd->state_active = false;
        }
@@ -289,7 +358,7 @@ static void core_tmr_drain_state_list(
 
        while (!list_empty(&drain_task_list)) {
                cmd = list_entry(drain_task_list.next, struct se_cmd, state_list);
-               list_del(&cmd->state_list);
+               list_del_init(&cmd->state_list);
 
                pr_debug("LUN_RESET: %s cmd: %p"
                        " ITT/CmdSN: 0x%08llx/0x%08x, i_state: %d, t_state: %d"
@@ -313,16 +382,11 @@ static void core_tmr_drain_state_list(
                 * loop above, but we do it down here given that
                 * cancel_work_sync may block.
                 */
-               if (cmd->t_state == TRANSPORT_COMPLETE)
-                       cancel_work_sync(&cmd->work);
-
-               spin_lock_irqsave(&cmd->t_state_lock, flags);
-               target_stop_cmd(cmd, &flags);
-
-               cmd->transport_state |= CMD_T_ABORTED;
-               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+               cancel_work_sync(&cmd->work);
+               transport_wait_for_tasks(cmd);
 
-               core_tmr_handle_tas_abort(tmr_nacl, cmd, tas);
+               core_tmr_handle_tas_abort(cmd, tas);
+               target_put_sess_cmd(cmd);
        }
 }
 
@@ -334,6 +398,7 @@ int core_tmr_lun_reset(
 {
        struct se_node_acl *tmr_nacl = NULL;
        struct se_portal_group *tmr_tpg = NULL;
+       struct se_session *tmr_sess = NULL;
        int tas;
         /*
         * TASK_ABORTED status bit, this is configurable via ConfigFS
@@ -352,8 +417,9 @@ int core_tmr_lun_reset(
         * or struct se_device passthrough..
         */
        if (tmr && tmr->task_cmd && tmr->task_cmd->se_sess) {
-               tmr_nacl = tmr->task_cmd->se_sess->se_node_acl;
-               tmr_tpg = tmr->task_cmd->se_sess->se_tpg;
+               tmr_sess = tmr->task_cmd->se_sess;
+               tmr_nacl = tmr_sess->se_node_acl;
+               tmr_tpg = tmr_sess->se_tpg;
                if (tmr_nacl && tmr_tpg) {
                        pr_debug("LUN_RESET: TMR caller fabric: %s"
                                " initiator port %s\n",
@@ -366,7 +432,7 @@ int core_tmr_lun_reset(
                dev->transport->name, tas);
 
        core_tmr_drain_tmr_list(dev, tmr, preempt_and_abort_list);
-       core_tmr_drain_state_list(dev, prout_cmd, tmr_nacl, tas,
+       core_tmr_drain_state_list(dev, prout_cmd, tmr_sess, tas,
                                preempt_and_abort_list);
 
        /*
index 9f3608e..867bc6d 100644 (file)
@@ -534,9 +534,6 @@ void transport_deregister_session(struct se_session *se_sess)
 }
 EXPORT_SYMBOL(transport_deregister_session);
 
-/*
- * Called with cmd->t_state_lock held.
- */
 static void target_remove_from_state_list(struct se_cmd *cmd)
 {
        struct se_device *dev = cmd->se_dev;
@@ -561,10 +558,6 @@ static int transport_cmd_check_stop(struct se_cmd *cmd, bool remove_from_lists,
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&cmd->t_state_lock, flags);
-       if (write_pending)
-               cmd->t_state = TRANSPORT_WRITE_PENDING;
-
        if (remove_from_lists) {
                target_remove_from_state_list(cmd);
 
@@ -574,6 +567,10 @@ static int transport_cmd_check_stop(struct se_cmd *cmd, bool remove_from_lists,
                cmd->se_lun = NULL;
        }
 
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       if (write_pending)
+               cmd->t_state = TRANSPORT_WRITE_PENDING;
+
        /*
         * Determine if frontend context caller is requesting the stopping of
         * this command for frontend exceptions.
@@ -627,6 +624,8 @@ static void transport_lun_remove_cmd(struct se_cmd *cmd)
 
 void transport_cmd_finish_abort(struct se_cmd *cmd, int remove)
 {
+       bool ack_kref = (cmd->se_cmd_flags & SCF_ACK_KREF);
+
        if (cmd->se_cmd_flags & SCF_SE_LUN_CMD)
                transport_lun_remove_cmd(cmd);
        /*
@@ -638,7 +637,7 @@ void transport_cmd_finish_abort(struct se_cmd *cmd, int remove)
 
        if (transport_cmd_check_stop_to_fabric(cmd))
                return;
-       if (remove)
+       if (remove && ack_kref)
                transport_put_cmd(cmd);
 }
 
@@ -693,20 +692,11 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status)
                        success = 1;
        }
 
-       /*
-        * See if we are waiting to complete for an exception condition.
-        */
-       if (cmd->transport_state & CMD_T_REQUEST_STOP) {
-               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
-               complete(&cmd->task_stop_comp);
-               return;
-       }
-
        /*
         * Check for case where an explicit ABORT_TASK has been received
         * and transport_wait_for_tasks() will be waiting for completion..
         */
-       if (cmd->transport_state & CMD_T_ABORTED &&
+       if (cmd->transport_state & CMD_T_ABORTED ||
            cmd->transport_state & CMD_T_STOP) {
                spin_unlock_irqrestore(&cmd->t_state_lock, flags);
                complete_all(&cmd->t_transport_stop_comp);
@@ -721,10 +711,10 @@ void target_complete_cmd(struct se_cmd *cmd, u8 scsi_status)
        cmd->transport_state |= (CMD_T_COMPLETE | CMD_T_ACTIVE);
        spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
-       if (cmd->cpuid == -1)
-               queue_work(target_completion_wq, &cmd->work);
-       else
+       if (cmd->se_cmd_flags & SCF_USE_CPUID)
                queue_work_on(cmd->cpuid, target_completion_wq, &cmd->work);
+       else
+               queue_work(target_completion_wq, &cmd->work);
 }
 EXPORT_SYMBOL(target_complete_cmd);
 
@@ -1203,7 +1193,6 @@ void transport_init_se_cmd(
        INIT_LIST_HEAD(&cmd->state_list);
        init_completion(&cmd->t_transport_stop_comp);
        init_completion(&cmd->cmd_wait_comp);
-       init_completion(&cmd->task_stop_comp);
        spin_lock_init(&cmd->t_state_lock);
        kref_init(&cmd->cmd_kref);
        cmd->transport_state = CMD_T_DEV_ACTIVE;
@@ -1437,6 +1426,12 @@ int target_submit_cmd_map_sgls(struct se_cmd *se_cmd, struct se_session *se_sess
         */
        transport_init_se_cmd(se_cmd, se_tpg->se_tpg_tfo, se_sess,
                                data_length, data_dir, task_attr, sense);
+
+       if (flags & TARGET_SCF_USE_CPUID)
+               se_cmd->se_cmd_flags |= SCF_USE_CPUID;
+       else
+               se_cmd->cpuid = WORK_CPU_UNBOUND;
+
        if (flags & TARGET_SCF_UNKNOWN_SIZE)
                se_cmd->unknown_data_length = 1;
        /*
@@ -1634,33 +1629,6 @@ int target_submit_tmr(struct se_cmd *se_cmd, struct se_session *se_sess,
 }
 EXPORT_SYMBOL(target_submit_tmr);
 
-/*
- * If the cmd is active, request it to be stopped and sleep until it
- * has completed.
- */
-bool target_stop_cmd(struct se_cmd *cmd, unsigned long *flags)
-       __releases(&cmd->t_state_lock)
-       __acquires(&cmd->t_state_lock)
-{
-       bool was_active = false;
-
-       if (cmd->transport_state & CMD_T_BUSY) {
-               cmd->transport_state |= CMD_T_REQUEST_STOP;
-               spin_unlock_irqrestore(&cmd->t_state_lock, *flags);
-
-               pr_debug("cmd %p waiting to complete\n", cmd);
-               wait_for_completion(&cmd->task_stop_comp);
-               pr_debug("cmd %p stopped successfully\n", cmd);
-
-               spin_lock_irqsave(&cmd->t_state_lock, *flags);
-               cmd->transport_state &= ~CMD_T_REQUEST_STOP;
-               cmd->transport_state &= ~CMD_T_BUSY;
-               was_active = true;
-       }
-
-       return was_active;
-}
-
 /*
  * Handle SAM-esque emulation for generic transport request failures.
  */
@@ -1859,19 +1827,21 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
        return true;
 }
 
+static int __transport_check_aborted_status(struct se_cmd *, int);
+
 void target_execute_cmd(struct se_cmd *cmd)
 {
-       /*
-        * If the received CDB has aleady been aborted stop processing it here.
-        */
-       if (transport_check_aborted_status(cmd, 1))
-               return;
-
        /*
         * Determine if frontend context caller is requesting the stopping of
         * this command for frontend exceptions.
+        *
+        * If the received CDB has aleady been aborted stop processing it here.
         */
        spin_lock_irq(&cmd->t_state_lock);
+       if (__transport_check_aborted_status(cmd, 1)) {
+               spin_unlock_irq(&cmd->t_state_lock);
+               return;
+       }
        if (cmd->transport_state & CMD_T_STOP) {
                pr_debug("%s:%d CMD_T_STOP for ITT: 0x%08llx\n",
                        __func__, __LINE__, cmd->tag);
@@ -2222,20 +2192,14 @@ static inline void transport_free_pages(struct se_cmd *cmd)
 }
 
 /**
- * transport_release_cmd - free a command
- * @cmd:       command to free
+ * transport_put_cmd - release a reference to a command
+ * @cmd:       command to release
  *
- * This routine unconditionally frees a command, and reference counting
- * or list removal must be done in the caller.
+ * This routine releases our reference to the command and frees it if possible.
  */
-static int transport_release_cmd(struct se_cmd *cmd)
+static int transport_put_cmd(struct se_cmd *cmd)
 {
        BUG_ON(!cmd->se_tfo);
-
-       if (cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)
-               core_tmr_release_req(cmd->se_tmr_req);
-       if (cmd->t_task_cdb != cmd->__t_task_cdb)
-               kfree(cmd->t_task_cdb);
        /*
         * If this cmd has been setup with target_get_sess_cmd(), drop
         * the kref and call ->release_cmd() in kref callback.
@@ -2243,18 +2207,6 @@ static int transport_release_cmd(struct se_cmd *cmd)
        return target_put_sess_cmd(cmd);
 }
 
-/**
- * transport_put_cmd - release a reference to a command
- * @cmd:       command to release
- *
- * This routine releases our reference to the command and frees it if possible.
- */
-static int transport_put_cmd(struct se_cmd *cmd)
-{
-       transport_free_pages(cmd);
-       return transport_release_cmd(cmd);
-}
-
 void *transport_kmap_data_sg(struct se_cmd *cmd)
 {
        struct scatterlist *sg = cmd->t_data_sg;
@@ -2450,34 +2402,58 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
        }
 }
 
-int transport_generic_free_cmd(struct se_cmd *cmd, int wait_for_tasks)
+static bool
+__transport_wait_for_tasks(struct se_cmd *, bool, bool *, bool *,
+                          unsigned long *flags);
+
+static void target_wait_free_cmd(struct se_cmd *cmd, bool *aborted, bool *tas)
 {
        unsigned long flags;
+
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       __transport_wait_for_tasks(cmd, true, aborted, tas, &flags);
+       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+}
+
+int transport_generic_free_cmd(struct se_cmd *cmd, int wait_for_tasks)
+{
        int ret = 0;
+       bool aborted = false, tas = false;
 
        if (!(cmd->se_cmd_flags & SCF_SE_LUN_CMD)) {
                if (wait_for_tasks && (cmd->se_cmd_flags & SCF_SCSI_TMR_CDB))
-                        transport_wait_for_tasks(cmd);
+                       target_wait_free_cmd(cmd, &aborted, &tas);
 
-               ret = transport_release_cmd(cmd);
+               if (!aborted || tas)
+                       ret = transport_put_cmd(cmd);
        } else {
                if (wait_for_tasks)
-                       transport_wait_for_tasks(cmd);
+                       target_wait_free_cmd(cmd, &aborted, &tas);
                /*
                 * Handle WRITE failure case where transport_generic_new_cmd()
                 * has already added se_cmd to state_list, but fabric has
                 * failed command before I/O submission.
                 */
-               if (cmd->state_active) {
-                       spin_lock_irqsave(&cmd->t_state_lock, flags);
+               if (cmd->state_active)
                        target_remove_from_state_list(cmd);
-                       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
-               }
 
                if (cmd->se_lun)
                        transport_lun_remove_cmd(cmd);
 
-               ret = transport_put_cmd(cmd);
+               if (!aborted || tas)
+                       ret = transport_put_cmd(cmd);
+       }
+       /*
+        * If the task has been internally aborted due to TMR ABORT_TASK
+        * or LUN_RESET, target_core_tmr.c is responsible for performing
+        * the remaining calls to target_put_sess_cmd(), and not the
+        * callers of this function.
+        */
+       if (aborted) {
+               pr_debug("Detected CMD_T_ABORTED for ITT: %llu\n", cmd->tag);
+               wait_for_completion(&cmd->cmd_wait_comp);
+               cmd->se_tfo->release_cmd(cmd);
+               ret = 1;
        }
        return ret;
 }
@@ -2517,26 +2493,46 @@ out:
 }
 EXPORT_SYMBOL(target_get_sess_cmd);
 
+static void target_free_cmd_mem(struct se_cmd *cmd)
+{
+       transport_free_pages(cmd);
+
+       if (cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)
+               core_tmr_release_req(cmd->se_tmr_req);
+       if (cmd->t_task_cdb != cmd->__t_task_cdb)
+               kfree(cmd->t_task_cdb);
+}
+
 static void target_release_cmd_kref(struct kref *kref)
 {
        struct se_cmd *se_cmd = container_of(kref, struct se_cmd, cmd_kref);
        struct se_session *se_sess = se_cmd->se_sess;
        unsigned long flags;
+       bool fabric_stop;
 
        spin_lock_irqsave(&se_sess->sess_cmd_lock, flags);
        if (list_empty(&se_cmd->se_cmd_list)) {
                spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
+               target_free_cmd_mem(se_cmd);
                se_cmd->se_tfo->release_cmd(se_cmd);
                return;
        }
-       if (se_sess->sess_tearing_down && se_cmd->cmd_wait_set) {
+
+       spin_lock(&se_cmd->t_state_lock);
+       fabric_stop = (se_cmd->transport_state & CMD_T_FABRIC_STOP);
+       spin_unlock(&se_cmd->t_state_lock);
+
+       if (se_cmd->cmd_wait_set || fabric_stop) {
+               list_del_init(&se_cmd->se_cmd_list);
                spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
+               target_free_cmd_mem(se_cmd);
                complete(&se_cmd->cmd_wait_comp);
                return;
        }
-       list_del(&se_cmd->se_cmd_list);
+       list_del_init(&se_cmd->se_cmd_list);
        spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
 
+       target_free_cmd_mem(se_cmd);
        se_cmd->se_tfo->release_cmd(se_cmd);
 }
 
@@ -2548,6 +2544,7 @@ int target_put_sess_cmd(struct se_cmd *se_cmd)
        struct se_session *se_sess = se_cmd->se_sess;
 
        if (!se_sess) {
+               target_free_cmd_mem(se_cmd);
                se_cmd->se_tfo->release_cmd(se_cmd);
                return 1;
        }
@@ -2564,6 +2561,7 @@ void target_sess_cmd_list_set_waiting(struct se_session *se_sess)
 {
        struct se_cmd *se_cmd;
        unsigned long flags;
+       int rc;
 
        spin_lock_irqsave(&se_sess->sess_cmd_lock, flags);
        if (se_sess->sess_tearing_down) {
@@ -2573,8 +2571,15 @@ void target_sess_cmd_list_set_waiting(struct se_session *se_sess)
        se_sess->sess_tearing_down = 1;
        list_splice_init(&se_sess->sess_cmd_list, &se_sess->sess_wait_list);
 
-       list_for_each_entry(se_cmd, &se_sess->sess_wait_list, se_cmd_list)
-               se_cmd->cmd_wait_set = 1;
+       list_for_each_entry(se_cmd, &se_sess->sess_wait_list, se_cmd_list) {
+               rc = kref_get_unless_zero(&se_cmd->cmd_kref);
+               if (rc) {
+                       se_cmd->cmd_wait_set = 1;
+                       spin_lock(&se_cmd->t_state_lock);
+                       se_cmd->transport_state |= CMD_T_FABRIC_STOP;
+                       spin_unlock(&se_cmd->t_state_lock);
+               }
+       }
 
        spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
 }
@@ -2587,15 +2592,25 @@ void target_wait_for_sess_cmds(struct se_session *se_sess)
 {
        struct se_cmd *se_cmd, *tmp_cmd;
        unsigned long flags;
+       bool tas;
 
        list_for_each_entry_safe(se_cmd, tmp_cmd,
                                &se_sess->sess_wait_list, se_cmd_list) {
-               list_del(&se_cmd->se_cmd_list);
+               list_del_init(&se_cmd->se_cmd_list);
 
                pr_debug("Waiting for se_cmd: %p t_state: %d, fabric state:"
                        " %d\n", se_cmd, se_cmd->t_state,
                        se_cmd->se_tfo->get_cmd_state(se_cmd));
 
+               spin_lock_irqsave(&se_cmd->t_state_lock, flags);
+               tas = (se_cmd->transport_state & CMD_T_TAS);
+               spin_unlock_irqrestore(&se_cmd->t_state_lock, flags);
+
+               if (!target_put_sess_cmd(se_cmd)) {
+                       if (tas)
+                               target_put_sess_cmd(se_cmd);
+               }
+
                wait_for_completion(&se_cmd->cmd_wait_comp);
                pr_debug("After cmd_wait_comp: se_cmd: %p t_state: %d"
                        " fabric state: %d\n", se_cmd, se_cmd->t_state,
@@ -2617,53 +2632,75 @@ void transport_clear_lun_ref(struct se_lun *lun)
        wait_for_completion(&lun->lun_ref_comp);
 }
 
-/**
- * transport_wait_for_tasks - wait for completion to occur
- * @cmd:       command to wait
- *
- * Called from frontend fabric context to wait for storage engine
- * to pause and/or release frontend generated struct se_cmd.
- */
-bool transport_wait_for_tasks(struct se_cmd *cmd)
+static bool
+__transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
+                          bool *aborted, bool *tas, unsigned long *flags)
+       __releases(&cmd->t_state_lock)
+       __acquires(&cmd->t_state_lock)
 {
-       unsigned long flags;
 
-       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       assert_spin_locked(&cmd->t_state_lock);
+       WARN_ON_ONCE(!irqs_disabled());
+
+       if (fabric_stop)
+               cmd->transport_state |= CMD_T_FABRIC_STOP;
+
+       if (cmd->transport_state & CMD_T_ABORTED)
+               *aborted = true;
+
+       if (cmd->transport_state & CMD_T_TAS)
+               *tas = true;
+
        if (!(cmd->se_cmd_flags & SCF_SE_LUN_CMD) &&
-           !(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)) {
-               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+           !(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB))
                return false;
-       }
 
        if (!(cmd->se_cmd_flags & SCF_SUPPORTED_SAM_OPCODE) &&
-           !(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)) {
-               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+           !(cmd->se_cmd_flags & SCF_SCSI_TMR_CDB))
                return false;
-       }
 
-       if (!(cmd->transport_state & CMD_T_ACTIVE)) {
-               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+       if (!(cmd->transport_state & CMD_T_ACTIVE))
+               return false;
+
+       if (fabric_stop && *aborted)
                return false;
-       }
 
        cmd->transport_state |= CMD_T_STOP;
 
-       pr_debug("wait_for_tasks: Stopping %p ITT: 0x%08llx i_state: %d, t_state: %d, CMD_T_STOP\n",
-               cmd, cmd->tag, cmd->se_tfo->get_cmd_state(cmd), cmd->t_state);
+       pr_debug("wait_for_tasks: Stopping %p ITT: 0x%08llx i_state: %d,"
+                " t_state: %d, CMD_T_STOP\n", cmd, cmd->tag,
+                cmd->se_tfo->get_cmd_state(cmd), cmd->t_state);
 
-       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+       spin_unlock_irqrestore(&cmd->t_state_lock, *flags);
 
        wait_for_completion(&cmd->t_transport_stop_comp);
 
-       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       spin_lock_irqsave(&cmd->t_state_lock, *flags);
        cmd->transport_state &= ~(CMD_T_ACTIVE | CMD_T_STOP);
 
-       pr_debug("wait_for_tasks: Stopped wait_for_completion(&cmd->t_transport_stop_comp) for ITT: 0x%08llx\n",
-               cmd->tag);
+       pr_debug("wait_for_tasks: Stopped wait_for_completion(&cmd->"
+                "t_transport_stop_comp) for ITT: 0x%08llx\n", cmd->tag);
 
+       return true;
+}
+
+/**
+ * transport_wait_for_tasks - wait for completion to occur
+ * @cmd:       command to wait
+ *
+ * Called from frontend fabric context to wait for storage engine
+ * to pause and/or release frontend generated struct se_cmd.
+ */
+bool transport_wait_for_tasks(struct se_cmd *cmd)
+{
+       unsigned long flags;
+       bool ret, aborted = false, tas = false;
+
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       ret = __transport_wait_for_tasks(cmd, false, &aborted, &tas, &flags);
        spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
-       return true;
+       return ret;
 }
 EXPORT_SYMBOL(transport_wait_for_tasks);
 
@@ -2845,28 +2882,49 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd,
 }
 EXPORT_SYMBOL(transport_send_check_condition_and_sense);
 
-int transport_check_aborted_status(struct se_cmd *cmd, int send_status)
+static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
+       __releases(&cmd->t_state_lock)
+       __acquires(&cmd->t_state_lock)
 {
+       assert_spin_locked(&cmd->t_state_lock);
+       WARN_ON_ONCE(!irqs_disabled());
+
        if (!(cmd->transport_state & CMD_T_ABORTED))
                return 0;
-
        /*
         * If cmd has been aborted but either no status is to be sent or it has
         * already been sent, just return
         */
-       if (!send_status || !(cmd->se_cmd_flags & SCF_SEND_DELAYED_TAS))
+       if (!send_status || !(cmd->se_cmd_flags & SCF_SEND_DELAYED_TAS)) {
+               if (send_status)
+                       cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS;
                return 1;
+       }
 
-       pr_debug("Sending delayed SAM_STAT_TASK_ABORTED status for CDB: 0x%02x ITT: 0x%08llx\n",
-                cmd->t_task_cdb[0], cmd->tag);
+       pr_debug("Sending delayed SAM_STAT_TASK_ABORTED status for CDB:"
+               " 0x%02x ITT: 0x%08llx\n", cmd->t_task_cdb[0], cmd->tag);
 
        cmd->se_cmd_flags &= ~SCF_SEND_DELAYED_TAS;
        cmd->scsi_status = SAM_STAT_TASK_ABORTED;
        trace_target_cmd_complete(cmd);
+
+       spin_unlock_irq(&cmd->t_state_lock);
        cmd->se_tfo->queue_status(cmd);
+       spin_lock_irq(&cmd->t_state_lock);
 
        return 1;
 }
+
+int transport_check_aborted_status(struct se_cmd *cmd, int send_status)
+{
+       int ret;
+
+       spin_lock_irq(&cmd->t_state_lock);
+       ret = __transport_check_aborted_status(cmd, send_status);
+       spin_unlock_irq(&cmd->t_state_lock);
+
+       return ret;
+}
 EXPORT_SYMBOL(transport_check_aborted_status);
 
 void transport_send_task_abort(struct se_cmd *cmd)
@@ -2888,11 +2946,17 @@ void transport_send_task_abort(struct se_cmd *cmd)
         */
        if (cmd->data_direction == DMA_TO_DEVICE) {
                if (cmd->se_tfo->write_pending_status(cmd) != 0) {
-                       cmd->transport_state |= CMD_T_ABORTED;
+                       spin_lock_irqsave(&cmd->t_state_lock, flags);
+                       if (cmd->se_cmd_flags & SCF_SEND_DELAYED_TAS) {
+                               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+                               goto send_abort;
+                       }
                        cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS;
+                       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
                        return;
                }
        }
+send_abort:
        cmd->scsi_status = SAM_STAT_TASK_ABORTED;
 
        transport_lun_remove_cmd(cmd);
@@ -2909,8 +2973,17 @@ static void target_tmr_work(struct work_struct *work)
        struct se_cmd *cmd = container_of(work, struct se_cmd, work);
        struct se_device *dev = cmd->se_dev;
        struct se_tmr_req *tmr = cmd->se_tmr_req;
+       unsigned long flags;
        int ret;
 
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       if (cmd->transport_state & CMD_T_ABORTED) {
+               tmr->response = TMR_FUNCTION_REJECTED;
+               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+               goto check_stop;
+       }
+       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+
        switch (tmr->function) {
        case TMR_ABORT_TASK:
                core_tmr_abort_task(dev, tmr, cmd->se_sess);
@@ -2943,9 +3016,17 @@ static void target_tmr_work(struct work_struct *work)
                break;
        }
 
+       spin_lock_irqsave(&cmd->t_state_lock, flags);
+       if (cmd->transport_state & CMD_T_ABORTED) {
+               spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+               goto check_stop;
+       }
        cmd->t_state = TRANSPORT_ISTATE_PROCESSING;
+       spin_unlock_irqrestore(&cmd->t_state_lock, flags);
+
        cmd->se_tfo->queue_tm_rsp(cmd);
 
+check_stop:
        transport_cmd_check_stop_to_fabric(cmd);
 }
 
index dd600e5..94f5154 100644 (file)
@@ -903,7 +903,7 @@ static int tcmu_configure_device(struct se_device *dev)
        info->version = __stringify(TCMU_MAILBOX_VERSION);
 
        info->mem[0].name = "tcm-user command & data buffer";
-       info->mem[0].addr = (phys_addr_t) udev->mb_addr;
+       info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr;
        info->mem[0].size = TCMU_RING_SIZE;
        info->mem[0].memtype = UIO_MEM_VIRTUAL;
 
index 8cc4ac6..7c92c09 100644 (file)
@@ -195,7 +195,7 @@ config IMX_THERMAL
          passive trip is crossed.
 
 config SPEAR_THERMAL
-       bool "SPEAr thermal sensor driver"
+       tristate "SPEAr thermal sensor driver"
        depends on PLAT_SPEAR || COMPILE_TEST
        depends on OF
        help
@@ -237,8 +237,8 @@ config DOVE_THERMAL
          framework.
 
 config DB8500_THERMAL
-       bool "DB8500 thermal management"
-       depends on ARCH_U8500
+       tristate "DB8500 thermal management"
+       depends on MFD_DB8500_PRCMU
        default y
        help
          Adds DB8500 thermal management implementation according to the thermal
index e3fbc5a..6ceac4f 100644 (file)
@@ -377,26 +377,28 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
  * get_load() - get load for a cpu since last updated
  * @cpufreq_device:    &struct cpufreq_cooling_device for this cpu
  * @cpu:       cpu number
+ * @cpu_idx:   index of the cpu in cpufreq_device->allowed_cpus
  *
  * Return: The average load of cpu @cpu in percentage since this
  * function was last called.
  */
-static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu)
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu,
+                   int cpu_idx)
 {
        u32 load;
        u64 now, now_idle, delta_time, delta_idle;
 
        now_idle = get_cpu_idle_time(cpu, &now, 0);
-       delta_idle = now_idle - cpufreq_device->time_in_idle[cpu];
-       delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu];
+       delta_idle = now_idle - cpufreq_device->time_in_idle[cpu_idx];
+       delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu_idx];
 
        if (delta_time <= delta_idle)
                load = 0;
        else
                load = div64_u64(100 * (delta_time - delta_idle), delta_time);
 
-       cpufreq_device->time_in_idle[cpu] = now_idle;
-       cpufreq_device->time_in_idle_timestamp[cpu] = now;
+       cpufreq_device->time_in_idle[cpu_idx] = now_idle;
+       cpufreq_device->time_in_idle_timestamp[cpu_idx] = now;
 
        return load;
 }
@@ -598,7 +600,7 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
                u32 load;
 
                if (cpu_online(cpu))
-                       load = get_load(cpufreq_device, cpu);
+                       load = get_load(cpufreq_device, cpu, i);
                else
                        load = 0;
 
index be4eedc..9043f8f 100644 (file)
@@ -475,14 +475,10 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
 
        sensor_np = of_node_get(dev->of_node);
 
-       for_each_child_of_node(np, child) {
+       for_each_available_child_of_node(np, child) {
                struct of_phandle_args sensor_specs;
                int ret, id;
 
-               /* Check whether child is enabled or not */
-               if (!of_device_is_available(child))
-                       continue;
-
                /* For now, thermal framework supports only 1 sensor per zone */
                ret = of_parse_phandle_with_args(child, "thermal-sensors",
                                                 "#thermal-sensor-cells",
@@ -881,16 +877,12 @@ int __init of_parse_thermal_zones(void)
                return 0; /* Run successfully on systems without thermal DT */
        }
 
-       for_each_child_of_node(np, child) {
+       for_each_available_child_of_node(np, child) {
                struct thermal_zone_device *zone;
                struct thermal_zone_params *tzp;
                int i, mask = 0;
                u32 prop;
 
-               /* Check whether child is enabled or not */
-               if (!of_device_is_available(child))
-                       continue;
-
                tz = thermal_of_build_thermal_zone(child);
                if (IS_ERR(tz)) {
                        pr_err("failed to build thermal zone %s: %ld\n",
@@ -968,13 +960,9 @@ void of_thermal_destroy_zones(void)
                return;
        }
 
-       for_each_child_of_node(np, child) {
+       for_each_available_child_of_node(np, child) {
                struct thermal_zone_device *zone;
 
-               /* Check whether child is enabled or not */
-               if (!of_device_is_available(child))
-                       continue;
-
                zone = thermal_zone_get_zone_by_name(child->name);
                if (IS_ERR(zone))
                        continue;
index 44b9c48..0e735ac 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/reboot.h>
@@ -75,8 +76,10 @@ struct rcar_thermal_priv {
 #define rcar_has_irq_support(priv)     ((priv)->common->base)
 #define rcar_id_to_shift(priv)         ((priv)->id * 8)
 
+#define USE_OF_THERMAL 1
 static const struct of_device_id rcar_thermal_dt_ids[] = {
        { .compatible = "renesas,rcar-thermal", },
+       { .compatible = "renesas,rcar-gen2-thermal", .data = (void *)USE_OF_THERMAL },
        {},
 };
 MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids);
@@ -200,9 +203,9 @@ err_out_unlock:
        return ret;
 }
 
-static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
+static int rcar_thermal_get_current_temp(struct rcar_thermal_priv *priv,
+                                        int *temp)
 {
-       struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
        int tmp;
        int ret;
 
@@ -226,6 +229,20 @@ static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
        return 0;
 }
 
+static int rcar_thermal_of_get_temp(void *data, int *temp)
+{
+       struct rcar_thermal_priv *priv = data;
+
+       return rcar_thermal_get_current_temp(priv, temp);
+}
+
+static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
+{
+       struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
+
+       return rcar_thermal_get_current_temp(priv, temp);
+}
+
 static int rcar_thermal_get_trip_type(struct thermal_zone_device *zone,
                                      int trip, enum thermal_trip_type *type)
 {
@@ -282,6 +299,10 @@ static int rcar_thermal_notify(struct thermal_zone_device *zone,
        return 0;
 }
 
+static const struct thermal_zone_of_device_ops rcar_thermal_zone_of_ops = {
+       .get_temp       = rcar_thermal_of_get_temp,
+};
+
 static struct thermal_zone_device_ops rcar_thermal_zone_ops = {
        .get_temp       = rcar_thermal_get_temp,
        .get_trip_type  = rcar_thermal_get_trip_type,
@@ -318,14 +339,20 @@ static void rcar_thermal_work(struct work_struct *work)
 
        priv = container_of(work, struct rcar_thermal_priv, work.work);
 
-       rcar_thermal_get_temp(priv->zone, &cctemp);
+       ret = rcar_thermal_get_current_temp(priv, &cctemp);
+       if (ret < 0)
+               return;
+
        ret = rcar_thermal_update_temp(priv);
        if (ret < 0)
                return;
 
        rcar_thermal_irq_enable(priv);
 
-       rcar_thermal_get_temp(priv->zone, &nctemp);
+       ret = rcar_thermal_get_current_temp(priv, &nctemp);
+       if (ret < 0)
+               return;
+
        if (nctemp != cctemp)
                thermal_zone_device_update(priv->zone);
 }
@@ -403,6 +430,8 @@ static int rcar_thermal_probe(struct platform_device *pdev)
        struct rcar_thermal_priv *priv;
        struct device *dev = &pdev->dev;
        struct resource *res, *irq;
+       const struct of_device_id *of_id = of_match_device(rcar_thermal_dt_ids, dev);
+       unsigned long of_data = (unsigned long)of_id->data;
        int mres = 0;
        int i;
        int ret = -ENODEV;
@@ -463,7 +492,13 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                if (ret < 0)
                        goto error_unregister;
 
-               priv->zone = thermal_zone_device_register("rcar_thermal",
+               if (of_data == USE_OF_THERMAL)
+                       priv->zone = thermal_zone_of_sensor_register(
+                                               dev, i, priv,
+                                               &rcar_thermal_zone_of_ops);
+               else
+                       priv->zone = thermal_zone_device_register(
+                                               "rcar_thermal",
                                                1, 0, priv,
                                                &rcar_thermal_zone_ops, NULL, 0,
                                                idle);
index 534dd91..81b35aa 100644 (file)
@@ -54,8 +54,7 @@ static struct thermal_zone_device_ops ops = {
        .get_temp = thermal_get_temp,
 };
 
-#ifdef CONFIG_PM
-static int spear_thermal_suspend(struct device *dev)
+static int __maybe_unused spear_thermal_suspend(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct thermal_zone_device *spear_thermal = platform_get_drvdata(pdev);
@@ -72,7 +71,7 @@ static int spear_thermal_suspend(struct device *dev)
        return 0;
 }
 
-static int spear_thermal_resume(struct device *dev)
+static int __maybe_unused spear_thermal_resume(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct thermal_zone_device *spear_thermal = platform_get_drvdata(pdev);
@@ -94,7 +93,6 @@ static int spear_thermal_resume(struct device *dev)
 
        return 0;
 }
-#endif
 
 static SIMPLE_DEV_PM_OPS(spear_thermal_pm_ops, spear_thermal_suspend,
                spear_thermal_resume);
index d9a5fc2..b280aba 100644 (file)
@@ -269,16 +269,13 @@ static void n_tty_check_throttle(struct tty_struct *tty)
 
 static void n_tty_check_unthrottle(struct tty_struct *tty)
 {
-       if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-           tty->link->ldisc->ops->write_wakeup == n_tty_write_wakeup) {
+       if (tty->driver->type == TTY_DRIVER_TYPE_PTY) {
                if (chars_in_buffer(tty) > TTY_THRESHOLD_UNTHROTTLE)
                        return;
                if (!tty->count)
                        return;
                n_tty_kick_worker(tty);
-               n_tty_write_wakeup(tty->link);
-               if (waitqueue_active(&tty->link->write_wait))
-                       wake_up_interruptible_poll(&tty->link->write_wait, POLLOUT);
+               tty_wakeup(tty->link);
                return;
        }
 
index b311004..2348fa6 100644 (file)
@@ -681,7 +681,14 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
 /* this is called once with whichever end is closed last */
 static void pty_unix98_shutdown(struct tty_struct *tty)
 {
-       devpts_kill_index(tty->driver_data, tty->index);
+       struct inode *ptmx_inode;
+
+       if (tty->driver->subtype == PTY_TYPE_MASTER)
+               ptmx_inode = tty->driver_data;
+       else
+               ptmx_inode = tty->link->driver_data;
+       devpts_kill_index(ptmx_inode, tty->index);
+       devpts_del_ref(ptmx_inode);
 }
 
 static const struct tty_operations ptm_unix98_ops = {
@@ -773,6 +780,18 @@ static int ptmx_open(struct inode *inode, struct file *filp)
        set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
        tty->driver_data = inode;
 
+       /*
+        * In the case where all references to ptmx inode are dropped and we
+        * still have /dev/tty opened pointing to the master/slave pair (ptmx
+        * is closed/released before /dev/tty), we must make sure that the inode
+        * is still valid when we call the final pty_unix98_shutdown, thus we
+        * hold an additional reference to the ptmx inode. For the same /dev/tty
+        * last close case, we also need to make sure the super_block isn't
+        * destroyed (devpts instance unmounted), before /dev/tty is closed and
+        * on its release devpts_kill_index is called.
+        */
+       devpts_add_ref(inode);
+
        tty_add_file(tty, filp);
 
        slave_inode = devpts_pty_new(inode,
index 4097f3f..7cd6f9a 100644 (file)
@@ -1379,6 +1379,9 @@ ce4100_serial_setup(struct serial_private *priv,
 #define PCI_DEVICE_ID_INTEL_BSW_UART1  0x228a
 #define PCI_DEVICE_ID_INTEL_BSW_UART2  0x228c
 
+#define PCI_DEVICE_ID_INTEL_BDW_UART1  0x9ce3
+#define PCI_DEVICE_ID_INTEL_BDW_UART2  0x9ce4
+
 #define BYT_PRV_CLK                    0x800
 #define BYT_PRV_CLK_EN                 (1 << 0)
 #define BYT_PRV_CLK_M_VAL_SHIFT                1
@@ -1461,11 +1464,13 @@ byt_serial_setup(struct serial_private *priv,
        switch (pdev->device) {
        case PCI_DEVICE_ID_INTEL_BYT_UART1:
        case PCI_DEVICE_ID_INTEL_BSW_UART1:
+       case PCI_DEVICE_ID_INTEL_BDW_UART1:
                rx_param->src_id = 3;
                tx_param->dst_id = 2;
                break;
        case PCI_DEVICE_ID_INTEL_BYT_UART2:
        case PCI_DEVICE_ID_INTEL_BSW_UART2:
+       case PCI_DEVICE_ID_INTEL_BDW_UART2:
                rx_param->src_id = 5;
                tx_param->dst_id = 4;
                break;
@@ -1936,6 +1941,7 @@ pci_wch_ch38x_setup(struct serial_private *priv,
 #define PCIE_VENDOR_ID_WCH             0x1c00
 #define PCIE_DEVICE_ID_WCH_CH382_2S1P  0x3250
 #define PCIE_DEVICE_ID_WCH_CH384_4S    0x3470
+#define PCIE_DEVICE_ID_WCH_CH382_2S    0x3253
 
 #define PCI_VENDOR_ID_PERICOM                  0x12D8
 #define PCI_DEVICE_ID_PERICOM_PI7C9X7951       0x7951
@@ -2062,6 +2068,20 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
                .subdevice      = PCI_ANY_ID,
                .setup          = byt_serial_setup,
        },
+       {
+               .vendor         = PCI_VENDOR_ID_INTEL,
+               .device         = PCI_DEVICE_ID_INTEL_BDW_UART1,
+               .subvendor      = PCI_ANY_ID,
+               .subdevice      = PCI_ANY_ID,
+               .setup          = byt_serial_setup,
+       },
+       {
+               .vendor         = PCI_VENDOR_ID_INTEL,
+               .device         = PCI_DEVICE_ID_INTEL_BDW_UART2,
+               .subvendor      = PCI_ANY_ID,
+               .subdevice      = PCI_ANY_ID,
+               .setup          = byt_serial_setup,
+       },
        /*
         * ITE
         */
@@ -2618,6 +2638,14 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
                .subdevice      = PCI_ANY_ID,
                .setup          = pci_wch_ch353_setup,
        },
+       /* WCH CH382 2S card (16850 clone) */
+       {
+               .vendor         = PCIE_VENDOR_ID_WCH,
+               .device         = PCIE_DEVICE_ID_WCH_CH382_2S,
+               .subvendor      = PCI_ANY_ID,
+               .subdevice      = PCI_ANY_ID,
+               .setup          = pci_wch_ch38x_setup,
+       },
        /* WCH CH382 2S1P card (16850 clone) */
        {
                .vendor         = PCIE_VENDOR_ID_WCH,
@@ -2936,6 +2964,7 @@ enum pci_board_num_t {
        pbn_fintek_4,
        pbn_fintek_8,
        pbn_fintek_12,
+       pbn_wch382_2,
        pbn_wch384_4,
        pbn_pericom_PI7C9X7951,
        pbn_pericom_PI7C9X7952,
@@ -3756,6 +3785,13 @@ static struct pciserial_board pci_boards[] = {
                .base_baud      = 115200,
                .first_offset   = 0x40,
        },
+       [pbn_wch382_2] = {
+               .flags          = FL_BASE0,
+               .num_ports      = 2,
+               .base_baud      = 115200,
+               .uart_offset    = 8,
+               .first_offset   = 0xC0,
+       },
        [pbn_wch384_4] = {
                .flags          = FL_BASE0,
                .num_ports      = 4,
@@ -5506,6 +5542,16 @@ static struct pci_device_id serial_pci_tbl[] = {
                PCI_CLASS_COMMUNICATION_SERIAL << 8, 0xff0000,
                pbn_byt },
 
+       /* Intel Broadwell */
+       {       PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_UART1,
+               PCI_ANY_ID,  PCI_ANY_ID,
+               PCI_CLASS_COMMUNICATION_SERIAL << 8, 0xff0000,
+               pbn_byt },
+       {       PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_UART2,
+               PCI_ANY_ID,  PCI_ANY_ID,
+               PCI_CLASS_COMMUNICATION_SERIAL << 8, 0xff0000,
+               pbn_byt },
+
        /*
         * Intel Quark x1000
         */
@@ -5545,6 +5591,10 @@ static struct pci_device_id serial_pci_tbl[] = {
                PCI_ANY_ID, PCI_ANY_ID,
                0, 0, pbn_b0_bt_2_115200 },
 
+       {       PCIE_VENDOR_ID_WCH, PCIE_DEVICE_ID_WCH_CH382_2S,
+               PCI_ANY_ID, PCI_ANY_ID,
+               0, 0, pbn_wch382_2 },
+
        {       PCIE_VENDOR_ID_WCH, PCIE_DEVICE_ID_WCH_CH384_4S,
                PCI_ANY_ID, PCI_ANY_ID,
                0, 0, pbn_wch384_4 },
index b645f92..fa49eb1 100644 (file)
@@ -1165,7 +1165,7 @@ serial_omap_type(struct uart_port *port)
 
 #define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
 
-static void wait_for_xmitr(struct uart_omap_port *up)
+static void __maybe_unused wait_for_xmitr(struct uart_omap_port *up)
 {
        unsigned int status, tmout = 10000;
 
@@ -1343,7 +1343,7 @@ static inline void serial_omap_add_console_port(struct uart_omap_port *up)
 
 /* Enable or disable the rs485 support */
 static int
-serial_omap_config_rs485(struct uart_port *port, struct serial_rs485 *rs485conf)
+serial_omap_config_rs485(struct uart_port *port, struct serial_rs485 *rs485)
 {
        struct uart_omap_port *up = to_uart_omap_port(port);
        unsigned int mode;
@@ -1356,8 +1356,12 @@ serial_omap_config_rs485(struct uart_port *port, struct serial_rs485 *rs485conf)
        up->ier = 0;
        serial_out(up, UART_IER, 0);
 
+       /* Clamp the delays to [0, 100ms] */
+       rs485->delay_rts_before_send = min(rs485->delay_rts_before_send, 100U);
+       rs485->delay_rts_after_send  = min(rs485->delay_rts_after_send, 100U);
+
        /* store new config */
-       port->rs485 = *rs485conf;
+       port->rs485 = *rs485;
 
        /*
         * Just as a precaution, only allow rs485
index 892c923..a7eacef 100644 (file)
@@ -1463,13 +1463,13 @@ static int tty_reopen(struct tty_struct *tty)
 {
        struct tty_driver *driver = tty->driver;
 
-       if (!tty->count)
-               return -EIO;
-
        if (driver->type == TTY_DRIVER_TYPE_PTY &&
            driver->subtype == PTY_TYPE_MASTER)
                return -EIO;
 
+       if (!tty->count)
+               return -EAGAIN;
+
        if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN))
                return -EBUSY;
 
@@ -2065,9 +2065,13 @@ retry_open:
 
                if (tty) {
                        mutex_unlock(&tty_mutex);
-                       tty_lock(tty);
-                       /* safe to drop the kref from tty_driver_lookup_tty() */
-                       tty_kref_put(tty);
+                       retval = tty_lock_interruptible(tty);
+                       tty_kref_put(tty);  /* drop kref from tty_driver_lookup_tty() */
+                       if (retval) {
+                               if (retval == -EINTR)
+                                       retval = -ERESTARTSYS;
+                               goto err_unref;
+                       }
                        retval = tty_reopen(tty);
                        if (retval < 0) {
                                tty_unlock(tty);
@@ -2083,7 +2087,11 @@ retry_open:
 
        if (IS_ERR(tty)) {
                retval = PTR_ERR(tty);
-               goto err_file;
+               if (retval != -EAGAIN || signal_pending(current))
+                       goto err_file;
+               tty_free_file(filp);
+               schedule();
+               goto retry_open;
        }
 
        tty_add_file(tty, filp);
@@ -2152,6 +2160,7 @@ retry_open:
        return 0;
 err_unlock:
        mutex_unlock(&tty_mutex);
+err_unref:
        /* after locks to avoid deadlock */
        if (!IS_ERR_OR_NULL(driver))
                tty_driver_kref_put(driver);
@@ -2648,6 +2657,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
        return ret;
 }
 
+/**
+ *     tiocgetd        -       get line discipline
+ *     @tty: tty device
+ *     @p: pointer to user data
+ *
+ *     Retrieves the line discipline id directly from the ldisc.
+ *
+ *     Locking: waits for ldisc reference (in case the line discipline
+ *             is changing or the tty is being hungup)
+ */
+
+static int tiocgetd(struct tty_struct *tty, int __user *p)
+{
+       struct tty_ldisc *ld;
+       int ret;
+
+       ld = tty_ldisc_ref_wait(tty);
+       ret = put_user(ld->ops->num, p);
+       tty_ldisc_deref(ld);
+       return ret;
+}
+
 /**
  *     send_break      -       performed time break
  *     @tty: device to break on
@@ -2874,7 +2905,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case TIOCGSID:
                return tiocgsid(tty, real_tty, p);
        case TIOCGETD:
-               return put_user(tty->ldisc->ops->num, (int __user *)p);
+               return tiocgetd(tty, p);
        case TIOCSETD:
                return tiocsetd(tty, p);
        case TIOCVHANGUP:
index 77703a3..dfa9ec0 100644 (file)
@@ -19,6 +19,19 @@ void __lockfunc tty_lock(struct tty_struct *tty)
 }
 EXPORT_SYMBOL(tty_lock);
 
+int tty_lock_interruptible(struct tty_struct *tty)
+{
+       int ret;
+
+       if (WARN(tty->magic != TTY_MAGIC, "L Bad %p\n", tty))
+               return -EIO;
+       tty_kref_get(tty);
+       ret = mutex_lock_interruptible(&tty->legacy_mutex);
+       if (ret)
+               tty_kref_put(tty);
+       return ret;
+}
+
 void __lockfunc tty_unlock(struct tty_struct *tty)
 {
        if (WARN(tty->magic != TTY_MAGIC, "U Bad %p\n", tty))
index e7cbc44..bd51bdd 100644 (file)
@@ -4250,6 +4250,7 @@ unsigned short *screen_pos(struct vc_data *vc, int w_offset, int viewed)
 {
        return screenpos(vc, 2 * w_offset, viewed);
 }
+EXPORT_SYMBOL_GPL(screen_pos);
 
 void getconsxy(struct vc_data *vc, unsigned char *p)
 {
index b59195e..b635ab6 100644 (file)
@@ -85,8 +85,8 @@ static int ci_hdrc_pci_probe(struct pci_dev *pdev,
 
        /* register a nop PHY */
        ci->phy = usb_phy_generic_register();
-       if (!ci->phy)
-               return -ENOMEM;
+       if (IS_ERR(ci->phy))
+               return PTR_ERR(ci->phy);
 
        memset(res, 0, sizeof(res));
        res[0].start    = pci_resource_start(pdev, 0);
index a4f7db2..df47110 100644 (file)
@@ -100,6 +100,9 @@ static ssize_t ci_port_test_write(struct file *file, const char __user *ubuf,
        if (sscanf(buf, "%u", &mode) != 1)
                return -EINVAL;
 
+       if (mode > 255)
+               return -EBADRQC;
+
        pm_runtime_get_sync(ci->dev);
        spin_lock_irqsave(&ci->lock, flags);
        ret = hw_port_test_set(ci, mode);
index 45f86da..03b6743 100644 (file)
@@ -158,7 +158,7 @@ static void ci_otg_work(struct work_struct *work)
 int ci_hdrc_otg_init(struct ci_hdrc *ci)
 {
        INIT_WORK(&ci->work, ci_otg_work);
-       ci->wq = create_singlethread_workqueue("ci_otg");
+       ci->wq = create_freezable_workqueue("ci_otg");
        if (!ci->wq) {
                dev_err(ci->dev, "can't create workqueue\n");
                return -ENODEV;
index 26ca4f9..fa4e239 100644 (file)
@@ -428,7 +428,8 @@ static void acm_read_bulk_callback(struct urb *urb)
                set_bit(rb->index, &acm->read_urbs_free);
                dev_dbg(&acm->data->dev, "%s - non-zero urb status: %d\n",
                                                        __func__, status);
-               return;
+               if ((status != -ENOENT) || (urb->actual_length == 0))
+                       return;
        }
 
        usb_mark_last_busy(acm->dev);
@@ -1404,6 +1405,8 @@ made_compressed_probe:
                                usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
                                NULL, acm->writesize, acm_write_bulk, snd);
                snd->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+               if (quirks & SEND_ZERO_PACKET)
+                       snd->urb->transfer_flags |= URB_ZERO_PACKET;
                snd->instance = acm;
        }
 
@@ -1838,6 +1841,11 @@ static const struct usb_device_id acm_ids[] = {
        },
 #endif
 
+       /*Samsung phone in firmware update mode */
+       { USB_DEVICE(0x04e8, 0x685d),
+       .driver_info = IGNORE_DEVICE,
+       },
+
        /* Exclude Infineon Flash Loader utility */
        { USB_DEVICE(0x058b, 0x0041),
        .driver_info = IGNORE_DEVICE,
@@ -1861,6 +1869,10 @@ static const struct usb_device_id acm_ids[] = {
        { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
                USB_CDC_ACM_PROTO_AT_CDMA) },
 
+       { USB_DEVICE(0x1519, 0x0452), /* Intel 7260 modem */
+       .driver_info = SEND_ZERO_PACKET,
+       },
+
        { }
 };
 
index dd9af38..ccfaba9 100644 (file)
@@ -134,3 +134,4 @@ struct acm {
 #define IGNORE_DEVICE                  BIT(5)
 #define QUIRK_CONTROL_LINE_STATE       BIT(6)
 #define CLEAR_HALT_CONDITIONS          BIT(7)
+#define SEND_ZERO_PACKET               BIT(8)
index fd95ba6..f0decc0 100644 (file)
@@ -1,5 +1,6 @@
 config USB_DWC2
        tristate "DesignWare USB2 DRD Core Support"
+       depends on HAS_DMA
        depends on USB || USB_GADGET
        help
          Say Y here if your system has a Dual Role Hi-Speed USB
index 39a0fa8..46c4ba7 100644 (file)
@@ -572,12 +572,6 @@ static bool dwc2_force_mode(struct dwc2_hsotg *hsotg, bool host)
        set = host ? GUSBCFG_FORCEHOSTMODE : GUSBCFG_FORCEDEVMODE;
        clear = host ? GUSBCFG_FORCEDEVMODE : GUSBCFG_FORCEHOSTMODE;
 
-       /*
-        * If the force mode bit is already set, don't set it.
-        */
-       if ((gusbcfg & set) && !(gusbcfg & clear))
-               return false;
-
        gusbcfg &= ~clear;
        gusbcfg |= set;
        dwc2_writel(gusbcfg, hsotg->regs + GUSBCFG);
@@ -625,6 +619,12 @@ void dwc2_force_dr_mode(struct dwc2_hsotg *hsotg)
                         __func__, hsotg->dr_mode);
                break;
        }
+
+       /*
+        * NOTE: This is required for some rockchip soc based
+        * platforms.
+        */
+       msleep(50);
 }
 
 /*
@@ -3278,9 +3278,6 @@ static void dwc2_get_dev_hwparams(struct dwc2_hsotg *hsotg)
 /**
  * During device initialization, read various hardware configuration
  * registers and interpret the contents.
- *
- * This should be called during driver probe. It will perform a core
- * soft reset in order to get the reset values of the parameters.
  */
 int dwc2_get_hwparams(struct dwc2_hsotg *hsotg)
 {
@@ -3288,7 +3285,6 @@ int dwc2_get_hwparams(struct dwc2_hsotg *hsotg)
        unsigned width;
        u32 hwcfg1, hwcfg2, hwcfg3, hwcfg4;
        u32 grxfsiz;
-       int retval;
 
        /*
         * Attempt to ensure this device is really a DWC_otg Controller.
@@ -3308,10 +3304,6 @@ int dwc2_get_hwparams(struct dwc2_hsotg *hsotg)
                hw->snpsid >> 12 & 0xf, hw->snpsid >> 8 & 0xf,
                hw->snpsid >> 4 & 0xf, hw->snpsid & 0xf, hw->snpsid);
 
-       retval = dwc2_core_reset(hsotg);
-       if (retval)
-               return retval;
-
        hwcfg1 = dwc2_readl(hsotg->regs + GHWCFG1);
        hwcfg2 = dwc2_readl(hsotg->regs + GHWCFG2);
        hwcfg3 = dwc2_readl(hsotg->regs + GHWCFG3);
index 36606fc..a41274a 100644 (file)
@@ -1174,14 +1174,11 @@ static int dwc2_process_non_isoc_desc(struct dwc2_hsotg *hsotg,
        failed = dwc2_update_non_isoc_urb_state_ddma(hsotg, chan, qtd, dma_desc,
                                                     halt_status, n_bytes,
                                                     xfer_done);
-       if (*xfer_done && urb->status != -EINPROGRESS)
-               failed = 1;
-
-       if (failed) {
+       if (failed || (*xfer_done && urb->status != -EINPROGRESS)) {
                dwc2_host_complete(hsotg, qtd, urb->status);
                dwc2_hcd_qtd_unlink_and_free(hsotg, qtd, qh);
-               dev_vdbg(hsotg->dev, "failed=%1x xfer_done=%1x status=%08x\n",
-                        failed, *xfer_done, urb->status);
+               dev_vdbg(hsotg->dev, "failed=%1x xfer_done=%1x\n",
+                        failed, *xfer_done);
                return failed;
        }
 
@@ -1236,21 +1233,23 @@ static void dwc2_complete_non_isoc_xfer_ddma(struct dwc2_hsotg *hsotg,
 
        list_for_each_safe(qtd_item, qtd_tmp, &qh->qtd_list) {
                int i;
+               int qtd_desc_count;
 
                qtd = list_entry(qtd_item, struct dwc2_qtd, qtd_list_entry);
                xfer_done = 0;
+               qtd_desc_count = qtd->n_desc;
 
-               for (i = 0; i < qtd->n_desc; i++) {
+               for (i = 0; i < qtd_desc_count; i++) {
                        if (dwc2_process_non_isoc_desc(hsotg, chan, chnum, qtd,
                                                       desc_num, halt_status,
-                                                      &xfer_done)) {
-                               qtd = NULL;
-                               break;
-                       }
+                                                      &xfer_done))
+                               goto stop_scan;
+
                        desc_num++;
                }
        }
 
+stop_scan:
        if (qh->ep_type != USB_ENDPOINT_XFER_CONTROL) {
                /*
                 * Resetting the data toggle for bulk and interrupt endpoints
@@ -1258,7 +1257,7 @@ static void dwc2_complete_non_isoc_xfer_ddma(struct dwc2_hsotg *hsotg,
                 */
                if (halt_status == DWC2_HC_XFER_STALL)
                        qh->data_toggle = DWC2_HC_PID_DATA0;
-               else if (qtd)
+               else
                        dwc2_hcd_save_data_toggle(hsotg, chan, chnum, qtd);
        }
 
index f825380..cadba8b 100644 (file)
@@ -525,11 +525,19 @@ void dwc2_hcd_save_data_toggle(struct dwc2_hsotg *hsotg,
        u32 pid = (hctsiz & TSIZ_SC_MC_PID_MASK) >> TSIZ_SC_MC_PID_SHIFT;
 
        if (chan->ep_type != USB_ENDPOINT_XFER_CONTROL) {
+               if (WARN(!chan || !chan->qh,
+                        "chan->qh must be specified for non-control eps\n"))
+                       return;
+
                if (pid == TSIZ_SC_MC_PID_DATA0)
                        chan->qh->data_toggle = DWC2_HC_PID_DATA0;
                else
                        chan->qh->data_toggle = DWC2_HC_PID_DATA1;
        } else {
+               if (WARN(!qtd,
+                        "qtd must be specified for control eps\n"))
+                       return;
+
                if (pid == TSIZ_SC_MC_PID_DATA0)
                        qtd->data_toggle = DWC2_HC_PID_DATA0;
                else
index 510f787..690b9fd 100644 (file)
@@ -530,7 +530,13 @@ static int dwc2_driver_probe(struct platform_device *dev)
        if (retval)
                return retval;
 
-       /* Reset the controller and detect hardware config values */
+       /*
+        * Reset before dwc2_get_hwparams() then it could get power-on real
+        * reset value form registers.
+        */
+       dwc2_core_reset_and_force_dr_mode(hsotg);
+
+       /* Detect config values from hardware */
        retval = dwc2_get_hwparams(hsotg);
        if (retval)
                goto error;
index 2913068..e4f8b90 100644 (file)
@@ -856,7 +856,6 @@ struct dwc3 {
        unsigned                pullups_connected:1;
        unsigned                resize_fifos:1;
        unsigned                setup_packet_pending:1;
-       unsigned                start_config_issued:1;
        unsigned                three_stage_setup:1;
        unsigned                usb3_lpm_capable:1;
 
index 3a9354a..8d6b75c 100644 (file)
@@ -555,7 +555,6 @@ static int dwc3_ep0_set_config(struct dwc3 *dwc, struct usb_ctrlrequest *ctrl)
        int ret;
        u32 reg;
 
-       dwc->start_config_issued = false;
        cfg = le16_to_cpu(ctrl->wValue);
 
        switch (state) {
@@ -737,10 +736,6 @@ static int dwc3_ep0_std_request(struct dwc3 *dwc, struct usb_ctrlrequest *ctrl)
                dwc3_trace(trace_dwc3_ep0, "USB_REQ_SET_ISOCH_DELAY");
                ret = dwc3_ep0_set_isoch_delay(dwc, ctrl);
                break;
-       case USB_REQ_SET_INTERFACE:
-               dwc3_trace(trace_dwc3_ep0, "USB_REQ_SET_INTERFACE");
-               dwc->start_config_issued = false;
-               /* Fall through */
        default:
                dwc3_trace(trace_dwc3_ep0, "Forwarding to gadget driver");
                ret = dwc3_ep0_delegate_req(dwc, ctrl);
index af023a8..2363bad 100644 (file)
@@ -385,24 +385,66 @@ static void dwc3_free_trb_pool(struct dwc3_ep *dep)
        dep->trb_pool_dma = 0;
 }
 
+static int dwc3_gadget_set_xfer_resource(struct dwc3 *dwc, struct dwc3_ep *dep);
+
+/**
+ * dwc3_gadget_start_config - Configure EP resources
+ * @dwc: pointer to our controller context structure
+ * @dep: endpoint that is being enabled
+ *
+ * The assignment of transfer resources cannot perfectly follow the
+ * data book due to the fact that the controller driver does not have
+ * all knowledge of the configuration in advance. It is given this
+ * information piecemeal by the composite gadget framework after every
+ * SET_CONFIGURATION and SET_INTERFACE. Trying to follow the databook
+ * programming model in this scenario can cause errors. For two
+ * reasons:
+ *
+ * 1) The databook says to do DEPSTARTCFG for every SET_CONFIGURATION
+ * and SET_INTERFACE (8.1.5). This is incorrect in the scenario of
+ * multiple interfaces.
+ *
+ * 2) The databook does not mention doing more DEPXFERCFG for new
+ * endpoint on alt setting (8.1.6).
+ *
+ * The following simplified method is used instead:
+ *
+ * All hardware endpoints can be assigned a transfer resource and this
+ * setting will stay persistent until either a core reset or
+ * hibernation. So whenever we do a DEPSTARTCFG(0) we can go ahead and
+ * do DEPXFERCFG for every hardware endpoint as well. We are
+ * guaranteed that there are as many transfer resources as endpoints.
+ *
+ * This function is called for each endpoint when it is being enabled
+ * but is triggered only when called for EP0-out, which always happens
+ * first, and which should only happen in one of the above conditions.
+ */
 static int dwc3_gadget_start_config(struct dwc3 *dwc, struct dwc3_ep *dep)
 {
        struct dwc3_gadget_ep_cmd_params params;
        u32                     cmd;
+       int                     i;
+       int                     ret;
+
+       if (dep->number)
+               return 0;
 
        memset(&params, 0x00, sizeof(params));
+       cmd = DWC3_DEPCMD_DEPSTARTCFG;
 
-       if (dep->number != 1) {
-               cmd = DWC3_DEPCMD_DEPSTARTCFG;
-               /* XferRscIdx == 0 for ep0 and 2 for the remaining */
-               if (dep->number > 1) {
-                       if (dwc->start_config_issued)
-                               return 0;
-                       dwc->start_config_issued = true;
-                       cmd |= DWC3_DEPCMD_PARAM(2);
-               }
+       ret = dwc3_send_gadget_ep_cmd(dwc, 0, cmd, &params);
+       if (ret)
+               return ret;
 
-               return dwc3_send_gadget_ep_cmd(dwc, 0, cmd, &params);
+       for (i = 0; i < DWC3_ENDPOINTS_NUM; i++) {
+               struct dwc3_ep *dep = dwc->eps[i];
+
+               if (!dep)
+                       continue;
+
+               ret = dwc3_gadget_set_xfer_resource(dwc, dep);
+               if (ret)
+                       return ret;
        }
 
        return 0;
@@ -516,10 +558,6 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep *dep,
                struct dwc3_trb *trb_st_hw;
                struct dwc3_trb *trb_link;
 
-               ret = dwc3_gadget_set_xfer_resource(dwc, dep);
-               if (ret)
-                       return ret;
-
                dep->endpoint.desc = desc;
                dep->comp_desc = comp_desc;
                dep->type = usb_endpoint_type(desc);
@@ -1636,8 +1674,6 @@ static int dwc3_gadget_start(struct usb_gadget *g,
        }
        dwc3_writel(dwc->regs, DWC3_DCFG, reg);
 
-       dwc->start_config_issued = false;
-
        /* Start with SuperSpeed Default */
        dwc3_gadget_ep0_desc.wMaxPacketSize = cpu_to_le16(512);
 
@@ -2237,7 +2273,6 @@ static void dwc3_gadget_disconnect_interrupt(struct dwc3 *dwc)
        dwc3_writel(dwc->regs, DWC3_DCTL, reg);
 
        dwc3_disconnect_gadget(dwc);
-       dwc->start_config_issued = false;
 
        dwc->gadget.speed = USB_SPEED_UNKNOWN;
        dwc->setup_packet_pending = false;
@@ -2288,7 +2323,6 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 *dwc)
 
        dwc3_stop_active_transfers(dwc);
        dwc3_clear_stall_all_ep(dwc);
-       dwc->start_config_issued = false;
 
        /* Reset device address to zero */
        reg = dwc3_readl(dwc->regs, DWC3_DCFG);
@@ -2789,6 +2823,7 @@ int dwc3_gadget_init(struct dwc3 *dwc)
        dwc->gadget.speed               = USB_SPEED_UNKNOWN;
        dwc->gadget.sg_supported        = true;
        dwc->gadget.name                = "dwc3-gadget";
+       dwc->gadget.is_otg              = dwc->dr_mode == USB_DR_MODE_OTG;
 
        /*
         * FIXME We might be setting max_speed to <SUPER, however versions
index 7e179f8..87fb0fd 100644 (file)
@@ -130,7 +130,8 @@ struct dev_data {
                                        setup_can_stall : 1,
                                        setup_out_ready : 1,
                                        setup_out_error : 1,
-                                       setup_abort : 1;
+                                       setup_abort : 1,
+                                       gadget_registered : 1;
        unsigned                        setup_wLength;
 
        /* the rest is basically write-once */
@@ -1179,7 +1180,8 @@ dev_release (struct inode *inode, struct file *fd)
 
        /* closing ep0 === shutdown all */
 
-       usb_gadget_unregister_driver (&gadgetfs_driver);
+       if (dev->gadget_registered)
+               usb_gadget_unregister_driver (&gadgetfs_driver);
 
        /* at this point "good" hardware has disconnected the
         * device from USB; the host won't see it any more.
@@ -1847,6 +1849,7 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
                 * kick in after the ep0 descriptor is closed.
                 */
                value = len;
+               dev->gadget_registered = true;
        }
        return value;
 
index 53c0692..93d28cb 100644 (file)
@@ -2340,7 +2340,7 @@ static struct qe_udc *qe_udc_config(struct platform_device *ofdev)
 {
        struct qe_udc *udc;
        struct device_node *np = ofdev->dev.of_node;
-       unsigned int tmp_addr = 0;
+       unsigned long tmp_addr = 0;
        struct usb_device_para __iomem *usbpram;
        unsigned int i;
        u64 size;
index 4dff60d..0d32052 100644 (file)
@@ -369,9 +369,20 @@ static inline void set_max_speed(struct net2280_ep *ep, u32 max)
        static const u32 ep_enhanced[9] = { 0x10, 0x60, 0x30, 0x80,
                                          0x50, 0x20, 0x70, 0x40, 0x90 };
 
-       if (ep->dev->enhanced_mode)
+       if (ep->dev->enhanced_mode) {
                reg = ep_enhanced[ep->num];
-       else{
+               switch (ep->dev->gadget.speed) {
+               case USB_SPEED_SUPER:
+                       reg += 2;
+                       break;
+               case USB_SPEED_FULL:
+                       reg += 1;
+                       break;
+               case USB_SPEED_HIGH:
+               default:
+                       break;
+               }
+       } else {
                reg = (ep->num + 1) * 0x10;
                if (ep->dev->gadget.speed != USB_SPEED_HIGH)
                        reg += 1;
index fd73a3e..b86a6f0 100644 (file)
@@ -413,9 +413,10 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
                if (!driver->udc_name || strcmp(driver->udc_name,
                                                dev_name(&udc->dev)) == 0) {
                        ret = udc_bind_to_driver(udc, driver);
+                       if (ret != -EPROBE_DEFER)
+                               list_del(&driver->pending);
                        if (ret)
                                goto err4;
-                       list_del(&driver->pending);
                        break;
                }
        }
index 04ce6b1..e0244fb 100644 (file)
@@ -112,12 +112,16 @@ static inline int xhci_find_next_ext_cap(void __iomem *base, u32 start, int id)
        offset = start;
        if (!start || start == XHCI_HCC_PARAMS_OFFSET) {
                val = readl(base + XHCI_HCC_PARAMS_OFFSET);
+               if (val == ~0)
+                       return 0;
                offset = XHCI_HCC_EXT_CAPS(val) << 2;
                if (!offset)
                        return 0;
        };
        do {
                val = readl(base + offset);
+               if (val == ~0)
+                       return 0;
                if (XHCI_EXT_CAPS_ID(val) == id && offset != start)
                        return offset;
 
index c30de7c..73f763c 100644 (file)
@@ -275,8 +275,9 @@ static bool need_bw_sch(struct usb_host_endpoint *ep,
                return false;
 
        /*
-        * for LS & FS periodic endpoints which its device don't attach
-        * to TT are also ignored, root-hub will schedule them directly
+        * for LS & FS periodic endpoints which its device is not behind
+        * a TT are also ignored, root-hub will schedule them directly,
+        * but need set @bpkts field of endpoint context to 1.
         */
        if (is_fs_or_ls(speed) && !has_tt)
                return false;
@@ -339,8 +340,17 @@ int xhci_mtk_add_ep_quirk(struct usb_hcd *hcd, struct usb_device *udev,
                GET_MAX_PACKET(usb_endpoint_maxp(&ep->desc)),
                usb_endpoint_dir_in(&ep->desc), ep);
 
-       if (!need_bw_sch(ep, udev->speed, slot_ctx->tt_info & TT_SLOT))
+       if (!need_bw_sch(ep, udev->speed, slot_ctx->tt_info & TT_SLOT)) {
+               /*
+                * set @bpkts to 1 if it is LS or FS periodic endpoint, and its
+                * device does not connected through an external HS hub
+                */
+               if (usb_endpoint_xfer_int(&ep->desc)
+                       || usb_endpoint_xfer_isoc(&ep->desc))
+                       ep_ctx->reserved[0] |= cpu_to_le32(EP_BPKTS(1));
+
                return 0;
+       }
 
        bw_index = get_bw_index(xhci, udev, ep);
        sch_bw = &sch_array[bw_index];
index c9ab6a4..9532f5a 100644 (file)
@@ -696,9 +696,24 @@ static int xhci_mtk_remove(struct platform_device *dev)
 }
 
 #ifdef CONFIG_PM_SLEEP
+/*
+ * if ip sleep fails, and all clocks are disabled, access register will hang
+ * AHB bus, so stop polling roothubs to avoid regs access on bus suspend.
+ * and no need to check whether ip sleep failed or not; this will cause SPM
+ * to wake up system immediately after system suspend complete if ip sleep
+ * fails, it is what we wanted.
+ */
 static int xhci_mtk_suspend(struct device *dev)
 {
        struct xhci_hcd_mtk *mtk = dev_get_drvdata(dev);
+       struct usb_hcd *hcd = mtk->hcd;
+       struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+       xhci_dbg(xhci, "%s: stop port polling\n", __func__);
+       clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+       del_timer_sync(&hcd->rh_timer);
+       clear_bit(HCD_FLAG_POLL_RH, &xhci->shared_hcd->flags);
+       del_timer_sync(&xhci->shared_hcd->rh_timer);
 
        xhci_mtk_host_disable(mtk);
        xhci_mtk_phy_power_off(mtk);
@@ -710,11 +725,19 @@ static int xhci_mtk_suspend(struct device *dev)
 static int xhci_mtk_resume(struct device *dev)
 {
        struct xhci_hcd_mtk *mtk = dev_get_drvdata(dev);
+       struct usb_hcd *hcd = mtk->hcd;
+       struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 
        usb_wakeup_disable(mtk);
        xhci_mtk_clks_enable(mtk);
        xhci_mtk_phy_power_on(mtk);
        xhci_mtk_host_enable(mtk);
+
+       xhci_dbg(xhci, "%s: restart port polling\n", __func__);
+       set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+       usb_hcd_poll_rh_status(hcd);
+       set_bit(HCD_FLAG_POLL_RH, &xhci->shared_hcd->flags);
+       usb_hcd_poll_rh_status(xhci->shared_hcd);
        return 0;
 }
 
index 58c43ed..f0640b7 100644 (file)
@@ -28,7 +28,9 @@
 #include "xhci.h"
 #include "xhci-trace.h"
 
-#define PORT2_SSIC_CONFIG_REG2 0x883c
+#define SSIC_PORT_NUM          2
+#define SSIC_PORT_CFG2         0x880c
+#define SSIC_PORT_CFG2_OFFSET  0x30
 #define PROG_DONE              (1 << 30)
 #define SSIC_PORT_UNUSED       (1 << 31)
 
@@ -45,6 +47,7 @@
 #define PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI            0x22b5
 #define PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_XHCI                0xa12f
 #define PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI       0x9d2f
+#define PCI_DEVICE_ID_INTEL_BROXTON_M_XHCI             0x0aa8
 
 static const char hcd_name[] = "xhci_hcd";
 
@@ -151,9 +154,14 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
        if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
                (pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI ||
                 pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_XHCI ||
-                pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI)) {
+                pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI ||
+                pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_M_XHCI)) {
                xhci->quirks |= XHCI_PME_STUCK_QUIRK;
        }
+       if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
+                pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI) {
+               xhci->quirks |= XHCI_SSIC_PORT_UNUSED;
+       }
        if (pdev->vendor == PCI_VENDOR_ID_ETRON &&
                        pdev->device == PCI_DEVICE_ID_EJ168) {
                xhci->quirks |= XHCI_RESET_ON_RESUME;
@@ -312,22 +320,20 @@ static void xhci_pci_remove(struct pci_dev *dev)
  * SSIC PORT need to be marked as "unused" before putting xHCI
  * into D3. After D3 exit, the SSIC port need to be marked as "used".
  * Without this change, xHCI might not enter D3 state.
- * Make sure PME works on some Intel xHCI controllers by writing 1 to clear
- * the Internal PME flag bit in vendor specific PMCTRL register at offset 0x80a4
  */
-static void xhci_pme_quirk(struct usb_hcd *hcd, bool suspend)
+static void xhci_ssic_port_unused_quirk(struct usb_hcd *hcd, bool suspend)
 {
        struct xhci_hcd *xhci = hcd_to_xhci(hcd);
-       struct pci_dev          *pdev = to_pci_dev(hcd->self.controller);
        u32 val;
        void __iomem *reg;
+       int i;
 
-       if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
-                pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI) {
-
-               reg = (void __iomem *) xhci->cap_regs + PORT2_SSIC_CONFIG_REG2;
+       for (i = 0; i < SSIC_PORT_NUM; i++) {
+               reg = (void __iomem *) xhci->cap_regs +
+                               SSIC_PORT_CFG2 +
+                               i * SSIC_PORT_CFG2_OFFSET;
 
-               /* Notify SSIC that SSIC profile programming is not done */
+               /* Notify SSIC that SSIC profile programming is not done. */
                val = readl(reg) & ~PROG_DONE;
                writel(val, reg);
 
@@ -344,6 +350,17 @@ static void xhci_pme_quirk(struct usb_hcd *hcd, bool suspend)
                writel(val, reg);
                readl(reg);
        }
+}
+
+/*
+ * Make sure PME works on some Intel xHCI controllers by writing 1 to clear
+ * the Internal PME flag bit in vendor specific PMCTRL register at offset 0x80a4
+ */
+static void xhci_pme_quirk(struct usb_hcd *hcd)
+{
+       struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+       void __iomem *reg;
+       u32 val;
 
        reg = (void __iomem *) xhci->cap_regs + 0x80a4;
        val = readl(reg);
@@ -355,6 +372,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 {
        struct xhci_hcd *xhci = hcd_to_xhci(hcd);
        struct pci_dev          *pdev = to_pci_dev(hcd->self.controller);
+       int                     ret;
 
        /*
         * Systems with the TI redriver that loses port status change events
@@ -364,9 +382,16 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
                pdev->no_d3cold = true;
 
        if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
-               xhci_pme_quirk(hcd, true);
+               xhci_pme_quirk(hcd);
+
+       if (xhci->quirks & XHCI_SSIC_PORT_UNUSED)
+               xhci_ssic_port_unused_quirk(hcd, true);
 
-       return xhci_suspend(xhci, do_wakeup);
+       ret = xhci_suspend(xhci, do_wakeup);
+       if (ret && (xhci->quirks & XHCI_SSIC_PORT_UNUSED))
+               xhci_ssic_port_unused_quirk(hcd, false);
+
+       return ret;
 }
 
 static int xhci_pci_resume(struct usb_hcd *hcd, bool hibernated)
@@ -396,8 +421,11 @@ static int xhci_pci_resume(struct usb_hcd *hcd, bool hibernated)
        if (pdev->vendor == PCI_VENDOR_ID_INTEL)
                usb_enable_intel_xhci_ports(pdev);
 
+       if (xhci->quirks & XHCI_SSIC_PORT_UNUSED)
+               xhci_ssic_port_unused_quirk(hcd, false);
+
        if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
-               xhci_pme_quirk(hcd, false);
+               xhci_pme_quirk(hcd);
 
        retval = xhci_resume(xhci, hibernated);
        return retval;
index 770b6b0..d39d6bf 100644 (file)
@@ -184,7 +184,8 @@ static int xhci_plat_probe(struct platform_device *pdev)
                struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
 
                /* Just copy data for now */
-               *priv = *priv_match;
+               if (priv_match)
+                       *priv = *priv_match;
        }
 
        if (xhci_plat_type_is(hcd, XHCI_PLAT_TYPE_MARVELL_ARMADA)) {
index f1c21c4..3915657 100644 (file)
@@ -2193,10 +2193,6 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td,
                }
        /* Fast path - was this the last TRB in the TD for this URB? */
        } else if (event_trb == td->last_trb) {
-               if (td->urb_length_set && trb_comp_code == COMP_SHORT_TX)
-                       return finish_td(xhci, td, event_trb, event, ep,
-                                        status, false);
-
                if (EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) {
                        td->urb->actual_length =
                                td->urb->transfer_buffer_length -
@@ -2248,12 +2244,6 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td,
                        td->urb->actual_length +=
                                TRB_LEN(le32_to_cpu(cur_trb->generic.field[2])) -
                                EVENT_TRB_LEN(le32_to_cpu(event->transfer_len));
-
-               if (trb_comp_code == COMP_SHORT_TX) {
-                       xhci_dbg(xhci, "mid bulk/intr SP, wait for last TRB event\n");
-                       td->urb_length_set = true;
-                       return 0;
-               }
        }
 
        return finish_td(xhci, td, event_trb, event, ep, status, false);
index 26a44c0..0c8087d 100644 (file)
@@ -1554,7 +1554,9 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
                xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
                                "HW died, freeing TD.");
                urb_priv = urb->hcpriv;
-               for (i = urb_priv->td_cnt; i < urb_priv->length; i++) {
+               for (i = urb_priv->td_cnt;
+                    i < urb_priv->length && xhci->devs[urb->dev->slot_id];
+                    i++) {
                        td = urb_priv->td[i];
                        if (!list_empty(&td->td_list))
                                list_del_init(&td->td_list);
index 9be7348..cc65138 100644 (file)
@@ -1631,6 +1631,7 @@ struct xhci_hcd {
 #define XHCI_BROKEN_STREAMS    (1 << 19)
 #define XHCI_PME_STUCK_QUIRK   (1 << 20)
 #define XHCI_MTK_HOST          (1 << 21)
+#define XHCI_SSIC_PORT_UNUSED  (1 << 22)
        unsigned int            num_active_eps;
        unsigned int            limit_active_eps;
        /* There are two roothubs to keep track of bus suspend info for */
index 795a45b..58487a4 100644 (file)
@@ -662,7 +662,7 @@ static int musb_tx_dma_set_mode_mentor(struct dma_controller *dma,
                csr &= ~(MUSB_TXCSR_AUTOSET | MUSB_TXCSR_DMAMODE);
                csr |= MUSB_TXCSR_DMAENAB; /* against programmer's guide */
        }
-       channel->desired_mode = mode;
+       channel->desired_mode = *mode;
        musb_writew(epio, MUSB_TXCSR, csr);
 
        return 0;
@@ -2003,10 +2003,8 @@ void musb_host_rx(struct musb *musb, u8 epnum)
                                qh->offset,
                                urb->transfer_buffer_length);
 
-                       done = musb_rx_dma_in_inventra_cppi41(c, hw_ep, qh,
-                                                             urb, xfer_len,
-                                                             iso_err);
-                       if (done)
+                       if (musb_rx_dma_in_inventra_cppi41(c, hw_ep, qh, urb,
+                                                          xfer_len, iso_err))
                                goto finish;
                        else
                                dev_err(musb->controller, "error: rx_dma failed\n");
index b2685e7..3eaa4ba 100644 (file)
@@ -348,7 +348,9 @@ static int ux500_suspend(struct device *dev)
        struct ux500_glue       *glue = dev_get_drvdata(dev);
        struct musb             *musb = glue_to_musb(glue);
 
-       usb_phy_set_suspend(musb->xceiv, 1);
+       if (musb)
+               usb_phy_set_suspend(musb->xceiv, 1);
+
        clk_disable_unprepare(glue->clk);
 
        return 0;
@@ -366,7 +368,8 @@ static int ux500_resume(struct device *dev)
                return ret;
        }
 
-       usb_phy_set_suspend(musb->xceiv, 0);
+       if (musb)
+               usb_phy_set_suspend(musb->xceiv, 0);
 
        return 0;
 }
index 0d19a6d..72b387d 100644 (file)
@@ -757,14 +757,8 @@ static int msm_otg_set_host(struct usb_otg *otg, struct usb_bus *host)
        otg->host = host;
        dev_dbg(otg->usb_phy->dev, "host driver registered w/ tranceiver\n");
 
-       /*
-        * Kick the state machine work, if peripheral is not supported
-        * or peripheral is already registered with us.
-        */
-       if (motg->pdata->mode == USB_DR_MODE_HOST || otg->gadget) {
-               pm_runtime_get_sync(otg->usb_phy->dev);
-               schedule_work(&motg->sm_work);
-       }
+       pm_runtime_get_sync(otg->usb_phy->dev);
+       schedule_work(&motg->sm_work);
 
        return 0;
 }
@@ -827,14 +821,8 @@ static int msm_otg_set_peripheral(struct usb_otg *otg,
        dev_dbg(otg->usb_phy->dev,
                "peripheral driver registered w/ tranceiver\n");
 
-       /*
-        * Kick the state machine work, if host is not supported
-        * or host is already registered with us.
-        */
-       if (motg->pdata->mode == USB_DR_MODE_PERIPHERAL || otg->host) {
-               pm_runtime_get_sync(otg->usb_phy->dev);
-               schedule_work(&motg->sm_work);
-       }
+       pm_runtime_get_sync(otg->usb_phy->dev);
+       schedule_work(&motg->sm_work);
 
        return 0;
 }
@@ -1599,6 +1587,8 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
                                                &motg->id.nb);
                if (ret < 0) {
                        dev_err(&pdev->dev, "register ID notifier failed\n");
+                       extcon_unregister_notifier(motg->vbus.extcon,
+                                                  EXTCON_USB, &motg->vbus.nb);
                        return ret;
                }
 
@@ -1660,15 +1650,6 @@ static int msm_otg_probe(struct platform_device *pdev)
        if (!motg)
                return -ENOMEM;
 
-       pdata = dev_get_platdata(&pdev->dev);
-       if (!pdata) {
-               if (!np)
-                       return -ENXIO;
-               ret = msm_otg_read_dt(pdev, motg);
-               if (ret)
-                       return ret;
-       }
-
        motg->phy.otg = devm_kzalloc(&pdev->dev, sizeof(struct usb_otg),
                                     GFP_KERNEL);
        if (!motg->phy.otg)
@@ -1710,6 +1691,15 @@ static int msm_otg_probe(struct platform_device *pdev)
        if (!motg->regs)
                return -ENOMEM;
 
+       pdata = dev_get_platdata(&pdev->dev);
+       if (!pdata) {
+               if (!np)
+                       return -ENXIO;
+               ret = msm_otg_read_dt(pdev, motg);
+               if (ret)
+                       return ret;
+       }
+
        /*
         * NOTE: The PHYs can be multiplexed between the chipidea controller
         * and the dwc3 controller, using a single bit. It is important that
@@ -1717,8 +1707,10 @@ static int msm_otg_probe(struct platform_device *pdev)
         */
        if (motg->phy_number) {
                phy_select = devm_ioremap_nocache(&pdev->dev, USB2_PHY_SEL, 4);
-               if (!phy_select)
-                       return -ENOMEM;
+               if (!phy_select) {
+                       ret = -ENOMEM;
+                       goto unregister_extcon;
+               }
                /* Enable second PHY with the OTG port */
                writel(0x1, phy_select);
        }
@@ -1728,7 +1720,8 @@ static int msm_otg_probe(struct platform_device *pdev)
        motg->irq = platform_get_irq(pdev, 0);
        if (motg->irq < 0) {
                dev_err(&pdev->dev, "platform_get_irq failed\n");
-               return motg->irq;
+               ret = motg->irq;
+               goto unregister_extcon;
        }
 
        regs[0].supply = "vddcx";
@@ -1737,7 +1730,7 @@ static int msm_otg_probe(struct platform_device *pdev)
 
        ret = devm_regulator_bulk_get(motg->phy.dev, ARRAY_SIZE(regs), regs);
        if (ret)
-               return ret;
+               goto unregister_extcon;
 
        motg->vddcx = regs[0].consumer;
        motg->v3p3  = regs[1].consumer;
@@ -1834,6 +1827,12 @@ disable_clks:
        clk_disable_unprepare(motg->clk);
        if (!IS_ERR(motg->core_clk))
                clk_disable_unprepare(motg->core_clk);
+unregister_extcon:
+       extcon_unregister_notifier(motg->id.extcon,
+                                  EXTCON_USB_HOST, &motg->id.nb);
+       extcon_unregister_notifier(motg->vbus.extcon,
+                                  EXTCON_USB, &motg->vbus.nb);
+
        return ret;
 }
 
index c2936dc..00bfea0 100644 (file)
@@ -220,7 +220,7 @@ static int mxs_phy_hw_init(struct mxs_phy *mxs_phy)
 /* Return true if the vbus is there */
 static bool mxs_phy_get_vbus_status(struct mxs_phy *mxs_phy)
 {
-       unsigned int vbus_value;
+       unsigned int vbus_value = 0;
 
        if (!mxs_phy->regmap_anatop)
                return false;
index f612dda..56ecb8b 100644 (file)
@@ -475,22 +475,6 @@ config USB_SERIAL_MOS7840
          To compile this driver as a module, choose M here: the
          module will be called mos7840.  If unsure, choose N.
 
-config USB_SERIAL_MXUPORT11
-       tristate "USB Moxa UPORT 11x0 Serial Driver"
-       ---help---
-         Say Y here if you want to use a MOXA UPort 11x0 Serial hub.
-
-         This driver supports:
-
-         - UPort 1110  : 1 port RS-232 USB to Serial Hub.
-         - UPort 1130  : 1 port RS-422/485 USB to Serial Hub.
-         - UPort 1130I : 1 port RS-422/485 USB to Serial Hub with Isolation.
-         - UPort 1150  : 1 port RS-232/422/485 USB to Serial Hub.
-         - UPort 1150I : 1 port RS-232/422/485 USB to Serial Hub with Isolation.
-
-         To compile this driver as a module, choose M here: the
-         module will be called mxu11x0.
-
 config USB_SERIAL_MXUPORT
        tristate "USB Moxa UPORT Serial Driver"
        ---help---
index f3fa5e5..349d9df 100644 (file)
@@ -38,7 +38,6 @@ obj-$(CONFIG_USB_SERIAL_METRO)                        += metro-usb.o
 obj-$(CONFIG_USB_SERIAL_MOS7720)               += mos7720.o
 obj-$(CONFIG_USB_SERIAL_MOS7840)               += mos7840.o
 obj-$(CONFIG_USB_SERIAL_MXUPORT)               += mxuport.o
-obj-$(CONFIG_USB_SERIAL_MXUPORT11)             += mxu11x0.o
 obj-$(CONFIG_USB_SERIAL_NAVMAN)                        += navman.o
 obj-$(CONFIG_USB_SERIAL_OMNINET)               += omninet.o
 obj-$(CONFIG_USB_SERIAL_OPTICON)               += opticon.o
index 9b90ad7..73a366d 100644 (file)
@@ -99,6 +99,7 @@ static const struct usb_device_id id_table[] = {
        { USB_DEVICE(0x10C4, 0x81AC) }, /* MSD Dash Hawk */
        { USB_DEVICE(0x10C4, 0x81AD) }, /* INSYS USB Modem */
        { USB_DEVICE(0x10C4, 0x81C8) }, /* Lipowsky Industrie Elektronik GmbH, Baby-JTAG */
+       { USB_DEVICE(0x10C4, 0x81D7) }, /* IAI Corp. RCB-CV-USB USB to RS485 Adaptor */
        { USB_DEVICE(0x10C4, 0x81E2) }, /* Lipowsky Industrie Elektronik GmbH, Baby-LIN */
        { USB_DEVICE(0x10C4, 0x81E7) }, /* Aerocomm Radio */
        { USB_DEVICE(0x10C4, 0x81E8) }, /* Zephyr Bioharness */
@@ -162,6 +163,9 @@ static const struct usb_device_id id_table[] = {
        { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */
        { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
        { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */
+       { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */
+       { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */
+       { USB_DEVICE(0x19CF, 0x3000) }, /* Parrot NMEA GPS Flight Recorder */
        { USB_DEVICE(0x1ADB, 0x0001) }, /* Schweitzer Engineering C662 Cable */
        { USB_DEVICE(0x1B1C, 0x1C00) }, /* Corsair USB Dongle */
        { USB_DEVICE(0x1BA4, 0x0002) }, /* Silicon Labs 358x factory default */
index a5a0376..8c660ae 100644 (file)
@@ -824,6 +824,7 @@ static const struct usb_device_id id_table_combined[] = {
        { USB_DEVICE(FTDI_VID, FTDI_TURTELIZER_PID),
                .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
        { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID_USB60F) },
+       { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID_SCU18) },
        { USB_DEVICE(FTDI_VID, FTDI_REU_TINY_PID) },
 
        /* Papouch devices based on FTDI chip */
index 67c6d44..a84df25 100644 (file)
  */
 #define RATOC_VENDOR_ID                0x0584
 #define RATOC_PRODUCT_ID_USB60F        0xb020
+#define RATOC_PRODUCT_ID_SCU18 0xb03a
 
 /*
  * Infineon Technologies
diff --git a/drivers/usb/serial/mxu11x0.c b/drivers/usb/serial/mxu11x0.c
deleted file mode 100644 (file)
index e3c3f57..0000000
+++ /dev/null
@@ -1,986 +0,0 @@
-/*
- * USB Moxa UPORT 11x0 Serial Driver
- *
- * Copyright (C) 2007 MOXA Technologies Co., Ltd.
- * Copyright (C) 2015 Mathieu Othacehe <m.othacehe@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- *
- * Supports the following Moxa USB to serial converters:
- *  UPort 1110,  1 port RS-232 USB to Serial Hub.
- *  UPort 1130,  1 port RS-422/485 USB to Serial Hub.
- *  UPort 1130I, 1 port RS-422/485 USB to Serial Hub with isolation
- *    protection.
- *  UPort 1150,  1 port RS-232/422/485 USB to Serial Hub.
- *  UPort 1150I, 1 port RS-232/422/485 USB to Serial Hub with isolation
- *  protection.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/firmware.h>
-#include <linux/jiffies.h>
-#include <linux/serial.h>
-#include <linux/serial_reg.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/tty_flip.h>
-#include <linux/uaccess.h>
-#include <linux/usb.h>
-#include <linux/usb/serial.h>
-
-/* Vendor and product ids */
-#define MXU1_VENDOR_ID                         0x110a
-#define MXU1_1110_PRODUCT_ID                   0x1110
-#define MXU1_1130_PRODUCT_ID                   0x1130
-#define MXU1_1150_PRODUCT_ID                   0x1150
-#define MXU1_1151_PRODUCT_ID                   0x1151
-#define MXU1_1131_PRODUCT_ID                   0x1131
-
-/* Commands */
-#define MXU1_GET_VERSION                       0x01
-#define MXU1_GET_PORT_STATUS                   0x02
-#define MXU1_GET_PORT_DEV_INFO                 0x03
-#define MXU1_GET_CONFIG                                0x04
-#define MXU1_SET_CONFIG                                0x05
-#define MXU1_OPEN_PORT                         0x06
-#define MXU1_CLOSE_PORT                                0x07
-#define MXU1_START_PORT                                0x08
-#define MXU1_STOP_PORT                         0x09
-#define MXU1_TEST_PORT                         0x0A
-#define MXU1_PURGE_PORT                                0x0B
-#define MXU1_RESET_EXT_DEVICE                  0x0C
-#define MXU1_GET_OUTQUEUE                      0x0D
-#define MXU1_WRITE_DATA                                0x80
-#define MXU1_READ_DATA                         0x81
-#define MXU1_REQ_TYPE_CLASS                    0x82
-
-/* Module identifiers */
-#define MXU1_I2C_PORT                          0x01
-#define MXU1_IEEE1284_PORT                     0x02
-#define MXU1_UART1_PORT                                0x03
-#define MXU1_UART2_PORT                                0x04
-#define MXU1_RAM_PORT                          0x05
-
-/* Modem status */
-#define MXU1_MSR_DELTA_CTS                     0x01
-#define MXU1_MSR_DELTA_DSR                     0x02
-#define MXU1_MSR_DELTA_RI                      0x04
-#define MXU1_MSR_DELTA_CD                      0x08
-#define MXU1_MSR_CTS                           0x10
-#define MXU1_MSR_DSR                           0x20
-#define MXU1_MSR_RI                            0x40
-#define MXU1_MSR_CD                            0x80
-#define MXU1_MSR_DELTA_MASK                    0x0F
-#define MXU1_MSR_MASK                          0xF0
-
-/* Line status */
-#define MXU1_LSR_OVERRUN_ERROR                 0x01
-#define MXU1_LSR_PARITY_ERROR                  0x02
-#define MXU1_LSR_FRAMING_ERROR                 0x04
-#define MXU1_LSR_BREAK                         0x08
-#define MXU1_LSR_ERROR                         0x0F
-#define MXU1_LSR_RX_FULL                       0x10
-#define MXU1_LSR_TX_EMPTY                      0x20
-
-/* Modem control */
-#define MXU1_MCR_LOOP                          0x04
-#define MXU1_MCR_DTR                           0x10
-#define MXU1_MCR_RTS                           0x20
-
-/* Mask settings */
-#define MXU1_UART_ENABLE_RTS_IN                        0x0001
-#define MXU1_UART_DISABLE_RTS                  0x0002
-#define MXU1_UART_ENABLE_PARITY_CHECKING       0x0008
-#define MXU1_UART_ENABLE_DSR_OUT               0x0010
-#define MXU1_UART_ENABLE_CTS_OUT               0x0020
-#define MXU1_UART_ENABLE_X_OUT                 0x0040
-#define MXU1_UART_ENABLE_XA_OUT                        0x0080
-#define MXU1_UART_ENABLE_X_IN                  0x0100
-#define MXU1_UART_ENABLE_DTR_IN                        0x0800
-#define MXU1_UART_DISABLE_DTR                  0x1000
-#define MXU1_UART_ENABLE_MS_INTS               0x2000
-#define MXU1_UART_ENABLE_AUTO_START_DMA                0x4000
-#define MXU1_UART_SEND_BREAK_SIGNAL            0x8000
-
-/* Parity */
-#define MXU1_UART_NO_PARITY                    0x00
-#define MXU1_UART_ODD_PARITY                   0x01
-#define MXU1_UART_EVEN_PARITY                  0x02
-#define MXU1_UART_MARK_PARITY                  0x03
-#define MXU1_UART_SPACE_PARITY                 0x04
-
-/* Stop bits */
-#define MXU1_UART_1_STOP_BITS                  0x00
-#define MXU1_UART_1_5_STOP_BITS                        0x01
-#define MXU1_UART_2_STOP_BITS                  0x02
-
-/* Bits per character */
-#define MXU1_UART_5_DATA_BITS                  0x00
-#define MXU1_UART_6_DATA_BITS                  0x01
-#define MXU1_UART_7_DATA_BITS                  0x02
-#define MXU1_UART_8_DATA_BITS                  0x03
-
-/* Operation modes */
-#define MXU1_UART_232                          0x00
-#define MXU1_UART_485_RECEIVER_DISABLED                0x01
-#define MXU1_UART_485_RECEIVER_ENABLED         0x02
-
-/* Pipe transfer mode and timeout */
-#define MXU1_PIPE_MODE_CONTINUOUS              0x01
-#define MXU1_PIPE_MODE_MASK                    0x03
-#define MXU1_PIPE_TIMEOUT_MASK                 0x7C
-#define MXU1_PIPE_TIMEOUT_ENABLE               0x80
-
-/* Config struct */
-struct mxu1_uart_config {
-       __be16  wBaudRate;
-       __be16  wFlags;
-       u8      bDataBits;
-       u8      bParity;
-       u8      bStopBits;
-       char    cXon;
-       char    cXoff;
-       u8      bUartMode;
-} __packed;
-
-/* Purge modes */
-#define MXU1_PURGE_OUTPUT                      0x00
-#define MXU1_PURGE_INPUT                       0x80
-
-/* Read/Write data */
-#define MXU1_RW_DATA_ADDR_SFR                  0x10
-#define MXU1_RW_DATA_ADDR_IDATA                        0x20
-#define MXU1_RW_DATA_ADDR_XDATA                        0x30
-#define MXU1_RW_DATA_ADDR_CODE                 0x40
-#define MXU1_RW_DATA_ADDR_GPIO                 0x50
-#define MXU1_RW_DATA_ADDR_I2C                  0x60
-#define MXU1_RW_DATA_ADDR_FLASH                        0x70
-#define MXU1_RW_DATA_ADDR_DSP                  0x80
-
-#define MXU1_RW_DATA_UNSPECIFIED               0x00
-#define MXU1_RW_DATA_BYTE                      0x01
-#define MXU1_RW_DATA_WORD                      0x02
-#define MXU1_RW_DATA_DOUBLE_WORD               0x04
-
-struct mxu1_write_data_bytes {
-       u8      bAddrType;
-       u8      bDataType;
-       u8      bDataCounter;
-       __be16  wBaseAddrHi;
-       __be16  wBaseAddrLo;
-       u8      bData[0];
-} __packed;
-
-/* Interrupt codes */
-#define MXU1_CODE_HARDWARE_ERROR               0xFF
-#define MXU1_CODE_DATA_ERROR                   0x03
-#define MXU1_CODE_MODEM_STATUS                 0x04
-
-static inline int mxu1_get_func_from_code(unsigned char code)
-{
-       return code & 0x0f;
-}
-
-/* Download firmware max packet size */
-#define MXU1_DOWNLOAD_MAX_PACKET_SIZE          64
-
-/* Firmware image header */
-struct mxu1_firmware_header {
-       __le16 wLength;
-       u8 bCheckSum;
-} __packed;
-
-#define MXU1_UART_BASE_ADDR        0xFFA0
-#define MXU1_UART_OFFSET_MCR       0x0004
-
-#define MXU1_BAUD_BASE              923077
-
-#define MXU1_TRANSFER_TIMEOUT      2
-#define MXU1_DOWNLOAD_TIMEOUT       1000
-#define MXU1_DEFAULT_CLOSING_WAIT   4000 /* in .01 secs */
-
-struct mxu1_port {
-       u8 msr;
-       u8 mcr;
-       u8 uart_mode;
-       spinlock_t spinlock; /* Protects msr */
-       struct mutex mutex; /* Protects mcr */
-       bool send_break;
-};
-
-struct mxu1_device {
-       u16 mxd_model;
-};
-
-static const struct usb_device_id mxu1_idtable[] = {
-       { USB_DEVICE(MXU1_VENDOR_ID, MXU1_1110_PRODUCT_ID) },
-       { USB_DEVICE(MXU1_VENDOR_ID, MXU1_1130_PRODUCT_ID) },
-       { USB_DEVICE(MXU1_VENDOR_ID, MXU1_1150_PRODUCT_ID) },
-       { USB_DEVICE(MXU1_VENDOR_ID, MXU1_1151_PRODUCT_ID) },
-       { USB_DEVICE(MXU1_VENDOR_ID, MXU1_1131_PRODUCT_ID) },
-       { }
-};
-
-MODULE_DEVICE_TABLE(usb, mxu1_idtable);
-
-/* Write the given buffer out to the control pipe.  */
-static int mxu1_send_ctrl_data_urb(struct usb_serial *serial,
-                                  u8 request,
-                                  u16 value, u16 index,
-                                  void *data, size_t size)
-{
-       int status;
-
-       status = usb_control_msg(serial->dev,
-                                usb_sndctrlpipe(serial->dev, 0),
-                                request,
-                                (USB_DIR_OUT | USB_TYPE_VENDOR |
-                                 USB_RECIP_DEVICE), value, index,
-                                data, size,
-                                USB_CTRL_SET_TIMEOUT);
-       if (status < 0) {
-               dev_err(&serial->interface->dev,
-                       "%s - usb_control_msg failed: %d\n",
-                       __func__, status);
-               return status;
-       }
-
-       if (status != size) {
-               dev_err(&serial->interface->dev,
-                       "%s - short write (%d / %zd)\n",
-                       __func__, status, size);
-               return -EIO;
-       }
-
-       return 0;
-}
-
-/* Send a vendor request without any data */
-static int mxu1_send_ctrl_urb(struct usb_serial *serial,
-                             u8 request, u16 value, u16 index)
-{
-       return mxu1_send_ctrl_data_urb(serial, request, value, index,
-                                      NULL, 0);
-}
-
-static int mxu1_download_firmware(struct usb_serial *serial,
-                                 const struct firmware *fw_p)
-{
-       int status = 0;
-       int buffer_size;
-       int pos;
-       int len;
-       int done;
-       u8 cs = 0;
-       u8 *buffer;
-       struct usb_device *dev = serial->dev;
-       struct mxu1_firmware_header *header;
-       unsigned int pipe;
-
-       pipe = usb_sndbulkpipe(dev, serial->port[0]->bulk_out_endpointAddress);
-
-       buffer_size = fw_p->size + sizeof(*header);
-       buffer = kmalloc(buffer_size, GFP_KERNEL);
-       if (!buffer)
-               return -ENOMEM;
-
-       memcpy(buffer, fw_p->data, fw_p->size);
-       memset(buffer + fw_p->size, 0xff, buffer_size - fw_p->size);
-
-       for (pos = sizeof(*header); pos < buffer_size; pos++)
-               cs = (u8)(cs + buffer[pos]);
-
-       header = (struct mxu1_firmware_header *)buffer;
-       header->wLength = cpu_to_le16(buffer_size - sizeof(*header));
-       header->bCheckSum = cs;
-
-       dev_dbg(&dev->dev, "%s - downloading firmware\n", __func__);
-
-       for (pos = 0; pos < buffer_size; pos += done) {
-               len = min(buffer_size - pos, MXU1_DOWNLOAD_MAX_PACKET_SIZE);
-
-               status = usb_bulk_msg(dev, pipe, buffer + pos, len, &done,
-                               MXU1_DOWNLOAD_TIMEOUT);
-               if (status)
-                       break;
-       }
-
-       kfree(buffer);
-
-       if (status) {
-               dev_err(&dev->dev, "failed to download firmware: %d\n", status);
-               return status;
-       }
-
-       msleep_interruptible(100);
-       usb_reset_device(dev);
-
-       dev_dbg(&dev->dev, "%s - download successful\n", __func__);
-
-       return 0;
-}
-
-static int mxu1_port_probe(struct usb_serial_port *port)
-{
-       struct mxu1_port *mxport;
-       struct mxu1_device *mxdev;
-
-       if (!port->interrupt_in_urb) {
-               dev_err(&port->dev, "no interrupt urb\n");
-               return -ENODEV;
-       }
-
-       mxport = kzalloc(sizeof(struct mxu1_port), GFP_KERNEL);
-       if (!mxport)
-               return -ENOMEM;
-
-       spin_lock_init(&mxport->spinlock);
-       mutex_init(&mxport->mutex);
-
-       mxdev = usb_get_serial_data(port->serial);
-
-       switch (mxdev->mxd_model) {
-       case MXU1_1110_PRODUCT_ID:
-       case MXU1_1150_PRODUCT_ID:
-       case MXU1_1151_PRODUCT_ID:
-               mxport->uart_mode = MXU1_UART_232;
-               break;
-       case MXU1_1130_PRODUCT_ID:
-       case MXU1_1131_PRODUCT_ID:
-               mxport->uart_mode = MXU1_UART_485_RECEIVER_DISABLED;
-               break;
-       }
-
-       usb_set_serial_port_data(port, mxport);
-
-       port->port.closing_wait =
-                       msecs_to_jiffies(MXU1_DEFAULT_CLOSING_WAIT * 10);
-       port->port.drain_delay = 1;
-
-       return 0;
-}
-
-static int mxu1_startup(struct usb_serial *serial)
-{
-       struct mxu1_device *mxdev;
-       struct usb_device *dev = serial->dev;
-       struct usb_host_interface *cur_altsetting;
-       char fw_name[32];
-       const struct firmware *fw_p = NULL;
-       int err;
-
-       dev_dbg(&serial->interface->dev, "%s - product 0x%04X, num configurations %d, configuration value %d\n",
-               __func__, le16_to_cpu(dev->descriptor.idProduct),
-               dev->descriptor.bNumConfigurations,
-               dev->actconfig->desc.bConfigurationValue);
-
-       /* create device structure */
-       mxdev = kzalloc(sizeof(struct mxu1_device), GFP_KERNEL);
-       if (!mxdev)
-               return -ENOMEM;
-
-       usb_set_serial_data(serial, mxdev);
-
-       mxdev->mxd_model = le16_to_cpu(dev->descriptor.idProduct);
-
-       cur_altsetting = serial->interface->cur_altsetting;
-
-       /* if we have only 1 configuration, download firmware */
-       if (cur_altsetting->desc.bNumEndpoints == 1) {
-
-               snprintf(fw_name,
-                        sizeof(fw_name),
-                        "moxa/moxa-%04x.fw",
-                        mxdev->mxd_model);
-
-               err = request_firmware(&fw_p, fw_name, &serial->interface->dev);
-               if (err) {
-                       dev_err(&serial->interface->dev, "failed to request firmware: %d\n",
-                               err);
-                       goto err_free_mxdev;
-               }
-
-               err = mxu1_download_firmware(serial, fw_p);
-               if (err)
-                       goto err_release_firmware;
-
-               /* device is being reset */
-               err = -ENODEV;
-               goto err_release_firmware;
-       }
-
-       return 0;
-
-err_release_firmware:
-       release_firmware(fw_p);
-err_free_mxdev:
-       kfree(mxdev);
-
-       return err;
-}
-
-static int mxu1_write_byte(struct usb_serial_port *port, u32 addr,
-                          u8 mask, u8 byte)
-{
-       int status;
-       size_t size;
-       struct mxu1_write_data_bytes *data;
-
-       dev_dbg(&port->dev, "%s - addr 0x%08X, mask 0x%02X, byte 0x%02X\n",
-               __func__, addr, mask, byte);
-
-       size = sizeof(struct mxu1_write_data_bytes) + 2;
-       data = kzalloc(size, GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
-       data->bAddrType = MXU1_RW_DATA_ADDR_XDATA;
-       data->bDataType = MXU1_RW_DATA_BYTE;
-       data->bDataCounter = 1;
-       data->wBaseAddrHi = cpu_to_be16(addr >> 16);
-       data->wBaseAddrLo = cpu_to_be16(addr);
-       data->bData[0] = mask;
-       data->bData[1] = byte;
-
-       status = mxu1_send_ctrl_data_urb(port->serial, MXU1_WRITE_DATA, 0,
-                                        MXU1_RAM_PORT, data, size);
-       if (status < 0)
-               dev_err(&port->dev, "%s - failed: %d\n", __func__, status);
-
-       kfree(data);
-
-       return status;
-}
-
-static int mxu1_set_mcr(struct usb_serial_port *port, unsigned int mcr)
-{
-       int status;
-
-       status = mxu1_write_byte(port,
-                                MXU1_UART_BASE_ADDR + MXU1_UART_OFFSET_MCR,
-                                MXU1_MCR_RTS | MXU1_MCR_DTR | MXU1_MCR_LOOP,
-                                mcr);
-       return status;
-}
-
-static void mxu1_set_termios(struct tty_struct *tty,
-                            struct usb_serial_port *port,
-                            struct ktermios *old_termios)
-{
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-       struct mxu1_uart_config *config;
-       tcflag_t cflag, iflag;
-       speed_t baud;
-       int status;
-       unsigned int mcr;
-
-       cflag = tty->termios.c_cflag;
-       iflag = tty->termios.c_iflag;
-
-       if (old_termios &&
-           !tty_termios_hw_change(&tty->termios, old_termios) &&
-           tty->termios.c_iflag == old_termios->c_iflag) {
-               dev_dbg(&port->dev, "%s - nothing to change\n", __func__);
-               return;
-       }
-
-       dev_dbg(&port->dev,
-               "%s - cflag 0x%08x, iflag 0x%08x\n", __func__, cflag, iflag);
-
-       if (old_termios) {
-               dev_dbg(&port->dev, "%s - old cflag 0x%08x, old iflag 0x%08x\n",
-                       __func__,
-                       old_termios->c_cflag,
-                       old_termios->c_iflag);
-       }
-
-       config = kzalloc(sizeof(*config), GFP_KERNEL);
-       if (!config)
-               return;
-
-       /* these flags must be set */
-       config->wFlags |= MXU1_UART_ENABLE_MS_INTS;
-       config->wFlags |= MXU1_UART_ENABLE_AUTO_START_DMA;
-       if (mxport->send_break)
-               config->wFlags |= MXU1_UART_SEND_BREAK_SIGNAL;
-       config->bUartMode = mxport->uart_mode;
-
-       switch (C_CSIZE(tty)) {
-       case CS5:
-               config->bDataBits = MXU1_UART_5_DATA_BITS;
-               break;
-       case CS6:
-               config->bDataBits = MXU1_UART_6_DATA_BITS;
-               break;
-       case CS7:
-               config->bDataBits = MXU1_UART_7_DATA_BITS;
-               break;
-       default:
-       case CS8:
-               config->bDataBits = MXU1_UART_8_DATA_BITS;
-               break;
-       }
-
-       if (C_PARENB(tty)) {
-               config->wFlags |= MXU1_UART_ENABLE_PARITY_CHECKING;
-               if (C_CMSPAR(tty)) {
-                       if (C_PARODD(tty))
-                               config->bParity = MXU1_UART_MARK_PARITY;
-                       else
-                               config->bParity = MXU1_UART_SPACE_PARITY;
-               } else {
-                       if (C_PARODD(tty))
-                               config->bParity = MXU1_UART_ODD_PARITY;
-                       else
-                               config->bParity = MXU1_UART_EVEN_PARITY;
-               }
-       } else {
-               config->bParity = MXU1_UART_NO_PARITY;
-       }
-
-       if (C_CSTOPB(tty))
-               config->bStopBits = MXU1_UART_2_STOP_BITS;
-       else
-               config->bStopBits = MXU1_UART_1_STOP_BITS;
-
-       if (C_CRTSCTS(tty)) {
-               /* RTS flow control must be off to drop RTS for baud rate B0 */
-               if (C_BAUD(tty) != B0)
-                       config->wFlags |= MXU1_UART_ENABLE_RTS_IN;
-               config->wFlags |= MXU1_UART_ENABLE_CTS_OUT;
-       }
-
-       if (I_IXOFF(tty) || I_IXON(tty)) {
-               config->cXon  = START_CHAR(tty);
-               config->cXoff = STOP_CHAR(tty);
-
-               if (I_IXOFF(tty))
-                       config->wFlags |= MXU1_UART_ENABLE_X_IN;
-
-               if (I_IXON(tty))
-                       config->wFlags |= MXU1_UART_ENABLE_X_OUT;
-       }
-
-       baud = tty_get_baud_rate(tty);
-       if (!baud)
-               baud = 9600;
-       config->wBaudRate = MXU1_BAUD_BASE / baud;
-
-       dev_dbg(&port->dev, "%s - BaudRate=%d, wBaudRate=%d, wFlags=0x%04X, bDataBits=%d, bParity=%d, bStopBits=%d, cXon=%d, cXoff=%d, bUartMode=%d\n",
-               __func__, baud, config->wBaudRate, config->wFlags,
-               config->bDataBits, config->bParity, config->bStopBits,
-               config->cXon, config->cXoff, config->bUartMode);
-
-       cpu_to_be16s(&config->wBaudRate);
-       cpu_to_be16s(&config->wFlags);
-
-       status = mxu1_send_ctrl_data_urb(port->serial, MXU1_SET_CONFIG, 0,
-                                        MXU1_UART1_PORT, config,
-                                        sizeof(*config));
-       if (status)
-               dev_err(&port->dev, "cannot set config: %d\n", status);
-
-       mutex_lock(&mxport->mutex);
-       mcr = mxport->mcr;
-
-       if (C_BAUD(tty) == B0)
-               mcr &= ~(MXU1_MCR_DTR | MXU1_MCR_RTS);
-       else if (old_termios && (old_termios->c_cflag & CBAUD) == B0)
-               mcr |= MXU1_MCR_DTR | MXU1_MCR_RTS;
-
-       status = mxu1_set_mcr(port, mcr);
-       if (status)
-               dev_err(&port->dev, "cannot set modem control: %d\n", status);
-       else
-               mxport->mcr = mcr;
-
-       mutex_unlock(&mxport->mutex);
-
-       kfree(config);
-}
-
-static int mxu1_get_serial_info(struct usb_serial_port *port,
-                               struct serial_struct __user *ret_arg)
-{
-       struct serial_struct ret_serial;
-       unsigned cwait;
-
-       if (!ret_arg)
-               return -EFAULT;
-
-       cwait = port->port.closing_wait;
-       if (cwait != ASYNC_CLOSING_WAIT_NONE)
-               cwait = jiffies_to_msecs(cwait) / 10;
-
-       memset(&ret_serial, 0, sizeof(ret_serial));
-
-       ret_serial.type = PORT_16550A;
-       ret_serial.line = port->minor;
-       ret_serial.port = 0;
-       ret_serial.xmit_fifo_size = port->bulk_out_size;
-       ret_serial.baud_base = MXU1_BAUD_BASE;
-       ret_serial.close_delay = 5*HZ;
-       ret_serial.closing_wait = cwait;
-
-       if (copy_to_user(ret_arg, &ret_serial, sizeof(*ret_arg)))
-               return -EFAULT;
-
-       return 0;
-}
-
-
-static int mxu1_set_serial_info(struct usb_serial_port *port,
-                               struct serial_struct __user *new_arg)
-{
-       struct serial_struct new_serial;
-       unsigned cwait;
-
-       if (copy_from_user(&new_serial, new_arg, sizeof(new_serial)))
-               return -EFAULT;
-
-       cwait = new_serial.closing_wait;
-       if (cwait != ASYNC_CLOSING_WAIT_NONE)
-               cwait = msecs_to_jiffies(10 * new_serial.closing_wait);
-
-       port->port.closing_wait = cwait;
-
-       return 0;
-}
-
-static int mxu1_ioctl(struct tty_struct *tty,
-                     unsigned int cmd, unsigned long arg)
-{
-       struct usb_serial_port *port = tty->driver_data;
-
-       switch (cmd) {
-       case TIOCGSERIAL:
-               return mxu1_get_serial_info(port,
-                                           (struct serial_struct __user *)arg);
-       case TIOCSSERIAL:
-               return mxu1_set_serial_info(port,
-                                           (struct serial_struct __user *)arg);
-       }
-
-       return -ENOIOCTLCMD;
-}
-
-static int mxu1_tiocmget(struct tty_struct *tty)
-{
-       struct usb_serial_port *port = tty->driver_data;
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-       unsigned int result;
-       unsigned int msr;
-       unsigned int mcr;
-       unsigned long flags;
-
-       mutex_lock(&mxport->mutex);
-       spin_lock_irqsave(&mxport->spinlock, flags);
-
-       msr = mxport->msr;
-       mcr = mxport->mcr;
-
-       spin_unlock_irqrestore(&mxport->spinlock, flags);
-       mutex_unlock(&mxport->mutex);
-
-       result = ((mcr & MXU1_MCR_DTR)  ? TIOCM_DTR     : 0) |
-                ((mcr & MXU1_MCR_RTS)  ? TIOCM_RTS     : 0) |
-                ((mcr & MXU1_MCR_LOOP) ? TIOCM_LOOP    : 0) |
-                ((msr & MXU1_MSR_CTS)  ? TIOCM_CTS     : 0) |
-                ((msr & MXU1_MSR_CD)   ? TIOCM_CAR     : 0) |
-                ((msr & MXU1_MSR_RI)   ? TIOCM_RI      : 0) |
-                ((msr & MXU1_MSR_DSR)  ? TIOCM_DSR     : 0);
-
-       dev_dbg(&port->dev, "%s - 0x%04X\n", __func__, result);
-
-       return result;
-}
-
-static int mxu1_tiocmset(struct tty_struct *tty,
-                        unsigned int set, unsigned int clear)
-{
-       struct usb_serial_port *port = tty->driver_data;
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-       int err;
-       unsigned int mcr;
-
-       mutex_lock(&mxport->mutex);
-       mcr = mxport->mcr;
-
-       if (set & TIOCM_RTS)
-               mcr |= MXU1_MCR_RTS;
-       if (set & TIOCM_DTR)
-               mcr |= MXU1_MCR_DTR;
-       if (set & TIOCM_LOOP)
-               mcr |= MXU1_MCR_LOOP;
-
-       if (clear & TIOCM_RTS)
-               mcr &= ~MXU1_MCR_RTS;
-       if (clear & TIOCM_DTR)
-               mcr &= ~MXU1_MCR_DTR;
-       if (clear & TIOCM_LOOP)
-               mcr &= ~MXU1_MCR_LOOP;
-
-       err = mxu1_set_mcr(port, mcr);
-       if (!err)
-               mxport->mcr = mcr;
-
-       mutex_unlock(&mxport->mutex);
-
-       return err;
-}
-
-static void mxu1_break(struct tty_struct *tty, int break_state)
-{
-       struct usb_serial_port *port = tty->driver_data;
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-
-       if (break_state == -1)
-               mxport->send_break = true;
-       else
-               mxport->send_break = false;
-
-       mxu1_set_termios(tty, port, NULL);
-}
-
-static int mxu1_open(struct tty_struct *tty, struct usb_serial_port *port)
-{
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-       struct usb_serial *serial = port->serial;
-       int status;
-       u16 open_settings;
-
-       open_settings = (MXU1_PIPE_MODE_CONTINUOUS |
-                        MXU1_PIPE_TIMEOUT_ENABLE |
-                        (MXU1_TRANSFER_TIMEOUT << 2));
-
-       mxport->msr = 0;
-
-       status = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
-       if (status) {
-               dev_err(&port->dev, "failed to submit interrupt urb: %d\n",
-                       status);
-               return status;
-       }
-
-       if (tty)
-               mxu1_set_termios(tty, port, NULL);
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_OPEN_PORT,
-                                   open_settings, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot send open command: %d\n", status);
-               goto unlink_int_urb;
-       }
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_START_PORT,
-                                   0, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot send start command: %d\n", status);
-               goto unlink_int_urb;
-       }
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_PURGE_PORT,
-                                   MXU1_PURGE_INPUT, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot clear input buffers: %d\n",
-                       status);
-
-               goto unlink_int_urb;
-       }
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_PURGE_PORT,
-                                   MXU1_PURGE_OUTPUT, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot clear output buffers: %d\n",
-                       status);
-
-               goto unlink_int_urb;
-       }
-
-       /*
-        * reset the data toggle on the bulk endpoints to work around bug in
-        * host controllers where things get out of sync some times
-        */
-       usb_clear_halt(serial->dev, port->write_urb->pipe);
-       usb_clear_halt(serial->dev, port->read_urb->pipe);
-
-       if (tty)
-               mxu1_set_termios(tty, port, NULL);
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_OPEN_PORT,
-                                   open_settings, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot send open command: %d\n", status);
-               goto unlink_int_urb;
-       }
-
-       status = mxu1_send_ctrl_urb(serial, MXU1_START_PORT,
-                                   0, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "cannot send start command: %d\n", status);
-               goto unlink_int_urb;
-       }
-
-       status = usb_serial_generic_open(tty, port);
-       if (status)
-               goto unlink_int_urb;
-
-       return 0;
-
-unlink_int_urb:
-       usb_kill_urb(port->interrupt_in_urb);
-
-       return status;
-}
-
-static void mxu1_close(struct usb_serial_port *port)
-{
-       int status;
-
-       usb_serial_generic_close(port);
-       usb_kill_urb(port->interrupt_in_urb);
-
-       status = mxu1_send_ctrl_urb(port->serial, MXU1_CLOSE_PORT,
-                                   0, MXU1_UART1_PORT);
-       if (status) {
-               dev_err(&port->dev, "failed to send close port command: %d\n",
-                       status);
-       }
-}
-
-static void mxu1_handle_new_msr(struct usb_serial_port *port, u8 msr)
-{
-       struct mxu1_port *mxport = usb_get_serial_port_data(port);
-       struct async_icount *icount;
-       unsigned long flags;
-
-       dev_dbg(&port->dev, "%s - msr 0x%02X\n", __func__, msr);
-
-       spin_lock_irqsave(&mxport->spinlock, flags);
-       mxport->msr = msr & MXU1_MSR_MASK;
-       spin_unlock_irqrestore(&mxport->spinlock, flags);
-
-       if (msr & MXU1_MSR_DELTA_MASK) {
-               icount = &port->icount;
-               if (msr & MXU1_MSR_DELTA_CTS)
-                       icount->cts++;
-               if (msr & MXU1_MSR_DELTA_DSR)
-                       icount->dsr++;
-               if (msr & MXU1_MSR_DELTA_CD)
-                       icount->dcd++;
-               if (msr & MXU1_MSR_DELTA_RI)
-                       icount->rng++;
-
-               wake_up_interruptible(&port->port.delta_msr_wait);
-       }
-}
-
-static void mxu1_interrupt_callback(struct urb *urb)
-{
-       struct usb_serial_port *port = urb->context;
-       unsigned char *data = urb->transfer_buffer;
-       int length = urb->actual_length;
-       int function;
-       int status;
-       u8 msr;
-
-       switch (urb->status) {
-       case 0:
-               break;
-       case -ECONNRESET:
-       case -ENOENT:
-       case -ESHUTDOWN:
-               dev_dbg(&port->dev, "%s - urb shutting down: %d\n",
-                       __func__, urb->status);
-               return;
-       default:
-               dev_dbg(&port->dev, "%s - nonzero urb status: %d\n",
-                       __func__, urb->status);
-               goto exit;
-       }
-
-       if (length != 2) {
-               dev_dbg(&port->dev, "%s - bad packet size: %d\n",
-                       __func__, length);
-               goto exit;
-       }
-
-       if (data[0] == MXU1_CODE_HARDWARE_ERROR) {
-               dev_err(&port->dev, "hardware error: %d\n", data[1]);
-               goto exit;
-       }
-
-       function = mxu1_get_func_from_code(data[0]);
-
-       dev_dbg(&port->dev, "%s - function %d, data 0x%02X\n",
-                __func__, function, data[1]);
-
-       switch (function) {
-       case MXU1_CODE_DATA_ERROR:
-               dev_dbg(&port->dev, "%s - DATA ERROR, data 0x%02X\n",
-                        __func__, data[1]);
-               break;
-
-       case MXU1_CODE_MODEM_STATUS:
-               msr = data[1];
-               mxu1_handle_new_msr(port, msr);
-               break;
-
-       default:
-               dev_err(&port->dev, "unknown interrupt code: 0x%02X\n",
-                       data[1]);
-               break;
-       }
-
-exit:
-       status = usb_submit_urb(urb, GFP_ATOMIC);
-       if (status) {
-               dev_err(&port->dev, "resubmit interrupt urb failed: %d\n",
-                       status);
-       }
-}
-
-static struct usb_serial_driver mxu11x0_device = {
-       .driver = {
-               .owner          = THIS_MODULE,
-               .name           = "mxu11x0",
-       },
-       .description            = "MOXA UPort 11x0",
-       .id_table               = mxu1_idtable,
-       .num_ports              = 1,
-       .port_probe             = mxu1_port_probe,
-       .attach                 = mxu1_startup,
-       .open                   = mxu1_open,
-       .close                  = mxu1_close,
-       .ioctl                  = mxu1_ioctl,
-       .set_termios            = mxu1_set_termios,
-       .tiocmget               = mxu1_tiocmget,
-       .tiocmset               = mxu1_tiocmset,
-       .tiocmiwait             = usb_serial_generic_tiocmiwait,
-       .get_icount             = usb_serial_generic_get_icount,
-       .break_ctl              = mxu1_break,
-       .read_int_callback      = mxu1_interrupt_callback,
-};
-
-static struct usb_serial_driver *const serial_drivers[] = {
-       &mxu11x0_device, NULL
-};
-
-module_usb_serial_driver(serial_drivers, mxu1_idtable);
-
-MODULE_AUTHOR("Mathieu Othacehe <m.othacehe@gmail.com>");
-MODULE_DESCRIPTION("MOXA UPort 11x0 USB to Serial Hub Driver");
-MODULE_LICENSE("GPL");
-MODULE_FIRMWARE("moxa/moxa-1110.fw");
-MODULE_FIRMWARE("moxa/moxa-1130.fw");
-MODULE_FIRMWARE("moxa/moxa-1131.fw");
-MODULE_FIRMWARE("moxa/moxa-1150.fw");
-MODULE_FIRMWARE("moxa/moxa-1151.fw");
index f228060..348e198 100644 (file)
@@ -268,6 +268,9 @@ static void option_instat_callback(struct urb *urb);
 #define TELIT_PRODUCT_CC864_SINGLE             0x1006
 #define TELIT_PRODUCT_DE910_DUAL               0x1010
 #define TELIT_PRODUCT_UE910_V2                 0x1012
+#define TELIT_PRODUCT_LE922_USBCFG0            0x1042
+#define TELIT_PRODUCT_LE922_USBCFG3            0x1043
+#define TELIT_PRODUCT_LE922_USBCFG5            0x1045
 #define TELIT_PRODUCT_LE920                    0x1200
 #define TELIT_PRODUCT_LE910                    0x1201
 
@@ -313,6 +316,7 @@ static void option_instat_callback(struct urb *urb);
 #define TOSHIBA_PRODUCT_G450                   0x0d45
 
 #define ALINK_VENDOR_ID                                0x1e0e
+#define SIMCOM_PRODUCT_SIM7100E                        0x9001 /* Yes, ALINK_VENDOR_ID */
 #define ALINK_PRODUCT_PH300                    0x9100
 #define ALINK_PRODUCT_3GU                      0x9200
 
@@ -605,6 +609,10 @@ static const struct option_blacklist_info zte_1255_blacklist = {
        .reserved = BIT(3) | BIT(4),
 };
 
+static const struct option_blacklist_info simcom_sim7100e_blacklist = {
+       .reserved = BIT(5) | BIT(6),
+};
+
 static const struct option_blacklist_info telit_le910_blacklist = {
        .sendsetup = BIT(0),
        .reserved = BIT(1) | BIT(2),
@@ -615,6 +623,16 @@ static const struct option_blacklist_info telit_le920_blacklist = {
        .reserved = BIT(1) | BIT(5),
 };
 
+static const struct option_blacklist_info telit_le922_blacklist_usbcfg0 = {
+       .sendsetup = BIT(2),
+       .reserved = BIT(0) | BIT(1) | BIT(3),
+};
+
+static const struct option_blacklist_info telit_le922_blacklist_usbcfg3 = {
+       .sendsetup = BIT(0),
+       .reserved = BIT(1) | BIT(2) | BIT(3),
+};
+
 static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
        { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
@@ -1110,9 +1128,13 @@ static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC650) },
        { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC680) },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6000)}, /* ZTE AC8700 */
+       { USB_DEVICE_AND_INTERFACE_INFO(QUALCOMM_VENDOR_ID, 0x6001, 0xff, 0xff, 0xff), /* 4G LTE usb-modem U901 */
+         .driver_info = (kernel_ulong_t)&net_intf3_blacklist },
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x0023)}, /* ONYX 3G device */
        { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9000)}, /* SIMCom SIM5218 */
+       { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9003), /* Quectel UC20 */
+         .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) },
        { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003),
@@ -1160,6 +1182,12 @@ static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_CC864_SINGLE) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_DE910_DUAL) },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UE910_V2) },
+       { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0),
+               .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
+       { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG3),
+               .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
+       { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG5, 0xff),
+               .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
                .driver_info = (kernel_ulong_t)&telit_le910_blacklist },
        { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920),
@@ -1629,6 +1657,8 @@ static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(ALINK_VENDOR_ID, 0x9000) },
        { USB_DEVICE(ALINK_VENDOR_ID, ALINK_PRODUCT_PH300) },
        { USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) },
+       { USB_DEVICE(ALINK_VENDOR_ID, SIMCOM_PRODUCT_SIM7100E),
+         .driver_info = (kernel_ulong_t)&simcom_sim7100e_blacklist },
        { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S_X200),
          .driver_info = (kernel_ulong_t)&alcatel_x200_blacklist
        },
@@ -1679,7 +1709,7 @@ static const struct usb_device_id option_ids[] = {
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_EU3_P) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PH8),
                .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
-       { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_AHXX) },
+       { USB_DEVICE_INTERFACE_CLASS(CINTERION_VENDOR_ID, CINTERION_PRODUCT_AHXX, 0xff) },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_PLXX),
                .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
        { USB_DEVICE(CINTERION_VENDOR_ID, CINTERION_PRODUCT_HC28_MDM) }, 
index 9919d2a..1bc6089 100644 (file)
@@ -157,14 +157,17 @@ static const struct usb_device_id id_table[] = {
        {DEVICE_SWI(0x1199, 0x9056)},   /* Sierra Wireless Modem */
        {DEVICE_SWI(0x1199, 0x9060)},   /* Sierra Wireless Modem */
        {DEVICE_SWI(0x1199, 0x9061)},   /* Sierra Wireless Modem */
-       {DEVICE_SWI(0x1199, 0x9070)},   /* Sierra Wireless MC74xx/EM74xx */
-       {DEVICE_SWI(0x1199, 0x9071)},   /* Sierra Wireless MC74xx/EM74xx */
+       {DEVICE_SWI(0x1199, 0x9070)},   /* Sierra Wireless MC74xx */
+       {DEVICE_SWI(0x1199, 0x9071)},   /* Sierra Wireless MC74xx */
+       {DEVICE_SWI(0x1199, 0x9078)},   /* Sierra Wireless EM74xx */
+       {DEVICE_SWI(0x1199, 0x9079)},   /* Sierra Wireless EM74xx */
        {DEVICE_SWI(0x413c, 0x81a2)},   /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
        {DEVICE_SWI(0x413c, 0x81a3)},   /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
        {DEVICE_SWI(0x413c, 0x81a4)},   /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
        {DEVICE_SWI(0x413c, 0x81a8)},   /* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */
        {DEVICE_SWI(0x413c, 0x81a9)},   /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
        {DEVICE_SWI(0x413c, 0x81b1)},   /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
+       {DEVICE_SWI(0x413c, 0x81b3)},   /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */
 
        /* Huawei devices */
        {DEVICE_HWI(0x03f0, 0x581d)},   /* HP lt4112 LTE/HSPA+ Gobi 4G Modem (Huawei me906e) */
index 60afb39..337a0be 100644 (file)
@@ -544,6 +544,11 @@ static int treo_attach(struct usb_serial *serial)
                (serial->num_interrupt_in == 0))
                return 0;
 
+       if (serial->num_bulk_in < 2 || serial->num_interrupt_in < 2) {
+               dev_err(&serial->interface->dev, "missing endpoints\n");
+               return -ENODEV;
+       }
+
        /*
        * It appears that Treos and Kyoceras want to use the
        * 1st bulk in endpoint to communicate with the 2nd bulk out endpoint,
@@ -597,8 +602,10 @@ static int clie_5_attach(struct usb_serial *serial)
         */
 
        /* some sanity check */
-       if (serial->num_ports < 2)
-               return -1;
+       if (serial->num_bulk_out < 2) {
+               dev_err(&serial->interface->dev, "missing bulk out endpoints\n");
+               return -ENODEV;
+       }
 
        /* port 0 now uses the modified endpoint Address */
        port = serial->port[0];
index 2760a7b..8c80a48 100644 (file)
@@ -446,7 +446,8 @@ static long vfio_pci_ioctl(void *device_data,
                info.num_regions = VFIO_PCI_NUM_REGIONS;
                info.num_irqs = VFIO_PCI_NUM_IRQS;
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
                struct pci_dev *pdev = vdev->pdev;
@@ -520,7 +521,8 @@ static long vfio_pci_ioctl(void *device_data,
                        return -EINVAL;
                }
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
                struct vfio_irq_info info;
@@ -555,7 +557,8 @@ static long vfio_pci_ioctl(void *device_data,
                else
                        info.flags |= VFIO_IRQ_INFO_NORESIZE;
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
                struct vfio_irq_set hdr;
index 418cdd9..e65b142 100644 (file)
@@ -219,7 +219,8 @@ static long vfio_platform_ioctl(void *device_data,
                info.num_regions = vdev->num_regions;
                info.num_irqs = vdev->num_irqs;
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
                struct vfio_region_info info;
@@ -240,7 +241,8 @@ static long vfio_platform_ioctl(void *device_data,
                info.size = vdev->regions[info.index].size;
                info.flags = vdev->regions[info.index].flags;
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
                struct vfio_irq_info info;
@@ -259,7 +261,8 @@ static long vfio_platform_ioctl(void *device_data,
                info.flags = vdev->irqs[info.index].flags;
                info.count = vdev->irqs[info.index].count;
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
                struct vfio_irq_set hdr;
index 82f25cc..ecca316 100644 (file)
@@ -123,8 +123,8 @@ struct iommu_group *vfio_iommu_group_get(struct device *dev)
        /*
         * With noiommu enabled, an IOMMU group will be created for a device
         * that doesn't already have one and doesn't have an iommu_ops on their
-        * bus.  We use iommu_present() again in the main code to detect these
-        * fake groups.
+        * bus.  We set iommudata simply to be able to identify these groups
+        * as special use and for reclamation later.
         */
        if (group || !noiommu || iommu_present(dev->bus))
                return group;
@@ -134,6 +134,7 @@ struct iommu_group *vfio_iommu_group_get(struct device *dev)
                return NULL;
 
        iommu_group_set_name(group, "vfio-noiommu");
+       iommu_group_set_iommudata(group, &noiommu, NULL);
        ret = iommu_group_add_device(group, dev);
        iommu_group_put(group);
        if (ret)
@@ -158,7 +159,7 @@ EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 {
 #ifdef CONFIG_VFIO_NOIOMMU
-       if (!iommu_present(dev->bus))
+       if (iommu_group_get_iommudata(group) == &noiommu)
                iommu_group_remove_device(dev);
 #endif
 
@@ -190,16 +191,10 @@ static long vfio_noiommu_ioctl(void *iommu_data,
        return -ENOTTY;
 }
 
-static int vfio_iommu_present(struct device *dev, void *unused)
-{
-       return iommu_present(dev->bus) ? 1 : 0;
-}
-
 static int vfio_noiommu_attach_group(void *iommu_data,
                                     struct iommu_group *iommu_group)
 {
-       return iommu_group_for_each_dev(iommu_group, NULL,
-                                       vfio_iommu_present) ? -EINVAL : 0;
+       return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 }
 
 static void vfio_noiommu_detach_group(void *iommu_data,
@@ -323,8 +318,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group)
 /**
  * Group objects - create, release, get, put, search
  */
-static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
-                                           bool iommu_present)
+static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 {
        struct vfio_group *group, *tmp;
        struct device *dev;
@@ -342,7 +336,9 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
        atomic_set(&group->container_users, 0);
        atomic_set(&group->opened, 0);
        group->iommu_group = iommu_group;
-       group->noiommu = !iommu_present;
+#ifdef CONFIG_VFIO_NOIOMMU
+       group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
+#endif
 
        group->nb.notifier_call = vfio_iommu_group_notifier;
 
@@ -767,7 +763,7 @@ int vfio_add_group_dev(struct device *dev,
 
        group = vfio_group_get_from_iommu(iommu_group);
        if (!group) {
-               group = vfio_create_group(iommu_group, iommu_present(dev->bus));
+               group = vfio_create_group(iommu_group);
                if (IS_ERR(group)) {
                        iommu_group_put(iommu_group);
                        return PTR_ERR(group);
index 6f1ea3d..75b24e9 100644 (file)
@@ -999,7 +999,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
                info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
-               return copy_to_user((void __user *)arg, &info, minsz);
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
 
        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
                struct vfio_iommu_type1_dma_map map;
@@ -1032,7 +1033,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
                if (ret)
                        return ret;
 
-               return copy_to_user((void __user *)arg, &unmap, minsz);
+               return copy_to_user((void __user *)arg, &unmap, minsz) ?
+                       -EFAULT : 0;
        }
 
        return -ENOTTY;
index ad2146a..236553e 100644 (file)
@@ -1156,6 +1156,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 {
        __virtio16 last_used_idx;
        int r;
+       bool is_le = vq->is_le;
+
        if (!vq->private_data) {
                vq->is_le = virtio_legacy_is_little_endian();
                return 0;
@@ -1165,15 +1167,20 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 
        r = vhost_update_used_flags(vq);
        if (r)
-               return r;
+               goto err;
        vq->signalled_used_valid = false;
-       if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
-               return -EFAULT;
+       if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+               r = -EFAULT;
+               goto err;
+       }
        r = __get_user(last_used_idx, &vq->used->idx);
        if (r)
-               return r;
+               goto err;
        vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
        return 0;
+err:
+       vq->is_le = is_le;
+       return r;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
index 92f3949..6e92917 100644 (file)
@@ -709,6 +709,7 @@ static int con2fb_acquire_newinfo(struct vc_data *vc, struct fb_info *info,
        }
 
        if (!err) {
+               ops->cur_blink_jiffies = HZ / 5;
                info->fbcon_par = ops;
 
                if (vc)
@@ -956,6 +957,7 @@ static const char *fbcon_startup(void)
        ops->currcon = -1;
        ops->graphics = 1;
        ops->cur_rotate = -1;
+       ops->cur_blink_jiffies = HZ / 5;
        info->fbcon_par = ops;
        p->con_rotate = initial_rotation;
        set_blitting_type(vc, info);
index a305cae..fb75b7e 100644 (file)
@@ -1040,8 +1040,8 @@ static int acornfb_probe(struct platform_device *dev)
                 * for the framebuffer if we are not using
                 * VRAM.
                 */
-               base = dma_alloc_writecombine(current_par.dev, size, &handle,
-                                             GFP_KERNEL);
+               base = dma_alloc_wc(current_par.dev, size, &handle,
+                                   GFP_KERNEL);
                if (base == NULL) {
                        printk(KERN_ERR "acornfb: unable to allocate screen "
                               "memory\n");
index 7a8afcd..a8a22da 100644 (file)
@@ -154,8 +154,8 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
 {
        dma_addr_t dma;
 
-       fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
-                                                   &dma, GFP_KERNEL);
+       fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, framesize, &dma,
+                                         GFP_KERNEL);
        if (!fb->fb.screen_base) {
                pr_err("CLCD: unable to map framebuffer\n");
                return -ENOMEM;
@@ -169,14 +169,12 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
 
 int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-       return dma_mmap_writecombine(&fb->dev->dev, vma,
-                                    fb->fb.screen_base,
-                                    fb->fb.fix.smem_start,
-                                    fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void versatile_clcd_remove_dma(struct clcd_fb *fb)
 {
-       dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-                             fb->fb.screen_base, fb->fb.fix.smem_start);
+       dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+                   fb->fb.fix.smem_start);
 }
index 9362424..fe274b5 100644 (file)
@@ -774,8 +774,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb)
 
 static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-       return dma_mmap_writecombine(&fb->dev->dev, vma, fb->fb.screen_base,
-                       fb->fb.fix.smem_start, fb->fb.fix.smem_len);
+       return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+                          fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 static void clcdfb_of_dma_remove(struct clcd_fb *fb)
index 19eb42b..56c60e6 100644 (file)
@@ -414,8 +414,8 @@ static inline void atmel_lcdfb_free_video_memory(struct atmel_lcdfb_info *sinfo)
 {
        struct fb_info *info = sinfo->info;
 
-       dma_free_writecombine(info->device, info->fix.smem_len,
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(info->device, info->fix.smem_len, info->screen_base,
+                   info->fix.smem_start);
 }
 
 /**
@@ -435,8 +435,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
                    * ((var->bits_per_pixel + 7) / 8));
        info->fix.smem_len = max(smem_len, sinfo->smem_len);
 
-       info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
-                                       (dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(info->device, info->fix.smem_len,
+                                        (dma_addr_t *)&info->fix.smem_start,
+                                        GFP_KERNEL);
 
        if (!info->screen_base) {
                return -ENOMEM;
index 0081725..6b2a06d 100644 (file)
@@ -152,7 +152,7 @@ static void lcdc_write(unsigned int val, unsigned int addr)
 
 struct da8xx_fb_par {
        struct device           *dev;
-       resource_size_t p_palette_base;
+       dma_addr_t              p_palette_base;
        unsigned char *v_palette_base;
        dma_addr_t              vram_phys;
        unsigned long           vram_size;
@@ -1428,7 +1428,7 @@ static int fb_probe(struct platform_device *device)
 
        par->vram_virt = dma_alloc_coherent(NULL,
                                            par->vram_size,
-                                           (resource_size_t *) &par->vram_phys,
+                                           &par->vram_phys,
                                            GFP_KERNEL | GFP_DMA);
        if (!par->vram_virt) {
                dev_err(&device->dev,
@@ -1448,7 +1448,7 @@ static int fb_probe(struct platform_device *device)
 
        /* allocate palette buffer */
        par->v_palette_base = dma_zalloc_coherent(NULL, PALETTE_SIZE,
-                                                 (resource_size_t *)&par->p_palette_base,
+                                                 &par->p_palette_base,
                                                  GFP_KERNEL | GFP_DMA);
        if (!par->v_palette_base) {
                dev_err(&device->dev,
index 5b10810..75f0db2 100644 (file)
@@ -316,9 +316,8 @@ static int ep93xxfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
        unsigned int offset = vma->vm_pgoff << PAGE_SHIFT;
 
        if (offset < info->fix.smem_len) {
-               return dma_mmap_writecombine(info->dev, vma, info->screen_base,
-                                            info->fix.smem_start,
-                                            info->fix.smem_len);
+               return dma_mmap_wc(info->dev, vma, info->screen_base,
+                                  info->fix.smem_start, info->fix.smem_len);
        }
 
        return -EINVAL;
@@ -428,8 +427,7 @@ static int ep93xxfb_alloc_videomem(struct fb_info *info)
        /* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
        fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
 
-       virt_addr = dma_alloc_writecombine(info->dev, fb_size,
-                                          &phys_addr, GFP_KERNEL);
+       virt_addr = dma_alloc_wc(info->dev, fb_size, &phys_addr, GFP_KERNEL);
        if (!virt_addr)
                return -ENOMEM;
 
index 95873f2..de2f3e7 100644 (file)
@@ -829,8 +829,7 @@ static int s6e8ax0_probe(struct mipi_dsim_lcd_device *dsim_dev)
        return 0;
 }
 
-#ifdef CONFIG_PM
-static int s6e8ax0_suspend(struct mipi_dsim_lcd_device *dsim_dev)
+static int __maybe_unused s6e8ax0_suspend(struct mipi_dsim_lcd_device *dsim_dev)
 {
        struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
 
@@ -843,7 +842,7 @@ static int s6e8ax0_suspend(struct mipi_dsim_lcd_device *dsim_dev)
        return 0;
 }
 
-static int s6e8ax0_resume(struct mipi_dsim_lcd_device *dsim_dev)
+static int __maybe_unused s6e8ax0_resume(struct mipi_dsim_lcd_device *dsim_dev)
 {
        struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
 
@@ -855,10 +854,6 @@ static int s6e8ax0_resume(struct mipi_dsim_lcd_device *dsim_dev)
 
        return 0;
 }
-#else
-#define s6e8ax0_suspend                NULL
-#define s6e8ax0_resume         NULL
-#endif
 
 static struct mipi_dsim_lcd_driver s6e8ax0_dsim_ddi_driver = {
        .name = "s6e8ax0",
@@ -867,8 +862,8 @@ static struct mipi_dsim_lcd_driver s6e8ax0_dsim_ddi_driver = {
        .power_on = s6e8ax0_power_on,
        .set_sequence = s6e8ax0_set_sequence,
        .probe = s6e8ax0_probe,
-       .suspend = s6e8ax0_suspend,
-       .resume = s6e8ax0_resume,
+       .suspend = IS_ENABLED(CONFIG_PM) ? s6e8ax0_suspend : NULL,
+       .resume = IS_ENABLED(CONFIG_PM) ? s6e8ax0_resume : NULL,
 };
 
 static int s6e8ax0_init(void)
index b63d55f..1a242b1 100644 (file)
@@ -1185,8 +1185,8 @@ static int gbefb_probe(struct platform_device *p_dev)
        } else {
                /* try to allocate memory with the classical allocator
                 * this has high chance to fail on low memory machines */
-               gbe_mem = dma_alloc_writecombine(NULL, gbe_mem_size,
-                                                &gbe_dma_addr, GFP_KERNEL);
+               gbe_mem = dma_alloc_wc(NULL, gbe_mem_size, &gbe_dma_addr,
+                                      GFP_KERNEL);
                if (!gbe_mem) {
                        printk(KERN_ERR "gbefb: couldn't allocate framebuffer memory\n");
                        ret = -ENOMEM;
@@ -1238,7 +1238,7 @@ static int gbefb_probe(struct platform_device *p_dev)
 out_gbe_unmap:
        arch_phys_wc_del(par->wc_cookie);
        if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
 out_tiles_free:
        dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                          (void *)gbe_tiles.cpu, gbe_tiles.dma);
@@ -1259,7 +1259,7 @@ static int gbefb_remove(struct platform_device* p_dev)
        gbe_turn_off();
        arch_phys_wc_del(par->wc_cookie);
        if (gbe_dma_addr)
-               dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+               dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
        dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
                          (void *)gbe_tiles.cpu, gbe_tiles.dma);
        release_mem_region(GBE_BASE, sizeof(struct sgi_gbe));
index cee8860..76b6a77 100644 (file)
@@ -902,6 +902,21 @@ static int imxfb_probe(struct platform_device *pdev)
                goto failed_getclock;
        }
 
+       /*
+        * The LCDC controller does not have an enable bit. The
+        * controller starts directly when the clocks are enabled.
+        * If the clocks are enabled when the controller is not yet
+        * programmed with proper register values (enabled at the
+        * bootloader, for example) then it just goes into some undefined
+        * state.
+        * To avoid this issue, let's enable and disable LCDC IPG clock
+        * so that we force some kind of 'reset' to the LCDC block.
+        */
+       ret = clk_prepare_enable(fbi->clk_ipg);
+       if (ret)
+               goto failed_getclock;
+       clk_disable_unprepare(fbi->clk_ipg);
+
        fbi->clk_ahb = devm_clk_get(&pdev->dev, "ahb");
        if (IS_ERR(fbi->clk_ahb)) {
                ret = PTR_ERR(fbi->clk_ahb);
@@ -922,8 +937,8 @@ static int imxfb_probe(struct platform_device *pdev)
        }
 
        fbi->map_size = PAGE_ALIGN(info->fix.smem_len);
-       info->screen_base = dma_alloc_writecombine(&pdev->dev, fbi->map_size,
-                                                  &fbi->map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(&pdev->dev, fbi->map_size,
+                                        &fbi->map_dma, GFP_KERNEL);
 
        if (!info->screen_base) {
                dev_err(&pdev->dev, "Failed to allocate video RAM: %d\n", ret);
@@ -990,8 +1005,8 @@ failed_cmap:
        if (pdata && pdata->exit)
                pdata->exit(fbi->pdev);
 failed_platform_init:
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
 failed_map:
        iounmap(fbi->regs);
 failed_ioremap:
@@ -1026,8 +1041,8 @@ static int imxfb_remove(struct platform_device *pdev)
        kfree(info->pseudo_palette);
        framebuffer_release(info);
 
-       dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-                             fbi->map_dma);
+       dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+                   fbi->map_dma);
 
        iounmap(fbi->regs);
        release_mem_region(res->start, resource_size(res));
index de54a47..b6f83d5 100644 (file)
@@ -503,8 +503,7 @@ static int mmphw_probe(struct platform_device *pdev)
        ctrl->reg_base = devm_ioremap_nocache(ctrl->dev,
                        res->start, resource_size(res));
        if (ctrl->reg_base == NULL) {
-               dev_err(ctrl->dev, "%s: res %x - %x map failed\n", __func__,
-                       res->start, res->end);
+               dev_err(ctrl->dev, "%s: res %pR map failed\n", __func__, res);
                ret = -ENOMEM;
                goto failed;
        }
index 7947634..f91b1db 100644 (file)
@@ -1336,9 +1336,8 @@ static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len,
        int retval = 0;
        dma_addr_t addr;
 
-       fbi->screen_base = dma_alloc_writecombine(fbi->device,
-                                                 mem_len,
-                                                 &addr, GFP_DMA | GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(fbi->device, mem_len, &addr,
+                                       GFP_DMA | GFP_KERNEL);
 
        if (!fbi->screen_base) {
                dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
@@ -1378,8 +1377,8 @@ err0:
  */
 static int mx3fb_unmap_video_memory(struct fb_info *fbi)
 {
-       dma_free_writecombine(fbi->device, fbi->fix.smem_len,
-                             fbi->screen_base, fbi->fix.smem_start);
+       dma_free_wc(fbi->device, fbi->fix.smem_len, fbi->screen_base,
+                   fbi->fix.smem_start);
 
        fbi->screen_base = NULL;
        mutex_lock(&fbi->mm_lock);
index 389fa2c..6680eda 100644 (file)
@@ -396,8 +396,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
        dev_dbg(fbi->dev, "nuc900fb_map_video_memory(fbi=%p) map_size %lu\n",
                fbi, map_size);
 
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                       &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
 
        if (!info->screen_base)
                return -ENOMEM;
@@ -411,8 +411,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
 static inline void nuc900fb_unmap_video_memory(struct fb_info *info)
 {
        struct nuc900fb_info *fbi = info->par;
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
 }
 
 static irqreturn_t nuc900fb_irqhandler(int irq, void *dev_id)
index c9293ae..a970edc 100644 (file)
@@ -123,11 +123,11 @@ static int ocfb_setupfb(struct ocfb_dev *fbdev)
 
        /* Horizontal timings */
        ocfb_writereg(fbdev, OCFB_HTIM, (var->hsync_len - 1) << 24 |
-                     (var->right_margin - 1) << 16 | (var->xres - 1));
+                     (var->left_margin - 1) << 16 | (var->xres - 1));
 
        /* Vertical timings */
        ocfb_writereg(fbdev, OCFB_VTIM, (var->vsync_len - 1) << 24 |
-                     (var->lower_margin - 1) << 16 | (var->yres - 1));
+                     (var->upper_margin - 1) << 16 | (var->yres - 1));
 
        /* Total length of frame */
        hlen = var->left_margin + var->right_margin + var->hsync_len +
index 6efa259..e3d9b9e 100644 (file)
@@ -612,8 +612,8 @@ static void lcdc_dma_handler(u16 status, void *data)
 
 static int alloc_palette_ram(void)
 {
-       lcdc.palette_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-               MAX_PALETTE_SIZE, &lcdc.palette_phys, GFP_KERNEL);
+       lcdc.palette_virt = dma_alloc_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
+                                        &lcdc.palette_phys, GFP_KERNEL);
        if (lcdc.palette_virt == NULL) {
                dev_err(lcdc.fbdev->dev, "failed to alloc palette memory\n");
                return -ENOMEM;
@@ -625,8 +625,8 @@ static int alloc_palette_ram(void)
 
 static void free_palette_ram(void)
 {
-       dma_free_writecombine(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
-                       lcdc.palette_virt, lcdc.palette_phys);
+       dma_free_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE, lcdc.palette_virt,
+                   lcdc.palette_phys);
 }
 
 static int alloc_fbmem(struct omapfb_mem_region *region)
@@ -642,8 +642,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
        if (region->size > frame_size)
                frame_size = region->size;
        lcdc.vram_size = frame_size;
-       lcdc.vram_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-                       lcdc.vram_size, &lcdc.vram_phys, GFP_KERNEL);
+       lcdc.vram_virt = dma_alloc_wc(lcdc.fbdev->dev, lcdc.vram_size,
+                                     &lcdc.vram_phys, GFP_KERNEL);
        if (lcdc.vram_virt == NULL) {
                dev_err(lcdc.fbdev->dev, "unable to allocate FB DMA memory\n");
                return -ENOMEM;
@@ -660,8 +660,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
 
 static void free_fbmem(void)
 {
-       dma_free_writecombine(lcdc.fbdev->dev, lcdc.vram_size,
-                             lcdc.vram_virt, lcdc.vram_phys);
+       dma_free_wc(lcdc.fbdev->dev, lcdc.vram_size, lcdc.vram_virt,
+                   lcdc.vram_phys);
 }
 
 static int setup_fbmem(struct omapfb_mem_desc *req_md)
index efb57c0..def3a50 100644 (file)
@@ -680,8 +680,8 @@ static int pxa168fb_probe(struct platform_device *pdev)
         */
        info->fix.smem_len = PAGE_ALIGN(DEFAULT_FB_SIZE);
 
-       info->screen_base = dma_alloc_writecombine(fbi->dev, info->fix.smem_len,
-                                               &fbi->fb_start_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, info->fix.smem_len,
+                                        &fbi->fb_start_dma, GFP_KERNEL);
        if (info->screen_base == NULL) {
                ret = -ENOMEM;
                goto failed_free_info;
@@ -804,8 +804,8 @@ static int pxa168fb_remove(struct platform_device *pdev)
 
        irq = platform_get_irq(pdev, 0);
 
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                               info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
 
        clk_disable(fbi->clk);
 
index 33b2bb3..2c0487f 100644 (file)
@@ -2446,8 +2446,8 @@ static int pxafb_remove(struct platform_device *dev)
 
        free_pages_exact(fbi->video_mem, fbi->video_mem_size);
 
-       dma_free_writecombine(&dev->dev, fbi->dma_buff_size,
-                       fbi->dma_buff, fbi->dma_buff_phys);
+       dma_free_wc(&dev->dev, fbi->dma_buff_size, fbi->dma_buff,
+                   fbi->dma_buff_phys);
 
        iounmap(fbi->mmio_base);
 
index f72dd12..5f4f696 100644 (file)
@@ -1105,8 +1105,7 @@ static int s3c_fb_alloc_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
 
        dev_dbg(sfb->dev, "want %u bytes for window\n", size);
 
-       fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
-                                                 &map_dma, GFP_KERNEL);
+       fbi->screen_base = dma_alloc_wc(sfb->dev, size, &map_dma, GFP_KERNEL);
        if (!fbi->screen_base)
                return -ENOMEM;
 
@@ -1131,8 +1130,8 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
        struct fb_info *fbi = win->fbinfo;
 
        if (fbi->screen_base)
-               dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
-                             fbi->screen_base, fbi->fix.smem_start);
+               dma_free_wc(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+                           fbi->screen_base, fbi->fix.smem_start);
 }
 
 /**
index d6704ad..0dd86be 100644 (file)
@@ -645,8 +645,8 @@ static int s3c2410fb_map_video_memory(struct fb_info *info)
 
        dprintk("map_video_memory(fbi=%p) map_size %u\n", fbi, map_size);
 
-       info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-                                                  &map_dma, GFP_KERNEL);
+       info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+                                        GFP_KERNEL);
 
        if (info->screen_base) {
                /* prevent initial garbage on screen */
@@ -667,8 +667,8 @@ static inline void s3c2410fb_unmap_video_memory(struct fb_info *info)
 {
        struct s3c2410fb_info *fbi = info->par;
 
-       dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-                             info->screen_base, info->fix.smem_start);
+       dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+                   info->screen_base, info->fix.smem_start);
 }
 
 static inline void modify_gpio(void __iomem *reg,
index dcf774c..fc2aaa5 100644 (file)
@@ -567,8 +567,8 @@ static int sa1100fb_mmap(struct fb_info *info,
 
        if (off < info->fix.smem_len) {
                vma->vm_pgoff += 1; /* skip over the palette */
-               return dma_mmap_writecombine(fbi->dev, vma, fbi->map_cpu,
-                                            fbi->map_dma, fbi->map_size);
+               return dma_mmap_wc(fbi->dev, vma, fbi->map_cpu, fbi->map_dma,
+                                  fbi->map_size);
        }
 
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1099,8 +1099,8 @@ static int sa1100fb_map_video_memory(struct sa1100fb_info *fbi)
         * of the framebuffer.
         */
        fbi->map_size = PAGE_ALIGN(fbi->fb.fix.smem_len + PAGE_SIZE);
-       fbi->map_cpu = dma_alloc_writecombine(fbi->dev, fbi->map_size,
-                                             &fbi->map_dma, GFP_KERNEL);
+       fbi->map_cpu = dma_alloc_wc(fbi->dev, fbi->map_size, &fbi->map_dma,
+                                   GFP_KERNEL);
 
        if (fbi->map_cpu) {
                fbi->fb.screen_base = fbi->map_cpu + PAGE_SIZE;
index 36205c2..f6bed86 100644 (file)
@@ -545,6 +545,7 @@ err_enable_device:
 static void virtio_pci_remove(struct pci_dev *pci_dev)
 {
        struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+       struct device *dev = get_device(&vp_dev->vdev.dev);
 
        unregister_virtio_device(&vp_dev->vdev);
 
@@ -554,6 +555,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
                virtio_pci_modern_remove(vp_dev);
 
        pci_disable_device(pci_dev);
+       put_device(dev);
 }
 
 static struct pci_driver virtio_pci_driver = {
index c0c11fa..7760fc1 100644 (file)
@@ -679,7 +679,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 
        pci_read_config_dword(pci_dev,
                              notify + offsetof(struct virtio_pci_notify_cap,
-                                               cap.length),
+                                               cap.offset),
                              &notify_offset);
 
        /* We don't know how many VQs we'll map, ahead of the time.
index 4f0e7be..80825a7 100644 (file)
@@ -145,7 +145,8 @@ config MENF21BMC_WATCHDOG
 config TANGOX_WATCHDOG
        tristate "Sigma Designs SMP86xx/SMP87xx watchdog"
        select WATCHDOG_CORE
-       depends on ARCH_TANGOX || COMPILE_TEST
+       depends on ARCH_TANGO || COMPILE_TEST
+       depends on HAS_IOMEM
        help
          Support for the watchdog in Sigma Designs SMP86xx (tango3)
          and SMP87xx (tango4) family chips.
@@ -618,6 +619,7 @@ config DIGICOLOR_WATCHDOG
 config LPC18XX_WATCHDOG
        tristate "LPC18xx/43xx Watchdog"
        depends on ARCH_LPC18XX || COMPILE_TEST
+       depends on HAS_IOMEM
        select WATCHDOG_CORE
        help
          Say Y here if to include support for the watchdog timer
@@ -1374,6 +1376,7 @@ config BCM_KONA_WDT_DEBUG
 config BCM7038_WDT
        tristate "BCM7038 Watchdog"
        select WATCHDOG_CORE
+       depends on HAS_IOMEM
        help
         Watchdog driver for the built-in hardware in Broadcom 7038 SoCs.
 
@@ -1383,6 +1386,7 @@ config IMGPDC_WDT
        tristate "Imagination Technologies PDC Watchdog Timer"
        depends on HAS_IOMEM
        depends on METAG || MIPS || COMPILE_TEST
+       select WATCHDOG_CORE
        help
          Driver for Imagination Technologies PowerDown Controller
          Watchdog Timer.
@@ -1565,6 +1569,17 @@ config WATCHDOG_RIO
          machines.  The watchdog timeout period is normally one minute but
          can be changed with a boot-time parameter.
 
+config WATCHDOG_SUN4V
+       tristate "Sun4v Watchdog support"
+       select WATCHDOG_CORE
+       depends on SPARC64
+       help
+         Say Y here to support the hypervisor watchdog capability embedded
+         in the SPARC sun4v architecture.
+
+         To compile this driver as a module, choose M here. The module will
+         be called sun4v_wdt.
+
 # XTENSA Architecture
 
 # Xen Architecture
index f566753..f6a6a38 100644 (file)
@@ -179,6 +179,7 @@ obj-$(CONFIG_SH_WDT) += shwdt.o
 
 obj-$(CONFIG_WATCHDOG_RIO)             += riowd.o
 obj-$(CONFIG_WATCHDOG_CP1XXX)          += cpwd.o
+obj-$(CONFIG_WATCHDOG_SUN4V)           += sun4v_wdt.o
 
 # XTENSA Architecture
 
index f36ca4b..ac5840d 100644 (file)
@@ -292,4 +292,4 @@ MODULE_PARM_DESC(nodelay,
                 "Force selection of a timeout setting without initial delay "
                 "(max6373/74 only, default=0)");
 
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("GPL v2");
index 1a11aed..68952d9 100644 (file)
@@ -608,7 +608,7 @@ static int usb_pcwd_probe(struct usb_interface *interface,
        struct usb_host_interface *iface_desc;
        struct usb_endpoint_descriptor *endpoint;
        struct usb_pcwd_private *usb_pcwd = NULL;
-       int pipe, maxp;
+       int pipe;
        int retval = -ENOMEM;
        int got_fw_rev;
        unsigned char fw_rev_major, fw_rev_minor;
@@ -641,7 +641,6 @@ static int usb_pcwd_probe(struct usb_interface *interface,
 
        /* get a handle to the interrupt data pipe */
        pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress);
-       maxp = usb_maxpacket(udev, pipe, usb_pipeout(pipe));
 
        /* allocate memory for our device and initialize it */
        usb_pcwd = kzalloc(sizeof(struct usb_pcwd_private), GFP_KERNEL);
index 01d8162..e7a715e 100644 (file)
@@ -139,12 +139,11 @@ static int wdt_config(struct watchdog_device *wdd, bool ping)
 
        writel_relaxed(UNLOCK, wdt->base + WDTLOCK);
        writel_relaxed(wdt->load_val, wdt->base + WDTLOAD);
+       writel_relaxed(INT_MASK, wdt->base + WDTINTCLR);
 
-       if (!ping) {
-               writel_relaxed(INT_MASK, wdt->base + WDTINTCLR);
+       if (!ping)
                writel_relaxed(INT_ENABLE | RESET_ENABLE, wdt->base +
                                WDTCONTROL);
-       }
 
        writel_relaxed(LOCK, wdt->base + WDTLOCK);
 
diff --git a/drivers/watchdog/sun4v_wdt.c b/drivers/watchdog/sun4v_wdt.c
new file mode 100644 (file)
index 0000000..1467fe5
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ *     sun4v watchdog timer
+ *     (c) Copyright 2016 Oracle Corporation
+ *
+ *     Implement a simple watchdog driver using the built-in sun4v hypervisor
+ *     watchdog support. If time expires, the hypervisor stops or bounces
+ *     the guest domain.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/watchdog.h>
+#include <asm/hypervisor.h>
+#include <asm/mdesc.h>
+
+#define WDT_TIMEOUT                    60
+#define WDT_MAX_TIMEOUT                        31536000
+#define WDT_MIN_TIMEOUT                        1
+#define WDT_DEFAULT_RESOLUTION_MS      1000    /* 1 second */
+
+static unsigned int timeout;
+module_param(timeout, uint, 0);
+MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds (default="
+       __MODULE_STRING(WDT_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, S_IRUGO);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+       __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static int sun4v_wdt_stop(struct watchdog_device *wdd)
+{
+       sun4v_mach_set_watchdog(0, NULL);
+
+       return 0;
+}
+
+static int sun4v_wdt_ping(struct watchdog_device *wdd)
+{
+       int hverr;
+
+       /*
+        * HV watchdog timer will round up the timeout
+        * passed in to the nearest multiple of the
+        * watchdog resolution in milliseconds.
+        */
+       hverr = sun4v_mach_set_watchdog(wdd->timeout * 1000, NULL);
+       if (hverr == HV_EINVAL)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int sun4v_wdt_set_timeout(struct watchdog_device *wdd,
+                                unsigned int timeout)
+{
+       wdd->timeout = timeout;
+
+       return 0;
+}
+
+static const struct watchdog_info sun4v_wdt_ident = {
+       .options =      WDIOF_SETTIMEOUT |
+                       WDIOF_MAGICCLOSE |
+                       WDIOF_KEEPALIVEPING,
+       .identity =     "sun4v hypervisor watchdog",
+       .firmware_version = 0,
+};
+
+static struct watchdog_ops sun4v_wdt_ops = {
+       .owner =        THIS_MODULE,
+       .start =        sun4v_wdt_ping,
+       .stop =         sun4v_wdt_stop,
+       .ping =         sun4v_wdt_ping,
+       .set_timeout =  sun4v_wdt_set_timeout,
+};
+
+static struct watchdog_device wdd = {
+       .info = &sun4v_wdt_ident,
+       .ops = &sun4v_wdt_ops,
+       .min_timeout = WDT_MIN_TIMEOUT,
+       .max_timeout = WDT_MAX_TIMEOUT,
+       .timeout = WDT_TIMEOUT,
+};
+
+static int __init sun4v_wdt_init(void)
+{
+       struct mdesc_handle *handle;
+       u64 node;
+       const u64 *value;
+       int err = 0;
+       unsigned long major = 1, minor = 1;
+
+       /*
+        * There are 2 properties that can be set from the control
+        * domain for the watchdog.
+        * watchdog-resolution
+        * watchdog-max-timeout
+        *
+        * We can expect a handle to be returned otherwise something
+        * serious is wrong. Correct to return -ENODEV here.
+        */
+
+       handle = mdesc_grab();
+       if (!handle)
+               return -ENODEV;
+
+       node = mdesc_node_by_name(handle, MDESC_NODE_NULL, "platform");
+       err = -ENODEV;
+       if (node == MDESC_NODE_NULL)
+               goto out_release;
+
+       /*
+        * This is a safe way to validate if we are on the right
+        * platform.
+        */
+       if (sun4v_hvapi_register(HV_GRP_CORE, major, &minor))
+               goto out_hv_unreg;
+
+       /* Allow value of watchdog-resolution up to 1s (default) */
+       value = mdesc_get_property(handle, node, "watchdog-resolution", NULL);
+       err = -EINVAL;
+       if (value) {
+               if (*value == 0 ||
+                   *value > WDT_DEFAULT_RESOLUTION_MS)
+                       goto out_hv_unreg;
+       }
+
+       value = mdesc_get_property(handle, node, "watchdog-max-timeout", NULL);
+       if (value) {
+               /*
+                * If the property value (in ms) is smaller than
+                * min_timeout, return -EINVAL.
+                */
+               if (*value < wdd.min_timeout * 1000)
+                       goto out_hv_unreg;
+
+               /*
+                * If the property value is smaller than
+                * default max_timeout  then set watchdog max_timeout to
+                * the value of the property in seconds.
+                */
+               if (*value < wdd.max_timeout * 1000)
+                       wdd.max_timeout = *value  / 1000;
+       }
+
+       watchdog_init_timeout(&wdd, timeout, NULL);
+
+       watchdog_set_nowayout(&wdd, nowayout);
+
+       err = watchdog_register_device(&wdd);
+       if (err)
+               goto out_hv_unreg;
+
+       pr_info("initialized (timeout=%ds, nowayout=%d)\n",
+                wdd.timeout, nowayout);
+
+       mdesc_release(handle);
+
+       return 0;
+
+out_hv_unreg:
+       sun4v_hvapi_unregister(HV_GRP_CORE);
+
+out_release:
+       mdesc_release(handle);
+       return err;
+}
+
+static void __exit sun4v_wdt_exit(void)
+{
+       sun4v_hvapi_unregister(HV_GRP_CORE);
+       watchdog_unregister_device(&wdd);
+}
+
+module_init(sun4v_wdt_init);
+module_exit(sun4v_wdt_exit);
+
+MODULE_AUTHOR("Wim Coekaerts <wim.coekaerts@oracle.com>");
+MODULE_DESCRIPTION("sun4v watchdog driver");
+MODULE_LICENSE("GPL");
index 12eab50..dc4305b 100644 (file)
@@ -257,7 +257,7 @@ static struct resource *additional_memory_resource(phys_addr_t size)
                return NULL;
 
        res->name = "System RAM";
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
        ret = allocate_resource(&iomem_resource, res,
                                size, 0, -1,
index 945fc43..4ac2ca8 100644 (file)
@@ -242,7 +242,7 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
        return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
 }
 
-static struct cleancache_ops tmem_cleancache_ops = {
+static const struct cleancache_ops tmem_cleancache_ops = {
        .put_page = tmem_cleancache_put_page,
        .get_page = tmem_cleancache_get_page,
        .invalidate_page = tmem_cleancache_flush_page,
index 73dafdc..fb02214 100644 (file)
@@ -227,8 +227,9 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
        /*
         * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able
         * to access the BARs where the MSI-X entries reside.
+        * But VF devices are unique in which the PF needs to be checked.
         */
-       pci_read_config_word(dev, PCI_COMMAND, &cmd);
+       pci_read_config_word(pci_physfn(dev), PCI_COMMAND, &cmd);
        if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY))
                return -ENXIO;
 
@@ -332,6 +333,9 @@ void xen_pcibk_do_op(struct work_struct *data)
        struct xen_pcibk_dev_data *dev_data = NULL;
        struct xen_pci_op *op = &pdev->op;
        int test_intx = 0;
+#ifdef CONFIG_PCI_MSI
+       unsigned int nr = 0;
+#endif
 
        *op = pdev->sh_info->op;
        barrier();
@@ -360,6 +364,7 @@ void xen_pcibk_do_op(struct work_struct *data)
                        op->err = xen_pcibk_disable_msi(pdev, dev, op);
                        break;
                case XEN_PCI_OP_enable_msix:
+                       nr = op->value;
                        op->err = xen_pcibk_enable_msix(pdev, dev, op);
                        break;
                case XEN_PCI_OP_disable_msix:
@@ -382,7 +387,7 @@ void xen_pcibk_do_op(struct work_struct *data)
        if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) {
                unsigned int i;
 
-               for (i = 0; i < op->value; i++)
+               for (i = 0; i < nr; i++)
                        pdev->sh_info->op.msix_entries[i].vector =
                                op->msix_entries[i].vector;
        }
index ad4eb10..c46ee18 100644 (file)
@@ -848,6 +848,24 @@ static int scsiback_map(struct vscsibk_info *info)
        return scsiback_init_sring(info, ring_ref, evtchn);
 }
 
+/*
+  Check for a translation entry being present
+*/
+static struct v2p_entry *scsiback_chk_translation_entry(
+       struct vscsibk_info *info, struct ids_tuple *v)
+{
+       struct list_head *head = &(info->v2p_entry_lists);
+       struct v2p_entry *entry;
+
+       list_for_each_entry(entry, head, l)
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun))
+                       return entry;
+
+       return NULL;
+}
+
 /*
   Add a new translation entry
 */
@@ -855,9 +873,7 @@ static int scsiback_add_translation_entry(struct vscsibk_info *info,
                                          char *phy, struct ids_tuple *v)
 {
        int err = 0;
-       struct v2p_entry *entry;
        struct v2p_entry *new;
-       struct list_head *head = &(info->v2p_entry_lists);
        unsigned long flags;
        char *lunp;
        unsigned long long unpacked_lun;
@@ -917,15 +933,10 @@ static int scsiback_add_translation_entry(struct vscsibk_info *info,
        spin_lock_irqsave(&info->v2p_lock, flags);
 
        /* Check double assignment to identical virtual ID */
-       list_for_each_entry(entry, head, l) {
-               if ((entry->v.chn == v->chn) &&
-                   (entry->v.tgt == v->tgt) &&
-                   (entry->v.lun == v->lun)) {
-                       pr_warn("Virtual ID is already used. Assignment was not performed.\n");
-                       err = -EEXIST;
-                       goto out;
-               }
-
+       if (scsiback_chk_translation_entry(info, v)) {
+               pr_warn("Virtual ID is already used. Assignment was not performed.\n");
+               err = -EEXIST;
+               goto out;
        }
 
        /* Create a new translation entry and add to the list */
@@ -933,18 +944,18 @@ static int scsiback_add_translation_entry(struct vscsibk_info *info,
        new->v = *v;
        new->tpg = tpg;
        new->lun = unpacked_lun;
-       list_add_tail(&new->l, head);
+       list_add_tail(&new->l, &info->v2p_entry_lists);
 
 out:
        spin_unlock_irqrestore(&info->v2p_lock, flags);
 
 out_free:
-       mutex_lock(&tpg->tv_tpg_mutex);
-       tpg->tv_tpg_fe_count--;
-       mutex_unlock(&tpg->tv_tpg_mutex);
-
-       if (err)
+       if (err) {
+               mutex_lock(&tpg->tv_tpg_mutex);
+               tpg->tv_tpg_fe_count--;
+               mutex_unlock(&tpg->tv_tpg_mutex);
                kfree(new);
+       }
 
        return err;
 }
@@ -956,39 +967,40 @@ static void __scsiback_del_translation_entry(struct v2p_entry *entry)
 }
 
 /*
-  Delete the translation entry specfied
+  Delete the translation entry specified
 */
 static int scsiback_del_translation_entry(struct vscsibk_info *info,
                                          struct ids_tuple *v)
 {
        struct v2p_entry *entry;
-       struct list_head *head = &(info->v2p_entry_lists);
        unsigned long flags;
+       int ret = 0;
 
        spin_lock_irqsave(&info->v2p_lock, flags);
        /* Find out the translation entry specified */
-       list_for_each_entry(entry, head, l) {
-               if ((entry->v.chn == v->chn) &&
-                   (entry->v.tgt == v->tgt) &&
-                   (entry->v.lun == v->lun)) {
-                       goto found;
-               }
-       }
-
-       spin_unlock_irqrestore(&info->v2p_lock, flags);
-       return 1;
-
-found:
-       /* Delete the translation entry specfied */
-       __scsiback_del_translation_entry(entry);
+       entry = scsiback_chk_translation_entry(info, v);
+       if (entry)
+               __scsiback_del_translation_entry(entry);
+       else
+               ret = -ENOENT;
 
        spin_unlock_irqrestore(&info->v2p_lock, flags);
-       return 0;
+       return ret;
 }
 
 static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state,
                                char *phy, struct ids_tuple *vir, int try)
 {
+       struct v2p_entry *entry;
+       unsigned long flags;
+
+       if (try) {
+               spin_lock_irqsave(&info->v2p_lock, flags);
+               entry = scsiback_chk_translation_entry(info, vir);
+               spin_unlock_irqrestore(&info->v2p_lock, flags);
+               if (entry)
+                       return;
+       }
        if (!scsiback_add_translation_entry(info, phy, vir)) {
                if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
                                  "%d", XenbusStateInitialised)) {
index 9433e46..912b64e 100644 (file)
@@ -188,6 +188,8 @@ static int queue_reply(struct list_head *queue, const void *data, size_t len)
 
        if (len == 0)
                return 0;
+       if (len > XENSTORE_PAYLOAD_MAX)
+               return -EINVAL;
 
        rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
        if (rb == NULL)
index 0548c53..22fc7c8 100644 (file)
@@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
        pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
                 page->index, to);
        BUG_ON(to > PAGE_CACHE_SIZE);
-       kmap(page);
-       data = page_address(page);
        bsize = AFFS_SB(sb)->s_data_blksize;
        tmp = page->index << PAGE_CACHE_SHIFT;
        bidx = tmp / bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
                        return PTR_ERR(bh);
                tmp = min(bsize - boff, to - pos);
                BUG_ON(pos + tmp > to || tmp > bsize);
+               data = kmap_atomic(page);
                memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+               kunmap_atomic(data);
                affs_brelse(bh);
                bidx++;
                pos += tmp;
                boff = 0;
        }
        flush_dcache_page(page);
-       kunmap(page);
        return 0;
 }
 
index 051ea48..7d914c6 100644 (file)
@@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 
        if ((current->flags & PF_RANDOMIZE) &&
                !(current->personality & ADDR_NO_RANDOMIZE)) {
-               random_variable = (unsigned long) get_random_int();
+               random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
index 7b9cd49..826b164 100644 (file)
@@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_disk = disk;
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
-               bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+               if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+                       bdev->bd_inode->i_flags = S_DAX;
+               else
+                       bdev->bd_inode->i_flags = 0;
+
                if (!partno) {
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
@@ -1693,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
 
+static int blkdev_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+       if (dax_mapping(mapping)) {
+               struct block_device *bdev = I_BDEV(mapping->host);
+
+               return dax_writeback_mapping_range(mapping, bdev, wbc);
+       }
+       return generic_writepages(mapping, wbc);
+}
+
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .readpages      = blkdev_readpages,
        .writepage      = blkdev_writepage,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
-       .writepages     = generic_writepages,
+       .writepages     = blkdev_writepages,
        .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
        .is_dirty_writeback = buffer_check_dirty_writeback,
@@ -1730,43 +1745,25 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return __dax_fault(vma, vmf, blkdev_get_block, NULL);
 }
 
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, unsigned int flags)
-{
-       return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static void blkdev_vm_open(struct vm_area_struct *vma)
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+               struct vm_fault *vmf)
 {
-       struct inode *bd_inode = bdev_file_inode(vma->vm_file);
-       struct block_device *bdev = I_BDEV(bd_inode);
-
-       inode_lock(bd_inode);
-       bdev->bd_map_count++;
-       inode_unlock(bd_inode);
+       return dax_pfn_mkwrite(vma, vmf);
 }
 
-static void blkdev_vm_close(struct vm_area_struct *vma)
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, unsigned int flags)
 {
-       struct inode *bd_inode = bdev_file_inode(vma->vm_file);
-       struct block_device *bdev = I_BDEV(bd_inode);
-
-       inode_lock(bd_inode);
-       bdev->bd_map_count--;
-       inode_unlock(bd_inode);
+       return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
 }
 
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
-       .open           = blkdev_vm_open,
-       .close          = blkdev_vm_close,
        .fault          = blkdev_dax_fault,
        .pmd_fault      = blkdev_dax_pmd_fault,
-       .pfn_mkwrite    = blkdev_dax_fault,
+       .pfn_mkwrite    = blkdev_dax_pfn_mkwrite,
 };
 
 static const struct vm_operations_struct blkdev_default_vm_ops = {
-       .open           = blkdev_vm_open,
-       .close          = blkdev_vm_close,
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
 };
@@ -1774,18 +1771,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = {
 static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *bd_inode = bdev_file_inode(file);
-       struct block_device *bdev = I_BDEV(bd_inode);
 
        file_accessed(file);
-       inode_lock(bd_inode);
-       bdev->bd_map_count++;
        if (IS_DAX(bd_inode)) {
                vma->vm_ops = &blkdev_dax_vm_ops;
                vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        } else {
                vma->vm_ops = &blkdev_default_vm_ops;
        }
-       inode_unlock(bd_inode);
 
        return 0;
 }
index 88d9af3..5fb60ea 100644 (file)
@@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
                list_add_tail(&work->ordered_list, &wq->ordered_list);
                spin_unlock_irqrestore(&wq->list_lock, flags);
        }
-       queue_work(wq->normal_wq, &work->normal_work);
        trace_btrfs_work_queued(work);
+       queue_work(wq->normal_wq, &work->normal_work);
 }
 
 void btrfs_queue_work(struct btrfs_workqueue *wq,
index b90cd37..f6dac40 100644 (file)
@@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        read_extent_buffer(eb, dest + bytes_left,
                                           name_off, name_len);
                if (eb != eb_in) {
-                       btrfs_tree_read_unlock_blocking(eb);
+                       if (!path->skip_locking)
+                               btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
                }
                ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                eb = path->nodes[0];
                /* make sure we can use eb after releasing the path */
                if (eb != eb_in) {
-                       atomic_inc(&eb->refs);
-                       btrfs_tree_read_lock(eb);
-                       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       if (!path->skip_locking)
+                               btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                       path->nodes[0] = NULL;
+                       path->locks[0] = 0;
                }
                btrfs_release_path(path);
                iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
index c473c42..3346cd8 100644 (file)
@@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        faili = nr_pages - 1;
        cb->nr_pages = nr_pages;
 
-       /* In the parent-locked case, we only locked the range we are
-        * interested in.  In all other cases, we can opportunistically
-        * cache decompressed data that goes beyond the requested range. */
-       if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
-               add_ra_bio_pages(inode, em_start + em_len, cb);
+       add_ra_bio_pages(inode, em_start + em_len, cb);
 
        /* include any pages we added in add_ra-bio_pages */
        uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
index 0be47e4..b57daa8 100644 (file)
@@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
  *
  */
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                   struct list_head *ins_list)
+                                   struct list_head *ins_list, bool *emitted)
 {
        struct btrfs_dir_item *di;
        struct btrfs_delayed_item *curr, *next;
@@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 
                if (over)
                        return 1;
+               *emitted = true;
        }
        return 0;
 }
index f70119f..0167853 100644 (file)
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
 int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index);
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
-                                   struct list_head *ins_list);
+                                   struct list_head *ins_list, bool *emitted);
 
 /* for init */
 int __init btrfs_delayed_inode_init(void);
index dd08e29..5699bbc 100644 (file)
@@ -182,6 +182,7 @@ static struct btrfs_lockdep_keyset {
        { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
        { .id = BTRFS_UUID_TREE_OBJECTID,       .name_stem = "uuid"     },
+       { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
        { .id = 0,                              .name_stem = "tree"     },
 };
 
@@ -930,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
        if (bio_flags & EXTENT_BIO_TREE_LOG)
                return 0;
 #ifdef CONFIG_X86
-       if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+       if (static_cpu_has(X86_FEATURE_XMM4_2))
                return 0;
 #endif
        return 1;
@@ -1787,7 +1788,6 @@ static int cleaner_kthread(void *arg)
        int again;
        struct btrfs_trans_handle *trans;
 
-       set_freezable();
        do {
                again = 0;
 
index 2e7c97a..392592d 100644 (file)
@@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree,
        struct block_device *bdev;
        int ret;
        int nr = 0;
-       int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
        size_t pg_offset = 0;
        size_t iosize;
        size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
-       unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+       unsigned long this_bio_flag = 0;
 
        set_page_extent_mapped(page);
 
@@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree,
                        kunmap_atomic(userpage);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                       if (!parent_locked)
-                               unlock_extent_cached(tree, cur,
-                                                    cur + iosize - 1,
-                                                    &cached, GFP_NOFS);
+                       unlock_extent_cached(tree, cur,
+                                            cur + iosize - 1,
+                                            &cached, GFP_NOFS);
                        break;
                }
                em = __get_extent_map(inode, page, pg_offset, cur,
                                      end - cur + 1, get_extent, em_cached);
                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
-                       if (!parent_locked)
-                               unlock_extent(tree, cur, end);
+                       unlock_extent(tree, cur, end);
                        break;
                }
                extent_offset = cur - em->start;
@@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree,
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                       if (parent_locked)
-                               free_extent_state(cached);
-                       else
-                               unlock_extent_cached(tree, cur,
-                                                    cur + iosize - 1,
-                                                    &cached, GFP_NOFS);
+                       unlock_extent_cached(tree, cur,
+                                            cur + iosize - 1,
+                                            &cached, GFP_NOFS);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                if (test_range_bit(tree, cur, cur_end,
                                   EXTENT_UPTODATE, 1, NULL)) {
                        check_page_uptodate(tree, page);
-                       if (!parent_locked)
-                               unlock_extent(tree, cur, cur + iosize - 1);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                 */
                if (block_start == EXTENT_MAP_INLINE) {
                        SetPageError(page);
-                       if (!parent_locked)
-                               unlock_extent(tree, cur, cur + iosize - 1);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
                        *bio_flags = this_bio_flag;
                } else {
                        SetPageError(page);
-                       if (!parent_locked)
-                               unlock_extent(tree, cur, cur + iosize - 1);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                }
                cur = cur + iosize;
                pg_offset += iosize;
@@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
 
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-                                get_extent_t *get_extent, int mirror_num)
-{
-       struct bio *bio = NULL;
-       unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
-       int ret;
-
-       ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
-                           &bio_flags, READ, NULL);
-       if (bio)
-               ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
-       return ret;
-}
-
 static noinline void update_nr_written(struct page *page,
                                      struct writeback_control *wbc,
                                      unsigned long nr_written)
index 0377413..880d529 100644 (file)
@@ -29,7 +29,6 @@
  */
 #define EXTENT_BIO_COMPRESSED 1
 #define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
@@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-                                get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
index 393e36b..53dbeaf 100644 (file)
@@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
 
 static unsigned long *alloc_bitmap(u32 bitmap_size)
 {
+       void *mem;
+
+       /*
+        * The allocation size varies, observed numbers were < 4K up to 16K.
+        * Using vmalloc unconditionally would be too heavy, we'll try
+        * contiguous allocations first.
+        */
+       if  (bitmap_size <= PAGE_SIZE)
+               return kzalloc(bitmap_size, GFP_NOFS);
+
+       mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
+       if (mem)
+               return mem;
+
        return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
                         PAGE_KERNEL);
 }
@@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 
        ret = 0;
 out:
-       vfree(bitmap);
+       kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
        return ret;
@@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 
        ret = 0;
 out:
-       vfree(bitmap);
+       kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
        return ret;
index e28f3d4..d96f5cf 100644 (file)
@@ -5717,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        char *name_ptr;
        int name_len;
        int is_curr = 0;        /* ctx->pos points to the current index? */
+       bool emitted;
 
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
@@ -5745,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        if (ret < 0)
                goto err;
 
+       emitted = false;
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];
@@ -5824,6 +5826,7 @@ skip:
 
                        if (over)
                                goto nopos;
+                       emitted = true;
                        di_len = btrfs_dir_name_len(leaf, di) +
                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
                        di_cur += di_len;
@@ -5836,11 +5839,20 @@ next:
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
                        ctx->pos++;
-               ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+               ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
                if (ret)
                        goto nopos;
        }
 
+       /*
+        * If we haven't emitted any dir entry, we must not touch ctx->pos as
+        * it was was set to the termination value in previous call. We assume
+        * that "." and ".." were emitted if we reach this point and set the
+        * termination value as well for an empty directory.
+        */
+       if (ctx->pos > 2 && !emitted)
+               goto nopos;
+
        /* Reached end of directory/root. Bump pos past the last item. */
        ctx->pos++;
 
@@ -7116,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        if (ret)
                return ERR_PTR(ret);
 
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return em;
-       }
-
+       /*
+        * Create the ordered extent before the extent map. This is to avoid
+        * races with the fast fsync path that would lead to it logging file
+        * extent items that point to disk extents that were not yet written to.
+        * The fast fsync path collects ordered extents into a local list and
+        * then collects all the new extent maps, so we must create the ordered
+        * extent first and make sure the fast fsync path collects any new
+        * ordered extents after collecting new extent maps as well.
+        * The fsync path simply can not rely on inode_dio_wait() because it
+        * causes deadlock with AIO.
+        */
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
        if (ret) {
                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               free_extent_map(em);
                return ERR_PTR(ret);
        }
 
+       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+                             ins.offset, ins.offset, ins.offset, 0);
+       if (IS_ERR(em)) {
+               struct btrfs_ordered_extent *oe;
+
+               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+               oe = btrfs_lookup_ordered_extent(inode, start);
+               ASSERT(oe);
+               if (WARN_ON(!oe))
+                       return em;
+               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+               btrfs_remove_ordered_extent(inode, oe);
+               /* Once for our lookup and once for the ordered extents tree. */
+               btrfs_put_ordered_extent(oe);
+               btrfs_put_ordered_extent(oe);
+       }
        return em;
 }
 
@@ -7954,6 +7986,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
 
        kfree(dip);
 
+       dio_bio->bi_error = bio->bi_error;
        dio_end_io(dio_bio, bio->bi_error);
 
        if (io_bio->end_io)
@@ -8008,6 +8041,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
 
        kfree(dip);
 
+       dio_bio->bi_error = bio->bi_error;
        dio_end_io(dio_bio, bio->bi_error);
        bio_put(bio);
 }
index 952172c..48aee98 100644 (file)
@@ -2794,24 +2794,29 @@ out:
 static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 {
        struct page *page;
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 
        page = grab_cache_page(inode->i_mapping, index);
        if (!page)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        if (!PageUptodate(page)) {
-               if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
-                                                0))
-                       return NULL;
+               int ret;
+
+               ret = btrfs_readpage(NULL, page);
+               if (ret)
+                       return ERR_PTR(ret);
                lock_page(page);
                if (!PageUptodate(page)) {
                        unlock_page(page);
                        page_cache_release(page);
-                       return NULL;
+                       return ERR_PTR(-EIO);
+               }
+               if (page->mapping != inode->i_mapping) {
+                       unlock_page(page);
+                       page_cache_release(page);
+                       return ERR_PTR(-EAGAIN);
                }
        }
-       unlock_page(page);
 
        return page;
 }
@@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
        pgoff_t index = off >> PAGE_CACHE_SHIFT;
 
        for (i = 0; i < num_pages; i++) {
+again:
                pages[i] = extent_same_get_page(inode, index + i);
-               if (!pages[i])
-                       return -ENOMEM;
+               if (IS_ERR(pages[i])) {
+                       int err = PTR_ERR(pages[i]);
+
+                       if (err == -EAGAIN)
+                               goto again;
+                       pages[i] = NULL;
+                       return err;
+               }
        }
        return 0;
 }
 
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+                            bool retry_range_locking)
 {
-       /* do any pending delalloc/csum calc on src, one way or
-          another, and lock file content */
+       /*
+        * Do any pending delalloc/csum calculations on inode, one way or
+        * another, and lock file content.
+        * The locking order is:
+        *
+        *   1) pages
+        *   2) range in the inode's io tree
+        */
        while (1) {
                struct btrfs_ordered_extent *ordered;
                lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
                unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
+               if (!retry_range_locking)
+                       return -EAGAIN;
                btrfs_wait_ordered_range(inode, off, len);
        }
+       return 0;
 }
 
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
        unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 }
 
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
-                                    struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+                                   struct inode *inode2, u64 loff2, u64 len,
+                                   bool retry_range_locking)
 {
+       int ret;
+
        if (inode1 < inode2) {
                swap(inode1, inode2);
                swap(loff1, loff2);
        }
-       lock_extent_range(inode1, loff1, len);
-       lock_extent_range(inode2, loff2, len);
+       ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+       if (ret)
+               return ret;
+       ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+       if (ret)
+               unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+                             loff1 + len - 1);
+       return ret;
 }
 
 struct cmp_pages {
@@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
 
        for (i = 0; i < cmp->num_pages; i++) {
                pg = cmp->src_pages[i];
-               if (pg)
+               if (pg) {
+                       unlock_page(pg);
                        page_cache_release(pg);
+               }
                pg = cmp->dst_pages[i];
-               if (pg)
+               if (pg) {
+                       unlock_page(pg);
                        page_cache_release(pg);
+               }
        }
        kfree(cmp->src_pages);
        kfree(cmp->dst_pages);
@@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 
                src_page = cmp->src_pages[i];
                dst_page = cmp->dst_pages[i];
+               ASSERT(PageLocked(src_page));
+               ASSERT(PageLocked(dst_page));
 
                addr = kmap_atomic(src_page);
                dst_addr = kmap_atomic(dst_page);
@@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                goto out_unlock;
        }
 
+again:
        ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
        if (ret)
                goto out_unlock;
 
        if (same_inode)
-               lock_extent_range(src, same_lock_start, same_lock_len);
+               ret = lock_extent_range(src, same_lock_start, same_lock_len,
+                                       false);
        else
-               btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+               ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+                                              false);
+       /*
+        * If one of the inodes has dirty pages in the respective range or
+        * ordered extents, we need to flush dellaloc and wait for all ordered
+        * extents in the range. We must unlock the pages and the ranges in the
+        * io trees to avoid deadlocks when flushing delalloc (requires locking
+        * pages) and when waiting for ordered extents to complete (they require
+        * range locking).
+        */
+       if (ret == -EAGAIN) {
+               /*
+                * Ranges in the io trees already unlocked. Now unlock all
+                * pages before waiting for all IO to complete.
+                */
+               btrfs_cmp_data_free(&cmp);
+               if (same_inode) {
+                       btrfs_wait_ordered_range(src, same_lock_start,
+                                                same_lock_len);
+               } else {
+                       btrfs_wait_ordered_range(src, loff, len);
+                       btrfs_wait_ordered_range(dst, dst_loff, len);
+               }
+               goto again;
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               btrfs_cmp_data_free(&cmp);
+               return ret;
+       }
 
        /* pass original length for comparison so we stay within i_size */
        ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
                u64 lock_start = min_t(u64, off, destoff);
                u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
 
-               lock_extent_range(src, lock_start, lock_len);
+               ret = lock_extent_range(src, lock_start, lock_len, true);
        } else {
-               btrfs_double_extent_lock(src, off, inode, destoff, len);
+               ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+                                              true);
+       }
+       ASSERT(ret == 0);
+       if (WARN_ON(ret)) {
+               /* ranges in the io trees already unlocked */
+               goto out_unlock;
        }
 
        ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
index fd1c4d9..2bd0011 100644 (file)
@@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid)
            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
            root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
            root_objectid == BTRFS_UUID_TREE_OBJECTID ||
-           root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+           root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
+           root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
                return 1;
        return 0;
 }
index 7cf8509..2c849b0 100644 (file)
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
 
                err = btrfs_insert_fs_root(root->fs_info, root);
+               /*
+                * The root might have been inserted already, as before we look
+                * for orphan roots, log replay might have happened, which
+                * triggers a transaction commit and qgroup accounting, which
+                * in turn reads and inserts fs roots while doing backref
+                * walking.
+                */
+               if (err == -EEXIST)
+                       err = 0;
                if (err) {
-                       BUG_ON(err == -EEXIST);
                        btrfs_free_fs_root(root);
                        break;
                }
index e0ac859..539e7b5 100644 (file)
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
 BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(raid56),
        BTRFS_FEAT_ATTR_PTR(skinny_metadata),
        BTRFS_FEAT_ATTR_PTR(no_holes),
+       BTRFS_FEAT_ATTR_PTR(free_space_tree),
        NULL
 };
 
@@ -780,6 +782,39 @@ failure:
        return error;
 }
 
+
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+               u64 bit, enum btrfs_feature_set set)
+{
+       struct btrfs_fs_devices *fs_devs;
+       struct kobject *fsid_kobj;
+       u64 features;
+       int ret;
+
+       if (!fs_info)
+               return;
+
+       features = get_features(fs_info, set);
+       ASSERT(bit & supported_feature_masks[set]);
+
+       fs_devs = fs_info->fs_devices;
+       fsid_kobj = &fs_devs->fsid_kobj;
+
+       if (!fsid_kobj->state_initialized)
+               return;
+
+       /*
+        * FIXME: this is too heavy to update just one value, ideally we'd like
+        * to use sysfs_update_group but some refactoring is needed first.
+        */
+       sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+       ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
+
 static int btrfs_init_debugfs(void)
 {
 #ifdef CONFIG_DEBUG_FS
index 9c09522..d7da1a4 100644 (file)
@@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {                            \
 #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
        BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
 #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
-       BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+       BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
 #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
        BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
 
@@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
                                struct kobject *parent);
 int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
 void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+               u64 bit, enum btrfs_feature_set set);
+
 #endif /* _BTRFS_SYSFS_H_ */
index b1d920b..0e1e61a 100644 (file)
@@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void)
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 {
        struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
-                                               GFP_NOFS);
+                                               GFP_KERNEL);
 
        if (!fs_info)
                return fs_info;
        fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
-                                     GFP_NOFS);
+                                     GFP_KERNEL);
        if (!fs_info->fs_devices) {
                kfree(fs_info);
                return NULL;
        }
        fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
-                                     GFP_NOFS);
+                                     GFP_KERNEL);
        if (!fs_info->super_copy) {
                kfree(fs_info->fs_devices);
                kfree(fs_info);
@@ -180,11 +180,11 @@ btrfs_alloc_dummy_block_group(unsigned long length)
 {
        struct btrfs_block_group_cache *cache;
 
-       cache = kzalloc(sizeof(*cache), GFP_NOFS);
+       cache = kzalloc(sizeof(*cache), GFP_KERNEL);
        if (!cache)
                return NULL;
        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-                                       GFP_NOFS);
+                                       GFP_KERNEL);
        if (!cache->free_space_ctl) {
                kfree(cache);
                return NULL;
index e29fa29..669b582 100644 (file)
@@ -94,7 +94,7 @@ static int test_find_delalloc(void)
         * test.
         */
        for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
-               page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+               page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
                if (!page) {
                        test_msg("Failed to allocate test page\n");
                        ret = -ENOMEM;
@@ -113,7 +113,7 @@ static int test_find_delalloc(void)
         * |--- delalloc ---|
         * |---  search  ---|
         */
-       set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+       set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
        start = 0;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -144,7 +144,7 @@ static int test_find_delalloc(void)
                test_msg("Couldn't find the locked page\n");
                goto out_bits;
        }
-       set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+       set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -199,7 +199,7 @@ static int test_find_delalloc(void)
         *
         * We are re-using our test_start from above since it works out well.
         */
-       set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+       set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -262,7 +262,7 @@ static int test_find_delalloc(void)
        }
        ret = 0;
 out_bits:
-       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
 out:
        if (locked_page)
                page_cache_release(locked_page);
@@ -360,7 +360,7 @@ static int test_eb_bitmaps(void)
 
        test_msg("Running extent buffer bitmap tests\n");
 
-       bitmap = kmalloc(len, GFP_NOFS);
+       bitmap = kmalloc(len, GFP_KERNEL);
        if (!bitmap) {
                test_msg("Couldn't allocate test bitmap\n");
                return -ENOMEM;
index 5de55fd..e2d3da0 100644 (file)
@@ -974,7 +974,7 @@ static int test_extent_accounting(void)
                               (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
                               EXTENT_DELALLOC | EXTENT_DIRTY |
                               EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
-                              NULL, GFP_NOFS);
+                              NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1045,7 +1045,7 @@ static int test_extent_accounting(void)
                               BTRFS_MAX_EXTENT_SIZE+8191,
                               EXTENT_DIRTY | EXTENT_DELALLOC |
                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                              NULL, GFP_NOFS);
+                              NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1079,7 +1079,7 @@ static int test_extent_accounting(void)
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                               EXTENT_DIRTY | EXTENT_DELALLOC |
                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                              NULL, GFP_NOFS);
+                              NULL, GFP_KERNEL);
        if (ret) {
                test_msg("clear_extent_bit returned %d\n", ret);
                goto out;
@@ -1096,7 +1096,7 @@ out:
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-                                NULL, GFP_NOFS);
+                                NULL, GFP_KERNEL);
        iput(inode);
        btrfs_free_dummy_root(root);
        return ret;
index 323e12c..978c3a8 100644 (file)
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct inode *inode,
                                     struct btrfs_path *path,
                                     struct list_head *logged_list,
-                                    struct btrfs_log_ctx *ctx)
+                                    struct btrfs_log_ctx *ctx,
+                                    const u64 start,
+                                    const u64 end)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        }
 
        list_sort(NULL, &extents, extent_cmp);
-
+       /*
+        * Collect any new ordered extents within the range. This is to
+        * prevent logging file extent items without waiting for the disk
+        * location they point to being written. We do this only to deal
+        * with races against concurrent lockless direct IO writes.
+        */
+       btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
                        goto out_unlock;
                }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                               &logged_list, ctx);
+                                               &logged_list, ctx, start, end);
                if (ret) {
                        err = ret;
                        goto out_unlock;
index c222137..19adeb0 100644 (file)
@@ -1756,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
        u32 pool;
        int ret, flags;
 
+       /* does not support pool namespace yet */
+       if (ci->i_pool_ns_len)
+               return -EIO;
+
        if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
                                NOPOOLPERM))
                return 0;
index cdbf8cf..6fe0ad2 100644 (file)
@@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
                             void *inline_data, int inline_len,
                             struct ceph_buffer *xattr_buf,
                             struct ceph_mds_session *session,
-                            struct ceph_cap *cap, int issued)
+                            struct ceph_cap *cap, int issued,
+                            u32 pool_ns_len)
        __releases(ci->i_ceph_lock)
        __releases(mdsc->snap_rwsem)
 {
@@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
                /* file layout may have changed */
                ci->i_layout = grant->layout;
+               ci->i_pool_ns_len = pool_ns_len;
+
                /* size/truncate_seq? */
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        u32  inline_len = 0;
        void *snaptrace;
        size_t snaptrace_len;
+       u32 pool_ns_len = 0;
        void *p, *end;
 
        dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                p += inline_len;
        }
 
+       if (le16_to_cpu(msg->hdr.version) >= 8) {
+               u64 flush_tid;
+               u32 caller_uid, caller_gid;
+               u32 osd_epoch_barrier;
+               /* version >= 5 */
+               ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+               /* version >= 6 */
+               ceph_decode_64_safe(&p, end, flush_tid, bad);
+               /* version >= 7 */
+               ceph_decode_32_safe(&p, end, caller_uid, bad);
+               ceph_decode_32_safe(&p, end, caller_gid, bad);
+               /* version >= 8 */
+               ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+       }
+
        /* lookup ino */
        inode = ceph_find_inode(sb, vino);
        ci = ceph_inode(inode);
@@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                                  &cap, &issued);
                handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
-                                msg->middle, session, cap, issued);
+                                msg->middle, session, cap, issued,
+                                pool_ns_len);
                if (realm)
                        ceph_put_snap_realm(mdsc, realm);
                goto done_unlocked;
@@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                issued |= __ceph_caps_dirty(ci);
                handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
-                                msg->middle, session, cap, issued);
+                                msg->middle, session, cap, issued,
+                                pool_ns_len);
                goto done_unlocked;
 
        case CEPH_CAP_OP_FLUSH_ACK:
index 86a9c38..eb9028e 100644 (file)
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
 
        req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
                        false, GFP_NOFS);
-       if (IS_ERR(req)) {
-               ret = PTR_ERR(req);
+       if (!req) {
+               ret = -ENOMEM;
                req = orig_req;
                goto out;
        }
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
        ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
                                snapc, CEPH_NOSNAP, &aio_req->mtime);
 
-       ceph_put_snap_context(snapc);
        ceph_osdc_put_request(orig_req);
 
        req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ out:
                ceph_aio_complete_req(req, NULL);
        }
 
+       ceph_put_snap_context(snapc);
        kfree(aio_work);
 }
 
index fb4ba2e..5849b88 100644 (file)
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_symlink = NULL;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+       ci->i_pool_ns_len = 0;
 
        ci->i_fragtree = RB_ROOT;
        mutex_init(&ci->i_fragtree_mutex);
@@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
                        ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
                ci->i_layout = info->layout;
+               ci->i_pool_ns_len = iinfo->pool_ns_len;
 
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
index e7b130a..911d64d 100644 (file)
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
        } else
                info->inline_version = CEPH_INLINE_NONE;
 
+       if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+               ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+               ceph_decode_need(p, end, info->pool_ns_len, bad);
+               *p += info->pool_ns_len;
+       } else {
+               info->pool_ns_len = 0;
+       }
+
        return 0;
 bad:
        return err;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
 
+       /* deny access to directories with pool_ns layouts */
+       if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+           ceph_inode(req->r_inode)->i_pool_ns_len)
+               return -EIO;
+       if (req->r_locked_dir &&
+           ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+               return -EIO;
+
        /* issue */
        mutex_lock(&mdsc->mutex);
        __register_request(mdsc, req, dir);
index ccf11ef..37712cc 100644 (file)
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
        u64 inline_version;
        u32 inline_len;
        char *inline_data;
+       u32 pool_ns_len;
 };
 
 /*
index 75b7d12..9c458eb 100644 (file)
@@ -287,6 +287,7 @@ struct ceph_inode_info {
 
        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
+       size_t i_pool_ns_len;
        char *i_symlink;
 
        /* for dirs */
index 7dc886c..e956cba 100644 (file)
@@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
         * string to the length of the original string to allow for worst case.
         */
        md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
-       mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+       mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
        if (mountdata == NULL) {
                rc = -ENOMEM;
                goto compose_mount_options_err;
index afa09fc..e682b36 100644 (file)
@@ -714,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 
        ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
        if (!ses->auth_key.response) {
-               rc = ENOMEM;
+               rc = -ENOMEM;
                ses->auth_key.len = 0;
                goto setup_ntlmv2_rsp_ret;
        }
index c48ca13..2eea403 100644 (file)
@@ -1013,7 +1013,6 @@ const struct file_operations cifs_file_strict_ops = {
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
        .clone_file_range = cifs_clone_file_range,
-       .clone_file_range = cifs_clone_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
 };
index 68c4547..83aac8b 100644 (file)
  * so that it will fit. We use hash_64 to convert the value to 31 bits, and
  * then add 1, to ensure that we don't end up with a 0 as the value.
  */
-#if BITS_PER_LONG == 64
 static inline ino_t
 cifs_uniqueid_to_ino_t(u64 fileid)
 {
+       if ((sizeof(ino_t)) < (sizeof(u64)))
+               return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
        return (ino_t)fileid;
+
 }
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
-       return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
 
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
index 90b4f9f..76fcb50 100644 (file)
@@ -1396,11 +1396,10 @@ openRetry:
  * current bigbuf.
  */
 static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
 {
        unsigned int rfclen = get_rfc1002_length(server->smallbuf);
        int remaining = rfclen + 4 - server->total_read;
-       struct cifs_readdata *rdata = mid->callback_data;
 
        while (remaining > 0) {
                int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
                remaining -= length;
        }
 
-       dequeue_mid(mid, rdata->result);
        return 0;
 }
 
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+       int length;
+       struct cifs_readdata *rdata = mid->callback_data;
+
+       length = discard_remaining_data(server);
+       dequeue_mid(mid, rdata->result);
+       return length;
+}
+
 int
 cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
                return length;
        server->total_read += length;
 
+       if (server->ops->is_status_pending &&
+           server->ops->is_status_pending(buf, server, 0)) {
+               discard_remaining_data(server);
+               return -1;
+       }
+
        /* Was the SMB read successful? */
        rdata->result = server->ops->map_error(buf, false);
        if (rdata->result != 0) {
index 4fbd92d..a763cd3 100644 (file)
@@ -2999,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
        if (ses_init_buf) {
                ses_init_buf->trailer.session_req.called_len = 32;
 
-               if (server->server_RFC1001_name &&
-                   server->server_RFC1001_name[0] != 0)
+               if (server->server_RFC1001_name[0] != 0)
                        rfc1002mangle(ses_init_buf->trailer.
                                      session_req.called_name,
                                      server->server_RFC1001_name,
index 10f8d5c..42e1f44 100644 (file)
@@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
 {
        char *data_offset;
        struct create_context *cc;
-       unsigned int next = 0;
+       unsigned int next;
+       unsigned int remaining;
        char *name;
 
        data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+       remaining = le32_to_cpu(rsp->CreateContextsLength);
        cc = (struct create_context *)data_offset;
-       do {
-               cc = (struct create_context *)((char *)cc + next);
+       while (remaining >= sizeof(struct create_context)) {
                name = le16_to_cpu(cc->NameOffset) + (char *)cc;
-               if (le16_to_cpu(cc->NameLength) != 4 ||
-                   strncmp(name, "RqLs", 4)) {
-                       next = le32_to_cpu(cc->Next);
-                       continue;
-               }
-               return server->ops->parse_lease_buf(cc, epoch);
-       } while (next != 0);
+               if (le16_to_cpu(cc->NameLength) == 4 &&
+                   strncmp(name, "RqLs", 4) == 0)
+                       return server->ops->parse_lease_buf(cc, epoch);
+
+               next = le32_to_cpu(cc->Next);
+               if (!next)
+                       break;
+               remaining -= next;
+               cc = (struct create_context *)((char *)cc + next);
+       }
 
        return 0;
 }
index a5b8eb6..6402eaf 100644 (file)
@@ -1261,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
 COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
 COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
 COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
index 4fd6b0c..bbb2ad7 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -58,16 +58,35 @@ static void dax_unmap_atomic(struct block_device *bdev,
        blk_queue_exit(bdev->bd_queue);
 }
 
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+       struct page *page = alloc_pages(GFP_KERNEL, 0);
+       struct blk_dax_ctl dax = {
+               .size = PAGE_SIZE,
+               .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+       };
+       long rc;
+
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+
+       rc = dax_map_atomic(bdev, &dax);
+       if (rc < 0)
+               return ERR_PTR(rc);
+       memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+       dax_unmap_atomic(bdev, &dax);
+       return page;
+}
+
 /*
- * dax_clear_blocks() is called from within transaction context from XFS,
+ * dax_clear_sectors() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
 {
-       struct block_device *bdev = inode->i_sb->s_bdev;
        struct blk_dax_ctl dax = {
-               .sector = block << (inode->i_blkbits - 9),
+               .sector = _sector,
                .size = _size,
        };
 
@@ -89,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
        wmb_pmem();
        return 0;
 }
-EXPORT_SYMBOL_GPL(dax_clear_blocks);
+EXPORT_SYMBOL_GPL(dax_clear_sectors);
 
 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
@@ -338,7 +357,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
        void *entry;
 
        WARN_ON_ONCE(pmd_entry && !dirty);
-       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+       if (dirty)
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
        spin_lock_irq(&mapping->tree_lock);
 
@@ -464,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev,
  * end]. This is required by data integrity operations to ensure file data is
  * on persistent storage prior to completion of the operation.
  */
-int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
-               loff_t end)
+int dax_writeback_mapping_range(struct address_space *mapping,
+               struct block_device *bdev, struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-       struct block_device *bdev = inode->i_sb->s_bdev;
        pgoff_t start_index, end_index, pmd_index;
        pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
@@ -479,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;
 
-       start_index = start >> PAGE_CACHE_SHIFT;
-       end_index = end >> PAGE_CACHE_SHIFT;
+       if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+               return 0;
+
+       start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+       end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
        pmd_index = DAX_PMD_INDEX(start_index);
 
        rcu_read_lock();
@@ -1034,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
+       int error;
 
        /*
         * We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1043,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         * saves us from having to make a call to get_block() here to look
         * up the sector.
         */
-       dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+       error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+                       true);
+
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM;
+       if (error)
+               return VM_FAULT_SIGBUS;
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
index 92d5140..2398f9f 100644 (file)
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
        return dentry->d_name.name != dentry->d_iname;
 }
 
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
 static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
        unsigned flags;
 
        dentry->d_inode = inode;
-       smp_wmb();
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        flags |= type_flags;
        WRITE_ONCE(dentry->d_flags, flags);
 }
 
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
 static inline void __d_clear_type_and_inode(struct dentry *dentry)
 {
        unsigned flags = READ_ONCE(dentry->d_flags);
 
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        WRITE_ONCE(dentry->d_flags, flags);
-       smp_wmb();
        dentry->d_inode = NULL;
 }
 
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
+
+       raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
-       dentry_rcuwalk_invalidate(dentry);
+       raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
@@ -1758,8 +1747,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
        spin_lock(&dentry->d_lock);
        if (inode)
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+       raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
-       dentry_rcuwalk_invalidate(dentry);
+       raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
 }
index 1f107fd..655f21f 100644 (file)
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
        mutex_unlock(&allocated_ptys_lock);
 }
 
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+       struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+       atomic_inc(&sb->s_active);
+       ihold(ptmx_inode);
+}
+
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+       struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+       iput(ptmx_inode);
+       deactivate_super(sb);
+}
+
 /**
  * devpts_pty_new -- create a new inode in /dev/pts/
  * @ptmx_inode: inode of the master
index 1b2f7ff..d6a9012 100644 (file)
@@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
                dio->io_error = -EIO;
 
        if (dio->is_async && dio->rw == READ && dio->should_dirty) {
-               bio_check_pages_dirty(bio);     /* transfers ownership */
                err = bio->bi_error;
+               bio_check_pages_dirty(bio);     /* transfers ownership */
        } else {
                bio_for_each_segment_all(bvec, bio, i) {
                        struct page *page = bvec->bv_page;
index c424e48..d48e0d2 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/efi.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/mount.h>
 
 #include "internal.h"
 
@@ -103,9 +104,78 @@ out_free:
        return size;
 }
 
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+       struct inode *inode = file->f_mapping->host;
+       unsigned int i_flags;
+       unsigned int flags = 0;
+
+       i_flags = inode->i_flags;
+       if (i_flags & S_IMMUTABLE)
+               flags |= FS_IMMUTABLE_FL;
+
+       if (copy_to_user(arg, &flags, sizeof(flags)))
+               return -EFAULT;
+       return 0;
+}
+
+static int
+efivarfs_ioc_setxflags(struct file *file, void __user *arg)
+{
+       struct inode *inode = file->f_mapping->host;
+       unsigned int flags;
+       unsigned int i_flags = 0;
+       int error;
+
+       if (!inode_owner_or_capable(inode))
+               return -EACCES;
+
+       if (copy_from_user(&flags, arg, sizeof(flags)))
+               return -EFAULT;
+
+       if (flags & ~FS_IMMUTABLE_FL)
+               return -EOPNOTSUPP;
+
+       if (!capable(CAP_LINUX_IMMUTABLE))
+               return -EPERM;
+
+       if (flags & FS_IMMUTABLE_FL)
+               i_flags |= S_IMMUTABLE;
+
+
+       error = mnt_want_write_file(file);
+       if (error)
+               return error;
+
+       inode_lock(inode);
+       inode_set_flags(inode, i_flags, S_IMMUTABLE);
+       inode_unlock(inode);
+
+       mnt_drop_write_file(file);
+
+       return 0;
+}
+
+long
+efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+       void __user *arg = (void __user *)p;
+
+       switch (cmd) {
+       case FS_IOC_GETFLAGS:
+               return efivarfs_ioc_getxflags(file, arg);
+       case FS_IOC_SETFLAGS:
+               return efivarfs_ioc_setxflags(file, arg);
+       }
+
+       return -ENOTTY;
+}
+
 const struct file_operations efivarfs_file_operations = {
        .open   = simple_open,
        .read   = efivarfs_file_read,
        .write  = efivarfs_file_write,
        .llseek = no_llseek,
+       .unlocked_ioctl = efivarfs_file_ioctl,
 };
index 3381b9d..e2ab6d0 100644 (file)
@@ -15,7 +15,8 @@
 #include "internal.h"
 
 struct inode *efivarfs_get_inode(struct super_block *sb,
-                               const struct inode *dir, int mode, dev_t dev)
+                               const struct inode *dir, int mode,
+                               dev_t dev, bool is_removable)
 {
        struct inode *inode = new_inode(sb);
 
@@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
                inode->i_ino = get_next_ino();
                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
                switch (mode & S_IFMT) {
                case S_IFREG:
                        inode->i_fop = &efivarfs_file_operations;
@@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
 static int efivarfs_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode, bool excl)
 {
-       struct inode *inode;
+       struct inode *inode = NULL;
        struct efivar_entry *var;
        int namelen, i = 0, err = 0;
+       bool is_removable = false;
 
        if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
                return -EINVAL;
 
-       inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
-       if (!inode)
-               return -ENOMEM;
-
        var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
-       if (!var) {
-               err = -ENOMEM;
-               goto out;
-       }
+       if (!var)
+               return -ENOMEM;
 
        /* length of the variable name itself: remove GUID and separator */
        namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
@@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
        efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
                        &var->var.VendorGuid);
 
+       if (efivar_variable_is_removable(var->var.VendorGuid,
+                                        dentry->d_name.name, namelen))
+               is_removable = true;
+
+       inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
+       if (!inode) {
+               err = -ENOMEM;
+               goto out;
+       }
+
        for (i = 0; i < namelen; i++)
                var->var.VariableName[i] = dentry->d_name.name[i];
 
@@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 out:
        if (err) {
                kfree(var);
-               iput(inode);
+               if (inode)
+                       iput(inode);
        }
        return err;
 }
index b5ff16a..b450518 100644 (file)
@@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations;
 extern const struct inode_operations efivarfs_dir_inode_operations;
 extern bool efivarfs_valid_name(const char *str, int len);
 extern struct inode *efivarfs_get_inode(struct super_block *sb,
-                       const struct inode *dir, int mode, dev_t dev);
+                       const struct inode *dir, int mode, dev_t dev,
+                       bool is_removable);
 
 extern struct list_head efivarfs_list;
 
index b8a564f..dd029d1 100644 (file)
@@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        struct dentry *dentry, *root = sb->s_root;
        unsigned long size = 0;
        char *name;
-       int len, i;
+       int len;
        int err = -ENOMEM;
+       bool is_removable = false;
 
        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
@@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        memcpy(entry->var.VariableName, name16, name_size);
        memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
 
-       len = ucs2_strlen(entry->var.VariableName);
+       len = ucs2_utf8size(entry->var.VariableName);
 
        /* name, plus '-', plus GUID, plus NUL*/
        name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
        if (!name)
                goto fail;
 
-       for (i = 0; i < len; i++)
-               name[i] = entry->var.VariableName[i] & 0xFF;
+       ucs2_as_utf8(name, entry->var.VariableName, len);
+
+       if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+               is_removable = true;
 
        name[len] = '-';
 
@@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 
        name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
 
-       inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+       inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
+                                  is_removable);
        if (!inode)
                goto fail_name;
 
@@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_d_op              = &efivarfs_d_ops;
        sb->s_time_gran         = 1;
 
-       inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+       inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
        if (!inode)
                return -ENOMEM;
        inode->i_op = &efivarfs_dir_inode_operations;
index ae1dbcf..cde6074 100644 (file)
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
 
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+                               EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
+
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
 
@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
-               ewake = 1;
+               if ((epi->event.events & EPOLLEXCLUSIVE) &&
+                                       !((unsigned long)key & POLLFREE)) {
+                       switch ((unsigned long)key & EPOLLINOUT_BITS) {
+                       case POLLIN:
+                               if (epi->event.events & POLLIN)
+                                       ewake = 1;
+                               break;
+                       case POLLOUT:
+                               if (epi->event.events & POLLOUT)
+                                       ewake = 1;
+                               break;
+                       case 0:
+                               ewake = 1;
+                               break;
+                       }
+               }
                wake_up_locked(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
-       if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
-               (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
-               goto error_tgt_fput;
+       if (epds.events & EPOLLEXCLUSIVE) {
+               if (op == EPOLL_CTL_MOD)
+                       goto error_tgt_fput;
+               if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+                               (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+                       goto error_tgt_fput;
+       }
 
        /*
         * At this point it is safe to assume that the "private_data" contains
@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
-                       epds.events |= POLLERR | POLLHUP;
-                       error = ep_modify(ep, epi, &epds);
+                       if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+                               epds.events |= POLLERR | POLLHUP;
+                               error = ep_modify(ep, epi, &epds);
+                       }
                } else
                        error = -ENOENT;
                break;
index 2c88d68..c1400b1 100644 (file)
@@ -80,23 +80,6 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
        return ret;
 }
 
-static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct inode *inode = file_inode(vma->vm_file);
-       struct ext2_inode_info *ei = EXT2_I(inode);
-       int ret;
-
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
-       down_read(&ei->dax_sem);
-
-       ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
-
-       up_read(&ei->dax_sem);
-       sb_end_pagefault(inode->i_sb);
-       return ret;
-}
-
 static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
                struct vm_fault *vmf)
 {
@@ -124,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 static const struct vm_operations_struct ext2_dax_vm_ops = {
        .fault          = ext2_dax_fault,
        .pmd_fault      = ext2_dax_pmd_fault,
-       .page_mkwrite   = ext2_dax_mkwrite,
+       .page_mkwrite   = ext2_dax_fault,
        .pfn_mkwrite    = ext2_dax_pfn_mkwrite,
 };
 
index 338eefd..6bd58e6 100644 (file)
@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
                 * so that it's not found by another thread before it's
                 * initialised
                 */
-               err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
-                                               1 << inode->i_blkbits);
+               err = dax_clear_sectors(inode->i_sb->s_bdev,
+                               le32_to_cpu(chain[depth-1].key) <<
+                               (inode->i_blkbits - 9),
+                               1 << inode->i_blkbits);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+#ifdef CONFIG_FS_DAX
+       if (dax_mapping(mapping)) {
+               return dax_writeback_mapping_range(mapping,
+                                                  mapping->host->i_sb->s_bdev,
+                                                  wbc);
+       }
+#endif
+
        return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
@@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_NOATIME;
        if (flags & EXT2_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
-       if (test_opt(inode->i_sb, DAX))
+       if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
                inode->i_flags |= S_DAX;
 }
 
index ec0668a..fe1f50f 100644 (file)
@@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
-               ext4_error(sb, "Checksum bad for group %u", block_group);
                grp = ext4_get_group_info(sb, block_group);
                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-
                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
-               if (err)
+               if (err) {
+                       ext4_error(sb, "Failed to init block bitmap for group "
+                                  "%u: %d", block_group, err);
                        goto out;
+               }
                goto verify;
        }
        ext4_unlock_group(sb, block_group);
index c802120..38f7562 100644 (file)
@@ -467,3 +467,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
                return size;
        return 0;
 }
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       struct inode *dir = d_inode(dentry->d_parent);
+       struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info;
+       int dir_has_key, cached_with_key;
+
+       if (!ext4_encrypted_inode(dir))
+               return 0;
+
+       if (ci && ci->ci_keyring_key &&
+           (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                                         (1 << KEY_FLAG_REVOKED) |
+                                         (1 << KEY_FLAG_DEAD))))
+               ci = NULL;
+
+       /* this should eventually be an flag in d_flags */
+       cached_with_key = dentry->d_fsdata != NULL;
+       dir_has_key = (ci != NULL);
+
+       /*
+        * If the dentry was cached without the key, and it is a
+        * negative dentry, it might be a valid name.  We can't check
+        * if the key has since been made available due to locking
+        * reasons, so we fail the validation so ext4_lookup() can do
+        * this check.
+        *
+        * We also fail the validation if the dentry was created with
+        * the key present, but we no longer have the key, or vice versa.
+        */
+       if ((!cached_with_key && d_is_negative(dentry)) ||
+           (!cached_with_key && dir_has_key) ||
+           (cached_with_key && !dir_has_key)) {
+#if 0                          /* Revalidation debug */
+               char buf[80];
+               char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+               if (IS_ERR(cp))
+                       cp = (char *) "???";
+               pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+                      cached_with_key, d_is_negative(dentry),
+                      dir_has_key);
+#endif
+               return 0;
+       }
+       return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+       .d_revalidate = ext4_d_revalidate,
+};
index 1d1bca7..33f5e2a 100644 (file)
@@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
        int dir_has_error = 0;
        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
 
+       if (ext4_encrypted_inode(inode)) {
+               err = ext4_get_encryption_info(inode);
+               if (err && err != -ENOKEY)
+                       return err;
+       }
+
        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR) {
@@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
                                        index, 1);
                        file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0);
-                       if (IS_ERR(bh))
-                               return PTR_ERR(bh);
+                       if (IS_ERR(bh)) {
+                               err = PTR_ERR(bh);
+                               bh = NULL;
+                               goto errout;
+                       }
                }
 
                if (!bh) {
index 0662b28..157b458 100644 (file)
@@ -2302,6 +2302,7 @@ struct page *ext4_encrypt(struct inode *inode,
 int ext4_decrypt(struct page *page);
 int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
                           ext4_fsblk_t pblk, ext4_lblk_t len);
+extern const struct dentry_operations ext4_encrypted_d_ops;
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_init_crypto(void);
index 0ffabaf..3753ceb 100644 (file)
@@ -3928,7 +3928,7 @@ static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
                           struct ext4_ext_path **ppath, int flags,
-                          unsigned int allocated, ext4_fsblk_t newblock)
+                          unsigned int allocated)
 {
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
@@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
                                allocated = convert_initialized_extent(
                                                handle, inode, map, &path,
-                                               flags, allocated, newblock);
+                                               flags, allocated);
                                goto out2;
                        } else if (!ext4_ext_is_unwritten(ex))
                                goto out;
index 1126436..4cd318f 100644 (file)
@@ -262,23 +262,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
        return result;
 }
 
-static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       int err;
-       struct inode *inode = file_inode(vma->vm_file);
-
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
-       down_read(&EXT4_I(inode)->i_mmap_sem);
-       err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
-       up_read(&EXT4_I(inode)->i_mmap_sem);
-       sb_end_pagefault(inode->i_sb);
-
-       return err;
-}
-
 /*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
  * i_mmap_sem, we are sure that also any hole punching that began before we
  * were called is finished by now and so if it included part of the file we
@@ -311,7 +296,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
 static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
        .pmd_fault      = ext4_dax_pmd_fault,
-       .page_mkwrite   = ext4_dax_mkwrite,
+       .page_mkwrite   = ext4_dax_fault,
        .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
 };
 #else
@@ -350,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct vfsmount *mnt = filp->f_path.mnt;
+       struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
        struct path path;
        char buf[64], *cp;
        int ret;
@@ -393,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                if (ext4_encryption_info(inode) == NULL)
                        return -ENOKEY;
        }
+       if (ext4_encrypted_inode(dir) &&
+           !ext4_is_child_context_consistent_with_parent(dir, inode)) {
+               ext4_warning(inode->i_sb,
+                            "Inconsistent encryption contexts: %lu/%lu\n",
+                            (unsigned long) dir->i_ino,
+                            (unsigned long) inode->i_ino);
+               return -EPERM;
+       }
        /*
         * Set up the jbd2_inode if we are opening the inode for
         * writing and the journal is present
index 3fcfd50..acc0ad5 100644 (file)
@@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
-               ext4_error(sb, "Checksum bad for group %u", block_group);
                grp = ext4_get_group_info(sb, block_group);
                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
-               if (err)
+               if (err) {
+                       ext4_error(sb, "Failed to init inode bitmap for group "
+                                  "%u: %d", block_group, err);
                        goto out;
+               }
                return bh;
        }
        ext4_unlock_group(sb, block_group);
index 83bc8bf..aee960b 100644 (file)
@@ -686,6 +686,34 @@ out_sem:
        return retval;
 }
 
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+       unsigned long old_state;
+       unsigned long new_state;
+
+       flags &= EXT4_MAP_FLAGS;
+
+       /* Dummy buffer_head? Set non-atomically. */
+       if (!bh->b_page) {
+               bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+               return;
+       }
+       /*
+        * Someone else may be modifying b_state. Be careful! This is ugly but
+        * once we get rid of using bh as a container for mapping information
+        * to pass to / from get_block functions, this can go away.
+        */
+       do {
+               old_state = READ_ONCE(bh->b_state);
+               new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+       } while (unlikely(
+                cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
+
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -722,7 +750,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                ext4_io_end_t *io_end = ext4_inode_aio(inode);
 
                map_bh(bh, inode->i_sb, map.m_pblk);
-               bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+               ext4_update_bh_state(bh, map.m_flags);
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -1685,7 +1713,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                return ret;
 
        map_bh(bh, inode->i_sb, map.m_pblk);
-       bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+       ext4_update_bh_state(bh, map.m_flags);
 
        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
@@ -2450,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping,
 
        trace_ext4_writepages(inode, wbc);
 
+       if (dax_mapping(mapping))
+               return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+                                                  wbc);
+
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@ -3253,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         * case, we allocate an io_end structure to hook to the iocb.
         */
        iocb->private = NULL;
-       ext4_inode_aio_set(inode, NULL);
-       if (!is_sync_kiocb(iocb)) {
-               io_end = ext4_init_io_end(inode, GFP_NOFS);
-               if (!io_end) {
-                       ret = -ENOMEM;
-                       goto retake_lock;
-               }
-               /*
-                * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
-                */
-               iocb->private = ext4_get_io_end(io_end);
-               /*
-                * we save the io structure for current async direct
-                * IO, so that later ext4_map_blocks() could flag the
-                * io structure whether there is a unwritten extents
-                * needs to be converted when IO is completed.
-                */
-               ext4_inode_aio_set(inode, io_end);
-       }
-
        if (overwrite) {
                get_block_func = ext4_get_block_overwrite;
        } else {
+               ext4_inode_aio_set(inode, NULL);
+               if (!is_sync_kiocb(iocb)) {
+                       io_end = ext4_init_io_end(inode, GFP_NOFS);
+                       if (!io_end) {
+                               ret = -ENOMEM;
+                               goto retake_lock;
+                       }
+                       /*
+                        * Grab reference for DIO. Will be dropped in
+                        * ext4_end_io_dio()
+                        */
+                       iocb->private = ext4_get_io_end(io_end);
+                       /*
+                        * we save the io structure for current async direct
+                        * IO, so that later ext4_map_blocks() could flag the
+                        * io structure whether there is a unwritten extents
+                        * needs to be converted when IO is completed.
+                        */
+                       ext4_inode_aio_set(inode, io_end);
+               }
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
@@ -4127,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
-       if (test_opt(inode->i_sb, DAX))
+       if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
                new_fl |= S_DAX;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
index 0f6c369..eae5917 100644 (file)
@@ -208,7 +208,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        handle_t *handle = NULL;
-       int err = EPERM, migrate = 0;
+       int err = -EPERM, migrate = 0;
        struct ext4_iloc iloc;
        unsigned int oldflags, mask, i;
        unsigned int jflag;
@@ -583,6 +583,11 @@ group_extend_out:
                                 "Online defrag not supported with bigalloc");
                        err = -EOPNOTSUPP;
                        goto mext_out;
+               } else if (IS_DAX(inode)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "Online defrag not supported with DAX");
+                       err = -EOPNOTSUPP;
+                       goto mext_out;
                }
 
                err = mnt_want_write_file(filp);
index 61eaf74..4424b7b 100644 (file)
@@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        if (group == 0)
                seq_puts(seq, "#group: free  frags first ["
                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
-                             " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
+                             " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
 
        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                sizeof(struct ext4_group_info);
index fb6f117..4098acc 100644 (file)
@@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        ext4_lblk_t orig_blk_offset, donor_blk_offset;
        unsigned long blocksize = orig_inode->i_sb->s_blocksize;
        unsigned int tmp_data_size, data_size, replaced_size;
-       int err2, jblocks, retries = 0;
+       int i, err2, jblocks, retries = 0;
        int replaced_count = 0;
        int from = data_offset_in_page << orig_inode->i_blkbits;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
        struct super_block *sb = orig_inode->i_sb;
+       struct buffer_head *bh = NULL;
 
        /*
         * It needs twice the amount of ordinary journal buffers because
@@ -380,8 +381,17 @@ data_copy:
        }
        /* Perform all necessary steps similar write_begin()/write_end()
         * but keeping in mind that i_size will not change */
-       *err = __block_write_begin(pagep[0], from, replaced_size,
-                                  ext4_get_block);
+       if (!page_has_buffers(pagep[0]))
+               create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+       bh = page_buffers(pagep[0]);
+       for (i = 0; i < data_offset_in_page; i++)
+               bh = bh->b_this_page;
+       for (i = 0; i < block_len_in_page; i++) {
+               *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+               if (*err < 0)
+                       break;
+               bh = bh->b_this_page;
+       }
        if (!*err)
                *err = block_commit_write(pagep[0], from, from + replaced_size);
 
index 06574dd..48e4b89 100644 (file)
@@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
        struct ext4_dir_entry_2 *de;
        struct buffer_head *bh;
 
+       if (ext4_encrypted_inode(dir)) {
+               int res = ext4_get_encryption_info(dir);
+
+               /*
+                * This should be a properly defined flag for
+                * dentry->d_flags when we uplift this to the VFS.
+                * d_fsdata is set to (void *) 1 if if the dentry is
+                * created while the directory was encrypted and we
+                * don't have access to the key.
+                */
+              dentry->d_fsdata = NULL;
+              if (ext4_encryption_info(dir))
+                      dentry->d_fsdata = (void *) 1;
+              d_set_d_op(dentry, &ext4_encrypted_d_ops);
+              if (res && res != -ENOKEY)
+                      return ERR_PTR(res);
+       }
+
        if (dentry->d_name.len > EXT4_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
 
@@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                        return ERR_PTR(-EFSCORRUPTED);
                }
                if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
-                   (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                    S_ISLNK(inode->i_mode)) &&
+                   (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
                    !ext4_is_child_context_consistent_with_parent(dir,
                                                                  inode)) {
+                       int nokey = ext4_encrypted_inode(inode) &&
+                               !ext4_encryption_info(inode);
+
                        iput(inode);
+                       if (nokey)
+                               return ERR_PTR(-ENOKEY);
                        ext4_warning(inode->i_sb,
                                     "Inconsistent encryption contexts: %lu/%lu\n",
                                     (unsigned long) dir->i_ino,
index ad62d7a..34038e3 100644 (file)
@@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
        if (flex_gd == NULL)
                goto out3;
 
-       if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+       if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
                goto out2;
        flex_gd->count = flexbg_size;
 
index 6915c95..5c46ed9 100644 (file)
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -424,6 +427,8 @@ skip_switch:
 
        iput(inode);
        kfree(isw);
+
+       atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
 
        /* needs to grab bh-unsafe locks, bounce to work item */
        INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-       schedule_work(&isw->work);
+       queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -469,7 +474,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
-       if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+       if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+           inode->i_state & (I_WB_SWITCH | I_FREEING) ||
            inode_to_wb(inode) == isw->new_wb) {
                spin_unlock(&inode->i_lock);
                goto out_free;
@@ -480,6 +486,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
        ihold(inode);
        isw->inode = inode;
 
+       atomic_inc(&isw_nr_in_flight);
+
        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the mapping's
@@ -840,6 +848,33 @@ restart:
                wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+       if (atomic_read(&isw_nr_in_flight)) {
+               synchronize_rcu();
+               flush_workqueue(isw_wq);
+       }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+       isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+       if (!isw_wq)
+               return -ENOMEM;
+       return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
index 506765a..bb8d67e 100644 (file)
@@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = d_inode(dentry);
        dnode_secno dno;
        int r;
-       int rep = 0;
        int err;
 
        hpfs_lock(dir->i_sb);
        hpfs_adjust_length(name, &len);
-again:
+
        err = -ENOENT;
        de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
        if (!de)
@@ -401,33 +400,9 @@ again:
                hpfs_error(dir->i_sb, "there was error when removing dirent");
                err = -EFSERROR;
                break;
-       case 2:         /* no space for deleting, try to truncate file */
-
+       case 2:         /* no space for deleting */
                err = -ENOSPC;
-               if (rep++)
-                       break;
-
-               dentry_unhash(dentry);
-               if (!d_unhashed(dentry)) {
-                       hpfs_unlock(dir->i_sb);
-                       return -ENOSPC;
-               }
-               if (generic_permission(inode, MAY_WRITE) ||
-                   !S_ISREG(inode->i_mode) ||
-                   get_write_access(inode)) {
-                       d_rehash(dentry);
-               } else {
-                       struct iattr newattrs;
-                       /*pr_info("truncating file before delete.\n");*/
-                       newattrs.ia_size = 0;
-                       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-                       err = notify_change(dentry, &newattrs, NULL);
-                       put_write_access(inode);
-                       if (!err)
-                               goto again;
-               }
-               hpfs_unlock(dir->i_sb);
-               return -ENOSPC;
+               break;
        default:
                drop_nlink(inode);
                err = 0;
index 9f62db3..69b8b52 100644 (file)
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_rdev = 0;
        inode->dirtied_when = 0;
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+       inode->i_wb_frn_winner = 0;
+       inode->i_wb_frn_avg_time = 0;
+       inode->i_wb_frn_history = 0;
+#endif
+
        if (security_inode_alloc(inode))
                goto out;
        spin_lock_init(&inode->i_lock);
index 3ea3655..8918ac9 100644 (file)
@@ -2,10 +2,6 @@
        JFFS2 LOCKING DOCUMENTATION
        ---------------------------
 
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
 This document attempts to describe the existing locking rules for
 JFFS2. It is not expected to remain perfectly up to date, but ought to
 be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
           any f->sem held.
        2. Never attempt to lock two file mutexes in one thread.
           No ordering rules have been made for doing so.
+       3. Never lock a page cache page with f->sem held.
 
 
        erase_completion_lock spinlock
index 0ae91ad..b288c8a 100644 (file)
@@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
 
 
 static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
-                                   struct jffs2_inode_cache *ic)
+                                   struct jffs2_inode_cache *ic,
+                                   int *dir_hardlinks)
 {
        struct jffs2_full_dirent *fd;
 
@@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
                        dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
                                  fd->name, fd->ino, ic->ino);
                        jffs2_mark_node_obsolete(c, fd->raw);
+                       /* Clear the ic/raw union so it doesn't cause problems later. */
+                       fd->ic = NULL;
                        continue;
                }
 
+               /* From this point, fd->raw is no longer used so we can set fd->ic */
+               fd->ic = child_ic;
+               child_ic->pino_nlink++;
+               /* If we appear (at this stage) to have hard-linked directories,
+                * set a flag to trigger a scan later */
                if (fd->type == DT_DIR) {
-                       if (child_ic->pino_nlink) {
-                               JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
-                                           fd->name, fd->ino, ic->ino);
-                               /* TODO: What do we do about it? */
-                       } else {
-                               child_ic->pino_nlink = ic->ino;
-                       }
-               } else
-                       child_ic->pino_nlink++;
+                       child_ic->flags |= INO_FLAGS_IS_DIR;
+                       if (child_ic->pino_nlink > 1)
+                               *dir_hardlinks = 1;
+               }
 
                dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
                /* Can't free scan_dents so far. We might need them in pass 2 */
@@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
 */
 static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 {
-       int ret;
-       int i;
+       int ret, i, dir_hardlinks = 0;
        struct jffs2_inode_cache *ic;
        struct jffs2_full_dirent *fd;
        struct jffs2_full_dirent *dead_fds = NULL;
@@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
        /* Now scan the directory tree, increasing nlink according to every dirent found. */
        for_each_inode(i, c, ic) {
                if (ic->scan_dents) {
-                       jffs2_build_inode_pass1(c, ic);
+                       jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
                        cond_resched();
                }
        }
@@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
        }
 
        dbg_fsbuild("pass 2a complete\n");
+
+       if (dir_hardlinks) {
+               /* If we detected directory hardlinks earlier, *hopefully*
+                * they are gone now because some of the links were from
+                * dead directories which still had some old dirents lying
+                * around and not yet garbage-collected, but which have
+                * been discarded above. So clear the pino_nlink field
+                * in each directory, so that the final scan below can
+                * print appropriate warnings. */
+               for_each_inode(i, c, ic) {
+                       if (ic->flags & INO_FLAGS_IS_DIR)
+                               ic->pino_nlink = 0;
+               }
+       }
        dbg_fsbuild("freeing temporary data structures\n");
 
        /* Finally, we can scan again and free the dirent structs */
@@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
                while(ic->scan_dents) {
                        fd = ic->scan_dents;
                        ic->scan_dents = fd->next;
+                       /* We do use the pino_nlink field to count nlink of
+                        * directories during fs build, so set it to the
+                        * parent ino# now. Now that there's hopefully only
+                        * one. */
+                       if (fd->type == DT_DIR) {
+                               if (!fd->ic) {
+                                       /* We'll have complained about it and marked the coresponding
+                                          raw node obsolete already. Just skip it. */
+                                       continue;
+                               }
+
+                               /* We *have* to have set this in jffs2_build_inode_pass1() */
+                               BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+                               /* We clear ic->pino_nlink âˆ€ directories' ic *only* if dir_hardlinks
+                                * is set. Otherwise, we know this should never trigger anyway, so
+                                * we don't do the check. And ic->pino_nlink still contains the nlink
+                                * value (which is 1). */
+                               if (dir_hardlinks && fd->ic->pino_nlink) {
+                                       JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+                                                   fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+                                       /* Should we unlink it from its previous parent? */
+                               }
+
+                               /* For directories, ic->pino_nlink holds that parent inode # */
+                               fd->ic->pino_nlink = ic->ino;
+                       }
                        jffs2_free_full_dirent(fd);
                }
                ic->scan_dents = NULL;
@@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
 
                        /* Reduce nlink of the child. If it's now zero, stick it on the
                           dead_fds list to be cleaned up later. Else just free the fd */
-
-                       if (fd->type == DT_DIR)
-                               child_ic->pino_nlink = 0;
-                       else
-                               child_ic->pino_nlink--;
+                       child_ic->pino_nlink--;
 
                        if (!child_ic->pino_nlink) {
                                dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
index d211b8e..30c4c9e 100644 (file)
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
                pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
                          __func__, ret);
-               /* Might as well let the VFS know */
-               d_instantiate(new_dentry, d_inode(old_dentry));
-               ihold(d_inode(old_dentry));
+               /*
+                * We can't keep the target in dcache after that.
+                * For one thing, we can't afford dentry aliases for directories.
+                * For another, if there was a victim, we _can't_ set new inode
+                * for that sucker and we have to trigger mount eviction - the
+                * caller won't do it on its own since we are returning an error.
+                */
+               d_invalidate(new_dentry);
                new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
                return ret;
        }
index c5ac594..cad86ba 100644 (file)
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        struct page *pg;
        struct inode *inode = mapping->host;
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
-       struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
-       struct jffs2_raw_inode ri;
-       uint32_t alloc_len = 0;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
 
-       jffs2_dbg(1, "%s()\n", __func__);
-
-       if (pageofs > inode->i_size) {
-               ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
-                                         ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
-               if (ret)
-                       return ret;
-       }
-
-       mutex_lock(&f->sem);
        pg = grab_cache_page_write_begin(mapping, index, flags);
-       if (!pg) {
-               if (alloc_len)
-                       jffs2_complete_reservation(c);
-               mutex_unlock(&f->sem);
+       if (!pg)
                return -ENOMEM;
-       }
        *pagep = pg;
 
-       if (alloc_len) {
+       jffs2_dbg(1, "%s()\n", __func__);
+
+       if (pageofs > inode->i_size) {
                /* Make new hole frag from old EOF to new page */
+               struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+               struct jffs2_raw_inode ri;
                struct jffs2_full_dnode *fn;
+               uint32_t alloc_len;
 
                jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
                          (unsigned int)inode->i_size, pageofs);
 
+               ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+                                         ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+               if (ret)
+                       goto out_page;
+
+               mutex_lock(&f->sem);
                memset(&ri, 0, sizeof(ri));
 
                ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                if (IS_ERR(fn)) {
                        ret = PTR_ERR(fn);
                        jffs2_complete_reservation(c);
+                       mutex_unlock(&f->sem);
                        goto out_page;
                }
                ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        jffs2_mark_node_obsolete(c, fn->raw);
                        jffs2_free_full_dnode(fn);
                        jffs2_complete_reservation(c);
+                       mutex_unlock(&f->sem);
                        goto out_page;
                }
                jffs2_complete_reservation(c);
                inode->i_size = pageofs;
+               mutex_unlock(&f->sem);
        }
 
        /*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
         * case of a short-copy.
         */
        if (!PageUptodate(pg)) {
+               mutex_lock(&f->sem);
                ret = jffs2_do_readpage_nolock(inode, pg);
+               mutex_unlock(&f->sem);
                if (ret)
                        goto out_page;
        }
-       mutex_unlock(&f->sem);
        jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
        return ret;
 
 out_page:
        unlock_page(pg);
        page_cache_release(pg);
-       mutex_unlock(&f->sem);
        return ret;
 }
 
index 5a2dec2..95d5880 100644 (file)
@@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
                BUG_ON(start > orig_start);
        }
 
-       /* First, use readpage() to read the appropriate page into the page cache */
-       /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
-        *    triggered garbage collection in the first place?
-        * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
-        *    page OK. We'll actually write it out again in commit_write, which is a little
-        *    suboptimal, but at least we're correct.
-        */
+       /* The rules state that we must obtain the page lock *before* f->sem, so
+        * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+        * actually going to *change* so we're safe; we only allow reading.
+        *
+        * It is important to note that jffs2_write_begin() will ensure that its
+        * page is marked Uptodate before allocating space. That means that if we
+        * end up here trying to GC the *same* page that jffs2_write_begin() is
+        * trying to write out, read_cache_page() will not deadlock. */
+       mutex_unlock(&f->sem);
        pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+       mutex_lock(&f->sem);
 
        if (IS_ERR(pg_ptr)) {
                pr_warn("read_cache_page() returned error: %ld\n",
index fa35ff7..0637271 100644 (file)
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
 #define INO_STATE_CLEARING     6       /* In clear_inode() */
 
 #define INO_FLAGS_XATTR_CHECKED        0x01    /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR       0x02    /* is a directory */
 
 #define RAWNODE_CLASS_INODE_CACHE      0
 #define RAWNODE_CLASS_XATTR_DATUM      1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
 
 struct jffs2_full_dirent
 {
-       struct jffs2_raw_node_ref *raw;
+       union {
+               struct jffs2_raw_node_ref *raw;
+               struct jffs2_inode_cache *ic; /* Just during part of build */
+       };
        struct jffs2_full_dirent *next;
        uint32_t version;
        uint32_t ino; /* == zero for unlink */
index f624d13..9c590e0 100644 (file)
@@ -1712,6 +1712,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
                return 0;
        if (!follow)
                return 0;
+       /* make sure that d_is_symlink above matches inode */
+       if (nd->flags & LOOKUP_RCU) {
+               if (read_seqcount_retry(&link->dentry->d_seq, seq))
+                       return -ECHILD;
+       }
        return pick_link(nd, link, inode, seq);
 }
 
@@ -1743,11 +1748,11 @@ static int walk_component(struct nameidata *nd, int flags)
                if (err < 0)
                        return err;
 
-               inode = d_backing_inode(path.dentry);
                seq = 0;        /* we are already out of RCU mode */
                err = -ENOENT;
                if (d_is_negative(path.dentry))
                        goto out_path_put;
+               inode = d_backing_inode(path.dentry);
        }
 
        if (flags & WALK_PUT)
@@ -3192,12 +3197,12 @@ retry_lookup:
                return error;
 
        BUG_ON(nd->flags & LOOKUP_RCU);
-       inode = d_backing_inode(path.dentry);
        seq = 0;        /* out of RCU mode, so the value doesn't matter */
        if (unlikely(d_is_negative(path.dentry))) {
                path_to_nameidata(&path, nd);
                return -ENOENT;
        }
+       inode = d_backing_inode(path.dentry);
 finish_lookup:
        if (nd->depth)
                put_link(nd);
@@ -3206,11 +3211,6 @@ finish_lookup:
        if (unlikely(error))
                return error;
 
-       if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
-               path_to_nameidata(&path, nd);
-               return -ELOOP;
-       }
-
        if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
                path_to_nameidata(&path, nd);
        } else {
@@ -3229,6 +3229,10 @@ finish_open:
                return error;
        }
        audit_inode(nd->name, nd->path.dentry, 0);
+       if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+               error = -ELOOP;
+               goto out;
+       }
        error = -EISDIR;
        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
                goto out;
@@ -3273,6 +3277,10 @@ opened:
                        goto exit_fput;
        }
 out:
+       if (unlikely(error > 0)) {
+               WARN_ON(1);
+               error = -EINVAL;
+       }
        if (got_write)
                mnt_drop_write(nd->path.mnt);
        path_put(&save_parent);
index 26c2de2..b7f8eae 100644 (file)
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                                d_rehash(newdent);
                } else {
                        spin_lock(&dentry->d_lock);
-                       NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                       NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
                        spin_unlock(&dentry->d_lock);
                }
        } else {
index c59a59c..35ab51c 100644 (file)
@@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 
                for (i = 0; i < nr_pages; i++)
                        put_page(arg->layoutupdate_pages[i]);
+               vfree(arg->start_p);
                kfree(arg->layoutupdate_pages);
        } else {
                put_page(arg->layoutupdate_page);
@@ -559,10 +560,15 @@ retry:
 
        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
                void *p = start_p, *end = p + arg->layoutupdate_len;
+               struct page *page = NULL;
                int i = 0;
 
-               for ( ; p < end; p += PAGE_SIZE)
-                       arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+               arg->start_p = start_p;
+               for ( ; p < end; p += PAGE_SIZE) {
+                       page = vmalloc_to_page(p);
+                       arg->layoutupdate_pages[i++] = page;
+                       get_page(page);
+               }
        }
 
        dprintk("%s found %zu ranges\n", __func__, count);
index 5bcd92d..0cb1abd 100644 (file)
@@ -1215,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
                                        hdr->pgio_mirror_idx + 1,
                                        &hdr->pgio_mirror_idx))
                        goto out_eagain;
-               set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
                        &hdr->lseg->pls_layout->plh_flags);
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
index 29898a9..eb37046 100644 (file)
@@ -412,7 +412,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                         OP_ILLEGAL, GFP_NOIO);
                if (!fail_return) {
                        if (ff_layout_has_available_ds(lseg))
-                               set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
                                        &lseg->pls_layout->plh_flags);
                        else
                                pnfs_error_mark_layout_for_return(ino, lseg);
index bd25dc7..dff8346 100644 (file)
 
 #define NFSDBG_FACILITY NFSDBG_PROC
 
-static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
-                               fmode_t fmode)
-{
-       struct nfs_open_context *open;
-       struct nfs_lock_context *lock;
-       int ret;
-
-       open = get_nfs_open_context(nfs_file_open_context(file));
-       lock = nfs_get_lock_context(open);
-       if (IS_ERR(lock)) {
-               put_nfs_open_context(open);
-               return PTR_ERR(lock);
-       }
-
-       ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
-
-       nfs_put_lock_context(lock);
-       put_nfs_open_context(open);
-       return ret;
-}
-
 static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
-                                loff_t offset, loff_t len)
+               struct nfs_lock_context *lock, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(filep);
        struct nfs_server *server = NFS_SERVER(inode);
@@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        msg->rpc_argp = &args;
        msg->rpc_resp = &res;
 
-       status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+       status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+                       lock, FMODE_WRITE);
        if (status)
                return status;
 
@@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 {
        struct nfs_server *server = NFS_SERVER(file_inode(filep));
        struct nfs4_exception exception = { };
+       struct nfs_lock_context *lock;
        int err;
 
+       lock = nfs_get_lock_context(nfs_file_open_context(filep));
+       if (IS_ERR(lock))
+               return PTR_ERR(lock);
+
+       exception.inode = file_inode(filep);
+       exception.state = lock->open_context->state;
+
        do {
-               err = _nfs42_proc_fallocate(msg, filep, offset, len);
-               if (err == -ENOTSUPP)
-                       return -EOPNOTSUPP;
+               err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+               if (err == -ENOTSUPP) {
+                       err = -EOPNOTSUPP;
+                       break;
+               }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 
+       nfs_put_lock_context(lock);
        return err;
 }
 
@@ -135,7 +126,8 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        return err;
 }
 
-static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep,
+               struct nfs_lock_context *lock, loff_t offset, int whence)
 {
        struct inode *inode = file_inode(filep);
        struct nfs42_seek_args args = {
@@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
        if (!nfs_server_capable(inode, NFS_CAP_SEEK))
                return -ENOTSUPP;
 
-       status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+       status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+                       lock, FMODE_READ);
        if (status)
                return status;
 
@@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 {
        struct nfs_server *server = NFS_SERVER(file_inode(filep));
        struct nfs4_exception exception = { };
+       struct nfs_lock_context *lock;
        loff_t err;
 
+       lock = nfs_get_lock_context(nfs_file_open_context(filep));
+       if (IS_ERR(lock))
+               return PTR_ERR(lock);
+
+       exception.inode = file_inode(filep);
+       exception.state = lock->open_context->state;
+
        do {
-               err = _nfs42_proc_llseek(filep, offset, whence);
+               err = _nfs42_proc_llseek(filep, lock, offset, whence);
                if (err >= 0)
                        break;
-               if (err == -ENOTSUPP)
-                       return -EOPNOTSUPP;
+               if (err == -ENOTSUPP) {
+                       err = -EOPNOTSUPP;
+                       break;
+               }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 
+       nfs_put_lock_context(lock);
        return err;
 }
 
@@ -298,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 }
 
 static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
-                            struct file *dst_f, loff_t src_offset,
-                            loff_t dst_offset, loff_t count)
+               struct file *dst_f, struct nfs_lock_context *src_lock,
+               struct nfs_lock_context *dst_lock, loff_t src_offset,
+               loff_t dst_offset, loff_t count)
 {
        struct inode *src_inode = file_inode(src_f);
        struct inode *dst_inode = file_inode(dst_f);
@@ -320,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
        msg->rpc_argp = &args;
        msg->rpc_resp = &res;
 
-       status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+       status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+                       src_lock, FMODE_READ);
        if (status)
                return status;
 
-       status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+       status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+                       dst_lock, FMODE_WRITE);
        if (status)
                return status;
 
@@ -349,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
        };
        struct inode *inode = file_inode(src_f);
        struct nfs_server *server = NFS_SERVER(file_inode(src_f));
-       struct nfs4_exception exception = { };
-       int err;
+       struct nfs_lock_context *src_lock;
+       struct nfs_lock_context *dst_lock;
+       struct nfs4_exception src_exception = { };
+       struct nfs4_exception dst_exception = { };
+       int err, err2;
 
        if (!nfs_server_capable(inode, NFS_CAP_CLONE))
                return -EOPNOTSUPP;
 
+       src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+       if (IS_ERR(src_lock))
+               return PTR_ERR(src_lock);
+
+       src_exception.inode = file_inode(src_f);
+       src_exception.state = src_lock->open_context->state;
+
+       dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+       if (IS_ERR(dst_lock)) {
+               err = PTR_ERR(dst_lock);
+               goto out_put_src_lock;
+       }
+
+       dst_exception.inode = file_inode(dst_f);
+       dst_exception.state = dst_lock->open_context->state;
+
        do {
-               err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
-                                       dst_offset, count);
+               err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+                                       src_offset, dst_offset, count);
                if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
                        NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
-                       return -EOPNOTSUPP;
+                       err = -EOPNOTSUPP;
+                       break;
                }
-               err = nfs4_handle_exception(server, err, &exception);
-       } while (exception.retry);
 
-       return err;
+               err2 = nfs4_handle_exception(server, err, &src_exception);
+               err = nfs4_handle_exception(server, err, &dst_exception);
+               if (!err)
+                       err = err2;
+       } while (src_exception.retry || dst_exception.retry);
 
+       nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+       nfs_put_lock_context(src_lock);
+       return err;
 }
index 4bfc33a..1488159 100644 (file)
@@ -2466,9 +2466,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
                dentry = d_add_unique(dentry, igrab(state->inode));
                if (dentry == NULL) {
                        dentry = opendata->dentry;
-               } else if (dentry != ctx->dentry) {
+               } else {
                        dput(ctx->dentry);
-                       ctx->dentry = dget(dentry);
+                       ctx->dentry = dentry;
                }
                nfs_set_verifier(dentry,
                                nfs_save_change_attribute(d_inode(opendata->dir)));
index a3592cc..2fa483e 100644 (file)
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
  */
 static LIST_HEAD(pnfs_modules_tbl);
 
-static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
-                      enum pnfs_iomode iomode, bool sync);
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
 
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct inode *inode = lo->plh_inode;
 
+       pnfs_layoutreturn_before_put_layout_hdr(lo);
+
        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
                if (!list_empty(&lo->plh_segs))
                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
        }
 }
 
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+static int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+               struct list_head *lseg_list)
+{
+       struct pnfs_layout_range range = {
+               .iomode = IOMODE_ANY,
+               .offset = 0,
+               .length = NFS4_MAX_UINT64,
+       };
+
+       set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+       return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+}
+
 static int
 pnfs_iomode_to_fail_bit(u32 iomode)
 {
@@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
 
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
-                       struct pnfs_layout_segment *lseg)
-{
-       struct pnfs_layout_segment *s;
-
-       if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-               return false;
-
-       list_for_each_entry(s, &lo->plh_segs, pls_list)
-               if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
-                       return false;
-
-       return true;
-}
-
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
-       if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-               return false;
-       lo->plh_return_iomode = 0;
-       pnfs_get_layout_hdr(lo);
-       clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
-       return true;
-}
-
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
-               struct pnfs_layout_hdr *lo, struct inode *inode)
-{
-       lo = lseg->pls_layout;
-       inode = lo->plh_inode;
-
-       spin_lock(&inode->i_lock);
-       if (pnfs_layout_need_return(lo, lseg)) {
-               nfs4_stateid stateid;
-               enum pnfs_iomode iomode;
-               bool send;
-
-               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
-               iomode = lo->plh_return_iomode;
-               send = pnfs_prepare_layoutreturn(lo);
-               spin_unlock(&inode->i_lock);
-               if (send) {
-                       /* Send an async layoutreturn so we dont deadlock */
-                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
-               }
-       } else
-               spin_unlock(&inode->i_lock);
-}
-
 void
 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 
-       /* Handle the case where refcount != 1 */
-       if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
-               return;
-
        lo = lseg->pls_layout;
        inode = lo->plh_inode;
-       /* Do we need a layoutreturn? */
-       if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-               pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
 
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -613,9 +575,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-               lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-               pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
                pnfs_get_layout_hdr(lo);
+               pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
                spin_unlock(&nfsi->vfs_inode.i_lock);
@@ -676,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 {
        struct pnfs_layout_hdr *lo;
        struct inode *inode;
-       struct pnfs_layout_range range = {
-               .iomode = IOMODE_ANY,
-               .offset = 0,
-               .length = NFS4_MAX_UINT64,
-       };
        LIST_HEAD(lseg_list);
        int ret = 0;
 
@@ -695,11 +651,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 
                spin_lock(&inode->i_lock);
                list_del_init(&lo->plh_bulk_destroy);
-               lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-               if (is_bulk_recall)
-                       set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-               if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+               if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+                       if (is_bulk_recall)
+                               set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
                        ret = -EAGAIN;
+               }
                spin_unlock(&inode->i_lock);
                pnfs_free_lseg_list(&lseg_list);
                /* Free all lsegs that are attached to commit buckets */
@@ -937,6 +893,17 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+       if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+               return false;
+       lo->plh_return_iomode = 0;
+       pnfs_get_layout_hdr(lo);
+       clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+       return true;
+}
+
 static int
 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
                       enum pnfs_iomode iomode, bool sync)
@@ -971,6 +938,48 @@ out:
        return status;
 }
 
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+       struct pnfs_layout_segment *s;
+
+       if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+               return false;
+
+       /* Defer layoutreturn until all lsegs are done */
+       list_for_each_entry(s, &lo->plh_segs, pls_list) {
+               if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+                       return false;
+       }
+
+       return true;
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct inode *inode= lo->plh_inode;
+
+       if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+               return;
+       spin_lock(&inode->i_lock);
+       if (pnfs_layout_need_return(lo)) {
+               nfs4_stateid stateid;
+               enum pnfs_iomode iomode;
+               bool send;
+
+               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+               iomode = lo->plh_return_iomode;
+               send = pnfs_prepare_layoutreturn(lo);
+               spin_unlock(&inode->i_lock);
+               if (send) {
+                       /* Send an async layoutreturn so we dont deadlock */
+                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+               }
+       } else
+               spin_unlock(&inode->i_lock);
+}
+
 /*
  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
  * when the layout segment list is empty.
@@ -1091,7 +1100,7 @@ bool pnfs_roc(struct inode *ino)
 
        nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        /* always send layoutreturn if being marked so */
-       if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+       if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
                                   &lo->plh_flags))
                layoutreturn = pnfs_prepare_layoutreturn(lo);
 
@@ -1744,8 +1753,19 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
        if (lo->plh_return_iomode != 0)
                iomode = IOMODE_ANY;
        lo->plh_return_iomode = iomode;
+       set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 }
 
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ */
 int
 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
@@ -1768,12 +1788,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                lseg, lseg->pls_range.iomode,
                                lseg->pls_range.offset,
                                lseg->pls_range.length);
+                       if (mark_lseg_invalid(lseg, tmp_list))
+                               continue;
+                       remaining++;
                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
                        pnfs_set_plh_return_iomode(lo, return_range->iomode);
-                       if (!mark_lseg_invalid(lseg, tmp_list))
-                               remaining++;
-                       set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-                                       &lo->plh_flags);
                }
        return remaining;
 }
index 9f4e2a4..1ac1db5 100644 (file)
@@ -94,8 +94,8 @@ enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
-       NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
-       NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
+       NFS_LAYOUT_RETURN,              /* layoutreturn in progress */
+       NFS_LAYOUT_RETURN_REQUESTED,    /* Return this layout ASAP */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
 };
index cfcbf11..7115c5d 100644 (file)
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+#define FSNOTIFY_REAPER_DELAY  (1)     /* 1 jiffy */
+
 struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
        atomic_dec(&group->num_marks);
 }
 
-static void
-fsnotify_mark_free_rcu(struct rcu_head *rcu)
-{
-       struct fsnotify_mark    *mark;
-
-       mark = container_of(rcu, struct fsnotify_mark, g_rcu);
-       fsnotify_put_mark(mark);
-}
-
 /*
- * Free fsnotify mark. The freeing is actually happening from a call_srcu
- * callback. Caller must have a reference to the mark or be protected by
- * fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
  */
 void fsnotify_free_mark(struct fsnotify_mark *mark)
 {
@@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);
 
-       call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+       spin_lock(&destroy_lock);
+       list_add(&mark->g_list, &destroy_list);
+       spin_unlock(&destroy_lock);
+       queue_delayed_work(system_unbound_wq, &reaper_work,
+                               FSNOTIFY_REAPER_DELAY);
 
        /*
         * Some groups like to know that marks are being freed.  This is a
@@ -388,7 +390,12 @@ err:
 
        spin_unlock(&mark->lock);
 
-       call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+       spin_lock(&destroy_lock);
+       list_add(&mark->g_list, &destroy_list);
+       spin_unlock(&destroy_lock);
+       queue_delayed_work(system_unbound_wq, &reaper_work,
+                               FSNOTIFY_REAPER_DELAY);
+
        return ret;
 }
 
@@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
        atomic_set(&mark->refcnt, 1);
        mark->free_mark = free_mark;
 }
+
+static void fsnotify_mark_destroy(struct work_struct *work)
+{
+       struct fsnotify_mark *mark, *next;
+       struct list_head private_destroy_list;
+
+       spin_lock(&destroy_lock);
+       /* exchange the list head */
+       list_replace_init(&destroy_list, &private_destroy_list);
+       spin_unlock(&destroy_lock);
+
+       synchronize_srcu(&fsnotify_mark_srcu);
+
+       list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+               list_del_init(&mark->g_list);
+               fsnotify_put_mark(mark);
+       }
+}
index 794fd15..cda0361 100644 (file)
@@ -956,6 +956,7 @@ clean_orphan:
                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
                                update_isize, end);
                if (tmp_ret < 0) {
+                       ocfs2_inode_unlock(inode, 1);
                        ret = tmp_ret;
                        mlog_errno(ret);
                        brelse(di_bh);
index a3cc6d2..a76b9ea 100644 (file)
@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
 
 void o2hb_exit(void)
 {
-       kfree(o2hb_db_livenodes);
-       kfree(o2hb_db_liveregions);
-       kfree(o2hb_db_quorumregions);
-       kfree(o2hb_db_failedregions);
        debugfs_remove(o2hb_debug_failedregions);
        debugfs_remove(o2hb_debug_quorumregions);
        debugfs_remove(o2hb_debug_liveregions);
        debugfs_remove(o2hb_debug_livenodes);
        debugfs_remove(o2hb_debug_dir);
+       kfree(o2hb_db_livenodes);
+       kfree(o2hb_db_liveregions);
+       kfree(o2hb_db_quorumregions);
+       kfree(o2hb_db_failedregions);
 }
 
 static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
 
        kfree(reg->hr_slots);
 
-       kfree(reg->hr_db_regnum);
-       kfree(reg->hr_db_livenodes);
        debugfs_remove(reg->hr_debug_livenodes);
        debugfs_remove(reg->hr_debug_regnum);
        debugfs_remove(reg->hr_debug_elapsed_time);
        debugfs_remove(reg->hr_debug_pinned);
        debugfs_remove(reg->hr_debug_dir);
+       kfree(reg->hr_db_livenodes);
+       kfree(reg->hr_db_regnum);
+       kfree(reg->hr_debug_elapsed_time);
+       kfree(reg->hr_debug_pinned);
 
        spin_lock(&o2hb_live_lock);
        list_del(&reg->hr_all_item);
index c5bdf02..b94a425 100644 (file)
@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                                break;
                                        }
                                }
+                               dlm_lockres_clear_refmap_bit(dlm, res,
+                                               dead_node);
                                spin_unlock(&res->spinlock);
                                continue;
                        }
index 9581d19..77ebc2b 100644 (file)
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
+               if (ret == -ENOMEM)
+                       ret = VM_FAULT_OOM;
+               else
+                       ret = VM_FAULT_SIGBUS;
                goto out;
        }
 
index ed95272..52f6de5 100644 (file)
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
         * sole user of this dentry.  Too tricky...  Just unhash for
         * now.
         */
-       d_drop(dentry);
+       if (!err)
+               d_drop(dentry);
        inode_unlock(dir);
 
        return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
        if (!overwrite && new_is_dir && !old_opaque && new_opaque)
                ovl_remove_opaque(newdentry);
 
+       /*
+        * Old dentry now lives in different location. Dentries in
+        * lowerstack are stale. We cannot drop them here because
+        * access to them is lockless. This could be only pure upper
+        * or opaque directory - numlower is zero. Or upper non-dir
+        * entry - its pureness is tracked by flag opaque.
+        */
        if (old_opaque != new_opaque) {
                ovl_dentry_set_opaque(old, new_opaque);
                if (!overwrite)
index 49e2045..a4ff5d0 100644 (file)
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 
                inode_lock(upperdentry->d_inode);
                err = notify_change(upperdentry, attr, NULL);
+               if (!err)
+                       ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
                inode_unlock(upperdentry->d_inode);
        }
        ovl_drop_write(dentry);
index 8d826bd..619ad4b 100644 (file)
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
        if (oe->__upperdentry) {
                type = __OVL_PATH_UPPER;
 
-               if (oe->numlower) {
-                       if (S_ISDIR(dentry->d_inode->i_mode))
-                               type |= __OVL_PATH_MERGE;
-               } else if (!oe->opaque) {
+               /*
+                * Non-dir dentry can hold lower dentry from previous
+                * location. Its purity depends only on opaque flag.
+                */
+               if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+                       type |= __OVL_PATH_MERGE;
+               else if (!oe->opaque)
                        type |= __OVL_PATH_PURE;
-               }
        } else {
                if (oe->numlower > 1)
                        type |= __OVL_PATH_MERGE;
@@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
 
 static const struct dentry_operations ovl_reval_dentry_operations = {
        .d_release = ovl_dentry_release,
+       .d_select_inode = ovl_d_select_inode,
        .d_revalidate = ovl_dentry_revalidate,
        .d_weak_revalidate = ovl_dentry_weak_revalidate,
 };
index 6367e1e..c524fdd 100644 (file)
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
 static struct mountpoint *mp;
 static struct hlist_head *list;
 
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+       return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
 static int propagate_one(struct mount *m)
 {
        struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
        /* skip if mountpoint isn't covered by it */
        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
                return 0;
-       if (m->mnt_group_id == last_dest->mnt_group_id) {
+       if (peers(m, last_dest)) {
                type = CL_MAKE_SHARED;
        } else {
                struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
                                        last_source = last_source->mnt_master;
                                        last_dest = last_source->mnt_parent;
                                }
-                               if (n->mnt_group_id != last_dest->mnt_group_id) {
+                               if (!peers(n, last_dest)) {
                                        last_source = last_source->mnt_master;
                                        last_dest = last_source->mnt_parent;
                                }
index 85d16c6..fa95ab2 100644 (file)
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
                                sizeof(struct proc_maps_private));
 }
 
-static pid_t pid_of_stack(struct proc_maps_private *priv,
-                               struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+                   struct vm_area_struct *vma, int is_pid)
 {
-       struct inode *inode = priv->inode;
-       struct task_struct *task;
-       pid_t ret = 0;
+       int stack = 0;
+
+       if (is_pid) {
+               stack = vma->vm_start <= vma->vm_mm->start_stack &&
+                       vma->vm_end >= vma->vm_mm->start_stack;
+       } else {
+               struct inode *inode = priv->inode;
+               struct task_struct *task;
 
-       rcu_read_lock();
-       task = pid_task(proc_pid(inode), PIDTYPE_PID);
-       if (task) {
-               task = task_of_stack(task, vma, is_pid);
+               rcu_read_lock();
+               task = pid_task(proc_pid(inode), PIDTYPE_PID);
                if (task)
-                       ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+                       stack = vma_is_stack_for_task(vma, task);
+               rcu_read_unlock();
        }
-       rcu_read_unlock();
-
-       return ret;
+       return stack;
 }
 
 static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
        name = arch_vma_name(vma);
        if (!name) {
-               pid_t tid;
-
                if (!mm) {
                        name = "[vdso]";
                        goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                        goto done;
                }
 
-               tid = pid_of_stack(priv, vma, is_pid);
-               if (tid != 0) {
-                       /*
-                        * Thread stack in /proc/PID/task/TID/maps or
-                        * the main process stack.
-                        */
-                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                           vma->vm_end >= mm->start_stack)) {
-                               name = "[stack]";
-                       } else {
-                               /* Thread stack in /proc/PID/maps */
-                               seq_pad(m, ' ');
-                               seq_printf(m, "[stack:%d]", tid);
-                       }
-               }
+               if (is_stack(priv, vma, is_pid))
+                       name = "[stack]";
        }
 
 done:
@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
+       pte_t huge_pte = huge_ptep_get(pte);
        struct numa_maps *md;
        struct page *page;
 
-       if (!pte_present(*pte))
+       if (!pte_present(huge_pte))
                return 0;
 
-       page = pte_page(*pte);
+       page = pte_page(huge_pte);
        if (!page)
                return 0;
 
        md = walk->private;
-       gather_stats(page, md, pte_dirty(*pte), 1);
+       gather_stats(page, md, pte_dirty(huge_pte), 1);
        return 0;
 }
 
@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
                seq_file_path(m, file, "\n\t= ");
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_puts(m, " heap");
-       } else {
-               pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
-               if (tid != 0) {
-                       /*
-                        * Thread stack in /proc/PID/task/TID/maps or
-                        * the main process stack.
-                        */
-                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                           vma->vm_end >= mm->start_stack))
-                               seq_puts(m, " stack");
-                       else
-                               seq_printf(m, " stack:%d", tid);
-               }
+       } else if (is_stack(proc_priv, vma, is_pid)) {
+               seq_puts(m, " stack");
        }
 
        if (is_vm_hugetlb_page(vma))
index e0d64c9..faacb0c 100644 (file)
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
        return size;
 }
 
-static pid_t pid_of_stack(struct proc_maps_private *priv,
-                               struct vm_area_struct *vma, bool is_pid)
+static int is_stack(struct proc_maps_private *priv,
+                   struct vm_area_struct *vma, int is_pid)
 {
-       struct inode *inode = priv->inode;
-       struct task_struct *task;
-       pid_t ret = 0;
-
-       rcu_read_lock();
-       task = pid_task(proc_pid(inode), PIDTYPE_PID);
-       if (task) {
-               task = task_of_stack(task, vma, is_pid);
+       struct mm_struct *mm = vma->vm_mm;
+       int stack = 0;
+
+       if (is_pid) {
+               stack = vma->vm_start <= mm->start_stack &&
+                       vma->vm_end >= mm->start_stack;
+       } else {
+               struct inode *inode = priv->inode;
+               struct task_struct *task;
+
+               rcu_read_lock();
+               task = pid_task(proc_pid(inode), PIDTYPE_PID);
                if (task)
-                       ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+                       stack = vma_is_stack_for_task(vma, task);
+               rcu_read_unlock();
        }
-       rcu_read_unlock();
-
-       return ret;
+       return stack;
 }
 
 /*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
        if (file) {
                seq_pad(m, ' ');
                seq_file_path(m, file, "");
-       } else if (mm) {
-               pid_t tid = pid_of_stack(priv, vma, is_pid);
-
-               if (tid != 0) {
-                       seq_pad(m, ' ');
-                       /*
-                        * Thread stack in /proc/PID/task/TID/maps or
-                        * the main process stack.
-                        */
-                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
-                           vma->vm_end >= mm->start_stack))
-                               seq_printf(m, "[stack]");
-                       else
-                               seq_printf(m, "[stack:%d]", tid);
-               }
+       } else if (mm && is_stack(priv, vma, is_pid)) {
+               seq_pad(m, ' ');
+               seq_printf(m, "[stack]");
        }
 
        seq_putc(m, '\n');
index 324ec27..dadf24e 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/splice.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
+#include <linux/fs.h>
 #include "internal.h"
 
 #include <asm/uaccess.h>
@@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
-                                               ~0ULL, 0);
+                                               OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
@@ -1532,10 +1533,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 
        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
-           (file_out->f_flags & O_APPEND) ||
-           !file_in->f_op->clone_file_range)
+           (file_out->f_flags & O_APPEND))
                return -EBADF;
 
+       if (!file_in->f_op->clone_file_range)
+               return -EOPNOTSUPP;
+
        ret = clone_verify_area(file_in, pos_in, len, false);
        if (ret)
                return ret;
index 1182af8..74914b1 100644 (file)
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
                sb->s_flags &= ~MS_ACTIVE;
 
                fsnotify_unmount_inodes(sb);
+               cgroup_writeback_umount();
 
                evict_inodes(sb);
 
index b94fa6c..053818d 100644 (file)
@@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
        if (isalarm(ctx))
                remaining = alarm_expires_remaining(&ctx->t.alarm);
        else
-               remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+               remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
 
        return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
index 5031170..66cdb44 100644 (file)
@@ -286,6 +286,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
        if (unlikely(ACCESS_ONCE(ctx->released)))
                goto out;
 
+       /*
+        * We don't do userfault handling for the final child pid update.
+        */
+       if (current->flags & PF_EXITING)
+               goto out;
+
        /*
         * Check that we can return VM_FAULT_RETRY.
         *
index 07d0e47..4861322 100644 (file)
@@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
        bool trusted = capable(CAP_SYS_ADMIN);
        struct simple_xattr *xattr;
        ssize_t remaining_size = size;
-       int err;
+       int err = 0;
 
 #ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl) {
@@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
 
                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
-                       return err;
+                       break;
        }
        spin_unlock(&xattrs->lock);
 
-       return size - remaining_size;
+       return err ? err : size - remaining_size;
 }
 
 /*
index 379c089..a9ebabf 100644 (file)
@@ -55,7 +55,7 @@ xfs_count_page_state(
        } while ((bh = bh->b_this_page) != head);
 }
 
-STATIC struct block_device *
+struct block_device *
 xfs_find_bdev_for_inode(
        struct inode            *inode)
 {
@@ -1208,6 +1208,10 @@ xfs_vm_writepages(
        struct writeback_control *wbc)
 {
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+       if (dax_mapping(mapping))
+               return dax_writeback_mapping_range(mapping,
+                               xfs_find_bdev_for_inode(mapping->host), wbc);
+
        return generic_writepages(mapping, wbc);
 }
 
index f6ffc9a..a4343c6 100644 (file)
@@ -62,5 +62,6 @@ int   xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
                                 struct buffer_head *map_bh, int create);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
 
 #endif /* __XFS_AOPS_H__ */
index 45ec9e4..6c87601 100644 (file)
@@ -75,7 +75,8 @@ xfs_zero_extent(
        ssize_t         size = XFS_FSB_TO_B(mp, count_fsb);
 
        if (IS_DAX(VFS_I(ip)))
-               return dax_clear_blocks(VFS_I(ip), block, size);
+               return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
+                               sector, size);
 
        /*
         * let the block layer decide on the fastest method of
index da37beb..be55688 100644 (file)
@@ -1109,27 +1109,10 @@ xlog_verify_head(
        bool                    tmp_wrapped;
 
        /*
-        * Search backwards through the log looking for the log record header
-        * block. This wraps all the way back around to the head so something is
-        * seriously wrong if we can't find it.
-        */
-       found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
-                                     rhead, wrapped);
-       if (found < 0)
-               return found;
-       if (!found) {
-               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
-               return -EIO;
-       }
-
-       *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
-       /*
-        * Now that we have a tail block, check the head of the log for torn
-        * writes. Search again until we hit the tail or the maximum number of
-        * log record I/Os that could have been in flight at one time. Use a
-        * temporary buffer so we don't trash the rhead/bp pointer from the
-        * call above.
+        * Check the head of the log for torn writes. Search backwards from the
+        * head until we hit the tail or the maximum number of log record I/Os
+        * that could have been in flight at one time. Use a temporary buffer so
+        * we don't trash the rhead/bp pointers from the caller.
         */
        tmp_bp = xlog_get_bp(log, 1);
        if (!tmp_bp)
@@ -1215,6 +1198,115 @@ xlog_verify_head(
        return error;
 }
 
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+       struct xlog             *log,
+       xfs_daddr_t             *head_blk,
+       xfs_daddr_t             *tail_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       struct xfs_buf          *bp,
+       bool                    *clean)
+{
+       struct xlog_op_header   *op_head;
+       xfs_daddr_t             umount_data_blk;
+       xfs_daddr_t             after_umount_blk;
+       int                     hblks;
+       int                     error;
+       char                    *offset;
+
+       *clean = false;
+
+       /*
+        * Look for unmount record. If we find it, then we know there was a
+        * clean unmount. Since 'i' could be the last block in the physical
+        * log, we convert to a log block before comparing to the head_blk.
+        *
+        * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+        * below. We won't want to clear the unmount record if there is one, so
+        * we pass the lsn of the unmount record rather than the block after it.
+        */
+       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+               int     h_size = be32_to_cpu(rhead->h_size);
+               int     h_version = be32_to_cpu(rhead->h_version);
+
+               if ((h_version & XLOG_VERSION_2) &&
+                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                               hblks++;
+               } else {
+                       hblks = 1;
+               }
+       } else {
+               hblks = 1;
+       }
+       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+       if (*head_blk == after_umount_blk &&
+           be32_to_cpu(rhead->h_num_logops) == 1) {
+               umount_data_blk = rhead_blk + hblks;
+               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               if (error)
+                       return error;
+
+               op_head = (struct xlog_op_header *)offset;
+               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+                       /*
+                        * Set tail and last sync so that newly written log
+                        * records will point recovery to after the current
+                        * unmount record.
+                        */
+                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+                                       log->l_curr_cycle, after_umount_blk);
+                       *tail_blk = after_umount_blk;
+
+                       *clean = true;
+               }
+       }
+
+       return 0;
+}
+
+static void
+xlog_set_state(
+       struct xlog             *log,
+       xfs_daddr_t             head_blk,
+       struct xlog_rec_header  *rhead,
+       xfs_daddr_t             rhead_blk,
+       bool                    bump_cycle)
+{
+       /*
+        * Reset log values according to the state of the log when we
+        * crashed.  In the case where head_blk == 0, we bump curr_cycle
+        * one because the next write starts a new cycle rather than
+        * continuing the cycle of the last good log record.  At this
+        * point we have guaranteed that all partial log records have been
+        * accounted for.  Therefore, we know that the last good log record
+        * written was complete and ended exactly on the end boundary
+        * of the physical log.
+        */
+       log->l_prev_block = rhead_blk;
+       log->l_curr_block = (int)head_blk;
+       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+       if (bump_cycle)
+               log->l_curr_cycle++;
+       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+                                       BBTOB(log->l_curr_block));
+}
+
 /*
  * Find the sync block number or the tail of the log.
  *
@@ -1238,22 +1330,20 @@ xlog_find_tail(
        xfs_daddr_t             *tail_blk)
 {
        xlog_rec_header_t       *rhead;
-       xlog_op_header_t        *op_head;
        char                    *offset = NULL;
        xfs_buf_t               *bp;
        int                     error;
-       xfs_daddr_t             umount_data_blk;
-       xfs_daddr_t             after_umount_blk;
        xfs_daddr_t             rhead_blk;
        xfs_lsn_t               tail_lsn;
-       int                     hblks;
        bool                    wrapped = false;
+       bool                    clean = false;
 
        /*
         * Find previous log record
         */
        if ((error = xlog_find_head(log, head_blk)))
                return error;
+       ASSERT(*head_blk < INT_MAX);
 
        bp = xlog_get_bp(log, 1);
        if (!bp)
@@ -1271,99 +1361,74 @@ xlog_find_tail(
        }
 
        /*
-        * Trim the head block back to skip over torn records. We can have
-        * multiple log I/Os in flight at any time, so we assume CRC failures
-        * back through the previous several records are torn writes and skip
-        * them.
+        * Search backwards through the log looking for the log record header
+        * block. This wraps all the way back around to the head so something is
+        * seriously wrong if we can't find it.
         */
-       ASSERT(*head_blk < INT_MAX);
-       error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
-                                &rhead, &wrapped);
-       if (error)
-               goto done;
+       error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+                                     &rhead_blk, &rhead, &wrapped);
+       if (error < 0)
+               return error;
+       if (!error) {
+               xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               return -EIO;
+       }
+       *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 
        /*
-        * Reset log values according to the state of the log when we
-        * crashed.  In the case where head_blk == 0, we bump curr_cycle
-        * one because the next write starts a new cycle rather than
-        * continuing the cycle of the last good log record.  At this
-        * point we have guaranteed that all partial log records have been
-        * accounted for.  Therefore, we know that the last good log record
-        * written was complete and ended exactly on the end boundary
-        * of the physical log.
+        * Set the log state based on the current head record.
         */
-       log->l_prev_block = rhead_blk;
-       log->l_curr_block = (int)*head_blk;
-       log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
-       if (wrapped)
-               log->l_curr_cycle++;
-       atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-       atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
-       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
-                                       BBTOB(log->l_curr_block));
+       xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+       tail_lsn = atomic64_read(&log->l_tail_lsn);
 
        /*
-        * Look for unmount record.  If we find it, then we know there
-        * was a clean unmount.  Since 'i' could be the last block in
-        * the physical log, we convert to a log block before comparing
-        * to the head_blk.
+        * Look for an unmount record at the head of the log. This sets the log
+        * state to determine whether recovery is necessary.
+        */
+       error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+                                      rhead_blk, bp, &clean);
+       if (error)
+               goto done;
+
+       /*
+        * Verify the log head if the log is not clean (e.g., we have anything
+        * but an unmount record at the head). This uses CRC verification to
+        * detect and trim torn writes. If discovered, CRC failures are
+        * considered torn writes and the log head is trimmed accordingly.
         *
-        * Save the current tail lsn to use to pass to
-        * xlog_clear_stale_blocks() below.  We won't want to clear the
-        * unmount record if there is one, so we pass the lsn of the
-        * unmount record rather than the block after it.
+        * Note that we can only run CRC verification when the log is dirty
+        * because there's no guarantee that the log data behind an unmount
+        * record is compatible with the current architecture.
         */
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               int     h_size = be32_to_cpu(rhead->h_size);
-               int     h_version = be32_to_cpu(rhead->h_version);
+       if (!clean) {
+               xfs_daddr_t     orig_head = *head_blk;
 
-               if ((h_version & XLOG_VERSION_2) &&
-                   (h_size > XLOG_HEADER_CYCLE_SIZE)) {
-                       hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
-                       if (h_size % XLOG_HEADER_CYCLE_SIZE)
-                               hblks++;
-               } else {
-                       hblks = 1;
-               }
-       } else {
-               hblks = 1;
-       }
-       after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
-       after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
-       tail_lsn = atomic64_read(&log->l_tail_lsn);
-       if (*head_blk == after_umount_blk &&
-           be32_to_cpu(rhead->h_num_logops) == 1) {
-               umount_data_blk = rhead_blk + hblks;
-               umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
-               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               error = xlog_verify_head(log, head_blk, tail_blk, bp,
+                                        &rhead_blk, &rhead, &wrapped);
                if (error)
                        goto done;
 
-               op_head = (xlog_op_header_t *)offset;
-               if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
-                       /*
-                        * Set tail and last sync so that newly written
-                        * log records will point recovery to after the
-                        * current unmount record.
-                        */
-                       xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                                       log->l_curr_cycle, after_umount_blk);
-                       *tail_blk = after_umount_blk;
-
-                       /*
-                        * Note that the unmount was clean. If the unmount
-                        * was not clean, we need to know this to rebuild the
-                        * superblock counters from the perag headers if we
-                        * have a filesystem using non-persistent counters.
-                        */
-                       log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+               /* update in-core state again if the head changed */
+               if (*head_blk != orig_head) {
+                       xlog_set_state(log, *head_blk, rhead, rhead_blk,
+                                      wrapped);
+                       tail_lsn = atomic64_read(&log->l_tail_lsn);
+                       error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+                                                      rhead, rhead_blk, bp,
+                                                      &clean);
+                       if (error)
+                               goto done;
                }
        }
 
+       /*
+        * Note that the unmount was clean. If the unmount was not clean, we
+        * need to know this to rebuild the superblock counters from the perag
+        * headers if we have a filesystem using non-persistent counters.
+        */
+       if (clean)
+               log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
        /*
         * Make sure that there are no blocks in front of the head
         * with the same cycle number as the head.  This can happen
@@ -4491,7 +4556,7 @@ xlog_recover_process(
         * know precisely what failed.
         */
        if (pass == XLOG_RECOVER_CRCPASS) {
-               if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+               if (rhead->h_crc && crc != rhead->h_crc)
                        return -EFSBADCRC;
                return 0;
        }
@@ -4502,7 +4567,7 @@ xlog_recover_process(
         * zero CRC check prevents warnings from being emitted when upgrading
         * the kernel from one that does not add CRCs by default.
         */
-       if (crc != le32_to_cpu(rhead->h_crc)) {
+       if (crc != rhead->h_crc) {
                if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
                        xfs_alert(log->l_mp,
                "log record CRC mismatch: found 0x%x, expected 0x%x.",
index 717a298..dad8af3 100644 (file)
@@ -133,6 +133,5 @@ extern int acpi_get_psd_map(struct cpudata **);
 /* Methods to interact with the PCC mailbox controller. */
 extern struct mbox_chan *
        pcc_mbox_request_channel(struct mbox_client *, unsigned int);
-extern int mbox_send_message(struct mbox_chan *chan, void *mssg);
 
 #endif /* _CPPC_ACPI_H*/
index 0419485..0f1c6f3 100644 (file)
@@ -75,7 +75,7 @@ typedef u64 __nocast cputime64_t;
  */
 static inline cputime_t timespec_to_cputime(const struct timespec *val)
 {
-       u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
+       u64 ret = (u64)val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
        return (__force cputime_t) ret;
 }
 static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
@@ -91,7 +91,8 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
  */
 static inline cputime_t timeval_to_cputime(const struct timeval *val)
 {
-       u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
+       u64 ret = (u64)val->tv_sec * NSEC_PER_SEC +
+                       val->tv_usec * NSEC_PER_USEC;
        return (__force cputime_t) ret;
 }
 static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
index 0b3c0d3..c370b26 100644 (file)
@@ -239,6 +239,14 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
 #endif
 
+#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
+static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+                                          unsigned long address, pmd_t *pmdp)
+{
+
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
 {
index 39e1cb2..35a52a8 100644 (file)
@@ -119,11 +119,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 }
 #endif
 
-/*
- * Initializier
- */
-#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
-
 /*
  * Remapping spinlock architecture specific functions to the corresponding
  * queued spinlock functions.
index 85f888e..034acd0 100644 (file)
@@ -32,6 +32,11 @@ typedef struct qspinlock {
        atomic_t        val;
 } arch_spinlock_t;
 
+/*
+ * Initializier
+ */
+#define        __ARCH_SPIN_LOCK_UNLOCKED       { ATOMIC_INIT(0) }
+
 /*
  * Bitfields in the atomic value:
  *
index c4bd0e2..772c784 100644 (file)
        .rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {           \
                VMLINUX_SYMBOL(__start_rodata) = .;                     \
                *(.rodata) *(.rodata.*)                                 \
+               *(.data..ro_after_init) /* Read only after init */      \
                *(__vermagic)           /* Kernel version magic */      \
                . = ALIGN(8);                                           \
                VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;         \
index 89d008d..fe5efad 100644 (file)
@@ -42,6 +42,10 @@ int drm_atomic_helper_commit(struct drm_device *dev,
                             struct drm_atomic_state *state,
                             bool async);
 
+bool drm_atomic_helper_framebuffer_changed(struct drm_device *dev,
+                                          struct drm_atomic_state *old_state,
+                                          struct drm_crtc *crtc);
+
 void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev,
                                        struct drm_atomic_state *old_state);
 
index 7bfb063..461a055 100644 (file)
 
 void drm_clflush_pages(struct page *pages[], unsigned long num_pages);
 
+static inline bool drm_arch_can_wc_memory(void)
+{
+#if defined(CONFIG_PPC) && !defined(CONFIG_NOT_COHERENT_CACHE)
+       return false;
+#else
+       return true;
+#endif
+}
+
 #endif
index c65a212..c5b4b81 100644 (file)
@@ -1166,6 +1166,7 @@ struct drm_connector {
        struct drm_mode_object base;
 
        char *name;
+       int connector_id;
        int connector_type;
        int connector_type_id;
        bool interlace_allowed;
@@ -2047,6 +2048,7 @@ struct drm_mode_config {
        struct list_head fb_list;
 
        int num_connector;
+       struct ida connector_ida;
        struct list_head connector_list;
        int num_encoder;
        struct list_head encoder_list;
@@ -2200,7 +2202,11 @@ int drm_connector_register(struct drm_connector *connector);
 void drm_connector_unregister(struct drm_connector *connector);
 
 extern void drm_connector_cleanup(struct drm_connector *connector);
-extern unsigned int drm_connector_index(struct drm_connector *connector);
+static inline unsigned drm_connector_index(struct drm_connector *connector)
+{
+       return connector->connector_id;
+}
+
 /* helper to unplug all connectors from sysfs for device */
 extern void drm_connector_unplug_all(struct drm_device *dev);
 
index 24ab178..fdb4705 100644 (file)
@@ -44,8 +44,6 @@ struct drm_dp_vcpi {
 /**
  * struct drm_dp_mst_port - MST port
  * @kref: reference count for this port.
- * @guid_valid: for DP 1.2 devices if we have validated the GUID.
- * @guid: guid for DP 1.2 device on this port.
  * @port_num: port number
  * @input: if this port is an input port.
  * @mcs: message capability status - DP 1.2 spec.
@@ -70,10 +68,6 @@ struct drm_dp_vcpi {
 struct drm_dp_mst_port {
        struct kref kref;
 
-       /* if dpcd 1.2 device is on this port - its GUID info */
-       bool guid_valid;
-       u8 guid[16];
-
        u8 port_num;
        bool input;
        bool mcs;
@@ -110,10 +104,12 @@ struct drm_dp_mst_port {
  * @tx_slots: transmission slots for this device.
  * @last_seqno: last sequence number used to talk to this.
  * @link_address_sent: if a link address message has been sent to this device yet.
+ * @guid: guid for DP 1.2 branch device. port under this branch can be
+ * identified by port #.
  *
  * This structure represents an MST branch device, there is one
- * primary branch device at the root, along with any others connected
- * to downstream ports
+ * primary branch device at the root, along with any other branches connected
+ * to downstream port of parent branches.
  */
 struct drm_dp_mst_branch {
        struct kref kref;
@@ -132,6 +128,9 @@ struct drm_dp_mst_branch {
        struct drm_dp_sideband_msg_tx *tx_slots[2];
        int last_seqno;
        bool link_address_sent;
+
+       /* global unique identifier to identify branch devices */
+       u8 guid[16];
 };
 
 
@@ -406,11 +405,9 @@ struct drm_dp_payload {
  * @conn_base_id: DRM connector ID this mgr is connected to.
  * @down_rep_recv: msg receiver state for down replies.
  * @up_req_recv: msg receiver state for up requests.
- * @lock: protects mst state, primary, guid, dpcd.
+ * @lock: protects mst state, primary, dpcd.
  * @mst_state: if this manager is enabled for an MST capable port.
  * @mst_primary: pointer to the primary branch device.
- * @guid_valid: GUID valid for the primary branch device.
- * @guid: GUID for primary port.
  * @dpcd: cache of DPCD for primary port.
  * @pbn_div: PBN to slots divisor.
  *
@@ -432,13 +429,11 @@ struct drm_dp_mst_topology_mgr {
        struct drm_dp_sideband_msg_rx up_req_recv;
 
        /* pointer to info about the initial MST device */
-       struct mutex lock; /* protects mst_state + primary + guid + dpcd */
+       struct mutex lock; /* protects mst_state + primary + dpcd */
 
        bool mst_state;
        struct drm_dp_mst_branch *mst_primary;
-       /* primary MST device GUID */
-       bool guid_valid;
-       u8 guid[16];
+
        u8 dpcd[DP_RECEIVER_CAP_SIZE];
        u8 sink_count;
        int pbn_div;
index d639049..553210c 100644 (file)
@@ -73,18 +73,28 @@ static inline u32 dfixed_div(fixed20_12 A, fixed20_12 B)
 #define DRM_FIXED_ONE          (1ULL << DRM_FIXED_POINT)
 #define DRM_FIXED_DECIMAL_MASK (DRM_FIXED_ONE - 1)
 #define DRM_FIXED_DIGITS_MASK  (~DRM_FIXED_DECIMAL_MASK)
+#define DRM_FIXED_EPSILON      1LL
+#define DRM_FIXED_ALMOST_ONE   (DRM_FIXED_ONE - DRM_FIXED_EPSILON)
 
 static inline s64 drm_int2fixp(int a)
 {
        return ((s64)a) << DRM_FIXED_POINT;
 }
 
-static inline int drm_fixp2int(int64_t a)
+static inline int drm_fixp2int(s64 a)
 {
        return ((s64)a) >> DRM_FIXED_POINT;
 }
 
-static inline unsigned drm_fixp_msbset(int64_t a)
+static inline int drm_fixp2int_ceil(s64 a)
+{
+       if (a > 0)
+               return drm_fixp2int(a + DRM_FIXED_ALMOST_ONE);
+       else
+               return drm_fixp2int(a - DRM_FIXED_ALMOST_ONE);
+}
+
+static inline unsigned drm_fixp_msbset(s64 a)
 {
        unsigned shift, sign = (a >> 63) & 1;
 
@@ -136,6 +146,45 @@ static inline s64 drm_fixp_div(s64 a, s64 b)
        return result;
 }
 
+static inline s64 drm_fixp_from_fraction(s64 a, s64 b)
+{
+       s64 res;
+       bool a_neg = a < 0;
+       bool b_neg = b < 0;
+       u64 a_abs = a_neg ? -a : a;
+       u64 b_abs = b_neg ? -b : b;
+       u64 rem;
+
+       /* determine integer part */
+       u64 res_abs  = div64_u64_rem(a_abs, b_abs, &rem);
+
+       /* determine fractional part */
+       {
+               u32 i = DRM_FIXED_POINT;
+
+               do {
+                       rem <<= 1;
+                       res_abs <<= 1;
+                       if (rem >= b_abs) {
+                               res_abs |= 1;
+                               rem -= b_abs;
+                       }
+               } while (--i != 0);
+       }
+
+       /* round up LSB */
+       {
+               u64 summand = (rem << 1) >= b_abs;
+
+               res_abs += summand;
+       }
+
+       res = (s64) res_abs;
+       if (a_neg ^ b_neg)
+               res = -res;
+       return res;
+}
+
 static inline s64 drm_fixp_exp(s64 x)
 {
        s64 tolerance = div64_s64(DRM_FIXED_ONE, 1000000);
index 6f45aea..0a05b0d 100644 (file)
 /* 104 */
 /* 105 */
 #define TEGRA210_CLK_D_AUDIO 106
-/* 107 ( affects abp -> ape) */
+#define TEGRA210_CLK_APB2APE 107
 /* 108 */
 /* 109 */
 /* 110 */
index d2992bf..c1a2f34 100644 (file)
@@ -487,8 +487,8 @@ enum ata_tf_protocols {
 };
 
 enum ata_ioctls {
-       ATA_IOC_GET_IO32        = 0x309,
-       ATA_IOC_SET_IO32        = 0x324,
+       ATA_IOC_GET_IO32        = 0x309, /* HDIO_GET_32BIT */
+       ATA_IOC_SET_IO32        = 0x324, /* HDIO_SET_32BIT */
 };
 
 /* core structures */
index 301de78..6c502cb 100644 (file)
@@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
+/**
+ * fetch_or - perform *ptr |= mask and return old value of *ptr
+ * @ptr: pointer to value
+ * @mask: mask to OR on the value
+ *
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#ifndef fetch_or
+#define fetch_or(ptr, mask)                                            \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (mask));          \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+#endif
+
+
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
index 5349e68..88bc64f 100644 (file)
@@ -310,6 +310,38 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
        bio->bi_flags &= ~(1U << bit);
 }
 
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       *bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+       struct bvec_iter iter = bio->bi_iter;
+       int idx;
+
+       if (unlikely(!bio_multiple_segments(bio))) {
+               *bv = bio_iovec(bio);
+               return;
+       }
+
+       bio_advance_iter(bio, &iter, iter.bi_size);
+
+       if (!iter.bi_bvec_done)
+               idx = iter.bi_idx - 1;
+       else    /* in the middle of bvec */
+               idx = iter.bi_idx;
+
+       *bv = bio->bi_io_vec[idx];
+
+       /*
+        * iter.bi_bvec_done records actual length of the last bvec
+        * if this bio ends in the middle of one io vector
+        */
+       if (iter.bi_bvec_done)
+               bv->bv_len = iter.bi_bvec_done;
+}
+
 enum bip_flags {
        BIP_BLOCK_INTEGRITY     = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY    = 1 << 1, /* ref tag has been remapped */
index 29189ae..413c84f 100644 (file)
@@ -682,9 +682,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 /*
  * q->prep_rq_fn return values
  */
-#define BLKPREP_OK             0       /* serve it */
-#define BLKPREP_KILL           1       /* fatal error, kill */
-#define BLKPREP_DEFER          2       /* leave on queue */
+enum {
+       BLKPREP_OK,             /* serve it */
+       BLKPREP_KILL,           /* fatal error, kill, return -EIO */
+       BLKPREP_DEFER,          /* leave on queue */
+       BLKPREP_INVALID,        /* invalid command, kill, return -EREMOTEIO */
+};
 
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
@@ -892,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
-       if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+       if (unlikely(rq->cmd_type != REQ_TYPE_FS))
                return q->limits.max_hw_sectors;
 
        if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
@@ -1369,6 +1372,13 @@ static inline void put_dev_sector(Sector p)
        page_cache_release(p.v);
 }
 
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+                               struct bio_vec *bprv, unsigned int offset)
+{
+       return offset ||
+               ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
 /*
  * Check if adding a bio_vec after bprv with offset would create a gap in
  * the SG list. Most drivers don't care about this, but some do.
@@ -1378,18 +1388,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 {
        if (!queue_virt_boundary(q))
                return false;
-       return offset ||
-               ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+       return __bvec_gap_to_prev(q, bprv, offset);
 }
 
 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
                         struct bio *next)
 {
-       if (!bio_has_data(prev))
-               return false;
+       if (bio_has_data(prev) && queue_virt_boundary(q)) {
+               struct bio_vec pb, nb;
+
+               bio_get_last_bvec(prev, &pb);
+               bio_get_first_bvec(next, &nb);
 
-       return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-                               next->bi_io_vec[0].bv_offset);
+               return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+       }
+
+       return false;
 }
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
index 17e7e82..1be04f8 100644 (file)
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
 #ifndef __read_mostly
 #define __read_mostly
 #endif
 
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
 #endif
index f89b31d..15151f3 100644 (file)
 #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
+#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
+#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
+#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
+#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
+#define CEPH_FEATURE_NEW_OSDOP_ENCODING   (1ULL<<56) /* New, v7 encoding */
+#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
+#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
+#define CEPH_FEATURE_CRUSH_TUNABLES5   (1ULL<<58) /* chooseleaf stable mode */
+// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
+#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING   (1ULL<<58) /* New, v7 encoding */
+#define CEPH_FEATURE_FS_FILE_LAYOUT_V2       (1ULL<<58) /* file_layout_t */
 
 /*
  * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -108,7 +121,9 @@ static inline u64 ceph_sanitize_features(u64 features)
         CEPH_FEATURE_CRUSH_TUNABLES3 |         \
         CEPH_FEATURE_OSD_PRIMARY_AFFINITY |    \
         CEPH_FEATURE_MSGR_KEEPALIVE2 |         \
-        CEPH_FEATURE_CRUSH_V4)
+        CEPH_FEATURE_CRUSH_V4 |                \
+        CEPH_FEATURE_CRUSH_TUNABLES5 |         \
+        CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
        (CEPH_FEATURE_NOSRCADDR |        \
index 7f540f7..789471d 100644 (file)
@@ -127,6 +127,12 @@ struct cgroup_subsys_state {
         */
        u64 serial_nr;
 
+       /*
+        * Incremented by online self and children.  Used to guarantee that
+        * parents are not offlined before their children.
+        */
+       atomic_t online_cnt;
+
        /* percpu_ref killing and RCU release */
        struct rcu_head rcu_head;
        struct work_struct destroy_work;
index bda5ec0..fccf7f4 100644 (file)
@@ -37,7 +37,7 @@ struct cleancache_ops {
        void (*invalidate_fs)(int);
 };
 
-extern int cleancache_register_ops(struct cleancache_ops *ops);
+extern int cleancache_register_ops(const struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
 extern void __cleancache_init_shared_fs(struct super_block *);
 extern int  __cleancache_get_page(struct page *);
@@ -48,14 +48,14 @@ extern void __cleancache_invalidate_fs(struct super_block *);
 
 #ifdef CONFIG_CLEANCACHE
 #define cleancache_enabled (1)
-static inline bool cleancache_fs_enabled(struct page *page)
-{
-       return page->mapping->host->i_sb->cleancache_poolid >= 0;
-}
 static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
 {
        return mapping->host->i_sb->cleancache_poolid >= 0;
 }
+static inline bool cleancache_fs_enabled(struct page *page)
+{
+       return cleancache_fs_enabled_mapping(page->mapping);
+}
 #else
 #define cleancache_enabled (0)
 #define cleancache_fs_enabled(_page) (0)
@@ -89,11 +89,9 @@ static inline void cleancache_init_shared_fs(struct super_block *sb)
 
 static inline int cleancache_get_page(struct page *page)
 {
-       int ret = -1;
-
        if (cleancache_enabled && cleancache_fs_enabled(page))
-               ret = __cleancache_get_page(page);
-       return ret;
+               return __cleancache_get_page(page);
+       return -1;
 }
 
 static inline void cleancache_put_page(struct page *page)
index bdcf358..0d442e3 100644 (file)
@@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev,
 extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);
 
 static inline void
-clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec)
+clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
 {
-       return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec);
+       return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
 }
 
 extern void clockevents_suspend(void);
index 6013021..a307bf6 100644 (file)
@@ -118,6 +118,23 @@ struct clocksource {
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
 
+static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
+{
+       /*  freq = cyc/from
+        *  mult/2^shift  = ns/cyc
+        *  mult = ns/cyc * 2^shift
+        *  mult = from/freq * 2^shift
+        *  mult = from * 2^shift / freq
+        *  mult = (from<<shift) / freq
+        */
+       u64 tmp = ((u64)from) << shift_constant;
+
+       tmp += freq/2; /* round for do_div */
+       do_div(tmp, freq);
+
+       return (u32)tmp;
+}
+
 /**
  * clocksource_khz2mult - calculates mult from khz and shift
  * @khz:               Clocksource frequency in KHz
@@ -128,19 +145,7 @@ struct clocksource {
  */
 static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
 {
-       /*  khz = cyc/(Million ns)
-        *  mult/2^shift  = ns/cyc
-        *  mult = ns/cyc * 2^shift
-        *  mult = 1Million/khz * 2^shift
-        *  mult = 1000000 * 2^shift / khz
-        *  mult = (1000000<<shift) / khz
-        */
-       u64 tmp = ((u64)1000000) << shift_constant;
-
-       tmp += khz/2; /* round for do_div */
-       do_div(tmp, khz);
-
-       return (u32)tmp;
+       return clocksource_freq2mult(khz, shift_constant, NSEC_PER_MSEC);
 }
 
 /**
@@ -154,19 +159,7 @@ static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
  */
 static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 {
-       /*  hz = cyc/(Billion ns)
-        *  mult/2^shift  = ns/cyc
-        *  mult = ns/cyc * 2^shift
-        *  mult = 1Billion/hz * 2^shift
-        *  mult = 1000000000 * 2^shift / hz
-        *  mult = (1000000000<<shift) / hz
-        */
-       u64 tmp = ((u64)1000000000) << shift_constant;
-
-       tmp += hz/2; /* round for do_div */
-       do_div(tmp, hz);
-
-       return (u32)tmp;
+       return clocksource_freq2mult(hz, shift_constant, NSEC_PER_SEC);
 }
 
 /**
index 00b042c..b5ff988 100644 (file)
 # define __pmem                __attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu         __attribute__((noderef, address_space(4)))
-#else
+#else /* CONFIG_SPARSE_RCU_POINTER */
 # define __rcu
-#endif
+#endif /* CONFIG_SPARSE_RCU_POINTER */
+# define __private     __attribute__((noderef))
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
-#else
+# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
+#else /* __CHECKER__ */
 # define __user
 # define __kernel
 # define __safe
@@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __percpu
 # define __rcu
 # define __pmem
-#endif
+# define __private
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+#endif /* __CHECKER__ */
 
 /* Indirect macros required for expanded argument pasting, eg. __LINE__. */
 #define ___PASTE(a,b) a##b
@@ -144,7 +148,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  */
 #define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
 #define __trace_if(cond) \
-       if (__builtin_constant_p((cond)) ? !!(cond) :                   \
+       if (__builtin_constant_p(!!(cond)) ? !!(cond) :                 \
        ({                                                              \
                int ______r;                                            \
                static struct ftrace_branch_data                        \
@@ -263,8 +267,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * In contrast to ACCESS_ONCE these two macros will also work on aggregate
  * data types like structs or unions. If the size of the accessed data
  * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
- * compile-time warning.
+ * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
+ * least two memcpy()s: one for the __builtin_memcpy() and then one for
+ * the macro doing the copy of variable - '__u' allocated on the stack.
  *
  * Their two major use cases are: (1) Mediating communication between
  * process-level code and irq/NMI handlers, all running on the same CPU,
index d2ca8c3..f9b1fab 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
+#include <linux/cpuhotplug.h>
 
 struct device;
 struct device_node;
@@ -27,6 +28,9 @@ struct cpu {
        struct device dev;
 };
 
+extern void boot_cpu_init(void);
+extern void boot_cpu_state_init(void);
+
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct device *get_cpu_device(unsigned cpu);
 extern bool cpu_is_hotpluggable(unsigned cpu);
@@ -74,7 +78,7 @@ enum {
        /* migration should happen before other stuff but after perf */
        CPU_PRI_PERF            = 20,
        CPU_PRI_MIGRATION       = 10,
-       CPU_PRI_SMPBOOT         = 9,
+
        /* bring up workqueues before normal notifiers and down after */
        CPU_PRI_WORKQUEUE_UP    = 5,
        CPU_PRI_WORKQUEUE_DOWN  = -5,
@@ -97,9 +101,7 @@ enum {
                                        * Called on the new cpu, just before
                                        * enabling interrupts. Must not sleep,
                                        * must not fail */
-#define CPU_DYING_IDLE         0x000B /* CPU (unsigned)v dying, reached
-                                       * idle loop. */
-#define CPU_BROKEN             0x000C /* CPU (unsigned)v did not die properly,
+#define CPU_BROKEN             0x000B /* CPU (unsigned)v did not die properly,
                                        * perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
@@ -118,6 +120,7 @@ enum {
 
 
 #ifdef CONFIG_SMP
+extern bool cpuhp_tasks_frozen;
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
 #define cpu_notifier(fn, pri) {                                        \
@@ -167,7 +170,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
 }
 #endif
 
-void smpboot_thread_init(void);
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
@@ -177,6 +179,7 @@ extern void cpu_maps_update_done(void);
 #define cpu_notifier_register_done     cpu_maps_update_done
 
 #else  /* CONFIG_SMP */
+#define cpuhp_tasks_frozen     0
 
 #define cpu_notifier(fn, pri)  do { (void)(fn); } while (0)
 #define __cpu_notifier(fn, pri)        do { (void)(fn); } while (0)
@@ -215,10 +218,6 @@ static inline void cpu_notifier_register_done(void)
 {
 }
 
-static inline void smpboot_thread_init(void)
-{
-}
-
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
@@ -265,11 +264,6 @@ static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
-enum cpuhp_state {
-       CPUHP_OFFLINE,
-       CPUHP_ONLINE,
-};
-
 void cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
@@ -280,14 +274,15 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
-DECLARE_PER_CPU(bool, cpu_dead_idle);
-
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
+void cpuhp_report_idle_dead(void);
+#else
+static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
new file mode 100644 (file)
index 0000000..5d68e15
--- /dev/null
@@ -0,0 +1,93 @@
+#ifndef __CPUHOTPLUG_H
+#define __CPUHOTPLUG_H
+
+enum cpuhp_state {
+       CPUHP_OFFLINE,
+       CPUHP_CREATE_THREADS,
+       CPUHP_NOTIFY_PREPARE,
+       CPUHP_BRINGUP_CPU,
+       CPUHP_AP_IDLE_DEAD,
+       CPUHP_AP_OFFLINE,
+       CPUHP_AP_NOTIFY_STARTING,
+       CPUHP_AP_ONLINE,
+       CPUHP_TEARDOWN_CPU,
+       CPUHP_AP_ONLINE_IDLE,
+       CPUHP_AP_SMPBOOT_THREADS,
+       CPUHP_AP_NOTIFY_ONLINE,
+       CPUHP_AP_ONLINE_DYN,
+       CPUHP_AP_ONLINE_DYN_END         = CPUHP_AP_ONLINE_DYN + 30,
+       CPUHP_ONLINE,
+};
+
+int __cpuhp_setup_state(enum cpuhp_state state,        const char *name, bool invoke,
+                       int (*startup)(unsigned int cpu),
+                       int (*teardown)(unsigned int cpu));
+
+/**
+ * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
+ * @state:     The state for which the calls are installed
+ * @name:      Name of the callback (will be used in debug output)
+ * @startup:   startup callback function
+ * @teardown:  teardown callback function
+ *
+ * Installs the callback functions and invokes the startup callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline int cpuhp_setup_state(enum cpuhp_state state,
+                                   const char *name,
+                                   int (*startup)(unsigned int cpu),
+                                   int (*teardown)(unsigned int cpu))
+{
+       return __cpuhp_setup_state(state, name, true, startup, teardown);
+}
+
+/**
+ * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
+ *                            callbacks
+ * @state:     The state for which the calls are installed
+ * @name:      Name of the callback.
+ * @startup:   startup callback function
+ * @teardown:  teardown callback function
+ *
+ * Same as @cpuhp_setup_state except that no calls are executed are invoked
+ * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
+ */
+static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
+                                           const char *name,
+                                           int (*startup)(unsigned int cpu),
+                                           int (*teardown)(unsigned int cpu))
+{
+       return __cpuhp_setup_state(state, name, false, startup, teardown);
+}
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+
+/**
+ * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
+ * @state:     The state for which the calls are removed
+ *
+ * Removes the callback functions and invokes the teardown callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline void cpuhp_remove_state(enum cpuhp_state state)
+{
+       __cpuhp_remove_state(state, true);
+}
+
+/**
+ * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
+ *                             teardown
+ * @state:     The state for which the calls are removed
+ */
+static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
+{
+       __cpuhp_remove_state(state, false);
+}
+
+#ifdef CONFIG_SMP
+void cpuhp_online_idle(enum cpuhp_state state);
+#else
+static inline void cpuhp_online_idle(enum cpuhp_state state) { }
+#endif
+
+#endif
index 85a868c..fea160e 100644 (file)
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
        task_unlock(current);
 }
 
+extern void cpuset_post_attach_flush(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
        return false;
 }
 
+static inline void cpuset_post_attach_flush(void)
+{
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
index 48b4930..be8f12b 100644 (file)
@@ -59,7 +59,8 @@ enum {
        CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
        CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
        CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
-       CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
+       CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
+       CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
 };
 
 /*
@@ -205,6 +206,11 @@ struct crush_map {
         * mappings line up a bit better with previous mappings. */
        __u8 chooseleaf_vary_r;
 
+       /* if true, it makes chooseleaf firstn to return stable results (if
+        * no local retry) so that data migrations would be optimal when some
+        * device fails. */
+       __u8 chooseleaf_stable;
+
 #ifndef __KERNEL__
        /*
         * version 0 (original) of straw_calc has various flaws.  version 1
index 8204c3d..636dd59 100644 (file)
@@ -7,13 +7,24 @@
 
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
                  get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
                dax_iodone_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
                dax_iodone_t);
+
+#ifdef CONFIG_FS_DAX
+struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+#else
+static inline struct page *read_dax_sector(struct block_device *bdev,
+               sector_t n)
+{
+       return ERR_PTR(-ENXIO);
+}
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
                                unsigned int flags, get_block_t, dax_iodone_t);
@@ -41,6 +52,8 @@ static inline bool dax_mapping(struct address_space *mapping)
 {
        return mapping->host && IS_DAX(mapping->host);
 }
-int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
-               loff_t end);
+
+struct writeback_control;
+int dax_writeback_mapping_range(struct address_space *mapping,
+               struct block_device *bdev, struct writeback_control *wbc);
 #endif
index 7781ce1..c4b5f4b 100644 (file)
@@ -409,9 +409,7 @@ static inline bool d_mountpoint(const struct dentry *dentry)
  */
 static inline unsigned __d_entry_type(const struct dentry *dentry)
 {
-       unsigned type = READ_ONCE(dentry->d_flags);
-       smp_rmb();
-       return type & DCACHE_ENTRY_TYPE;
+       return dentry->d_flags & DCACHE_ENTRY_TYPE;
 }
 
 static inline bool d_is_miss(const struct dentry *dentry)
index 251a209..e0ee0b3 100644 (file)
@@ -19,6 +19,8 @@
 
 int devpts_new_index(struct inode *ptmx_inode);
 void devpts_kill_index(struct inode *ptmx_inode, int idx);
+void devpts_add_ref(struct inode *ptmx_inode);
+void devpts_del_ref(struct inode *ptmx_inode);
 /* mknod in devpts */
 struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
                void *priv);
@@ -32,6 +34,8 @@ void devpts_pty_kill(struct inode *inode);
 /* Dummy stubs in the no-pty case */
 static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; }
 static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { }
+static inline void devpts_add_ref(struct inode *ptmx_inode) { }
+static inline void devpts_del_ref(struct inode *ptmx_inode) { }
 static inline struct inode *devpts_pty_new(struct inode *ptmx_inode,
                dev_t device, int index, void *priv)
 {
index 75857cd..5e45cf9 100644 (file)
@@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
        if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
                return;
 
-       if (!ops->free)
+       if (!ops->free || !cpu_addr)
                return;
 
        debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
@@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev)
 }
 #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
-static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
-                                          dma_addr_t *dma_addr, gfp_t gfp)
+static inline void *dma_alloc_wc(struct device *dev, size_t size,
+                                dma_addr_t *dma_addr, gfp_t gfp)
 {
        DEFINE_DMA_ATTRS(attrs);
        dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
        return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs);
 }
+#ifndef dma_alloc_writecombine
+#define dma_alloc_writecombine dma_alloc_wc
+#endif
 
-static inline void dma_free_writecombine(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_addr)
+static inline void dma_free_wc(struct device *dev, size_t size,
+                              void *cpu_addr, dma_addr_t dma_addr)
 {
        DEFINE_DMA_ATTRS(attrs);
        dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
        return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs);
 }
+#ifndef dma_free_writecombine
+#define dma_free_writecombine dma_free_wc
+#endif
 
-static inline int dma_mmap_writecombine(struct device *dev,
-                                       struct vm_area_struct *vma,
-                                       void *cpu_addr, dma_addr_t dma_addr,
-                                       size_t size)
+static inline int dma_mmap_wc(struct device *dev,
+                             struct vm_area_struct *vma,
+                             void *cpu_addr, dma_addr_t dma_addr,
+                             size_t size)
 {
        DEFINE_DMA_ATTRS(attrs);
        dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
        return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
 }
+#ifndef dma_mmap_writecombine
+#define dma_mmap_writecombine dma_mmap_wc
+#endif
 
 #ifdef CONFIG_NEED_DMA_MAP_STATE
 #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
index 569b5a8..47be3ad 100644 (file)
@@ -1199,7 +1199,10 @@ int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
 struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
                                       struct list_head *head, bool remove);
 
-bool efivar_validate(efi_char16_t *var_name, u8 *data, unsigned long len);
+bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+                    unsigned long data_size);
+bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
+                                 size_t len);
 
 extern struct work_struct efivar_work;
 void efivar_run_worker(void);
index 1a20462..ae68100 100644 (file)
@@ -484,9 +484,6 @@ struct block_device {
        int                     bd_fsfreeze_count;
        /* Mutex for freeze */
        struct mutex            bd_fsfreeze_mutex;
-#ifdef CONFIG_FS_DAX
-       int                     bd_map_count;
-#endif
 };
 
 /*
@@ -2907,7 +2904,7 @@ extern void replace_mount_options(struct super_block *sb, char *options);
 
 static inline bool io_is_direct(struct file *filp)
 {
-       return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
+       return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
 }
 
 static inline int iocb_flags(struct file *file)
index 6b7e89f..533c440 100644 (file)
@@ -220,10 +220,7 @@ struct fsnotify_mark {
        /* List of marks by group->i_fsnotify_marks. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
-       union {
-               struct list_head g_list;
-               struct rcu_head g_rcu;
-       };
+       struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [obj_lock] */
index 0639dcc..6d9df3f 100644 (file)
@@ -165,7 +165,6 @@ struct ftrace_ops {
        ftrace_func_t                   saved_func;
        int __percpu                    *disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
-       int                             nr_trampolines;
        struct ftrace_ops_hash          local_hash;
        struct ftrace_ops_hash          *func_hash;
        struct ftrace_ops_hash          old_hash;
@@ -604,6 +603,7 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 extern void ftrace_module_init(struct module *mod);
+extern void ftrace_module_enable(struct module *mod);
 extern void ftrace_release_mod(struct module *mod);
 
 extern void ftrace_disable_daemon(void);
@@ -613,8 +613,9 @@ static inline int skip_trace(unsigned long ip) { return 0; }
 static inline int ftrace_force_update(void) { return 0; }
 static inline void ftrace_disable_daemon(void) { }
 static inline void ftrace_enable_daemon(void) { }
-static inline void ftrace_release_mod(struct module *mod) {}
-static inline void ftrace_module_init(struct module *mod) {}
+static inline void ftrace_module_init(struct module *mod) { }
+static inline void ftrace_module_enable(struct module *mod) { }
+static inline void ftrace_release_mod(struct module *mod) { }
 static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
 {
        return -EINVAL;
@@ -712,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
 #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
 #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
 
+static inline unsigned long get_lock_parent_ip(void)
+{
+       unsigned long addr = CALLER_ADDR0;
+
+       if (!in_lock_functions(addr))
+               return addr;
+       addr = CALLER_ADDR1;
+       if (!in_lock_functions(addr))
+               return addr;
+       return CALLER_ADDR2;
+}
+
 #ifdef CONFIG_IRQSOFF_TRACER
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
   extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
index 28ad5f6..af1f2b2 100644 (file)
@@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-#ifdef CONFIG_CMA
-
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
                              unsigned migratetype);
 extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
+#endif
 
+#ifdef CONFIG_CMA
 /* CMA stuff */
 extern void init_cma_reserved_pageblock(struct page *page);
-
 #endif
 
 #endif /* __LINUX_GFP_H */
index 76dd4f0..2ead22d 100644 (file)
@@ -87,7 +87,8 @@ enum hrtimer_restart {
  * @function:  timer expiry callback function
  * @base:      pointer to the timer base (per cpu and per clock)
  * @state:     state information (See bit values above)
- * @start_pid: timer statistics field to store the pid of the task which
+ * @is_rel:    Set if the timer was armed relative
+ * @start_pid:  timer statistics field to store the pid of the task which
  *             started the timer
  * @start_site:        timer statistics field to store the site where the timer
  *             was started
@@ -101,7 +102,8 @@ struct hrtimer {
        ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
-       unsigned long                   state;
+       u8                              state;
+       u8                              is_rel;
 #ifdef CONFIG_TIMER_STATS
        int                             start_pid;
        void                            *start_site;
@@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { }
 
 #endif
 
+static inline ktime_t
+__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
+{
+       ktime_t rem = ktime_sub(timer->node.expires, now);
+
+       /*
+        * Adjust relative timers for the extra we added in
+        * hrtimer_start_range_ns() to prevent short timeouts.
+        */
+       if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
+               rem.tv64 -= hrtimer_resolution;
+       return rem;
+}
+
+static inline ktime_t
+hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
+{
+       return __hrtimer_expires_remaining_adjusted(timer,
+                                                   timer->base->get_time());
+}
+
 extern void clock_was_set(void);
 #ifdef CONFIG_TIMERFD
 extern void timerfd_clock_was_set(void);
@@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer)
 }
 
 /* Query timers: */
-extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
+extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
+
+static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+       return __hrtimer_get_remaining(timer, false);
+}
 
 extern u64 hrtimer_get_next_event(void);
 
index b449f37..aedb254 100644 (file)
@@ -142,6 +142,10 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
 extern void (*late_time_init)(void);
 
 extern bool initcall_debug;
index 821273c..2d9b650 100644 (file)
@@ -235,6 +235,9 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 /* low 64 bit */
 #define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
 
+/* PRS_REG */
+#define DMA_PRS_PPR    ((u32)1)
+
 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)                    \
 do {                                                                   \
        cycles_t start_time = get_cycles();                             \
index f28dff3..a5c539f 100644 (file)
@@ -133,8 +133,9 @@ struct iommu_dm_region {
 
 /**
  * struct iommu_ops - iommu ops and capabilities
- * @domain_init: init iommu domain
- * @domain_destroy: destroy iommu domain
+ * @capable: check capability
+ * @domain_alloc: allocate iommu domain
+ * @domain_free: free iommu domain
  * @attach_dev: attach device to an iommu domain
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
@@ -144,8 +145,15 @@ struct iommu_dm_region {
  * @iova_to_phys: translate iova to physical address
  * @add_device: add device to iommu grouping
  * @remove_device: remove device from iommu grouping
+ * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
+ * @get_dm_regions: Request list of direct mapping requirements for a device
+ * @put_dm_regions: Free list of direct mapping requirements for a device
+ * @domain_window_enable: Configure and enable a particular window for a domain
+ * @domain_window_disable: Disable a particular window for a domain
+ * @domain_set_windows: Set the number of windows for a domain
+ * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @pgsize_bitmap: bitmap of supported page sizes
  * @priv: per-instance data private to the iommu driver
@@ -182,9 +190,9 @@ struct iommu_ops {
        int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
                                    phys_addr_t paddr, u64 size, int prot);
        void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
-       /* Set the numer of window per domain */
+       /* Set the number of windows per domain */
        int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
-       /* Get the numer of window per domain */
+       /* Get the number of windows per domain */
        u32 (*domain_get_windows)(struct iommu_domain *domain);
 
 #ifdef CONFIG_OF_IOMMU
index 24bea08..afb4559 100644 (file)
@@ -20,6 +20,7 @@ struct resource {
        resource_size_t end;
        const char *name;
        unsigned long flags;
+       unsigned long desc;
        struct resource *parent, *sibling, *child;
 };
 
@@ -49,12 +50,19 @@ struct resource {
 #define IORESOURCE_WINDOW      0x00200000      /* forwarded by bridge */
 #define IORESOURCE_MUXED       0x00400000      /* Resource is software muxed */
 
+#define IORESOURCE_EXT_TYPE_BITS 0x01000000    /* Resource extended types */
+#define IORESOURCE_SYSRAM      0x01000000      /* System RAM (modifier) */
+
 #define IORESOURCE_EXCLUSIVE   0x08000000      /* Userland may not map this resource */
+
 #define IORESOURCE_DISABLED    0x10000000
 #define IORESOURCE_UNSET       0x20000000      /* No address assigned yet */
 #define IORESOURCE_AUTO                0x40000000
 #define IORESOURCE_BUSY                0x80000000      /* Driver has marked this resource busy */
 
+/* I/O resource extended types */
+#define IORESOURCE_SYSTEM_RAM          (IORESOURCE_MEM|IORESOURCE_SYSRAM)
+
 /* PnP IRQ specific bits (IORESOURCE_BITS) */
 #define IORESOURCE_IRQ_HIGHEDGE                (1<<0)
 #define IORESOURCE_IRQ_LOWEDGE         (1<<1)
@@ -105,6 +113,22 @@ struct resource {
 /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
 #define IORESOURCE_PCI_FIXED           (1<<4)  /* Do not move resource */
 
+/*
+ * I/O Resource Descriptors
+ *
+ * Descriptors are used by walk_iomem_res_desc() and region_intersects()
+ * for searching a specific resource range in the iomem table.  Assign
+ * a new descriptor when a resource range supports the search interfaces.
+ * Otherwise, resource.desc must be set to IORES_DESC_NONE (0).
+ */
+enum {
+       IORES_DESC_NONE                         = 0,
+       IORES_DESC_CRASH_KERNEL                 = 1,
+       IORES_DESC_ACPI_TABLES                  = 2,
+       IORES_DESC_ACPI_NV_STORAGE              = 3,
+       IORES_DESC_PERSISTENT_MEMORY            = 4,
+       IORES_DESC_PERSISTENT_MEMORY_LEGACY     = 5,
+};
 
 /* helpers to define resources */
 #define DEFINE_RES_NAMED(_start, _size, _name, _flags)                 \
@@ -113,6 +137,7 @@ struct resource {
                .end = (_start) + (_size) - 1,                          \
                .name = (_name),                                        \
                .flags = (_flags),                                      \
+               .desc = IORES_DESC_NONE,                                \
        }
 
 #define DEFINE_RES_IO_NAMED(_start, _size, _name)                      \
@@ -170,6 +195,10 @@ static inline unsigned long resource_type(const struct resource *res)
 {
        return res->flags & IORESOURCE_TYPE_BITS;
 }
+static inline unsigned long resource_ext_type(const struct resource *res)
+{
+       return res->flags & IORESOURCE_EXT_TYPE_BITS;
+}
 /* True iff r1 completely contains r2 */
 static inline bool resource_contains(struct resource *r1, struct resource *r2)
 {
@@ -239,8 +268,8 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
                    int (*func)(u64, u64, void *));
 extern int
-walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg,
-              int (*func)(u64, u64, void *));
+walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
+                   void *arg, int (*func)(u64, u64, void *));
 
 /* True if any part of r1 overlaps r2 */
 static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
index 3c1c967..c4de623 100644 (file)
@@ -133,17 +133,23 @@ struct irq_domain;
  *                     Use accessor functions to deal with it
  * @node:              node index useful for balancing
  * @handler_data:      per-IRQ data for the irq_chip methods
- * @affinity:          IRQ affinity on SMP
+ * @affinity:          IRQ affinity on SMP. If this is an IPI
+ *                     related irq, then this is the mask of the
+ *                     CPUs to which an IPI can be sent.
  * @msi_desc:          MSI descriptor
+ * @ipi_offset:                Offset of first IPI target cpu in @affinity. Optional.
  */
 struct irq_common_data {
-       unsigned int            state_use_accessors;
+       unsigned int            __private state_use_accessors;
 #ifdef CONFIG_NUMA
        unsigned int            node;
 #endif
        void                    *handler_data;
        struct msi_desc         *msi_desc;
        cpumask_var_t           affinity;
+#ifdef CONFIG_GENERIC_IRQ_IPI
+       unsigned int            ipi_offset;
+#endif
 };
 
 /**
@@ -208,7 +214,7 @@ enum {
        IRQD_FORWARDED_TO_VCPU          = (1 << 20),
 };
 
-#define __irqd_to_state(d)             ((d)->common->state_use_accessors)
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
 {
@@ -299,6 +305,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d)
        __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU;
 }
 
+#undef __irqd_to_state
+
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 {
        return d->hwirq;
@@ -341,6 +349,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_get_irqchip_state:     return the internal state of an interrupt
  * @irq_set_irqchip_state:     set the internal state of a interrupt
  * @irq_set_vcpu_affinity:     optional to target a vCPU in a virtual machine
+ * @ipi_send_single:   send a single IPI to destination cpus
+ * @ipi_send_mask:     send an IPI to destination cpus in cpumask
  * @flags:             chip specific flags
  */
 struct irq_chip {
@@ -385,6 +395,9 @@ struct irq_chip {
 
        int             (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
 
+       void            (*ipi_send_single)(struct irq_data *data, unsigned int cpu);
+       void            (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest);
+
        unsigned long   flags;
 };
 
@@ -934,4 +947,12 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
                return readl(gc->reg_base + reg_offset);
 }
 
+/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
+#define INVALID_HWIRQ  (~0UL)
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu);
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
+int ipi_send_single(unsigned int virq, unsigned int cpu);
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
+
 #endif /* _LINUX_IRQ_H */
index ce824db..80f89e4 100644 (file)
@@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt);
 extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
 extern void gic_start_count(void);
 extern void gic_stop_count(void);
-extern void gic_send_ipi(unsigned int intr);
-extern unsigned int plat_ipi_call_int_xlate(unsigned int);
-extern unsigned int plat_ipi_resched_int_xlate(unsigned int);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
index f64622a..ed48594 100644 (file)
@@ -70,9 +70,11 @@ struct irq_fwspec {
  */
 enum irq_domain_bus_token {
        DOMAIN_BUS_ANY          = 0,
+       DOMAIN_BUS_WIRED,
        DOMAIN_BUS_PCI_MSI,
        DOMAIN_BUS_PLATFORM_MSI,
        DOMAIN_BUS_NEXUS,
+       DOMAIN_BUS_IPI,
 };
 
 /**
@@ -171,6 +173,12 @@ enum {
        /* Core calls alloc/free recursive through the domain hierarchy. */
        IRQ_DOMAIN_FLAG_AUTO_RECURSIVE  = (1 << 1),
 
+       /* Irq domain is an IPI domain with virq per cpu */
+       IRQ_DOMAIN_FLAG_IPI_PER_CPU     = (1 << 2),
+
+       /* Irq domain is an IPI domain with single virq */
+       IRQ_DOMAIN_FLAG_IPI_SINGLE      = (1 << 3),
+
        /*
         * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
         * for implementation specific purposes and ignored by the
@@ -205,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
                                                   enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
+extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+                                 irq_hw_number_t hwirq, int node);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
@@ -334,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
                        const u32 *intspec, unsigned int intsize,
                        irq_hw_number_t *out_hwirq, unsigned int *out_type);
 
+/* IPI functions */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+                            const struct cpumask *dest);
+void irq_destroy_ipi(unsigned int irq);
+
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
                                                unsigned int virq);
@@ -399,6 +414,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
        return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+       return domain->flags &
+               (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE);
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+       return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+       return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
+}
 #else  /* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void irq_domain_activate_irq(struct irq_data *data) { }
 static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -412,6 +443,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
        return false;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+       return false;
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+       return false;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+       return false;
+}
 #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #else /* CONFIG_IRQ_DOMAIN */
index 4b9f85c..0fdc798 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/sched.h>
 #include <linux/types.h>
 
 struct kmem_cache;
@@ -13,7 +14,6 @@ struct vm_struct;
 
 #include <asm/kasan.h>
 #include <asm/pgtable.h>
-#include <linux/sched.h>
 
 extern unsigned char kasan_zero_page[PAGE_SIZE];
 extern pte_t kasan_zero_pte[PTRS_PER_PTE];
@@ -43,6 +43,8 @@ static inline void kasan_disable_current(void)
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
+void kasan_unpoison_task_stack(struct task_struct *task);
+
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
@@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm);
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
 
index 861f690..5276fe0 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
+#include <linux/swait.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
        int fpu_active;
        int guest_fpu_loaded, guest_xcr0_loaded;
        unsigned char fpu_counter;
-       wait_queue_head_t wq;
+       struct swait_queue_head wq;
        struct pid *pid;
        int sigset_active;
        sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
 }
 #endif
 
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
        return vcpu->arch.wqp;
index e23121f..59ccab2 100644 (file)
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 
 void clear_all_latency_tracing(struct task_struct *p);
 
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #else
 
 static inline void
index 851821b..2c4ebef 100644 (file)
@@ -526,6 +526,7 @@ enum ata_lpm_policy {
 enum ata_lpm_hints {
        ATA_LPM_EMPTY           = (1 << 0), /* port empty/probing */
        ATA_LPM_HIPM            = (1 << 1), /* may use HIPM */
+       ATA_LPM_WAKE_ONLY       = (1 << 2), /* only wake up link */
 };
 
 /* forward declarations */
@@ -719,7 +720,7 @@ struct ata_device {
        union {
                u16             id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
                u32             gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */
-       };
+       } ____cacheline_aligned;
 
        /* DEVSLP Timing Variables from Identify Device Data Log */
        u8                      devslp_timing[ATA_LOG_DEVSLP_SIZE];
index bed40df..141ffdd 100644 (file)
@@ -26,9 +26,8 @@ enum {
 
        /* need to set a limit somewhere, but yes, this is likely overkill */
        ND_IOCTL_MAX_BUFLEN = SZ_4M,
-       ND_CMD_MAX_ELEM = 4,
+       ND_CMD_MAX_ELEM = 5,
        ND_CMD_MAX_ENVELOPE = 16,
-       ND_CMD_ARS_STATUS_MAX = SZ_4K,
        ND_MAX_MAPPINGS = 32,
 
        /* region flag indicating to direct-map persistent memory by default */
index d675011..2190419 100644 (file)
@@ -135,6 +135,10 @@ enum {
        /* Memory types */
        NVM_ID_FMTYPE_SLC       = 0,
        NVM_ID_FMTYPE_MLC       = 1,
+
+       /* Device capabilities */
+       NVM_ID_DCAP_BBLKMGMT    = 0x1,
+       NVM_UD_DCAP_ECC         = 0x2,
 };
 
 struct nvm_id_lp_mlc {
index 30cf420..5356f4d 100644 (file)
@@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry);
 extern void list_del(struct list_head *entry);
 #endif
 
-#ifdef CONFIG_DEBUG_LIST
-/*
- * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
- * of the pages it allocates is ever passed to list_add()
- */
-extern void list_force_poison(struct list_head *entry);
-#else
-/* fallback to the less strict LIST_POISON* definitions */
-#define list_force_poison list_del
-#endif
-
 /**
  * list_replace - replace old entry by new one
  * @old : the element to be replaced
index c57e424..d026b19 100644 (file)
@@ -66,7 +66,7 @@ struct lock_class {
        /*
         * class-hash:
         */
-       struct list_head                hash_entry;
+       struct hlist_node               hash_entry;
 
        /*
         * global list of all lock-classes:
@@ -199,7 +199,7 @@ struct lock_chain {
        u8                              irq_context;
        u8                              depth;
        u16                             base;
-       struct list_head                entry;
+       struct hlist_node               entry;
        u64                             chain_key;
 };
 
@@ -261,7 +261,6 @@ struct held_lock {
 /*
  * Initialization, self-test and debugging-output methods:
  */
-extern void lockdep_init(void);
 extern void lockdep_info(void);
 extern void lockdep_reset(void);
 extern void lockdep_reset_lock(struct lockdep_map *lock);
@@ -392,7 +391,6 @@ static inline void lockdep_on(void)
 # define lockdep_set_current_reclaim_state(g)  do { } while (0)
 # define lockdep_clear_current_reclaim_state() do { } while (0)
 # define lockdep_trace_alloc(g)                        do { } while (0)
-# define lockdep_init()                                do { } while (0)
 # define lockdep_info()                                do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
index 9ae48d4..792c898 100644 (file)
@@ -51,7 +51,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
        /* default hierarchy stats */
-       MEMCG_SOCK,
+       MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
        MEMCG_NR_STAT,
 };
 
index 430a929..a0e8cc8 100644 (file)
@@ -44,6 +44,8 @@
 
 #include <linux/timecounter.h>
 
+#define DEFAULT_UAR_PAGE_SHIFT  12
+
 #define MAX_MSIX_P_PORT                17
 #define MAX_MSIX               64
 #define MIN_MSIX_P_PORT                5
@@ -856,6 +858,7 @@ struct mlx4_dev {
        u64                     regid_promisc_array[MLX4_MAX_PORTS + 1];
        u64                     regid_allmulti_array[MLX4_MAX_PORTS + 1];
        struct mlx4_vf_dev     *dev_vfs;
+       u8  uar_page_shift;
 };
 
 struct mlx4_clock_params {
@@ -1528,4 +1531,14 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
                                   struct mlx4_clock_params *params);
 
+static inline int mlx4_to_hw_uar_index(struct mlx4_dev *dev, int index)
+{
+       return (index << (PAGE_SHIFT - dev->uar_page_shift));
+}
+
+static inline int mlx4_get_num_reserved_uar(struct mlx4_dev *dev)
+{
+       /* The first 128 UARs are used for EQ doorbells */
+       return (128 >> (PAGE_SHIFT - dev->uar_page_shift));
+}
 #endif /* MLX4_DEVICE_H */
index 231ab6b..58eef02 100644 (file)
@@ -207,15 +207,15 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
        u8         outer_dmac[0x1];
        u8         outer_smac[0x1];
        u8         outer_ether_type[0x1];
-       u8         reserved_0[0x1];
+       u8         reserved_at_3[0x1];
        u8         outer_first_prio[0x1];
        u8         outer_first_cfi[0x1];
        u8         outer_first_vid[0x1];
-       u8         reserved_1[0x1];
+       u8         reserved_at_7[0x1];
        u8         outer_second_prio[0x1];
        u8         outer_second_cfi[0x1];
        u8         outer_second_vid[0x1];
-       u8         reserved_2[0x1];
+       u8         reserved_at_b[0x1];
        u8         outer_sip[0x1];
        u8         outer_dip[0x1];
        u8         outer_frag[0x1];
@@ -230,21 +230,21 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
        u8         outer_gre_protocol[0x1];
        u8         outer_gre_key[0x1];
        u8         outer_vxlan_vni[0x1];
-       u8         reserved_3[0x5];
+       u8         reserved_at_1a[0x5];
        u8         source_eswitch_port[0x1];
 
        u8         inner_dmac[0x1];
        u8         inner_smac[0x1];
        u8         inner_ether_type[0x1];
-       u8         reserved_4[0x1];
+       u8         reserved_at_23[0x1];
        u8         inner_first_prio[0x1];
        u8         inner_first_cfi[0x1];
        u8         inner_first_vid[0x1];
-       u8         reserved_5[0x1];
+       u8         reserved_at_27[0x1];
        u8         inner_second_prio[0x1];
        u8         inner_second_cfi[0x1];
        u8         inner_second_vid[0x1];
-       u8         reserved_6[0x1];
+       u8         reserved_at_2b[0x1];
        u8         inner_sip[0x1];
        u8         inner_dip[0x1];
        u8         inner_frag[0x1];
@@ -256,37 +256,37 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
        u8         inner_tcp_sport[0x1];
        u8         inner_tcp_dport[0x1];
        u8         inner_tcp_flags[0x1];
-       u8         reserved_7[0x9];
+       u8         reserved_at_37[0x9];
 
-       u8         reserved_8[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
        u8         ft_support[0x1];
-       u8         reserved_0[0x2];
+       u8         reserved_at_1[0x2];
        u8         flow_modify_en[0x1];
        u8         modify_root[0x1];
        u8         identified_miss_table_mode[0x1];
        u8         flow_table_modify[0x1];
-       u8         reserved_1[0x19];
+       u8         reserved_at_7[0x19];
 
-       u8         reserved_2[0x2];
+       u8         reserved_at_20[0x2];
        u8         log_max_ft_size[0x6];
-       u8         reserved_3[0x10];
+       u8         reserved_at_28[0x10];
        u8         max_ft_level[0x8];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_5[0x18];
+       u8         reserved_at_60[0x18];
        u8         log_max_ft_num[0x8];
 
-       u8         reserved_6[0x18];
+       u8         reserved_at_80[0x18];
        u8         log_max_destination[0x8];
 
-       u8         reserved_7[0x18];
+       u8         reserved_at_a0[0x18];
        u8         log_max_flow[0x8];
 
-       u8         reserved_8[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
 
@@ -298,13 +298,13 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits {
        u8         receive[0x1];
        u8         write[0x1];
        u8         read[0x1];
-       u8         reserved_0[0x1];
+       u8         reserved_at_4[0x1];
        u8         srq_receive[0x1];
-       u8         reserved_1[0x1a];
+       u8         reserved_at_6[0x1a];
 };
 
 struct mlx5_ifc_ipv4_layout_bits {
-       u8         reserved_0[0x60];
+       u8         reserved_at_0[0x60];
 
        u8         ipv4[0x20];
 };
@@ -316,7 +316,7 @@ struct mlx5_ifc_ipv6_layout_bits {
 union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
        struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
        struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
-       u8         reserved_0[0x80];
+       u8         reserved_at_0[0x80];
 };
 
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
@@ -336,15 +336,15 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
        u8         ip_dscp[0x6];
        u8         ip_ecn[0x2];
        u8         vlan_tag[0x1];
-       u8         reserved_0[0x1];
+       u8         reserved_at_91[0x1];
        u8         frag[0x1];
-       u8         reserved_1[0x4];
+       u8         reserved_at_93[0x4];
        u8         tcp_flags[0x9];
 
        u8         tcp_sport[0x10];
        u8         tcp_dport[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_c0[0x20];
 
        u8         udp_sport[0x10];
        u8         udp_dport[0x10];
@@ -355,9 +355,9 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 };
 
 struct mlx5_ifc_fte_match_set_misc_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         source_port[0x10];
 
        u8         outer_second_prio[0x3];
@@ -369,31 +369,31 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 
        u8         outer_second_vlan_tag[0x1];
        u8         inner_second_vlan_tag[0x1];
-       u8         reserved_2[0xe];
+       u8         reserved_at_62[0xe];
        u8         gre_protocol[0x10];
 
        u8         gre_key_h[0x18];
        u8         gre_key_l[0x8];
 
        u8         vxlan_vni[0x18];
-       u8         reserved_3[0x8];
+       u8         reserved_at_b8[0x8];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_c0[0x20];
 
-       u8         reserved_5[0xc];
+       u8         reserved_at_e0[0xc];
        u8         outer_ipv6_flow_label[0x14];
 
-       u8         reserved_6[0xc];
+       u8         reserved_at_100[0xc];
        u8         inner_ipv6_flow_label[0x14];
 
-       u8         reserved_7[0xe0];
+       u8         reserved_at_120[0xe0];
 };
 
 struct mlx5_ifc_cmd_pas_bits {
        u8         pa_h[0x20];
 
        u8         pa_l[0x14];
-       u8         reserved_0[0xc];
+       u8         reserved_at_34[0xc];
 };
 
 struct mlx5_ifc_uint64_bits {
@@ -418,31 +418,31 @@ enum {
 struct mlx5_ifc_ads_bits {
        u8         fl[0x1];
        u8         free_ar[0x1];
-       u8         reserved_0[0xe];
+       u8         reserved_at_2[0xe];
        u8         pkey_index[0x10];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_20[0x8];
        u8         grh[0x1];
        u8         mlid[0x7];
        u8         rlid[0x10];
 
        u8         ack_timeout[0x5];
-       u8         reserved_2[0x3];
+       u8         reserved_at_45[0x3];
        u8         src_addr_index[0x8];
-       u8         reserved_3[0x4];
+       u8         reserved_at_50[0x4];
        u8         stat_rate[0x4];
        u8         hop_limit[0x8];
 
-       u8         reserved_4[0x4];
+       u8         reserved_at_60[0x4];
        u8         tclass[0x8];
        u8         flow_label[0x14];
 
        u8         rgid_rip[16][0x8];
 
-       u8         reserved_5[0x4];
+       u8         reserved_at_100[0x4];
        u8         f_dscp[0x1];
        u8         f_ecn[0x1];
-       u8         reserved_6[0x1];
+       u8         reserved_at_106[0x1];
        u8         f_eth_prio[0x1];
        u8         ecn[0x2];
        u8         dscp[0x6];
@@ -458,25 +458,25 @@ struct mlx5_ifc_ads_bits {
 };
 
 struct mlx5_ifc_flow_table_nic_cap_bits {
-       u8         reserved_0[0x200];
+       u8         reserved_at_0[0x200];
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
 
-       u8         reserved_1[0x200];
+       u8         reserved_at_400[0x200];
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
 
-       u8         reserved_2[0x200];
+       u8         reserved_at_a00[0x200];
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
 
-       u8         reserved_3[0x7200];
+       u8         reserved_at_e00[0x7200];
 };
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-       u8     reserved_0[0x200];
+       u8     reserved_at_0[0x200];
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
 
@@ -484,7 +484,7 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
 
-       u8      reserved_1[0x7800];
+       u8      reserved_at_800[0x7800];
 };
 
 struct mlx5_ifc_e_switch_cap_bits {
@@ -493,9 +493,9 @@ struct mlx5_ifc_e_switch_cap_bits {
        u8         vport_svlan_insert[0x1];
        u8         vport_cvlan_insert_if_not_exist[0x1];
        u8         vport_cvlan_insert_overwrite[0x1];
-       u8         reserved_0[0x1b];
+       u8         reserved_at_5[0x1b];
 
-       u8         reserved_1[0x7e0];
+       u8         reserved_at_20[0x7e0];
 };
 
 struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
@@ -504,51 +504,51 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
        u8         lro_cap[0x1];
        u8         lro_psh_flag[0x1];
        u8         lro_time_stamp[0x1];
-       u8         reserved_0[0x3];
+       u8         reserved_at_5[0x3];
        u8         self_lb_en_modifiable[0x1];
-       u8         reserved_1[0x2];
+       u8         reserved_at_9[0x2];
        u8         max_lso_cap[0x5];
-       u8         reserved_2[0x4];
+       u8         reserved_at_10[0x4];
        u8         rss_ind_tbl_cap[0x4];
-       u8         reserved_3[0x3];
+       u8         reserved_at_18[0x3];
        u8         tunnel_lso_const_out_ip_id[0x1];
-       u8         reserved_4[0x2];
+       u8         reserved_at_1c[0x2];
        u8         tunnel_statless_gre[0x1];
        u8         tunnel_stateless_vxlan[0x1];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_6[0x10];
+       u8         reserved_at_40[0x10];
        u8         lro_min_mss_size[0x10];
 
-       u8         reserved_7[0x120];
+       u8         reserved_at_60[0x120];
 
        u8         lro_timer_supported_periods[4][0x20];
 
-       u8         reserved_8[0x600];
+       u8         reserved_at_200[0x600];
 };
 
 struct mlx5_ifc_roce_cap_bits {
        u8         roce_apm[0x1];
-       u8         reserved_0[0x1f];
+       u8         reserved_at_1[0x1f];
 
-       u8         reserved_1[0x60];
+       u8         reserved_at_20[0x60];
 
-       u8         reserved_2[0xc];
+       u8         reserved_at_80[0xc];
        u8         l3_type[0x4];
-       u8         reserved_3[0x8];
+       u8         reserved_at_90[0x8];
        u8         roce_version[0x8];
 
-       u8         reserved_4[0x10];
+       u8         reserved_at_a0[0x10];
        u8         r_roce_dest_udp_port[0x10];
 
        u8         r_roce_max_src_udp_port[0x10];
        u8         r_roce_min_src_udp_port[0x10];
 
-       u8         reserved_5[0x10];
+       u8         reserved_at_e0[0x10];
        u8         roce_address_table_size[0x10];
 
-       u8         reserved_6[0x700];
+       u8         reserved_at_100[0x700];
 };
 
 enum {
@@ -576,35 +576,35 @@ enum {
 };
 
 struct mlx5_ifc_atomic_caps_bits {
-       u8         reserved_0[0x40];
+       u8         reserved_at_0[0x40];
 
        u8         atomic_req_8B_endianess_mode[0x2];
-       u8         reserved_1[0x4];
+       u8         reserved_at_42[0x4];
        u8         supported_atomic_req_8B_endianess_mode_1[0x1];
 
-       u8         reserved_2[0x19];
+       u8         reserved_at_47[0x19];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
-       u8         reserved_4[0x10];
+       u8         reserved_at_80[0x10];
        u8         atomic_operations[0x10];
 
-       u8         reserved_5[0x10];
+       u8         reserved_at_a0[0x10];
        u8         atomic_size_qp[0x10];
 
-       u8         reserved_6[0x10];
+       u8         reserved_at_c0[0x10];
        u8         atomic_size_dc[0x10];
 
-       u8         reserved_7[0x720];
+       u8         reserved_at_e0[0x720];
 };
 
 struct mlx5_ifc_odp_cap_bits {
-       u8         reserved_0[0x40];
+       u8         reserved_at_0[0x40];
 
        u8         sig[0x1];
-       u8         reserved_1[0x1f];
+       u8         reserved_at_41[0x1f];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps;
 
@@ -612,7 +612,7 @@ struct mlx5_ifc_odp_cap_bits {
 
        struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps;
 
-       u8         reserved_3[0x720];
+       u8         reserved_at_e0[0x720];
 };
 
 enum {
@@ -660,55 +660,55 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-       u8         reserved_0[0x80];
+       u8         reserved_at_0[0x80];
 
        u8         log_max_srq_sz[0x8];
        u8         log_max_qp_sz[0x8];
-       u8         reserved_1[0xb];
+       u8         reserved_at_90[0xb];
        u8         log_max_qp[0x5];
 
-       u8         reserved_2[0xb];
+       u8         reserved_at_a0[0xb];
        u8         log_max_srq[0x5];
-       u8         reserved_3[0x10];
+       u8         reserved_at_b0[0x10];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_c0[0x8];
        u8         log_max_cq_sz[0x8];
-       u8         reserved_5[0xb];
+       u8         reserved_at_d0[0xb];
        u8         log_max_cq[0x5];
 
        u8         log_max_eq_sz[0x8];
-       u8         reserved_6[0x2];
+       u8         reserved_at_e8[0x2];
        u8         log_max_mkey[0x6];
-       u8         reserved_7[0xc];
+       u8         reserved_at_f0[0xc];
        u8         log_max_eq[0x4];
 
        u8         max_indirection[0x8];
-       u8         reserved_8[0x1];
+       u8         reserved_at_108[0x1];
        u8         log_max_mrw_sz[0x7];
-       u8         reserved_9[0x2];
+       u8         reserved_at_110[0x2];
        u8         log_max_bsf_list_size[0x6];
-       u8         reserved_10[0x2];
+       u8         reserved_at_118[0x2];
        u8         log_max_klm_list_size[0x6];
 
-       u8         reserved_11[0xa];
+       u8         reserved_at_120[0xa];
        u8         log_max_ra_req_dc[0x6];
-       u8         reserved_12[0xa];
+       u8         reserved_at_130[0xa];
        u8         log_max_ra_res_dc[0x6];
 
-       u8         reserved_13[0xa];
+       u8         reserved_at_140[0xa];
        u8         log_max_ra_req_qp[0x6];
-       u8         reserved_14[0xa];
+       u8         reserved_at_150[0xa];
        u8         log_max_ra_res_qp[0x6];
 
        u8         pad_cap[0x1];
        u8         cc_query_allowed[0x1];
        u8         cc_modify_allowed[0x1];
-       u8         reserved_15[0xd];
+       u8         reserved_at_163[0xd];
        u8         gid_table_size[0x10];
 
        u8         out_of_seq_cnt[0x1];
        u8         vport_counters[0x1];
-       u8         reserved_16[0x4];
+       u8         reserved_at_182[0x4];
        u8         max_qp_cnt[0xa];
        u8         pkey_table_size[0x10];
 
@@ -716,158 +716,158 @@ struct mlx5_ifc_cmd_hca_cap_bits {
        u8         vhca_group_manager[0x1];
        u8         ib_virt[0x1];
        u8         eth_virt[0x1];
-       u8         reserved_17[0x1];
+       u8         reserved_at_1a4[0x1];
        u8         ets[0x1];
        u8         nic_flow_table[0x1];
        u8         eswitch_flow_table[0x1];
        u8         early_vf_enable;
-       u8         reserved_18[0x2];
+       u8         reserved_at_1a8[0x2];
        u8         local_ca_ack_delay[0x5];
-       u8         reserved_19[0x6];
+       u8         reserved_at_1af[0x6];
        u8         port_type[0x2];
        u8         num_ports[0x8];
 
-       u8         reserved_20[0x3];
+       u8         reserved_at_1bf[0x3];
        u8         log_max_msg[0x5];
-       u8         reserved_21[0x18];
+       u8         reserved_at_1c7[0x18];
 
        u8         stat_rate_support[0x10];
-       u8         reserved_22[0xc];
+       u8         reserved_at_1ef[0xc];
        u8         cqe_version[0x4];
 
        u8         compact_address_vector[0x1];
-       u8         reserved_23[0xe];
+       u8         reserved_at_200[0xe];
        u8         drain_sigerr[0x1];
        u8         cmdif_checksum[0x2];
        u8         sigerr_cqe[0x1];
-       u8         reserved_24[0x1];
+       u8         reserved_at_212[0x1];
        u8         wq_signature[0x1];
        u8         sctr_data_cqe[0x1];
-       u8         reserved_25[0x1];
+       u8         reserved_at_215[0x1];
        u8         sho[0x1];
        u8         tph[0x1];
        u8         rf[0x1];
        u8         dct[0x1];
-       u8         reserved_26[0x1];
+       u8         reserved_at_21a[0x1];
        u8         eth_net_offloads[0x1];
        u8         roce[0x1];
        u8         atomic[0x1];
-       u8         reserved_27[0x1];
+       u8         reserved_at_21e[0x1];
 
        u8         cq_oi[0x1];
        u8         cq_resize[0x1];
        u8         cq_moderation[0x1];
-       u8         reserved_28[0x3];
+       u8         reserved_at_222[0x3];
        u8         cq_eq_remap[0x1];
        u8         pg[0x1];
        u8         block_lb_mc[0x1];
-       u8         reserved_29[0x1];
+       u8         reserved_at_228[0x1];
        u8         scqe_break_moderation[0x1];
-       u8         reserved_30[0x1];
+       u8         reserved_at_22a[0x1];
        u8         cd[0x1];
-       u8         reserved_31[0x1];
+       u8         reserved_at_22c[0x1];
        u8         apm[0x1];
-       u8         reserved_32[0x7];
+       u8         reserved_at_22e[0x7];
        u8         qkv[0x1];
        u8         pkv[0x1];
-       u8         reserved_33[0x4];
+       u8         reserved_at_237[0x4];
        u8         xrc[0x1];
        u8         ud[0x1];
        u8         uc[0x1];
        u8         rc[0x1];
 
-       u8         reserved_34[0xa];
+       u8         reserved_at_23f[0xa];
        u8         uar_sz[0x6];
-       u8         reserved_35[0x8];
+       u8         reserved_at_24f[0x8];
        u8         log_pg_sz[0x8];
 
        u8         bf[0x1];
-       u8         reserved_36[0x1];
+       u8         reserved_at_260[0x1];
        u8         pad_tx_eth_packet[0x1];
-       u8         reserved_37[0x8];
+       u8         reserved_at_262[0x8];
        u8         log_bf_reg_size[0x5];
-       u8         reserved_38[0x10];
+       u8         reserved_at_26f[0x10];
 
-       u8         reserved_39[0x10];
+       u8         reserved_at_27f[0x10];
        u8         max_wqe_sz_sq[0x10];
 
-       u8         reserved_40[0x10];
+       u8         reserved_at_29f[0x10];
        u8         max_wqe_sz_rq[0x10];
 
-       u8         reserved_41[0x10];
+       u8         reserved_at_2bf[0x10];
        u8         max_wqe_sz_sq_dc[0x10];
 
-       u8         reserved_42[0x7];
+       u8         reserved_at_2df[0x7];
        u8         max_qp_mcg[0x19];
 
-       u8         reserved_43[0x18];
+       u8         reserved_at_2ff[0x18];
        u8         log_max_mcg[0x8];
 
-       u8         reserved_44[0x3];
+       u8         reserved_at_31f[0x3];
        u8         log_max_transport_domain[0x5];
-       u8         reserved_45[0x3];
+       u8         reserved_at_327[0x3];
        u8         log_max_pd[0x5];
-       u8         reserved_46[0xb];
+       u8         reserved_at_32f[0xb];
        u8         log_max_xrcd[0x5];
 
-       u8         reserved_47[0x20];
+       u8         reserved_at_33f[0x20];
 
-       u8         reserved_48[0x3];
+       u8         reserved_at_35f[0x3];
        u8         log_max_rq[0x5];
-       u8         reserved_49[0x3];
+       u8         reserved_at_367[0x3];
        u8         log_max_sq[0x5];
-       u8         reserved_50[0x3];
+       u8         reserved_at_36f[0x3];
        u8         log_max_tir[0x5];
-       u8         reserved_51[0x3];
+       u8         reserved_at_377[0x3];
        u8         log_max_tis[0x5];
 
        u8         basic_cyclic_rcv_wqe[0x1];
-       u8         reserved_52[0x2];
+       u8         reserved_at_380[0x2];
        u8         log_max_rmp[0x5];
-       u8         reserved_53[0x3];
+       u8         reserved_at_387[0x3];
        u8         log_max_rqt[0x5];
-       u8         reserved_54[0x3];
+       u8         reserved_at_38f[0x3];
        u8         log_max_rqt_size[0x5];
-       u8         reserved_55[0x3];
+       u8         reserved_at_397[0x3];
        u8         log_max_tis_per_sq[0x5];
 
-       u8         reserved_56[0x3];
+       u8         reserved_at_39f[0x3];
        u8         log_max_stride_sz_rq[0x5];
-       u8         reserved_57[0x3];
+       u8         reserved_at_3a7[0x3];
        u8         log_min_stride_sz_rq[0x5];
-       u8         reserved_58[0x3];
+       u8         reserved_at_3af[0x3];
        u8         log_max_stride_sz_sq[0x5];
-       u8         reserved_59[0x3];
+       u8         reserved_at_3b7[0x3];
        u8         log_min_stride_sz_sq[0x5];
 
-       u8         reserved_60[0x1b];
+       u8         reserved_at_3bf[0x1b];
        u8         log_max_wq_sz[0x5];
 
        u8         nic_vport_change_event[0x1];
-       u8         reserved_61[0xa];
+       u8         reserved_at_3e0[0xa];
        u8         log_max_vlan_list[0x5];
-       u8         reserved_62[0x3];
+       u8         reserved_at_3ef[0x3];
        u8         log_max_current_mc_list[0x5];
-       u8         reserved_63[0x3];
+       u8         reserved_at_3f7[0x3];
        u8         log_max_current_uc_list[0x5];
 
-       u8         reserved_64[0x80];
+       u8         reserved_at_3ff[0x80];
 
-       u8         reserved_65[0x3];
+       u8         reserved_at_47f[0x3];
        u8         log_max_l2_table[0x5];
-       u8         reserved_66[0x8];
+       u8         reserved_at_487[0x8];
        u8         log_uar_page_sz[0x10];
 
-       u8         reserved_67[0x20];
+       u8         reserved_at_49f[0x20];
        u8         device_frequency_mhz[0x20];
        u8         device_frequency_khz[0x20];
-       u8         reserved_68[0x5f];
+       u8         reserved_at_4ff[0x5f];
        u8         cqe_zip[0x1];
 
        u8         cqe_zip_timeout[0x10];
        u8         cqe_zip_max_num[0x10];
 
-       u8         reserved_69[0x220];
+       u8         reserved_at_57f[0x220];
 };
 
 enum mlx5_flow_destination_type {
@@ -880,7 +880,7 @@ struct mlx5_ifc_dest_format_struct_bits {
        u8         destination_type[0x8];
        u8         destination_id[0x18];
 
-       u8         reserved_0[0x20];
+       u8         reserved_at_20[0x20];
 };
 
 struct mlx5_ifc_fte_match_param_bits {
@@ -890,7 +890,7 @@ struct mlx5_ifc_fte_match_param_bits {
 
        struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers;
 
-       u8         reserved_0[0xa00];
+       u8         reserved_at_600[0xa00];
 };
 
 enum {
@@ -922,18 +922,18 @@ struct mlx5_ifc_wq_bits {
        u8         wq_signature[0x1];
        u8         end_padding_mode[0x2];
        u8         cd_slave[0x1];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         hds_skip_first_sge[0x1];
        u8         log2_hds_buf_size[0x3];
-       u8         reserved_1[0x7];
+       u8         reserved_at_24[0x7];
        u8         page_offset[0x5];
        u8         lwm[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         pd[0x18];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_60[0x8];
        u8         uar_page[0x18];
 
        u8         dbr_addr[0x40];
@@ -942,60 +942,60 @@ struct mlx5_ifc_wq_bits {
 
        u8         sw_counter[0x20];
 
-       u8         reserved_4[0xc];
+       u8         reserved_at_100[0xc];
        u8         log_wq_stride[0x4];
-       u8         reserved_5[0x3];
+       u8         reserved_at_110[0x3];
        u8         log_wq_pg_sz[0x5];
-       u8         reserved_6[0x3];
+       u8         reserved_at_118[0x3];
        u8         log_wq_sz[0x5];
 
-       u8         reserved_7[0x4e0];
+       u8         reserved_at_120[0x4e0];
 
        struct mlx5_ifc_cmd_pas_bits pas[0];
 };
 
 struct mlx5_ifc_rq_num_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         rq_num[0x18];
 };
 
 struct mlx5_ifc_mac_address_layout_bits {
-       u8         reserved_0[0x10];
+       u8         reserved_at_0[0x10];
        u8         mac_addr_47_32[0x10];
 
        u8         mac_addr_31_0[0x20];
 };
 
 struct mlx5_ifc_vlan_layout_bits {
-       u8         reserved_0[0x14];
+       u8         reserved_at_0[0x14];
        u8         vlan[0x0c];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_20[0x20];
 };
 
 struct mlx5_ifc_cong_control_r_roce_ecn_np_bits {
-       u8         reserved_0[0xa0];
+       u8         reserved_at_0[0xa0];
 
        u8         min_time_between_cnps[0x20];
 
-       u8         reserved_1[0x12];
+       u8         reserved_at_c0[0x12];
        u8         cnp_dscp[0x6];
-       u8         reserved_2[0x5];
+       u8         reserved_at_d8[0x5];
        u8         cnp_802p_prio[0x3];
 
-       u8         reserved_3[0x720];
+       u8         reserved_at_e0[0x720];
 };
 
 struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits {
-       u8         reserved_0[0x60];
+       u8         reserved_at_0[0x60];
 
-       u8         reserved_1[0x4];
+       u8         reserved_at_60[0x4];
        u8         clamp_tgt_rate[0x1];
-       u8         reserved_2[0x3];
+       u8         reserved_at_65[0x3];
        u8         clamp_tgt_rate_after_time_inc[0x1];
-       u8         reserved_3[0x17];
+       u8         reserved_at_69[0x17];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_80[0x20];
 
        u8         rpg_time_reset[0x20];
 
@@ -1015,7 +1015,7 @@ struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits {
 
        u8         rpg_min_rate[0x20];
 
-       u8         reserved_5[0xe0];
+       u8         reserved_at_1c0[0xe0];
 
        u8         rate_to_set_on_first_cnp[0x20];
 
@@ -1025,15 +1025,15 @@ struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits {
 
        u8         rate_reduce_monitor_period[0x20];
 
-       u8         reserved_6[0x20];
+       u8         reserved_at_320[0x20];
 
        u8         initial_alpha_value[0x20];
 
-       u8         reserved_7[0x4a0];
+       u8         reserved_at_360[0x4a0];
 };
 
 struct mlx5_ifc_cong_control_802_1qau_rp_bits {
-       u8         reserved_0[0x80];
+       u8         reserved_at_0[0x80];
 
        u8         rppp_max_rps[0x20];
 
@@ -1055,7 +1055,7 @@ struct mlx5_ifc_cong_control_802_1qau_rp_bits {
 
        u8         rpg_min_rate[0x20];
 
-       u8         reserved_1[0x640];
+       u8         reserved_at_1c0[0x640];
 };
 
 enum {
@@ -1205,7 +1205,7 @@ struct mlx5_ifc_phys_layer_cntrs_bits {
 
        u8         successful_recovery_events[0x20];
 
-       u8         reserved_0[0x180];
+       u8         reserved_at_640[0x180];
 };
 
 struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
@@ -1213,7 +1213,7 @@ struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
 
        u8         transmit_queue_low[0x20];
 
-       u8         reserved_0[0x780];
+       u8         reserved_at_40[0x780];
 };
 
 struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
@@ -1221,7 +1221,7 @@ struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
 
        u8         rx_octets_low[0x20];
 
-       u8         reserved_0[0xc0];
+       u8         reserved_at_40[0xc0];
 
        u8         rx_frames_high[0x20];
 
@@ -1231,7 +1231,7 @@ struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
 
        u8         tx_octets_low[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_180[0xc0];
 
        u8         tx_frames_high[0x20];
 
@@ -1257,7 +1257,7 @@ struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
 
        u8         rx_pause_transition_low[0x20];
 
-       u8         reserved_2[0x400];
+       u8         reserved_at_3c0[0x400];
 };
 
 struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits {
@@ -1265,7 +1265,7 @@ struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits {
 
        u8         port_transmit_wait_low[0x20];
 
-       u8         reserved_0[0x780];
+       u8         reserved_at_40[0x780];
 };
 
 struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits {
@@ -1333,7 +1333,7 @@ struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits {
 
        u8         dot3out_pause_frames_low[0x20];
 
-       u8         reserved_0[0x3c0];
+       u8         reserved_at_400[0x3c0];
 };
 
 struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits {
@@ -1421,7 +1421,7 @@ struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits {
 
        u8         ether_stats_pkts8192to10239octets_low[0x20];
 
-       u8         reserved_0[0x280];
+       u8         reserved_at_540[0x280];
 };
 
 struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits {
@@ -1477,7 +1477,7 @@ struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits {
 
        u8         if_out_broadcast_pkts_low[0x20];
 
-       u8         reserved_0[0x480];
+       u8         reserved_at_340[0x480];
 };
 
 struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
@@ -1557,54 +1557,54 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
 
        u8         a_pause_mac_ctrl_frames_transmitted_low[0x20];
 
-       u8         reserved_0[0x300];
+       u8         reserved_at_4c0[0x300];
 };
 
 struct mlx5_ifc_cmd_inter_comp_event_bits {
        u8         command_completion_vector[0x20];
 
-       u8         reserved_0[0xc0];
+       u8         reserved_at_20[0xc0];
 };
 
 struct mlx5_ifc_stall_vl_event_bits {
-       u8         reserved_0[0x18];
+       u8         reserved_at_0[0x18];
        u8         port_num[0x1];
-       u8         reserved_1[0x3];
+       u8         reserved_at_19[0x3];
        u8         vl[0x4];
 
-       u8         reserved_2[0xa0];
+       u8         reserved_at_20[0xa0];
 };
 
 struct mlx5_ifc_db_bf_congestion_event_bits {
        u8         event_subtype[0x8];
-       u8         reserved_0[0x8];
+       u8         reserved_at_8[0x8];
        u8         congestion_level[0x8];
-       u8         reserved_1[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_2[0xa0];
+       u8         reserved_at_20[0xa0];
 };
 
 struct mlx5_ifc_gpio_event_bits {
-       u8         reserved_0[0x60];
+       u8         reserved_at_0[0x60];
 
        u8         gpio_event_hi[0x20];
 
        u8         gpio_event_lo[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_a0[0x40];
 };
 
 struct mlx5_ifc_port_state_change_event_bits {
-       u8         reserved_0[0x40];
+       u8         reserved_at_0[0x40];
 
        u8         port_num[0x4];
-       u8         reserved_1[0x1c];
+       u8         reserved_at_44[0x1c];
 
-       u8         reserved_2[0x80];
+       u8         reserved_at_60[0x80];
 };
 
 struct mlx5_ifc_dropped_packet_logged_bits {
-       u8         reserved_0[0xe0];
+       u8         reserved_at_0[0xe0];
 };
 
 enum {
@@ -1613,15 +1613,15 @@ enum {
 };
 
 struct mlx5_ifc_cq_error_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         cqn[0x18];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         syndrome[0x8];
 
-       u8         reserved_3[0x80];
+       u8         reserved_at_60[0x80];
 };
 
 struct mlx5_ifc_rdma_page_fault_event_bits {
@@ -1629,14 +1629,14 @@ struct mlx5_ifc_rdma_page_fault_event_bits {
 
        u8         r_key[0x20];
 
-       u8         reserved_0[0x10];
+       u8         reserved_at_40[0x10];
        u8         packet_len[0x10];
 
        u8         rdma_op_len[0x20];
 
        u8         rdma_va[0x40];
 
-       u8         reserved_1[0x5];
+       u8         reserved_at_c0[0x5];
        u8         rdma[0x1];
        u8         write[0x1];
        u8         requestor[0x1];
@@ -1646,15 +1646,15 @@ struct mlx5_ifc_rdma_page_fault_event_bits {
 struct mlx5_ifc_wqe_associated_page_fault_event_bits {
        u8         bytes_committed[0x20];
 
-       u8         reserved_0[0x10];
+       u8         reserved_at_20[0x10];
        u8         wqe_index[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_40[0x10];
        u8         len[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_60[0x60];
 
-       u8         reserved_3[0x5];
+       u8         reserved_at_c0[0x5];
        u8         rdma[0x1];
        u8         write_read[0x1];
        u8         requestor[0x1];
@@ -1662,26 +1662,26 @@ struct mlx5_ifc_wqe_associated_page_fault_event_bits {
 };
 
 struct mlx5_ifc_qp_events_bits {
-       u8         reserved_0[0xa0];
+       u8         reserved_at_0[0xa0];
 
        u8         type[0x8];
-       u8         reserved_1[0x18];
+       u8         reserved_at_a8[0x18];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_c0[0x8];
        u8         qpn_rqn_sqn[0x18];
 };
 
 struct mlx5_ifc_dct_events_bits {
-       u8         reserved_0[0xc0];
+       u8         reserved_at_0[0xc0];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_c0[0x8];
        u8         dct_number[0x18];
 };
 
 struct mlx5_ifc_comp_event_bits {
-       u8         reserved_0[0xc0];
+       u8         reserved_at_0[0xc0];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_c0[0x8];
        u8         cq_number[0x18];
 };
 
@@ -1754,41 +1754,41 @@ enum {
 
 struct mlx5_ifc_qpc_bits {
        u8         state[0x4];
-       u8         reserved_0[0x4];
+       u8         reserved_at_4[0x4];
        u8         st[0x8];
-       u8         reserved_1[0x3];
+       u8         reserved_at_10[0x3];
        u8         pm_state[0x2];
-       u8         reserved_2[0x7];
+       u8         reserved_at_15[0x7];
        u8         end_padding_mode[0x2];
-       u8         reserved_3[0x2];
+       u8         reserved_at_1e[0x2];
 
        u8         wq_signature[0x1];
        u8         block_lb_mc[0x1];
        u8         atomic_like_write_en[0x1];
        u8         latency_sensitive[0x1];
-       u8         reserved_4[0x1];
+       u8         reserved_at_24[0x1];
        u8         drain_sigerr[0x1];
-       u8         reserved_5[0x2];
+       u8         reserved_at_26[0x2];
        u8         pd[0x18];
 
        u8         mtu[0x3];
        u8         log_msg_max[0x5];
-       u8         reserved_6[0x1];
+       u8         reserved_at_48[0x1];
        u8         log_rq_size[0x4];
        u8         log_rq_stride[0x3];
        u8         no_sq[0x1];
        u8         log_sq_size[0x4];
-       u8         reserved_7[0x6];
+       u8         reserved_at_55[0x6];
        u8         rlky[0x1];
-       u8         reserved_8[0x4];
+       u8         reserved_at_5c[0x4];
 
        u8         counter_set_id[0x8];
        u8         uar_page[0x18];
 
-       u8         reserved_9[0x8];
+       u8         reserved_at_80[0x8];
        u8         user_index[0x18];
 
-       u8         reserved_10[0x3];
+       u8         reserved_at_a0[0x3];
        u8         log_page_size[0x5];
        u8         remote_qpn[0x18];
 
@@ -1797,66 +1797,66 @@ struct mlx5_ifc_qpc_bits {
        struct mlx5_ifc_ads_bits secondary_address_path;
 
        u8         log_ack_req_freq[0x4];
-       u8         reserved_11[0x4];
+       u8         reserved_at_384[0x4];
        u8         log_sra_max[0x3];
-       u8         reserved_12[0x2];
+       u8         reserved_at_38b[0x2];
        u8         retry_count[0x3];
        u8         rnr_retry[0x3];
-       u8         reserved_13[0x1];
+       u8         reserved_at_393[0x1];
        u8         fre[0x1];
        u8         cur_rnr_retry[0x3];
        u8         cur_retry_count[0x3];
-       u8         reserved_14[0x5];
+       u8         reserved_at_39b[0x5];
 
-       u8         reserved_15[0x20];
+       u8         reserved_at_3a0[0x20];
 
-       u8         reserved_16[0x8];
+       u8         reserved_at_3c0[0x8];
        u8         next_send_psn[0x18];
 
-       u8         reserved_17[0x8];
+       u8         reserved_at_3e0[0x8];
        u8         cqn_snd[0x18];
 
-       u8         reserved_18[0x40];
+       u8         reserved_at_400[0x40];
 
-       u8         reserved_19[0x8];
+       u8         reserved_at_440[0x8];
        u8         last_acked_psn[0x18];
 
-       u8         reserved_20[0x8];
+       u8         reserved_at_460[0x8];
        u8         ssn[0x18];
 
-       u8         reserved_21[0x8];
+       u8         reserved_at_480[0x8];
        u8         log_rra_max[0x3];
-       u8         reserved_22[0x1];
+       u8         reserved_at_48b[0x1];
        u8         atomic_mode[0x4];
        u8         rre[0x1];
        u8         rwe[0x1];
        u8         rae[0x1];
-       u8         reserved_23[0x1];
+       u8         reserved_at_493[0x1];
        u8         page_offset[0x6];
-       u8         reserved_24[0x3];
+       u8         reserved_at_49a[0x3];
        u8         cd_slave_receive[0x1];
        u8         cd_slave_send[0x1];
        u8         cd_master[0x1];
 
-       u8         reserved_25[0x3];
+       u8         reserved_at_4a0[0x3];
        u8         min_rnr_nak[0x5];
        u8         next_rcv_psn[0x18];
 
-       u8         reserved_26[0x8];
+       u8         reserved_at_4c0[0x8];
        u8         xrcd[0x18];
 
-       u8         reserved_27[0x8];
+       u8         reserved_at_4e0[0x8];
        u8         cqn_rcv[0x18];
 
        u8         dbr_addr[0x40];
 
        u8         q_key[0x20];
 
-       u8         reserved_28[0x5];
+       u8         reserved_at_560[0x5];
        u8         rq_type[0x3];
        u8         srqn_rmpn[0x18];
 
-       u8         reserved_29[0x8];
+       u8         reserved_at_580[0x8];
        u8         rmsn[0x18];
 
        u8         hw_sq_wqebb_counter[0x10];
@@ -1866,33 +1866,33 @@ struct mlx5_ifc_qpc_bits {
 
        u8         sw_rq_counter[0x20];
 
-       u8         reserved_30[0x20];
+       u8         reserved_at_600[0x20];
 
-       u8         reserved_31[0xf];
+       u8         reserved_at_620[0xf];
        u8         cgs[0x1];
        u8         cs_req[0x8];
        u8         cs_res[0x8];
 
        u8         dc_access_key[0x40];
 
-       u8         reserved_32[0xc0];
+       u8         reserved_at_680[0xc0];
 };
 
 struct mlx5_ifc_roce_addr_layout_bits {
        u8         source_l3_address[16][0x8];
 
-       u8         reserved_0[0x3];
+       u8         reserved_at_80[0x3];
        u8         vlan_valid[0x1];
        u8         vlan_id[0xc];
        u8         source_mac_47_32[0x10];
 
        u8         source_mac_31_0[0x20];
 
-       u8         reserved_1[0x14];
+       u8         reserved_at_c0[0x14];
        u8         roce_l3_type[0x4];
        u8         roce_version[0x8];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_e0[0x20];
 };
 
 union mlx5_ifc_hca_cap_union_bits {
@@ -1904,7 +1904,7 @@ union mlx5_ifc_hca_cap_union_bits {
        struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
        struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
        struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
-       u8         reserved_0[0x8000];
+       u8         reserved_at_0[0x8000];
 };
 
 enum {
@@ -1914,24 +1914,24 @@ enum {
 };
 
 struct mlx5_ifc_flow_context_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
        u8         group_id[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         flow_tag[0x18];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_60[0x10];
        u8         action[0x10];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_80[0x8];
        u8         destination_list_size[0x18];
 
-       u8         reserved_4[0x160];
+       u8         reserved_at_a0[0x160];
 
        struct mlx5_ifc_fte_match_param_bits match_value;
 
-       u8         reserved_5[0x600];
+       u8         reserved_at_1200[0x600];
 
        struct mlx5_ifc_dest_format_struct_bits destination[0];
 };
@@ -1944,43 +1944,43 @@ enum {
 struct mlx5_ifc_xrc_srqc_bits {
        u8         state[0x4];
        u8         log_xrc_srq_size[0x4];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         wq_signature[0x1];
        u8         cont_srq[0x1];
-       u8         reserved_1[0x1];
+       u8         reserved_at_22[0x1];
        u8         rlky[0x1];
        u8         basic_cyclic_rcv_wqe[0x1];
        u8         log_rq_stride[0x3];
        u8         xrcd[0x18];
 
        u8         page_offset[0x6];
-       u8         reserved_2[0x2];
+       u8         reserved_at_46[0x2];
        u8         cqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         user_index_equal_xrc_srqn[0x1];
-       u8         reserved_4[0x1];
+       u8         reserved_at_81[0x1];
        u8         log_page_size[0x6];
        u8         user_index[0x18];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_a0[0x20];
 
-       u8         reserved_6[0x8];
+       u8         reserved_at_c0[0x8];
        u8         pd[0x18];
 
        u8         lwm[0x10];
        u8         wqe_cnt[0x10];
 
-       u8         reserved_7[0x40];
+       u8         reserved_at_100[0x40];
 
        u8         db_record_addr_h[0x20];
 
        u8         db_record_addr_l[0x1e];
-       u8         reserved_8[0x2];
+       u8         reserved_at_17e[0x2];
 
-       u8         reserved_9[0x80];
+       u8         reserved_at_180[0x80];
 };
 
 struct mlx5_ifc_traffic_counter_bits {
@@ -1990,16 +1990,16 @@ struct mlx5_ifc_traffic_counter_bits {
 };
 
 struct mlx5_ifc_tisc_bits {
-       u8         reserved_0[0xc];
+       u8         reserved_at_0[0xc];
        u8         prio[0x4];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x100];
+       u8         reserved_at_20[0x100];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_120[0x8];
        u8         transport_domain[0x18];
 
-       u8         reserved_4[0x3c0];
+       u8         reserved_at_140[0x3c0];
 };
 
 enum {
@@ -2024,31 +2024,31 @@ enum {
 };
 
 struct mlx5_ifc_tirc_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
        u8         disp_type[0x4];
-       u8         reserved_1[0x1c];
+       u8         reserved_at_24[0x1c];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
-       u8         reserved_3[0x4];
+       u8         reserved_at_80[0x4];
        u8         lro_timeout_period_usecs[0x10];
        u8         lro_enable_mask[0x4];
        u8         lro_max_ip_payload_size[0x8];
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_a0[0x40];
 
-       u8         reserved_5[0x8];
+       u8         reserved_at_e0[0x8];
        u8         inline_rqn[0x18];
 
        u8         rx_hash_symmetric[0x1];
-       u8         reserved_6[0x1];
+       u8         reserved_at_101[0x1];
        u8         tunneled_offload_en[0x1];
-       u8         reserved_7[0x5];
+       u8         reserved_at_103[0x5];
        u8         indirect_table[0x18];
 
        u8         rx_hash_fn[0x4];
-       u8         reserved_8[0x2];
+       u8         reserved_at_124[0x2];
        u8         self_lb_block[0x2];
        u8         transport_domain[0x18];
 
@@ -2058,7 +2058,7 @@ struct mlx5_ifc_tirc_bits {
 
        struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_inner;
 
-       u8         reserved_9[0x4c0];
+       u8         reserved_at_2c0[0x4c0];
 };
 
 enum {
@@ -2069,39 +2069,39 @@ enum {
 struct mlx5_ifc_srqc_bits {
        u8         state[0x4];
        u8         log_srq_size[0x4];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         wq_signature[0x1];
        u8         cont_srq[0x1];
-       u8         reserved_1[0x1];
+       u8         reserved_at_22[0x1];
        u8         rlky[0x1];
-       u8         reserved_2[0x1];
+       u8         reserved_at_24[0x1];
        u8         log_rq_stride[0x3];
        u8         xrcd[0x18];
 
        u8         page_offset[0x6];
-       u8         reserved_3[0x2];
+       u8         reserved_at_46[0x2];
        u8         cqn[0x18];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_60[0x20];
 
-       u8         reserved_5[0x2];
+       u8         reserved_at_80[0x2];
        u8         log_page_size[0x6];
-       u8         reserved_6[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_7[0x20];
+       u8         reserved_at_a0[0x20];
 
-       u8         reserved_8[0x8];
+       u8         reserved_at_c0[0x8];
        u8         pd[0x18];
 
        u8         lwm[0x10];
        u8         wqe_cnt[0x10];
 
-       u8         reserved_9[0x40];
+       u8         reserved_at_100[0x40];
 
        u8         dbr_addr[0x40];
 
-       u8         reserved_10[0x80];
+       u8         reserved_at_180[0x80];
 };
 
 enum {
@@ -2115,39 +2115,39 @@ struct mlx5_ifc_sqc_bits {
        u8         cd_master[0x1];
        u8         fre[0x1];
        u8         flush_in_error_en[0x1];
-       u8         reserved_0[0x4];
+       u8         reserved_at_4[0x4];
        u8         state[0x4];
-       u8         reserved_1[0x14];
+       u8         reserved_at_c[0x14];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_20[0x8];
        u8         user_index[0x18];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
-       u8         reserved_4[0xa0];
+       u8         reserved_at_60[0xa0];
 
        u8         tis_lst_sz[0x10];
-       u8         reserved_5[0x10];
+       u8         reserved_at_110[0x10];
 
-       u8         reserved_6[0x40];
+       u8         reserved_at_120[0x40];
 
-       u8         reserved_7[0x8];
+       u8         reserved_at_160[0x8];
        u8         tis_num_0[0x18];
 
        struct mlx5_ifc_wq_bits wq;
 };
 
 struct mlx5_ifc_rqtc_bits {
-       u8         reserved_0[0xa0];
+       u8         reserved_at_0[0xa0];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_a0[0x10];
        u8         rqt_max_size[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_c0[0x10];
        u8         rqt_actual_size[0x10];
 
-       u8         reserved_3[0x6a0];
+       u8         reserved_at_e0[0x6a0];
 
        struct mlx5_ifc_rq_num_bits rq_num[0];
 };
@@ -2165,27 +2165,27 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
        u8         rlky[0x1];
-       u8         reserved_0[0x2];
+       u8         reserved_at_1[0x2];
        u8         vsd[0x1];
        u8         mem_rq_type[0x4];
        u8         state[0x4];
-       u8         reserved_1[0x1];
+       u8         reserved_at_c[0x1];
        u8         flush_in_error_en[0x1];
-       u8         reserved_2[0x12];
+       u8         reserved_at_e[0x12];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_20[0x8];
        u8         user_index[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
        u8         counter_set_id[0x8];
-       u8         reserved_5[0x18];
+       u8         reserved_at_68[0x18];
 
-       u8         reserved_6[0x8];
+       u8         reserved_at_80[0x8];
        u8         rmpn[0x18];
 
-       u8         reserved_7[0xe0];
+       u8         reserved_at_a0[0xe0];
 
        struct mlx5_ifc_wq_bits wq;
 };
@@ -2196,31 +2196,31 @@ enum {
 };
 
 struct mlx5_ifc_rmpc_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         state[0x4];
-       u8         reserved_1[0x14];
+       u8         reserved_at_c[0x14];
 
        u8         basic_cyclic_rcv_wqe[0x1];
-       u8         reserved_2[0x1f];
+       u8         reserved_at_21[0x1f];
 
-       u8         reserved_3[0x140];
+       u8         reserved_at_40[0x140];
 
        struct mlx5_ifc_wq_bits wq;
 };
 
 struct mlx5_ifc_nic_vport_context_bits {
-       u8         reserved_0[0x1f];
+       u8         reserved_at_0[0x1f];
        u8         roce_en[0x1];
 
        u8         arm_change_event[0x1];
-       u8         reserved_1[0x1a];
+       u8         reserved_at_21[0x1a];
        u8         event_on_mtu[0x1];
        u8         event_on_promisc_change[0x1];
        u8         event_on_vlan_change[0x1];
        u8         event_on_mc_address_change[0x1];
        u8         event_on_uc_address_change[0x1];
 
-       u8         reserved_2[0xf0];
+       u8         reserved_at_40[0xf0];
 
        u8         mtu[0x10];
 
@@ -2228,21 +2228,21 @@ struct mlx5_ifc_nic_vport_context_bits {
        u8         port_guid[0x40];
        u8         node_guid[0x40];
 
-       u8         reserved_3[0x140];
+       u8         reserved_at_200[0x140];
        u8         qkey_violation_counter[0x10];
-       u8         reserved_4[0x430];
+       u8         reserved_at_350[0x430];
 
        u8         promisc_uc[0x1];
        u8         promisc_mc[0x1];
        u8         promisc_all[0x1];
-       u8         reserved_5[0x2];
+       u8         reserved_at_783[0x2];
        u8         allowed_list_type[0x3];
-       u8         reserved_6[0xc];
+       u8         reserved_at_788[0xc];
        u8         allowed_list_size[0xc];
 
        struct mlx5_ifc_mac_address_layout_bits permanent_address;
 
-       u8         reserved_7[0x20];
+       u8         reserved_at_7e0[0x20];
 
        u8         current_uc_mac_address[0][0x40];
 };
@@ -2254,9 +2254,9 @@ enum {
 };
 
 struct mlx5_ifc_mkc_bits {
-       u8         reserved_0[0x1];
+       u8         reserved_at_0[0x1];
        u8         free[0x1];
-       u8         reserved_1[0xd];
+       u8         reserved_at_2[0xd];
        u8         small_fence_on_rdma_read_response[0x1];
        u8         umr_en[0x1];
        u8         a[0x1];
@@ -2265,19 +2265,19 @@ struct mlx5_ifc_mkc_bits {
        u8         lw[0x1];
        u8         lr[0x1];
        u8         access_mode[0x2];
-       u8         reserved_2[0x8];
+       u8         reserved_at_18[0x8];
 
        u8         qpn[0x18];
        u8         mkey_7_0[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_40[0x20];
 
        u8         length64[0x1];
        u8         bsf_en[0x1];
        u8         sync_umr[0x1];
-       u8         reserved_4[0x2];
+       u8         reserved_at_63[0x2];
        u8         expected_sigerr_count[0x1];
-       u8         reserved_5[0x1];
+       u8         reserved_at_66[0x1];
        u8         en_rinval[0x1];
        u8         pd[0x18];
 
@@ -2287,18 +2287,18 @@ struct mlx5_ifc_mkc_bits {
 
        u8         bsf_octword_size[0x20];
 
-       u8         reserved_6[0x80];
+       u8         reserved_at_120[0x80];
 
        u8         translations_octword_size[0x20];
 
-       u8         reserved_7[0x1b];
+       u8         reserved_at_1c0[0x1b];
        u8         log_page_size[0x5];
 
-       u8         reserved_8[0x20];
+       u8         reserved_at_1e0[0x20];
 };
 
 struct mlx5_ifc_pkey_bits {
-       u8         reserved_0[0x10];
+       u8         reserved_at_0[0x10];
        u8         pkey[0x10];
 };
 
@@ -2309,19 +2309,19 @@ struct mlx5_ifc_array128_auto_bits {
 struct mlx5_ifc_hca_vport_context_bits {
        u8         field_select[0x20];
 
-       u8         reserved_0[0xe0];
+       u8         reserved_at_20[0xe0];
 
        u8         sm_virt_aware[0x1];
        u8         has_smi[0x1];
        u8         has_raw[0x1];
        u8         grh_required[0x1];
-       u8         reserved_1[0xc];
+       u8         reserved_at_104[0xc];
        u8         port_physical_state[0x4];
        u8         vport_state_policy[0x4];
        u8         port_state[0x4];
        u8         vport_state[0x4];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_120[0x20];
 
        u8         system_image_guid[0x40];
 
@@ -2337,33 +2337,33 @@ struct mlx5_ifc_hca_vport_context_bits {
 
        u8         cap_mask2_field_select[0x20];
 
-       u8         reserved_3[0x80];
+       u8         reserved_at_280[0x80];
 
        u8         lid[0x10];
-       u8         reserved_4[0x4];
+       u8         reserved_at_310[0x4];
        u8         init_type_reply[0x4];
        u8         lmc[0x3];
        u8         subnet_timeout[0x5];
 
        u8         sm_lid[0x10];
        u8         sm_sl[0x4];
-       u8         reserved_5[0xc];
+       u8         reserved_at_334[0xc];
 
        u8         qkey_violation_counter[0x10];
        u8         pkey_violation_counter[0x10];
 
-       u8         reserved_6[0xca0];
+       u8         reserved_at_360[0xca0];
 };
 
 struct mlx5_ifc_esw_vport_context_bits {
-       u8         reserved_0[0x3];
+       u8         reserved_at_0[0x3];
        u8         vport_svlan_strip[0x1];
        u8         vport_cvlan_strip[0x1];
        u8         vport_svlan_insert[0x1];
        u8         vport_cvlan_insert[0x2];
-       u8         reserved_1[0x18];
+       u8         reserved_at_8[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_20[0x20];
 
        u8         svlan_cfi[0x1];
        u8         svlan_pcp[0x3];
@@ -2372,7 +2372,7 @@ struct mlx5_ifc_esw_vport_context_bits {
        u8         cvlan_pcp[0x3];
        u8         cvlan_id[0xc];
 
-       u8         reserved_3[0x7a0];
+       u8         reserved_at_60[0x7a0];
 };
 
 enum {
@@ -2387,41 +2387,41 @@ enum {
 
 struct mlx5_ifc_eqc_bits {
        u8         status[0x4];
-       u8         reserved_0[0x9];
+       u8         reserved_at_4[0x9];
        u8         ec[0x1];
        u8         oi[0x1];
-       u8         reserved_1[0x5];
+       u8         reserved_at_f[0x5];
        u8         st[0x4];
-       u8         reserved_2[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_4[0x14];
+       u8         reserved_at_40[0x14];
        u8         page_offset[0x6];
-       u8         reserved_5[0x6];
+       u8         reserved_at_5a[0x6];
 
-       u8         reserved_6[0x3];
+       u8         reserved_at_60[0x3];
        u8         log_eq_size[0x5];
        u8         uar_page[0x18];
 
-       u8         reserved_7[0x20];
+       u8         reserved_at_80[0x20];
 
-       u8         reserved_8[0x18];
+       u8         reserved_at_a0[0x18];
        u8         intr[0x8];
 
-       u8         reserved_9[0x3];
+       u8         reserved_at_c0[0x3];
        u8         log_page_size[0x5];
-       u8         reserved_10[0x18];
+       u8         reserved_at_c8[0x18];
 
-       u8         reserved_11[0x60];
+       u8         reserved_at_e0[0x60];
 
-       u8         reserved_12[0x8];
+       u8         reserved_at_140[0x8];
        u8         consumer_counter[0x18];
 
-       u8         reserved_13[0x8];
+       u8         reserved_at_160[0x8];
        u8         producer_counter[0x18];
 
-       u8         reserved_14[0x80];
+       u8         reserved_at_180[0x80];
 };
 
 enum {
@@ -2445,14 +2445,14 @@ enum {
 };
 
 struct mlx5_ifc_dctc_bits {
-       u8         reserved_0[0x4];
+       u8         reserved_at_0[0x4];
        u8         state[0x4];
-       u8         reserved_1[0x18];
+       u8         reserved_at_8[0x18];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_20[0x8];
        u8         user_index[0x18];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
        u8         counter_set_id[0x8];
@@ -2464,45 +2464,45 @@ struct mlx5_ifc_dctc_bits {
        u8         latency_sensitive[0x1];
        u8         rlky[0x1];
        u8         free_ar[0x1];
-       u8         reserved_4[0xd];
+       u8         reserved_at_73[0xd];
 
-       u8         reserved_5[0x8];
+       u8         reserved_at_80[0x8];
        u8         cs_res[0x8];
-       u8         reserved_6[0x3];
+       u8         reserved_at_90[0x3];
        u8         min_rnr_nak[0x5];
-       u8         reserved_7[0x8];
+       u8         reserved_at_98[0x8];
 
-       u8         reserved_8[0x8];
+       u8         reserved_at_a0[0x8];
        u8         srqn[0x18];
 
-       u8         reserved_9[0x8];
+       u8         reserved_at_c0[0x8];
        u8         pd[0x18];
 
        u8         tclass[0x8];
-       u8         reserved_10[0x4];
+       u8         reserved_at_e8[0x4];
        u8         flow_label[0x14];
 
        u8         dc_access_key[0x40];
 
-       u8         reserved_11[0x5];
+       u8         reserved_at_140[0x5];
        u8         mtu[0x3];
        u8         port[0x8];
        u8         pkey_index[0x10];
 
-       u8         reserved_12[0x8];
+       u8         reserved_at_160[0x8];
        u8         my_addr_index[0x8];
-       u8         reserved_13[0x8];
+       u8         reserved_at_170[0x8];
        u8         hop_limit[0x8];
 
        u8         dc_access_key_violation_count[0x20];
 
-       u8         reserved_14[0x14];
+       u8         reserved_at_1a0[0x14];
        u8         dei_cfi[0x1];
        u8         eth_prio[0x3];
        u8         ecn[0x2];
        u8         dscp[0x6];
 
-       u8         reserved_15[0x40];
+       u8         reserved_at_1c0[0x40];
 };
 
 enum {
@@ -2524,54 +2524,54 @@ enum {
 
 struct mlx5_ifc_cqc_bits {
        u8         status[0x4];
-       u8         reserved_0[0x4];
+       u8         reserved_at_4[0x4];
        u8         cqe_sz[0x3];
        u8         cc[0x1];
-       u8         reserved_1[0x1];
+       u8         reserved_at_c[0x1];
        u8         scqe_break_moderation_en[0x1];
        u8         oi[0x1];
-       u8         reserved_2[0x2];
+       u8         reserved_at_f[0x2];
        u8         cqe_zip_en[0x1];
        u8         mini_cqe_res_format[0x2];
        u8         st[0x4];
-       u8         reserved_3[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_5[0x14];
+       u8         reserved_at_40[0x14];
        u8         page_offset[0x6];
-       u8         reserved_6[0x6];
+       u8         reserved_at_5a[0x6];
 
-       u8         reserved_7[0x3];
+       u8         reserved_at_60[0x3];
        u8         log_cq_size[0x5];
        u8         uar_page[0x18];
 
-       u8         reserved_8[0x4];
+       u8         reserved_at_80[0x4];
        u8         cq_period[0xc];
        u8         cq_max_count[0x10];
 
-       u8         reserved_9[0x18];
+       u8         reserved_at_a0[0x18];
        u8         c_eqn[0x8];
 
-       u8         reserved_10[0x3];
+       u8         reserved_at_c0[0x3];
        u8         log_page_size[0x5];
-       u8         reserved_11[0x18];
+       u8         reserved_at_c8[0x18];
 
-       u8         reserved_12[0x20];
+       u8         reserved_at_e0[0x20];
 
-       u8         reserved_13[0x8];
+       u8         reserved_at_100[0x8];
        u8         last_notified_index[0x18];
 
-       u8         reserved_14[0x8];
+       u8         reserved_at_120[0x8];
        u8         last_solicit_index[0x18];
 
-       u8         reserved_15[0x8];
+       u8         reserved_at_140[0x8];
        u8         consumer_counter[0x18];
 
-       u8         reserved_16[0x8];
+       u8         reserved_at_160[0x8];
        u8         producer_counter[0x18];
 
-       u8         reserved_17[0x40];
+       u8         reserved_at_180[0x40];
 
        u8         dbr_addr[0x40];
 };
@@ -2580,16 +2580,16 @@ union mlx5_ifc_cong_control_roce_ecn_auto_bits {
        struct mlx5_ifc_cong_control_802_1qau_rp_bits cong_control_802_1qau_rp;
        struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits cong_control_r_roce_ecn_rp;
        struct mlx5_ifc_cong_control_r_roce_ecn_np_bits cong_control_r_roce_ecn_np;
-       u8         reserved_0[0x800];
+       u8         reserved_at_0[0x800];
 };
 
 struct mlx5_ifc_query_adapter_param_block_bits {
-       u8         reserved_0[0xc0];
+       u8         reserved_at_0[0xc0];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_c0[0x8];
        u8         ieee_vendor_id[0x18];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_e0[0x10];
        u8         vsd_vendor_id[0x10];
 
        u8         vsd[208][0x8];
@@ -2600,14 +2600,14 @@ struct mlx5_ifc_query_adapter_param_block_bits {
 union mlx5_ifc_modify_field_select_resize_field_select_auto_bits {
        struct mlx5_ifc_modify_field_select_bits modify_field_select;
        struct mlx5_ifc_resize_field_select_bits resize_field_select;
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 };
 
 union mlx5_ifc_field_select_802_1_r_roce_auto_bits {
        struct mlx5_ifc_field_select_802_1qau_rp_bits field_select_802_1qau_rp;
        struct mlx5_ifc_field_select_r_roce_rp_bits field_select_r_roce_rp;
        struct mlx5_ifc_field_select_r_roce_np_bits field_select_r_roce_np;
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 };
 
 union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
@@ -2619,7 +2619,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
        struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
        struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
        struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
-       u8         reserved_0[0x7c0];
+       u8         reserved_at_0[0x7c0];
 };
 
 union mlx5_ifc_event_auto_bits {
@@ -2635,23 +2635,23 @@ union mlx5_ifc_event_auto_bits {
        struct mlx5_ifc_db_bf_congestion_event_bits db_bf_congestion_event;
        struct mlx5_ifc_stall_vl_event_bits stall_vl_event;
        struct mlx5_ifc_cmd_inter_comp_event_bits cmd_inter_comp_event;
-       u8         reserved_0[0xe0];
+       u8         reserved_at_0[0xe0];
 };
 
 struct mlx5_ifc_health_buffer_bits {
-       u8         reserved_0[0x100];
+       u8         reserved_at_0[0x100];
 
        u8         assert_existptr[0x20];
 
        u8         assert_callra[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_140[0x40];
 
        u8         fw_version[0x20];
 
        u8         hw_id[0x20];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_1c0[0x20];
 
        u8         irisc_index[0x8];
        u8         synd[0x8];
@@ -2660,20 +2660,20 @@ struct mlx5_ifc_health_buffer_bits {
 
 struct mlx5_ifc_register_loopback_control_bits {
        u8         no_lb[0x1];
-       u8         reserved_0[0x7];
+       u8         reserved_at_1[0x7];
        u8         port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_teardown_hca_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 enum {
@@ -2683,108 +2683,108 @@ enum {
 
 struct mlx5_ifc_teardown_hca_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         profile[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_sqerr2rts_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_sqerr2rts_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_sqd2rts_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_sqd2rts_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_set_roce_address_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_roce_address_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         roce_address_index[0x10];
-       u8         reserved_2[0x10];
+       u8         reserved_at_50[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_roce_addr_layout_bits roce_address;
 };
 
 struct mlx5_ifc_set_mad_demux_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 enum {
@@ -2794,89 +2794,89 @@ enum {
 
 struct mlx5_ifc_set_mad_demux_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_3[0x6];
+       u8         reserved_at_60[0x6];
        u8         demux_mode[0x2];
-       u8         reserved_4[0x18];
+       u8         reserved_at_68[0x18];
 };
 
 struct mlx5_ifc_set_l2_table_entry_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_l2_table_entry_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_40[0x60];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_index[0x18];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_c0[0x20];
 
-       u8         reserved_5[0x13];
+       u8         reserved_at_e0[0x13];
        u8         vlan_valid[0x1];
        u8         vlan[0xc];
 
        struct mlx5_ifc_mac_address_layout_bits mac_address;
 
-       u8         reserved_6[0xc0];
+       u8         reserved_at_140[0xc0];
 };
 
 struct mlx5_ifc_set_issi_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_issi_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         current_issi[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_set_hca_cap_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_hca_cap_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        union mlx5_ifc_hca_cap_union_bits capability;
 };
@@ -2890,156 +2890,156 @@ enum {
 
 struct mlx5_ifc_set_fte_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_fte_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x18];
+       u8         reserved_at_c0[0x18];
        u8         modify_enable_mask[0x8];
 
-       u8         reserved_6[0x20];
+       u8         reserved_at_e0[0x20];
 
        u8         flow_index[0x20];
 
-       u8         reserved_7[0xe0];
+       u8         reserved_at_120[0xe0];
 
        struct mlx5_ifc_flow_context_bits flow_context;
 };
 
 struct mlx5_ifc_rts2rts_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_rts2rts_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_rtr2rts_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_rtr2rts_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_rst2init_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_rst2init_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_query_xrc_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
 
-       u8         reserved_2[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_query_xrc_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrc_srqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 enum {
@@ -3049,13 +3049,13 @@ enum {
 
 struct mlx5_ifc_query_vport_state_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_60[0x18];
        u8         admin_state[0x4];
        u8         state[0x4];
 };
@@ -3067,25 +3067,25 @@ enum {
 
 struct mlx5_ifc_query_vport_state_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_vport_counter_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_traffic_counter_bits received_errors;
 
@@ -3111,7 +3111,7 @@ struct mlx5_ifc_query_vport_counter_out_bits {
 
        struct mlx5_ifc_traffic_counter_bits transmitted_eth_multicast;
 
-       u8         reserved_2[0xa00];
+       u8         reserved_at_680[0xa00];
 };
 
 enum {
@@ -3120,328 +3120,328 @@ enum {
 
 struct mlx5_ifc_query_vport_counter_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x60];
+       u8         reserved_at_60[0x60];
 
        u8         clear[0x1];
-       u8         reserved_4[0x1f];
+       u8         reserved_at_c1[0x1f];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_e0[0x20];
 };
 
 struct mlx5_ifc_query_tis_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_tisc_bits tis_context;
 };
 
 struct mlx5_ifc_query_tis_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tisn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_tir_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_tirc_bits tir_context;
 };
 
 struct mlx5_ifc_query_tir_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tirn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_srqc_bits srq_context_entry;
 
-       u8         reserved_2[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_query_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         srqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_sq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_sqc_bits sq_context;
 };
 
 struct mlx5_ifc_query_sq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         sqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_special_contexts_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 
        u8         resd_lkey[0x20];
 };
 
 struct mlx5_ifc_query_special_contexts_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_query_rqt_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rqtc_bits rqt_context;
 };
 
 struct mlx5_ifc_query_rqt_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqtn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_rq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rqc_bits rq_context;
 };
 
 struct mlx5_ifc_query_rq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_roce_address_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_roce_addr_layout_bits roce_address;
 };
 
 struct mlx5_ifc_query_roce_address_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         roce_address_index[0x10];
-       u8         reserved_2[0x10];
+       u8         reserved_at_50[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_rmp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rmpc_bits rmp_context;
 };
 
 struct mlx5_ifc_query_rmp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rmpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_3[0x80];
+       u8         reserved_at_800[0x80];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_query_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_q_counter_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         rx_write_requests[0x20];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_a0[0x20];
 
        u8         rx_read_requests[0x20];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_e0[0x20];
 
        u8         rx_atomic_requests[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_120[0x20];
 
        u8         rx_dct_connect[0x20];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_160[0x20];
 
        u8         out_of_buffer[0x20];
 
-       u8         reserved_6[0x20];
+       u8         reserved_at_1a0[0x20];
 
        u8         out_of_sequence[0x20];
 
-       u8         reserved_7[0x620];
+       u8         reserved_at_1e0[0x620];
 };
 
 struct mlx5_ifc_query_q_counter_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x80];
+       u8         reserved_at_40[0x80];
 
        u8         clear[0x1];
-       u8         reserved_3[0x1f];
+       u8         reserved_at_c1[0x1f];
 
-       u8         reserved_4[0x18];
+       u8         reserved_at_e0[0x18];
        u8         counter_set_id[0x8];
 };
 
 struct mlx5_ifc_query_pages_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_40[0x10];
        u8         function_id[0x10];
 
        u8         num_pages[0x20];
@@ -3455,55 +3455,55 @@ enum {
 
 struct mlx5_ifc_query_pages_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         function_id[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_nic_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
 };
 
 struct mlx5_ifc_query_nic_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x5];
+       u8         reserved_at_60[0x5];
        u8         allowed_list_type[0x3];
-       u8         reserved_4[0x18];
+       u8         reserved_at_68[0x18];
 };
 
 struct mlx5_ifc_query_mkey_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
 
-       u8         reserved_2[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         bsf0_klm0_pas_mtt0_1[16][0x8];
 
@@ -3512,265 +3512,265 @@ struct mlx5_ifc_query_mkey_out_bits {
 
 struct mlx5_ifc_query_mkey_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         mkey_index[0x18];
 
        u8         pg_access[0x1];
-       u8         reserved_3[0x1f];
+       u8         reserved_at_61[0x1f];
 };
 
 struct mlx5_ifc_query_mad_demux_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         mad_dumux_parameters_block[0x20];
 };
 
 struct mlx5_ifc_query_mad_demux_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_query_l2_table_entry_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xa0];
+       u8         reserved_at_40[0xa0];
 
-       u8         reserved_2[0x13];
+       u8         reserved_at_e0[0x13];
        u8         vlan_valid[0x1];
        u8         vlan[0xc];
 
        struct mlx5_ifc_mac_address_layout_bits mac_address;
 
-       u8         reserved_3[0xc0];
+       u8         reserved_at_140[0xc0];
 };
 
 struct mlx5_ifc_query_l2_table_entry_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_40[0x60];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_index[0x18];
 
-       u8         reserved_4[0x140];
+       u8         reserved_at_c0[0x140];
 };
 
 struct mlx5_ifc_query_issi_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_40[0x10];
        u8         current_issi[0x10];
 
-       u8         reserved_2[0xa0];
+       u8         reserved_at_60[0xa0];
 
-       u8         supported_issi_reserved[76][0x8];
+       u8         reserved_at_100[76][0x8];
        u8         supported_issi_dw0[0x20];
 };
 
 struct mlx5_ifc_query_issi_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_query_hca_vport_pkey_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_pkey_bits pkey[0];
 };
 
 struct mlx5_ifc_query_hca_vport_pkey_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xb];
+       u8         reserved_at_41[0xb];
        u8         port_num[0x4];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         pkey_index[0x10];
 };
 
 struct mlx5_ifc_query_hca_vport_gid_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 
        u8         gids_num[0x10];
-       u8         reserved_2[0x10];
+       u8         reserved_at_70[0x10];
 
        struct mlx5_ifc_array128_auto_bits gid[0];
 };
 
 struct mlx5_ifc_query_hca_vport_gid_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xb];
+       u8         reserved_at_41[0xb];
        u8         port_num[0x4];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         gid_index[0x10];
 };
 
 struct mlx5_ifc_query_hca_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
 };
 
 struct mlx5_ifc_query_hca_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xb];
+       u8         reserved_at_41[0xb];
        u8         port_num[0x4];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_hca_cap_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        union mlx5_ifc_hca_cap_union_bits capability;
 };
 
 struct mlx5_ifc_query_hca_cap_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_query_flow_table_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x80];
+       u8         reserved_at_40[0x80];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_c0[0x8];
        u8         level[0x8];
-       u8         reserved_3[0x8];
+       u8         reserved_at_d0[0x8];
        u8         log_size[0x8];
 
-       u8         reserved_4[0x120];
+       u8         reserved_at_e0[0x120];
 };
 
 struct mlx5_ifc_query_flow_table_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x140];
+       u8         reserved_at_c0[0x140];
 };
 
 struct mlx5_ifc_query_fte_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x1c0];
+       u8         reserved_at_40[0x1c0];
 
        struct mlx5_ifc_flow_context_bits flow_context;
 };
 
 struct mlx5_ifc_query_fte_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x40];
+       u8         reserved_at_c0[0x40];
 
        u8         flow_index[0x20];
 
-       u8         reserved_6[0xe0];
+       u8         reserved_at_120[0xe0];
 };
 
 enum {
@@ -3781,84 +3781,84 @@ enum {
 
 struct mlx5_ifc_query_flow_group_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0xa0];
+       u8         reserved_at_40[0xa0];
 
        u8         start_flow_index[0x20];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_100[0x20];
 
        u8         end_flow_index[0x20];
 
-       u8         reserved_3[0xa0];
+       u8         reserved_at_140[0xa0];
 
-       u8         reserved_4[0x18];
+       u8         reserved_at_1e0[0x18];
        u8         match_criteria_enable[0x8];
 
        struct mlx5_ifc_fte_match_param_bits match_criteria;
 
-       u8         reserved_5[0xe00];
+       u8         reserved_at_1200[0xe00];
 };
 
 struct mlx5_ifc_query_flow_group_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
        u8         group_id[0x20];
 
-       u8         reserved_5[0x120];
+       u8         reserved_at_e0[0x120];
 };
 
 struct mlx5_ifc_query_esw_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
 };
 
 struct mlx5_ifc_query_esw_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_modify_esw_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_esw_vport_context_fields_select_bits {
-       u8         reserved[0x1c];
+       u8         reserved_at_0[0x1c];
        u8         vport_cvlan_insert[0x1];
        u8         vport_svlan_insert[0x1];
        u8         vport_cvlan_strip[0x1];
@@ -3867,13 +3867,13 @@ struct mlx5_ifc_esw_vport_context_fields_select_bits {
 
 struct mlx5_ifc_modify_esw_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
        struct mlx5_ifc_esw_vport_context_fields_select_bits field_select;
@@ -3883,124 +3883,124 @@ struct mlx5_ifc_modify_esw_vport_context_in_bits {
 
 struct mlx5_ifc_query_eq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_eqc_bits eq_context_entry;
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_280[0x40];
 
        u8         event_bitmask[0x40];
 
-       u8         reserved_3[0x580];
+       u8         reserved_at_300[0x580];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_query_eq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         eq_number[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_dct_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_dctc_bits dct_context_entry;
 
-       u8         reserved_2[0x180];
+       u8         reserved_at_280[0x180];
 };
 
 struct mlx5_ifc_query_dct_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         dctn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_cq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_cqc_bits cq_context;
 
-       u8         reserved_2[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_query_cq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_cong_status_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 
        u8         enable[0x1];
        u8         tag_enable[0x1];
-       u8         reserved_2[0x1e];
+       u8         reserved_at_62[0x1e];
 };
 
 struct mlx5_ifc_query_cong_status_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         priority[0x4];
        u8         cong_protocol[0x4];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_cong_statistics_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         cur_flows[0x20];
 
@@ -4014,7 +4014,7 @@ struct mlx5_ifc_query_cong_statistics_out_bits {
 
        u8         cnp_handled_low[0x20];
 
-       u8         reserved_2[0x100];
+       u8         reserved_at_140[0x100];
 
        u8         time_stamp_high[0x20];
 
@@ -4030,453 +4030,455 @@ struct mlx5_ifc_query_cong_statistics_out_bits {
 
        u8         cnps_sent_low[0x20];
 
-       u8         reserved_3[0x560];
+       u8         reserved_at_320[0x560];
 };
 
 struct mlx5_ifc_query_cong_statistics_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         clear[0x1];
-       u8         reserved_2[0x1f];
+       u8         reserved_at_41[0x1f];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_cong_params_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
 };
 
 struct mlx5_ifc_query_cong_params_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x1c];
+       u8         reserved_at_40[0x1c];
        u8         cong_protocol[0x4];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_query_adapter_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_query_adapter_param_block_bits query_adapter_struct;
 };
 
 struct mlx5_ifc_query_adapter_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_qp_2rst_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_qp_2rst_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_qp_2err_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_qp_2err_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_page_fault_resume_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_page_fault_resume_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         error[0x1];
-       u8         reserved_2[0x4];
+       u8         reserved_at_41[0x4];
        u8         rdma[0x1];
        u8         read_write[0x1];
        u8         req_res[0x1];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_nop_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_nop_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_vport_state_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_vport_state_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x18];
+       u8         reserved_at_60[0x18];
        u8         admin_state[0x4];
-       u8         reserved_4[0x4];
+       u8         reserved_at_7c[0x4];
 };
 
 struct mlx5_ifc_modify_tis_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_tis_bitmask_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
-       u8         reserved_1[0x1f];
+       u8         reserved_at_20[0x1f];
        u8         prio[0x1];
 };
 
 struct mlx5_ifc_modify_tis_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tisn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_tisc_bits ctx;
 };
 
 struct mlx5_ifc_modify_tir_bitmask_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
-       u8         reserved_1[0x1b];
+       u8         reserved_at_20[0x1b];
        u8         self_lb_en[0x1];
-       u8         reserved_2[0x3];
+       u8         reserved_at_3c[0x1];
+       u8         hash[0x1];
+       u8         reserved_at_3e[0x1];
        u8         lro[0x1];
 };
 
 struct mlx5_ifc_modify_tir_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_tir_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tirn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_modify_tir_bitmask_bits bitmask;
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_tirc_bits ctx;
 };
 
 struct mlx5_ifc_modify_sq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_sq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         sq_state[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_44[0x4];
        u8         sqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         modify_bitmask[0x40];
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_sqc_bits ctx;
 };
 
 struct mlx5_ifc_modify_rqt_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_rqt_bitmask_bits {
-       u8         reserved[0x20];
+       u8         reserved_at_0[0x20];
 
-       u8         reserved1[0x1f];
+       u8         reserved_at_20[0x1f];
        u8         rqn_list[0x1];
 };
 
 struct mlx5_ifc_modify_rqt_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqtn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_rqt_bitmask_bits bitmask;
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_rqtc_bits ctx;
 };
 
 struct mlx5_ifc_modify_rq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_rq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         rq_state[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_44[0x4];
        u8         rqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         modify_bitmask[0x40];
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_rqc_bits ctx;
 };
 
 struct mlx5_ifc_modify_rmp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_rmp_bitmask_bits {
-       u8         reserved[0x20];
+       u8         reserved_at_0[0x20];
 
-       u8         reserved1[0x1f];
+       u8         reserved_at_20[0x1f];
        u8         lwm[0x1];
 };
 
 struct mlx5_ifc_modify_rmp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         rmp_state[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_44[0x4];
        u8         rmpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_rmp_bitmask_bits bitmask;
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_c0[0x40];
 
        struct mlx5_ifc_rmpc_bits ctx;
 };
 
 struct mlx5_ifc_modify_nic_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_nic_vport_field_select_bits {
-       u8         reserved_0[0x19];
+       u8         reserved_at_0[0x19];
        u8         mtu[0x1];
        u8         change_event[0x1];
        u8         promisc[0x1];
        u8         permanent_address[0x1];
        u8         addresses_list[0x1];
        u8         roce_en[0x1];
-       u8         reserved_1[0x1];
+       u8         reserved_at_1f[0x1];
 };
 
 struct mlx5_ifc_modify_nic_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xf];
+       u8         reserved_at_41[0xf];
        u8         vport_number[0x10];
 
        struct mlx5_ifc_modify_nic_vport_field_select_bits field_select;
 
-       u8         reserved_3[0x780];
+       u8         reserved_at_80[0x780];
 
        struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
 };
 
 struct mlx5_ifc_modify_hca_vport_context_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_hca_vport_context_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         other_vport[0x1];
-       u8         reserved_2[0xb];
+       u8         reserved_at_41[0xb];
        u8         port_num[0x4];
        u8         vport_number[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
 };
 
 struct mlx5_ifc_modify_cq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 enum {
@@ -4486,83 +4488,83 @@ enum {
 
 struct mlx5_ifc_modify_cq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
        union mlx5_ifc_modify_field_select_resize_field_select_auto_bits modify_field_select_resize_field_select;
 
        struct mlx5_ifc_cqc_bits cq_context;
 
-       u8         reserved_3[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_modify_cong_status_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_cong_status_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         priority[0x4];
        u8         cong_protocol[0x4];
 
        u8         enable[0x1];
        u8         tag_enable[0x1];
-       u8         reserved_3[0x1e];
+       u8         reserved_at_62[0x1e];
 };
 
 struct mlx5_ifc_modify_cong_params_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_cong_params_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x1c];
+       u8         reserved_at_40[0x1c];
        u8         cong_protocol[0x4];
 
        union mlx5_ifc_field_select_802_1_r_roce_auto_bits field_select;
 
-       u8         reserved_3[0x80];
+       u8         reserved_at_80[0x80];
 
        union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
 };
 
 struct mlx5_ifc_manage_pages_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
        u8         output_num_entries[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         pas[0][0x40];
 };
@@ -4575,12 +4577,12 @@ enum {
 
 struct mlx5_ifc_manage_pages_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         function_id[0x10];
 
        u8         input_num_entries[0x20];
@@ -4590,117 +4592,117 @@ struct mlx5_ifc_manage_pages_in_bits {
 
 struct mlx5_ifc_mad_ifc_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         response_mad_packet[256][0x8];
 };
 
 struct mlx5_ifc_mad_ifc_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         remote_lid[0x10];
-       u8         reserved_2[0x8];
+       u8         reserved_at_50[0x8];
        u8         port[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         mad[256][0x8];
 };
 
 struct mlx5_ifc_init_hca_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_init_hca_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_init2rtr_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_init2rtr_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_init2init_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_init2init_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_800[0x80];
 };
 
 struct mlx5_ifc_get_dropped_packet_log_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         packet_headers_log[128][0x8];
 
@@ -4709,1029 +4711,1029 @@ struct mlx5_ifc_get_dropped_packet_log_out_bits {
 
 struct mlx5_ifc_get_dropped_packet_log_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_gen_eqe_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         eq_number[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         eqe[64][0x8];
 };
 
 struct mlx5_ifc_gen_eq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_enable_hca_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 };
 
 struct mlx5_ifc_enable_hca_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         function_id[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_drain_dct_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_drain_dct_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         dctn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_disable_hca_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_40[0x20];
 };
 
 struct mlx5_ifc_disable_hca_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         function_id[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_detach_from_mcg_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_detach_from_mcg_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         multicast_gid[16][0x8];
 };
 
 struct mlx5_ifc_destroy_xrc_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_xrc_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrc_srqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_tis_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_tis_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tisn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_tir_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_tir_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         tirn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         srqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_sq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_sq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         sqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_rqt_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_rqt_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqtn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_rq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_rq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_rmp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_rmp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         rmpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_psv_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_psv_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         psvn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_mkey_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_mkey_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         mkey_index[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_flow_table_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_flow_table_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x140];
+       u8         reserved_at_c0[0x140];
 };
 
 struct mlx5_ifc_destroy_flow_group_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_flow_group_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
        u8         group_id[0x20];
 
-       u8         reserved_5[0x120];
+       u8         reserved_at_e0[0x120];
 };
 
 struct mlx5_ifc_destroy_eq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_eq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         eq_number[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_dct_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_dct_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         dctn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_destroy_cq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_destroy_cq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_delete_vxlan_udp_dport_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_delete_vxlan_udp_dport_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         vxlan_udp_port[0x10];
 };
 
 struct mlx5_ifc_delete_l2_table_entry_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_delete_l2_table_entry_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_40[0x60];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_index[0x18];
 
-       u8         reserved_4[0x140];
+       u8         reserved_at_c0[0x140];
 };
 
 struct mlx5_ifc_delete_fte_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_delete_fte_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x40];
+       u8         reserved_at_c0[0x40];
 
        u8         flow_index[0x20];
 
-       u8         reserved_6[0xe0];
+       u8         reserved_at_120[0xe0];
 };
 
 struct mlx5_ifc_dealloc_xrcd_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_dealloc_xrcd_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrcd[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_dealloc_uar_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_dealloc_uar_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         uar[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_dealloc_transport_domain_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_dealloc_transport_domain_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         transport_domain[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_dealloc_q_counter_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_dealloc_q_counter_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_40[0x18];
        u8         counter_set_id[0x8];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_dealloc_pd_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_dealloc_pd_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         pd[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_xrc_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrc_srqn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_xrc_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
 
-       u8         reserved_3[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_create_tis_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         tisn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_tis_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_tisc_bits ctx;
 };
 
 struct mlx5_ifc_create_tir_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         tirn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_tir_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_tirc_bits ctx;
 };
 
 struct mlx5_ifc_create_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         srqn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_srqc_bits srq_context_entry;
 
-       u8         reserved_3[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_create_sq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         sqn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_sq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_sqc_bits ctx;
 };
 
 struct mlx5_ifc_create_rqt_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqtn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_rqt_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rqtc_bits rqt_context;
 };
 
 struct mlx5_ifc_create_rq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         rqn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_rq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rqc_bits ctx;
 };
 
 struct mlx5_ifc_create_rmp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         rmpn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_rmp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0xc0];
+       u8         reserved_at_40[0xc0];
 
        struct mlx5_ifc_rmpc_bits ctx;
 };
 
 struct mlx5_ifc_create_qp_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_qp_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         opt_param_mask[0x20];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_a0[0x20];
 
        struct mlx5_ifc_qpc_bits qpc;
 
-       u8         reserved_4[0x80];
+       u8         reserved_at_800[0x80];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_create_psv_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_80[0x8];
        u8         psv0_index[0x18];
 
-       u8         reserved_3[0x8];
+       u8         reserved_at_a0[0x8];
        u8         psv1_index[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_c0[0x8];
        u8         psv2_index[0x18];
 
-       u8         reserved_5[0x8];
+       u8         reserved_at_e0[0x8];
        u8         psv3_index[0x18];
 };
 
 struct mlx5_ifc_create_psv_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         num_psv[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_44[0x4];
        u8         pd[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_mkey_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         mkey_index[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_mkey_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_40[0x20];
 
        u8         pg_access[0x1];
-       u8         reserved_3[0x1f];
+       u8         reserved_at_61[0x1f];
 
        struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
 
-       u8         reserved_4[0x80];
+       u8         reserved_at_280[0x80];
 
        u8         translations_octword_actual_size[0x20];
 
-       u8         reserved_5[0x560];
+       u8         reserved_at_320[0x560];
 
        u8         klm_pas_mtt[0][0x20];
 };
 
 struct mlx5_ifc_create_flow_table_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_flow_table_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_a0[0x20];
 
-       u8         reserved_5[0x4];
+       u8         reserved_at_c0[0x4];
        u8         table_miss_mode[0x4];
        u8         level[0x8];
-       u8         reserved_6[0x8];
+       u8         reserved_at_d0[0x8];
        u8         log_size[0x8];
 
-       u8         reserved_7[0x8];
+       u8         reserved_at_e0[0x8];
        u8         table_miss_id[0x18];
 
-       u8         reserved_8[0x100];
+       u8         reserved_at_100[0x100];
 };
 
 struct mlx5_ifc_create_flow_group_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         group_id[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 enum {
@@ -5742,134 +5744,134 @@ enum {
 
 struct mlx5_ifc_create_flow_group_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_c0[0x20];
 
        u8         start_flow_index[0x20];
 
-       u8         reserved_6[0x20];
+       u8         reserved_at_100[0x20];
 
        u8         end_flow_index[0x20];
 
-       u8         reserved_7[0xa0];
+       u8         reserved_at_140[0xa0];
 
-       u8         reserved_8[0x18];
+       u8         reserved_at_1e0[0x18];
        u8         match_criteria_enable[0x8];
 
        struct mlx5_ifc_fte_match_param_bits match_criteria;
 
-       u8         reserved_9[0xe00];
+       u8         reserved_at_1200[0xe00];
 };
 
 struct mlx5_ifc_create_eq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x18];
+       u8         reserved_at_40[0x18];
        u8         eq_number[0x8];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_eq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_eqc_bits eq_context_entry;
 
-       u8         reserved_3[0x40];
+       u8         reserved_at_280[0x40];
 
        u8         event_bitmask[0x40];
 
-       u8         reserved_4[0x580];
+       u8         reserved_at_300[0x580];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_create_dct_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         dctn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_dct_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_dctc_bits dct_context_entry;
 
-       u8         reserved_3[0x180];
+       u8         reserved_at_280[0x180];
 };
 
 struct mlx5_ifc_create_cq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         cqn[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_create_cq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        struct mlx5_ifc_cqc_bits cq_context;
 
-       u8         reserved_3[0x600];
+       u8         reserved_at_280[0x600];
 
        u8         pas[0][0x40];
 };
 
 struct mlx5_ifc_config_int_moderation_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x4];
+       u8         reserved_at_40[0x4];
        u8         min_delay[0xc];
        u8         int_vector[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 enum {
@@ -5879,49 +5881,49 @@ enum {
 
 struct mlx5_ifc_config_int_moderation_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x4];
+       u8         reserved_at_40[0x4];
        u8         min_delay[0xc];
        u8         int_vector[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_attach_to_mcg_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_attach_to_mcg_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         qpn[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 
        u8         multicast_gid[16][0x8];
 };
 
 struct mlx5_ifc_arm_xrc_srq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 enum {
@@ -5930,25 +5932,25 @@ enum {
 
 struct mlx5_ifc_arm_xrc_srq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrc_srqn[0x18];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         lwm[0x10];
 };
 
 struct mlx5_ifc_arm_rq_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 enum {
@@ -5957,179 +5959,179 @@ enum {
 
 struct mlx5_ifc_arm_rq_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         srq_number[0x18];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         lwm[0x10];
 };
 
 struct mlx5_ifc_arm_dct_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_arm_dct_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_40[0x8];
        u8         dct_number[0x18];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_xrcd_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         xrcd[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_xrcd_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_alloc_uar_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         uar[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_uar_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_alloc_transport_domain_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         transport_domain[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_transport_domain_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_alloc_q_counter_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x18];
+       u8         reserved_at_40[0x18];
        u8         counter_set_id[0x8];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_q_counter_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_alloc_pd_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x8];
+       u8         reserved_at_40[0x8];
        u8         pd[0x18];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_alloc_pd_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_add_vxlan_udp_dport_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         vxlan_udp_port[0x10];
 };
 
 struct mlx5_ifc_access_register_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         register_data[0][0x20];
 };
@@ -6141,12 +6143,12 @@ enum {
 
 struct mlx5_ifc_access_register_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x10];
+       u8         reserved_at_40[0x10];
        u8         register_id[0x10];
 
        u8         argument[0x20];
@@ -6159,24 +6161,24 @@ struct mlx5_ifc_sltp_reg_bits {
        u8         version[0x4];
        u8         local_port[0x8];
        u8         pnat[0x2];
-       u8         reserved_0[0x2];
+       u8         reserved_at_12[0x2];
        u8         lane[0x4];
-       u8         reserved_1[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_3[0x7];
+       u8         reserved_at_40[0x7];
        u8         polarity[0x1];
        u8         ob_tap0[0x8];
        u8         ob_tap1[0x8];
        u8         ob_tap2[0x8];
 
-       u8         reserved_4[0xc];
+       u8         reserved_at_60[0xc];
        u8         ob_preemp_mode[0x4];
        u8         ob_reg[0x8];
        u8         ob_bias[0x8];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_80[0x20];
 };
 
 struct mlx5_ifc_slrg_reg_bits {
@@ -6184,36 +6186,36 @@ struct mlx5_ifc_slrg_reg_bits {
        u8         version[0x4];
        u8         local_port[0x8];
        u8         pnat[0x2];
-       u8         reserved_0[0x2];
+       u8         reserved_at_12[0x2];
        u8         lane[0x4];
-       u8         reserved_1[0x8];
+       u8         reserved_at_18[0x8];
 
        u8         time_to_link_up[0x10];
-       u8         reserved_2[0xc];
+       u8         reserved_at_30[0xc];
        u8         grade_lane_speed[0x4];
 
        u8         grade_version[0x8];
        u8         grade[0x18];
 
-       u8         reserved_3[0x4];
+       u8         reserved_at_60[0x4];
        u8         height_grade_type[0x4];
        u8         height_grade[0x18];
 
        u8         height_dz[0x10];
        u8         height_dv[0x10];
 
-       u8         reserved_4[0x10];
+       u8         reserved_at_a0[0x10];
        u8         height_sigma[0x10];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_c0[0x20];
 
-       u8         reserved_6[0x4];
+       u8         reserved_at_e0[0x4];
        u8         phase_grade_type[0x4];
        u8         phase_grade[0x18];
 
-       u8         reserved_7[0x8];
+       u8         reserved_at_100[0x8];
        u8         phase_eo_pos[0x8];
-       u8         reserved_8[0x8];
+       u8         reserved_at_110[0x8];
        u8         phase_eo_neg[0x8];
 
        u8         ffe_set_tested[0x10];
@@ -6221,70 +6223,70 @@ struct mlx5_ifc_slrg_reg_bits {
 };
 
 struct mlx5_ifc_pvlc_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x1c];
+       u8         reserved_at_20[0x1c];
        u8         vl_hw_cap[0x4];
 
-       u8         reserved_3[0x1c];
+       u8         reserved_at_40[0x1c];
        u8         vl_admin[0x4];
 
-       u8         reserved_4[0x1c];
+       u8         reserved_at_60[0x1c];
        u8         vl_operational[0x4];
 };
 
 struct mlx5_ifc_pude_reg_bits {
        u8         swid[0x8];
        u8         local_port[0x8];
-       u8         reserved_0[0x4];
+       u8         reserved_at_10[0x4];
        u8         admin_status[0x4];
-       u8         reserved_1[0x4];
+       u8         reserved_at_18[0x4];
        u8         oper_status[0x4];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_ptys_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0xd];
+       u8         reserved_at_10[0xd];
        u8         proto_mask[0x3];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_20[0x40];
 
        u8         eth_proto_capability[0x20];
 
        u8         ib_link_width_capability[0x10];
        u8         ib_proto_capability[0x10];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_a0[0x20];
 
        u8         eth_proto_admin[0x20];
 
        u8         ib_link_width_admin[0x10];
        u8         ib_proto_admin[0x10];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_100[0x20];
 
        u8         eth_proto_oper[0x20];
 
        u8         ib_link_width_oper[0x10];
        u8         ib_proto_oper[0x10];
 
-       u8         reserved_5[0x20];
+       u8         reserved_at_160[0x20];
 
        u8         eth_proto_lp_advertise[0x20];
 
-       u8         reserved_6[0x60];
+       u8         reserved_at_1a0[0x60];
 };
 
 struct mlx5_ifc_ptas_reg_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
        u8         algorithm_options[0x10];
-       u8         reserved_1[0x4];
+       u8         reserved_at_30[0x4];
        u8         repetitions_mode[0x4];
        u8         num_of_repetitions[0x8];
 
@@ -6310,13 +6312,13 @@ struct mlx5_ifc_ptas_reg_bits {
        u8         ndeo_error_threshold[0x10];
 
        u8         mixer_offset_step_size[0x10];
-       u8         reserved_2[0x8];
+       u8         reserved_at_110[0x8];
        u8         mix90_phase_for_voltage_bath[0x8];
 
        u8         mixer_offset_start[0x10];
        u8         mixer_offset_end[0x10];
 
-       u8         reserved_3[0x15];
+       u8         reserved_at_140[0x15];
        u8         ber_test_time[0xb];
 };
 
@@ -6324,154 +6326,154 @@ struct mlx5_ifc_pspa_reg_bits {
        u8         swid[0x8];
        u8         local_port[0x8];
        u8         sub_port[0x8];
-       u8         reserved_0[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_20[0x20];
 };
 
 struct mlx5_ifc_pqdr_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x5];
+       u8         reserved_at_10[0x5];
        u8         prio[0x3];
-       u8         reserved_2[0x6];
+       u8         reserved_at_18[0x6];
        u8         mode[0x2];
 
-       u8         reserved_3[0x20];
+       u8         reserved_at_20[0x20];
 
-       u8         reserved_4[0x10];
+       u8         reserved_at_40[0x10];
        u8         min_threshold[0x10];
 
-       u8         reserved_5[0x10];
+       u8         reserved_at_60[0x10];
        u8         max_threshold[0x10];
 
-       u8         reserved_6[0x10];
+       u8         reserved_at_80[0x10];
        u8         mark_probability_denominator[0x10];
 
-       u8         reserved_7[0x60];
+       u8         reserved_at_a0[0x60];
 };
 
 struct mlx5_ifc_ppsc_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_20[0x60];
 
-       u8         reserved_3[0x1c];
+       u8         reserved_at_80[0x1c];
        u8         wrps_admin[0x4];
 
-       u8         reserved_4[0x1c];
+       u8         reserved_at_a0[0x1c];
        u8         wrps_status[0x4];
 
-       u8         reserved_5[0x8];
+       u8         reserved_at_c0[0x8];
        u8         up_threshold[0x8];
-       u8         reserved_6[0x8];
+       u8         reserved_at_d0[0x8];
        u8         down_threshold[0x8];
 
-       u8         reserved_7[0x20];
+       u8         reserved_at_e0[0x20];
 
-       u8         reserved_8[0x1c];
+       u8         reserved_at_100[0x1c];
        u8         srps_admin[0x4];
 
-       u8         reserved_9[0x1c];
+       u8         reserved_at_120[0x1c];
        u8         srps_status[0x4];
 
-       u8         reserved_10[0x40];
+       u8         reserved_at_140[0x40];
 };
 
 struct mlx5_ifc_pplr_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_20[0x8];
        u8         lb_cap[0x8];
-       u8         reserved_3[0x8];
+       u8         reserved_at_30[0x8];
        u8         lb_en[0x8];
 };
 
 struct mlx5_ifc_pplm_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_20[0x20];
 
        u8         port_profile_mode[0x8];
        u8         static_port_profile[0x8];
        u8         active_port_profile[0x8];
-       u8         reserved_3[0x8];
+       u8         reserved_at_58[0x8];
 
        u8         retransmission_active[0x8];
        u8         fec_mode_active[0x18];
 
-       u8         reserved_4[0x20];
+       u8         reserved_at_80[0x20];
 };
 
 struct mlx5_ifc_ppcnt_reg_bits {
        u8         swid[0x8];
        u8         local_port[0x8];
        u8         pnat[0x2];
-       u8         reserved_0[0x8];
+       u8         reserved_at_12[0x8];
        u8         grp[0x6];
 
        u8         clr[0x1];
-       u8         reserved_1[0x1c];
+       u8         reserved_at_21[0x1c];
        u8         prio_tc[0x3];
 
        union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
 struct mlx5_ifc_ppad_reg_bits {
-       u8         reserved_0[0x3];
+       u8         reserved_at_0[0x3];
        u8         single_mac[0x1];
-       u8         reserved_1[0x4];
+       u8         reserved_at_4[0x4];
        u8         local_port[0x8];
        u8         mac_47_32[0x10];
 
        u8         mac_31_0[0x20];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_pmtu_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         max_mtu[0x10];
-       u8         reserved_2[0x10];
+       u8         reserved_at_30[0x10];
 
        u8         admin_mtu[0x10];
-       u8         reserved_3[0x10];
+       u8         reserved_at_50[0x10];
 
        u8         oper_mtu[0x10];
-       u8         reserved_4[0x10];
+       u8         reserved_at_70[0x10];
 };
 
 struct mlx5_ifc_pmpr_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         module[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0x18];
+       u8         reserved_at_20[0x18];
        u8         attenuation_5g[0x8];
 
-       u8         reserved_3[0x18];
+       u8         reserved_at_40[0x18];
        u8         attenuation_7g[0x8];
 
-       u8         reserved_4[0x18];
+       u8         reserved_at_60[0x18];
        u8         attenuation_12g[0x8];
 };
 
 struct mlx5_ifc_pmpe_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         module[0x8];
-       u8         reserved_1[0xc];
+       u8         reserved_at_10[0xc];
        u8         module_status[0x4];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_pmpc_reg_bits {
@@ -6479,20 +6481,20 @@ struct mlx5_ifc_pmpc_reg_bits {
 };
 
 struct mlx5_ifc_pmlpn_reg_bits {
-       u8         reserved_0[0x4];
+       u8         reserved_at_0[0x4];
        u8         mlpn_status[0x4];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         e[0x1];
-       u8         reserved_2[0x1f];
+       u8         reserved_at_21[0x1f];
 };
 
 struct mlx5_ifc_pmlp_reg_bits {
        u8         rxtx[0x1];
-       u8         reserved_0[0x7];
+       u8         reserved_at_1[0x7];
        u8         local_port[0x8];
-       u8         reserved_1[0x8];
+       u8         reserved_at_10[0x8];
        u8         width[0x8];
 
        u8         lane0_module_mapping[0x20];
@@ -6503,36 +6505,36 @@ struct mlx5_ifc_pmlp_reg_bits {
 
        u8         lane3_module_mapping[0x20];
 
-       u8         reserved_2[0x160];
+       u8         reserved_at_a0[0x160];
 };
 
 struct mlx5_ifc_pmaos_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         module[0x8];
-       u8         reserved_1[0x4];
+       u8         reserved_at_10[0x4];
        u8         admin_status[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_18[0x4];
        u8         oper_status[0x4];
 
        u8         ase[0x1];
        u8         ee[0x1];
-       u8         reserved_3[0x1c];
+       u8         reserved_at_22[0x1c];
        u8         e[0x2];
 
-       u8         reserved_4[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_plpc_reg_bits {
-       u8         reserved_0[0x4];
+       u8         reserved_at_0[0x4];
        u8         profile_id[0xc];
-       u8         reserved_1[0x4];
+       u8         reserved_at_10[0x4];
        u8         proto_mask[0x4];
-       u8         reserved_2[0x8];
+       u8         reserved_at_18[0x8];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_20[0x10];
        u8         lane_speed[0x10];
 
-       u8         reserved_4[0x17];
+       u8         reserved_at_40[0x17];
        u8         lpbf[0x1];
        u8         fec_mode_policy[0x8];
 
@@ -6545,44 +6547,44 @@ struct mlx5_ifc_plpc_reg_bits {
        u8         retransmission_request_admin[0x8];
        u8         fec_mode_request_admin[0x18];
 
-       u8         reserved_5[0x80];
+       u8         reserved_at_c0[0x80];
 };
 
 struct mlx5_ifc_plib_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x8];
+       u8         reserved_at_10[0x8];
        u8         ib_port[0x8];
 
-       u8         reserved_2[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_plbf_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0xd];
+       u8         reserved_at_10[0xd];
        u8         lbf_mode[0x3];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_20[0x20];
 };
 
 struct mlx5_ifc_pipg_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         dic[0x1];
-       u8         reserved_2[0x19];
+       u8         reserved_at_21[0x19];
        u8         ipg[0x4];
-       u8         reserved_3[0x2];
+       u8         reserved_at_3e[0x2];
 };
 
 struct mlx5_ifc_pifr_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0xe0];
+       u8         reserved_at_20[0xe0];
 
        u8         port_filter[8][0x20];
 
@@ -6590,36 +6592,36 @@ struct mlx5_ifc_pifr_reg_bits {
 };
 
 struct mlx5_ifc_pfcc_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         ppan[0x4];
-       u8         reserved_2[0x4];
+       u8         reserved_at_24[0x4];
        u8         prio_mask_tx[0x8];
-       u8         reserved_3[0x8];
+       u8         reserved_at_30[0x8];
        u8         prio_mask_rx[0x8];
 
        u8         pptx[0x1];
        u8         aptx[0x1];
-       u8         reserved_4[0x6];
+       u8         reserved_at_42[0x6];
        u8         pfctx[0x8];
-       u8         reserved_5[0x10];
+       u8         reserved_at_50[0x10];
 
        u8         pprx[0x1];
        u8         aprx[0x1];
-       u8         reserved_6[0x6];
+       u8         reserved_at_62[0x6];
        u8         pfcrx[0x8];
-       u8         reserved_7[0x10];
+       u8         reserved_at_70[0x10];
 
-       u8         reserved_8[0x80];
+       u8         reserved_at_80[0x80];
 };
 
 struct mlx5_ifc_pelc_reg_bits {
        u8         op[0x4];
-       u8         reserved_0[0x4];
+       u8         reserved_at_4[0x4];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         op_admin[0x8];
        u8         op_capability[0x8];
@@ -6634,28 +6636,28 @@ struct mlx5_ifc_pelc_reg_bits {
 
        u8         active[0x40];
 
-       u8         reserved_2[0x80];
+       u8         reserved_at_140[0x80];
 };
 
 struct mlx5_ifc_peir_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_2[0xc];
+       u8         reserved_at_20[0xc];
        u8         error_count[0x4];
-       u8         reserved_3[0x10];
+       u8         reserved_at_30[0x10];
 
-       u8         reserved_4[0xc];
+       u8         reserved_at_40[0xc];
        u8         lane[0x4];
-       u8         reserved_5[0x8];
+       u8         reserved_at_50[0x8];
        u8         error_type[0x8];
 };
 
 struct mlx5_ifc_pcap_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_1[0x10];
+       u8         reserved_at_10[0x10];
 
        u8         port_capability_mask[4][0x20];
 };
@@ -6663,46 +6665,46 @@ struct mlx5_ifc_pcap_reg_bits {
 struct mlx5_ifc_paos_reg_bits {
        u8         swid[0x8];
        u8         local_port[0x8];
-       u8         reserved_0[0x4];
+       u8         reserved_at_10[0x4];
        u8         admin_status[0x4];
-       u8         reserved_1[0x4];
+       u8         reserved_at_18[0x4];
        u8         oper_status[0x4];
 
        u8         ase[0x1];
        u8         ee[0x1];
-       u8         reserved_2[0x1c];
+       u8         reserved_at_22[0x1c];
        u8         e[0x2];
 
-       u8         reserved_3[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_pamp_reg_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         opamp_group[0x8];
-       u8         reserved_1[0xc];
+       u8         reserved_at_10[0xc];
        u8         opamp_group_type[0x4];
 
        u8         start_index[0x10];
-       u8         reserved_2[0x4];
+       u8         reserved_at_30[0x4];
        u8         num_of_indices[0xc];
 
        u8         index_data[18][0x10];
 };
 
 struct mlx5_ifc_lane_2_module_mapping_bits {
-       u8         reserved_0[0x6];
+       u8         reserved_at_0[0x6];
        u8         rx_lane[0x2];
-       u8         reserved_1[0x6];
+       u8         reserved_at_8[0x6];
        u8         tx_lane[0x2];
-       u8         reserved_2[0x8];
+       u8         reserved_at_10[0x8];
        u8         module[0x8];
 };
 
 struct mlx5_ifc_bufferx_reg_bits {
-       u8         reserved_0[0x6];
+       u8         reserved_at_0[0x6];
        u8         lossy[0x1];
        u8         epsb[0x1];
-       u8         reserved_1[0xc];
+       u8         reserved_at_8[0xc];
        u8         size[0xc];
 
        u8         xoff_threshold[0x10];
@@ -6714,21 +6716,21 @@ struct mlx5_ifc_set_node_in_bits {
 };
 
 struct mlx5_ifc_register_power_settings_bits {
-       u8         reserved_0[0x18];
+       u8         reserved_at_0[0x18];
        u8         power_settings_level[0x8];
 
-       u8         reserved_1[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_register_host_endianness_bits {
        u8         he[0x1];
-       u8         reserved_0[0x1f];
+       u8         reserved_at_1[0x1f];
 
-       u8         reserved_1[0x60];
+       u8         reserved_at_20[0x60];
 };
 
 struct mlx5_ifc_umr_pointer_desc_argument_bits {
-       u8         reserved_0[0x20];
+       u8         reserved_at_0[0x20];
 
        u8         mkey[0x20];
 
@@ -6741,7 +6743,7 @@ struct mlx5_ifc_ud_adrs_vector_bits {
        u8         dc_key[0x40];
 
        u8         ext[0x1];
-       u8         reserved_0[0x7];
+       u8         reserved_at_41[0x7];
        u8         destination_qp_dct[0x18];
 
        u8         static_rate[0x4];
@@ -6750,7 +6752,7 @@ struct mlx5_ifc_ud_adrs_vector_bits {
        u8         mlid[0x7];
        u8         rlid_udp_sport[0x10];
 
-       u8         reserved_1[0x20];
+       u8         reserved_at_80[0x20];
 
        u8         rmac_47_16[0x20];
 
@@ -6758,9 +6760,9 @@ struct mlx5_ifc_ud_adrs_vector_bits {
        u8         tclass[0x8];
        u8         hop_limit[0x8];
 
-       u8         reserved_2[0x1];
+       u8         reserved_at_e0[0x1];
        u8         grh[0x1];
-       u8         reserved_3[0x2];
+       u8         reserved_at_e2[0x2];
        u8         src_addr_index[0x8];
        u8         flow_label[0x14];
 
@@ -6768,27 +6770,27 @@ struct mlx5_ifc_ud_adrs_vector_bits {
 };
 
 struct mlx5_ifc_pages_req_event_bits {
-       u8         reserved_0[0x10];
+       u8         reserved_at_0[0x10];
        u8         function_id[0x10];
 
        u8         num_pages[0x20];
 
-       u8         reserved_1[0xa0];
+       u8         reserved_at_40[0xa0];
 };
 
 struct mlx5_ifc_eqe_bits {
-       u8         reserved_0[0x8];
+       u8         reserved_at_0[0x8];
        u8         event_type[0x8];
-       u8         reserved_1[0x8];
+       u8         reserved_at_10[0x8];
        u8         event_sub_type[0x8];
 
-       u8         reserved_2[0xe0];
+       u8         reserved_at_20[0xe0];
 
        union mlx5_ifc_event_auto_bits event_data;
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_1e0[0x10];
        u8         signature[0x8];
-       u8         reserved_4[0x7];
+       u8         reserved_at_1f8[0x7];
        u8         owner[0x1];
 };
 
@@ -6798,14 +6800,14 @@ enum {
 
 struct mlx5_ifc_cmd_queue_entry_bits {
        u8         type[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         input_length[0x20];
 
        u8         input_mailbox_pointer_63_32[0x20];
 
        u8         input_mailbox_pointer_31_9[0x17];
-       u8         reserved_1[0x9];
+       u8         reserved_at_77[0x9];
 
        u8         command_input_inline_data[16][0x8];
 
@@ -6814,20 +6816,20 @@ struct mlx5_ifc_cmd_queue_entry_bits {
        u8         output_mailbox_pointer_63_32[0x20];
 
        u8         output_mailbox_pointer_31_9[0x17];
-       u8         reserved_2[0x9];
+       u8         reserved_at_1b7[0x9];
 
        u8         output_length[0x20];
 
        u8         token[0x8];
        u8         signature[0x8];
-       u8         reserved_3[0x8];
+       u8         reserved_at_1f0[0x8];
        u8         status[0x7];
        u8         ownership[0x1];
 };
 
 struct mlx5_ifc_cmd_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
@@ -6836,9 +6838,9 @@ struct mlx5_ifc_cmd_out_bits {
 
 struct mlx5_ifc_cmd_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
        u8         command[0][0x20];
@@ -6847,16 +6849,16 @@ struct mlx5_ifc_cmd_in_bits {
 struct mlx5_ifc_cmd_if_box_bits {
        u8         mailbox_data[512][0x8];
 
-       u8         reserved_0[0x180];
+       u8         reserved_at_1000[0x180];
 
        u8         next_pointer_63_32[0x20];
 
        u8         next_pointer_31_10[0x16];
-       u8         reserved_1[0xa];
+       u8         reserved_at_11b6[0xa];
 
        u8         block_number[0x20];
 
-       u8         reserved_2[0x8];
+       u8         reserved_at_11e0[0x8];
        u8         token[0x8];
        u8         ctrl_signature[0x8];
        u8         signature[0x8];
@@ -6866,7 +6868,7 @@ struct mlx5_ifc_mtt_bits {
        u8         ptag_63_32[0x20];
 
        u8         ptag_31_8[0x18];
-       u8         reserved_0[0x6];
+       u8         reserved_at_38[0x6];
        u8         wr_en[0x1];
        u8         rd_en[0x1];
 };
@@ -6904,38 +6906,38 @@ struct mlx5_ifc_initial_seg_bits {
        u8         cmd_interface_rev[0x10];
        u8         fw_rev_subminor[0x10];
 
-       u8         reserved_0[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         cmdq_phy_addr_63_32[0x20];
 
        u8         cmdq_phy_addr_31_12[0x14];
-       u8         reserved_1[0x2];
+       u8         reserved_at_b4[0x2];
        u8         nic_interface[0x2];
        u8         log_cmdq_size[0x4];
        u8         log_cmdq_stride[0x4];
 
        u8         command_doorbell_vector[0x20];
 
-       u8         reserved_2[0xf00];
+       u8         reserved_at_e0[0xf00];
 
        u8         initializing[0x1];
-       u8         reserved_3[0x4];
+       u8         reserved_at_fe1[0x4];
        u8         nic_interface_supported[0x3];
-       u8         reserved_4[0x18];
+       u8         reserved_at_fe8[0x18];
 
        struct mlx5_ifc_health_buffer_bits health_buffer;
 
        u8         no_dram_nic_offset[0x20];
 
-       u8         reserved_5[0x6e40];
+       u8         reserved_at_1220[0x6e40];
 
-       u8         reserved_6[0x1f];
+       u8         reserved_at_8060[0x1f];
        u8         clear_int[0x1];
 
        u8         health_syndrome[0x8];
        u8         health_counter[0x18];
 
-       u8         reserved_7[0x17fc0];
+       u8         reserved_at_80a0[0x17fc0];
 };
 
 union mlx5_ifc_ports_control_registers_document_bits {
@@ -6980,44 +6982,44 @@ union mlx5_ifc_ports_control_registers_document_bits {
        struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
        struct mlx5_ifc_slrg_reg_bits slrg_reg;
        struct mlx5_ifc_sltp_reg_bits sltp_reg;
-       u8         reserved_0[0x60e0];
+       u8         reserved_at_0[0x60e0];
 };
 
 union mlx5_ifc_debug_enhancements_document_bits {
        struct mlx5_ifc_health_buffer_bits health_buffer;
-       u8         reserved_0[0x200];
+       u8         reserved_at_0[0x200];
 };
 
 union mlx5_ifc_uplink_pci_interface_document_bits {
        struct mlx5_ifc_initial_seg_bits initial_seg;
-       u8         reserved_0[0x20060];
+       u8         reserved_at_0[0x20060];
 };
 
 struct mlx5_ifc_set_flow_table_root_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_set_flow_table_root_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x40];
+       u8         reserved_at_40[0x40];
 
        u8         table_type[0x8];
-       u8         reserved_3[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_4[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_5[0x140];
+       u8         reserved_at_c0[0x140];
 };
 
 enum {
@@ -7026,39 +7028,39 @@ enum {
 
 struct mlx5_ifc_modify_flow_table_out_bits {
        u8         status[0x8];
-       u8         reserved_0[0x18];
+       u8         reserved_at_8[0x18];
 
        u8         syndrome[0x20];
 
-       u8         reserved_1[0x40];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_flow_table_in_bits {
        u8         opcode[0x10];
-       u8         reserved_0[0x10];
+       u8         reserved_at_10[0x10];
 
-       u8         reserved_1[0x10];
+       u8         reserved_at_20[0x10];
        u8         op_mod[0x10];
 
-       u8         reserved_2[0x20];
+       u8         reserved_at_40[0x20];
 
-       u8         reserved_3[0x10];
+       u8         reserved_at_60[0x10];
        u8         modify_field_select[0x10];
 
        u8         table_type[0x8];
-       u8         reserved_4[0x18];
+       u8         reserved_at_88[0x18];
 
-       u8         reserved_5[0x8];
+       u8         reserved_at_a0[0x8];
        u8         table_id[0x18];
 
-       u8         reserved_6[0x4];
+       u8         reserved_at_c0[0x4];
        u8         table_miss_mode[0x4];
-       u8         reserved_7[0x18];
+       u8         reserved_at_c8[0x18];
 
-       u8         reserved_8[0x8];
+       u8         reserved_at_e0[0x8];
        u8         table_miss_id[0x18];
 
-       u8         reserved_9[0x100];
+       u8         reserved_at_100[0x100];
 };
 
 #endif /* MLX5_IFC_H */
index f1cd22f..3579d1e 100644 (file)
@@ -201,11 +201,13 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 
 #ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+#define VM_STACK       VM_GROWSUP
 #else
-#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+#define VM_STACK       VM_GROWSDOWN
 #endif
 
+#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+
 /*
  * Special vmas that are non-mergable, non-mlock()able.
  * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
@@ -385,7 +387,8 @@ enum {
        REGION_MIXED,
 };
 
-int region_intersects(resource_size_t offset, size_t size, const char *type);
+int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
+                     unsigned long desc);
 
 /* Support for virtually mapped pages */
 struct page *vmalloc_to_page(const void *addr);
@@ -1341,8 +1344,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
                !vma_growsup(vma->vm_next, addr);
 }
 
-extern struct task_struct *task_of_stack(struct task_struct *task,
-                               struct vm_area_struct *vma, bool in_group);
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t);
 
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
@@ -2137,6 +2139,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
index d3ebb9d..944b2b3 100644 (file)
@@ -424,9 +424,9 @@ struct mm_struct {
        unsigned long total_vm;         /* Total pages mapped */
        unsigned long locked_vm;        /* Pages that have PG_mlocked set */
        unsigned long pinned_vm;        /* Refcount permanently increased */
-       unsigned long data_vm;          /* VM_WRITE & ~VM_SHARED/GROWSDOWN */
-       unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE */
-       unsigned long stack_vm;         /* VM_GROWSUP/DOWN */
+       unsigned long data_vm;          /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+       unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+       unsigned long stack_vm;         /* VM_STACK */
        unsigned long def_flags;
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long start_brk, brk, start_stack;
@@ -566,10 +566,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
 }
 #endif
 
-struct vm_special_mapping
-{
-       const char *name;
+struct vm_fault;
+
+struct vm_special_mapping {
+       const char *name;       /* The name, e.g. "[vdso]". */
+
+       /*
+        * If .fault is not provided, this points to a
+        * NULL-terminated array of pages that back the special mapping.
+        *
+        * This must not be NULL unless .fault is provided.
+        */
        struct page **pages;
+
+       /*
+        * If non-NULL, then this is called to resolve page faults
+        * on the special mapping.  If used, .pages is not checked.
+        */
+       int (*fault)(const struct vm_special_mapping *sm,
+                    struct vm_area_struct *vma,
+                    struct vm_fault *vmf);
 };
 
 enum tlb_flush_reason {
index 33bb1b1..7b6c2cf 100644 (file)
@@ -682,6 +682,12 @@ typedef struct pglist_data {
         */
        unsigned long first_deferred_pfn;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       spinlock_t split_queue_lock;
+       struct list_head split_queue;
+       unsigned long split_queue_len;
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
index 4560d8f..2bb0c30 100644 (file)
@@ -324,6 +324,12 @@ struct module_layout {
 #define __module_layout_align
 #endif
 
+struct mod_kallsyms {
+       Elf_Sym *symtab;
+       unsigned int num_symtab;
+       char *strtab;
+};
+
 struct module {
        enum module_state state;
 
@@ -405,15 +411,10 @@ struct module {
 #endif
 
 #ifdef CONFIG_KALLSYMS
-       /*
-        * We keep the symbol and string tables for kallsyms.
-        * The core_* fields below are temporary, loader-only (they
-        * could really be discarded after module init).
-        */
-       Elf_Sym *symtab, *core_symtab;
-       unsigned int num_symtab, core_num_syms;
-       char *strtab, *core_strtab;
-
+       /* Protected by RCU and/or module_mutex: use rcu_dereference() */
+       struct mod_kallsyms *kallsyms;
+       struct mod_kallsyms core_kallsyms;
+       
        /* Section attributes */
        struct module_sect_attrs *sect_attrs;
 
index 5ac140d..5440b7b 100644 (file)
@@ -512,7 +512,6 @@ static inline void napi_enable(struct napi_struct *n)
        clear_bit(NAPI_STATE_NPSVC, &n->state);
 }
 
-#ifdef CONFIG_SMP
 /**
  *     napi_synchronize - wait until NAPI is not running
  *     @n: napi context
@@ -523,12 +522,12 @@ static inline void napi_enable(struct napi_struct *n)
  */
 static inline void napi_synchronize(const struct napi_struct *n)
 {
-       while (test_bit(NAPI_STATE_SCHED, &n->state))
-               msleep(1);
+       if (IS_ENABLED(CONFIG_SMP))
+               while (test_bit(NAPI_STATE_SCHED, &n->state))
+                       msleep(1);
+       else
+               barrier();
 }
-#else
-# define napi_synchronize(n)   barrier()
-#endif
 
 enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
@@ -3719,7 +3718,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);
 #define netdev_for_each_lower_dev(dev, ldev, iter) \
-       for (iter = &(dev)->adj_list.lower, \
+       for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))
index 48e0320..67300f8 100644 (file)
@@ -550,9 +550,7 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 
 static inline loff_t nfs_size_to_loff_t(__u64 size)
 {
-       if (size > (__u64) OFFSET_MAX - 1)
-               return OFFSET_MAX - 1;
-       return (loff_t) size;
+       return min_t(u64, size, OFFSET_MAX);
 }
 
 static inline ino_t
index 791098a..d320906 100644 (file)
@@ -275,6 +275,7 @@ struct nfs4_layoutcommit_args {
        size_t layoutupdate_len;
        struct page *layoutupdate_page;
        struct page **layoutupdate_pages;
+       __be32 *start_p;
 };
 
 struct nfs4_layoutcommit_res {
index d14a4c3..4149868 100644 (file)
@@ -47,6 +47,8 @@
  * runtime initialization.
  */
 
+struct notifier_block;
+
 typedef        int (*notifier_fn_t)(struct notifier_block *nb,
                        unsigned long action, void *data);
 
index dd10626..dc6e396 100644 (file)
@@ -929,7 +929,7 @@ static inline int of_get_available_child_count(const struct device_node *np)
        return num;
 }
 
-#ifdef CONFIG_OF
+#if defined(CONFIG_OF) && !defined(MODULE)
 #define _OF_DECLARE(table, name, compat, fn, fn_type)                  \
        static const struct of_device_id __of_table_##name              \
                __used __section(__##table##_of_table)                  \
index 27df4a6..2771625 100644 (file)
@@ -988,23 +988,6 @@ static inline int pci_is_managed(struct pci_dev *pdev)
        return pdev->is_managed;
 }
 
-static inline void pci_set_managed_irq(struct pci_dev *pdev, unsigned int irq)
-{
-       pdev->irq = irq;
-       pdev->irq_managed = 1;
-}
-
-static inline void pci_reset_managed_irq(struct pci_dev *pdev)
-{
-       pdev->irq = 0;
-       pdev->irq_managed = 0;
-}
-
-static inline bool pci_has_managed_irq(struct pci_dev *pdev)
-{
-       return pdev->irq_managed && pdev->irq > 0;
-}
-
 void pci_disable_device(struct pci_dev *dev);
 
 extern unsigned int pcibios_max_latency;
index f9828a4..79ec7bb 100644 (file)
@@ -397,6 +397,7 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
+       PERF_EVENT_STATE_DEAD           = -4,
        PERF_EVENT_STATE_EXIT           = -3,
        PERF_EVENT_STATE_ERROR          = -2,
        PERF_EVENT_STATE_OFF            = -1,
@@ -467,6 +468,7 @@ struct perf_event {
        int                             group_flags;
        struct perf_event               *group_leader;
        struct pmu                      *pmu;
+       void                            *pmu_private;
 
        enum perf_event_active_state    state;
        unsigned int                    attach_state;
@@ -634,9 +636,6 @@ struct perf_event_context {
        int                             nr_cgroups;      /* cgroup evts */
        void                            *task_ctx_data; /* pmu specific data */
        struct rcu_head                 rcu_head;
-
-       struct delayed_work             orphans_remove;
-       bool                            orphans_remove_sched;
 };
 
 /*
@@ -729,7 +728,7 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
-extern struct perf_event *perf_event_get(unsigned int fd);
+extern struct file *perf_event_get(unsigned int fd);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -908,7 +907,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
        }
 }
 
-extern struct static_key_deferred perf_sched_events;
+extern struct static_key_false perf_sched_events;
 
 static __always_inline bool
 perf_sw_migrate_enabled(void)
@@ -927,7 +926,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
 static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
 {
-       if (static_key_false(&perf_sched_events.key))
+       if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);
 
        if (perf_sw_migrate_enabled() && task->sched_migrated) {
@@ -944,7 +943,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 {
        perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
-       if (static_key_false(&perf_sched_events.key))
+       if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
 }
 
@@ -1044,7 +1043,7 @@ extern void perf_swevent_put_recursion_context(int rctx);
 extern u64 perf_swevent_set_period(struct perf_event *event);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
-extern int __perf_event_disable(void *info);
+extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
@@ -1070,7 +1069,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)     { }
 static inline void perf_event_free_task(struct task_struct *task)      { }
 static inline void perf_event_delayed_put(struct task_struct *task)    { }
-static inline struct perf_event *perf_event_get(unsigned int fd)       { return ERR_PTR(-EINVAL); }
+static inline struct file *perf_event_get(unsigned int fd)     { return ERR_PTR(-EINVAL); }
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
        return ERR_PTR(-EINVAL);
@@ -1111,12 +1110,6 @@ static inline void perf_event_task_tick(void)                            { }
 static inline int perf_event_release_kernel(struct perf_event *event)  { return 0; }
 #endif
 
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
-extern bool perf_event_can_stop_tick(void);
-#else
-static inline bool perf_event_can_stop_tick(void)                      { return true; }
-#endif
-
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 extern void perf_restore_debug_store(void);
 #else
index 2d8e497..1132953 100644 (file)
@@ -10,7 +10,7 @@
  * backing is indicated by flags in the high bits of the value.
  */
 typedef struct {
-       unsigned long val;
+       u64 val;
 } pfn_t;
 #endif
 
index 0703b53..9499481 100644 (file)
@@ -9,14 +9,13 @@
  * PFN_DEV - pfn is not covered by system memmap by default
  * PFN_MAP - pfn has a dynamic page mapping established by a device driver
  */
-#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \
-               << (BITS_PER_LONG - PAGE_SHIFT))
-#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1))
-#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2))
-#define PFN_DEV (1UL << (BITS_PER_LONG - 3))
-#define PFN_MAP (1UL << (BITS_PER_LONG - 4))
-
-static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags)
+#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
+#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
+#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
+#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
+#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
+
+static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
 {
        pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
 
@@ -29,7 +28,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
        return __pfn_to_pfn_t(pfn, 0);
 }
 
-extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags);
+extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags);
 
 static inline bool pfn_t_has_page(pfn_t pfn)
 {
@@ -48,7 +47,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn)
        return NULL;
 }
 
-static inline dma_addr_t pfn_t_to_phys(pfn_t pfn)
+static inline phys_addr_t pfn_t_to_phys(pfn_t pfn)
 {
        return PFN_PHYS(pfn_t_to_pfn(pfn));
 }
@@ -87,7 +86,7 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
 #ifdef __HAVE_ARCH_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
-       const unsigned long flags = PFN_DEV|PFN_MAP;
+       const u64 flags = PFN_DEV|PFN_MAP;
 
        return (pfn.val & flags) == flags;
 }
index 907f3fd..62d44c1 100644 (file)
@@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer);
 void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
-
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
                           cputime_t *newval, cputime_t *oldval);
 
index 998d8f1..b50c049 100644 (file)
@@ -49,6 +49,7 @@ struct bq27xxx_reg_cache {
 
 struct bq27xxx_device_info {
        struct device *dev;
+       int id;
        enum bq27xxx_chip chip;
        const char *name;
        struct bq27xxx_access_methods bus;
index 54bf148..35ac903 100644 (file)
@@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
        kt->nsec = ts.tv_nsec;
 }
 
-#ifdef CONFIG_NTP_PPS
-
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
-       ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real);
-}
+       struct system_time_snapshot snap;
 
-#else /* CONFIG_NTP_PPS */
-
-static inline void pps_get_ts(struct pps_event_time *ts)
-{
-       ktime_get_real_ts64(&ts->ts_real);
+       ktime_get_snapshot(&snap);
+       ts->ts_real = ktime_to_timespec64(snap.real);
+#ifdef CONFIG_NTP_PPS
+       ts->ts_raw = ktime_to_timespec64(snap.raw);
+#endif
 }
 
-#endif /* CONFIG_NTP_PPS */
-
 /* Subtract known time delay from PPS event time(s) */
 static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta)
 {
index b8b7306..6b15e16 100644 (file)
@@ -38,6 +38,7 @@ struct ptp_clock_request {
        };
 };
 
+struct system_device_crosststamp;
 /**
  * struct ptp_clock_info - decribes a PTP hardware clock
  *
@@ -67,6 +68,11 @@ struct ptp_clock_request {
  * @gettime64:  Reads the current time from the hardware clock.
  *              parameter ts: Holds the result.
  *
+ * @getcrosststamp:  Reads the current time from the hardware clock and
+ *                   system clock simultaneously.
+ *                   parameter cts: Contains timestamp (device,system) pair,
+ *                   where system time is realtime and monotonic.
+ *
  * @settime64:  Set the current time on the hardware clock.
  *              parameter ts: Time value to set.
  *
@@ -105,6 +111,8 @@ struct ptp_clock_info {
        int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
        int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
        int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
+       int (*getcrosststamp)(struct ptp_clock_info *ptp,
+                             struct system_device_crosststamp *cts);
        int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
        int (*enable)(struct ptp_clock_info *ptp,
                      struct ptp_clock_request *request, int on);
index 7c88ad1..f54be70 100644 (file)
@@ -378,13 +378,29 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
 void **radix_tree_next_chunk(struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags);
 
+/**
+ * radix_tree_iter_retry - retry this chunk of the iteration
+ * @iter:      iterator state
+ *
+ * If we iterate over a tree protected only by the RCU lock, a race
+ * against deletion or creation may result in seeing a slot for which
+ * radix_tree_deref_retry() returns true.  If so, call this function
+ * and continue the iteration.
+ */
+static inline __must_check
+void **radix_tree_iter_retry(struct radix_tree_iter *iter)
+{
+       iter->next_index = iter->index;
+       return NULL;
+}
+
 /**
  * radix_tree_chunk_size - get current chunk size
  *
  * @iter:      pointer to radix tree iterator
  * Returns:    current chunk size
  */
-static __always_inline unsigned
+static __always_inline long
 radix_tree_chunk_size(struct radix_tree_iter *iter)
 {
        return iter->next_index - iter->index;
@@ -418,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
                        return slot + offset + 1;
                }
        } else {
-               unsigned size = radix_tree_chunk_size(iter) - 1;
+               long size = radix_tree_chunk_size(iter);
 
-               while (size--) {
+               while (--size > 0) {
                        slot++;
                        iter->index++;
                        if (likely(*slot))
index a7a06d1..a0118d5 100644 (file)
@@ -152,6 +152,8 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
 
 # define jiffies       raid6_jiffies()
 # define printk        printf
+# define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__)
+# define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__)
 # define GFP_KERNEL    0
 # define __get_free_pages(x, y)        ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \
                                                     PROT_READ|PROT_WRITE,   \
index a75840c..9c29122 100644 (file)
@@ -34,6 +34,7 @@ extern const struct file_operations random_fops, urandom_fops;
 #endif
 
 unsigned int get_random_int(void);
+unsigned long get_random_long(void);
 unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
 
 u32 prandom_u32(void);
index 14e6f47..2657aff 100644 (file)
@@ -332,9 +332,7 @@ void rcu_init(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
-struct notifier_block;
-int rcu_cpu_notify(struct notifier_block *self,
-                  unsigned long action, void *hcpu);
+void rcu_report_dead(unsigned int cpu);
 
 #ifndef CONFIG_TINY_RCU
 void rcu_end_inkernel_boot(void);
@@ -360,8 +358,6 @@ void rcu_user_exit(void);
 #else
 static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
-static inline void rcu_user_hooks_switch(struct task_struct *prev,
-                                        struct task_struct *next) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
 #ifdef CONFIG_RCU_NOCB_CPU
index bdf597c..a07f42b 100644 (file)
@@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma)
                __put_anon_vma(anon_vma);
 }
 
-static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
-{
-       struct anon_vma *anon_vma = vma->anon_vma;
-       if (anon_vma)
-               down_write(&anon_vma->root->rwsem);
-}
-
-static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
-{
-       struct anon_vma *anon_vma = vma->anon_vma;
-       if (anon_vma)
-               up_write(&anon_vma->root->rwsem);
-}
-
 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 {
        down_write(&anon_vma->root->rwsem);
index a10494a..c617ea1 100644 (file)
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
 static inline void update_cpu_load_nohz(int active) { }
 #endif
 
-extern unsigned long get_parent_ip(unsigned long addr);
-
 extern void dump_cpu_task(int cpu);
 
 struct seq_file;
@@ -719,6 +717,10 @@ struct signal_struct {
        /* Earliest-expiration cache. */
        struct task_cputime cputime_expires;
 
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
+
        struct list_head cpu_timers[3];
 
        struct pid *tty_old_pgrp;
@@ -920,6 +922,10 @@ static inline int sched_info_on(void)
 #endif
 }
 
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
 enum cpu_idle_type {
        CPU_IDLE,
        CPU_NOT_IDLE,
@@ -1289,6 +1295,8 @@ struct sched_rt_entity {
        unsigned long timeout;
        unsigned long watchdog_stamp;
        unsigned int time_slice;
+       unsigned short on_rq;
+       unsigned short on_list;
 
        struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -1329,10 +1337,6 @@ struct sched_dl_entity {
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
-        * @dl_new tells if a new instance arrived. If so we must
-        * start executing it with full runtime and reset its absolute
-        * deadline;
-        *
         * @dl_boosted tells if we are boosted due to DI. If so we are
         * outside bandwidth enforcement mechanism (but only until we
         * exit the critical section);
@@ -1340,7 +1344,7 @@ struct sched_dl_entity {
         * @dl_yielded tells if task gave up the cpu before consuming
         * all its available runtime during the last job.
         */
-       int dl_throttled, dl_new, dl_boosted, dl_yielded;
+       int dl_throttled, dl_boosted, dl_yielded;
 
        /*
         * Bandwidth enforcement timer. Each -deadline task has its
@@ -1542,6 +1546,10 @@ struct task_struct {
                VTIME_SYS,
        } vtime_snap_whence;
 #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+       unsigned long tick_dep_mask;
+#endif
        unsigned long nvcsw, nivcsw; /* context switch counts */
        u64 start_time;         /* monotonic time in nsec */
        u64 real_start_time;    /* boot based time in nsec */
@@ -2356,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
 extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
 #endif
 
 #ifdef CONFIG_SCHED_AUTOGROUP
index c9e4731..4f080ab 100644 (file)
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos);
 
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos);
+
 #endif /* _SCHED_SYSCTL_H */
index 11f935c..d3fcd45 100644 (file)
@@ -299,6 +299,7 @@ struct sk_buff;
 #else
 #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
 #endif
+extern int sysctl_max_skb_frags;
 
 typedef struct skb_frag_struct skb_frag_t;
 
@@ -1984,6 +1985,30 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
        skb->tail += len;
 }
 
+/**
+ *     skb_tailroom_reserve - adjust reserved_tailroom
+ *     @skb: buffer to alter
+ *     @mtu: maximum amount of headlen permitted
+ *     @needed_tailroom: minimum amount of reserved_tailroom
+ *
+ *     Set reserved_tailroom so that headlen can be as large as possible but
+ *     not larger than mtu and tailroom cannot be smaller than
+ *     needed_tailroom.
+ *     The required headroom should already have been reserved before using
+ *     this function.
+ */
+static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
+                                       unsigned int needed_tailroom)
+{
+       SKB_LINEAR_ASSERT(skb);
+       if (mtu < skb_tailroom(skb) - needed_tailroom)
+               /* use at most mtu */
+               skb->reserved_tailroom = skb_tailroom(skb) - mtu;
+       else
+               /* use up to all available space */
+               skb->reserved_tailroom = needed_tailroom;
+}
+
 #define ENCAP_TYPE_ETHER       0
 #define ENCAP_TYPE_IPPROTO     1
 
index 343c13a..35cb926 100644 (file)
@@ -44,6 +44,7 @@
 
 #define KNAV_DMA_NUM_EPIB_WORDS                        4
 #define KNAV_DMA_NUM_PS_WORDS                  16
+#define KNAV_DMA_NUM_SW_DATA_WORDS             4
 #define KNAV_DMA_FDQ_PER_CHAN                  4
 
 /* Tx channel scheduling priority */
@@ -142,6 +143,7 @@ struct knav_dma_cfg {
  * @orig_buff:                 buff pointer since 'buff' can be overwritten
  * @epib:                      Extended packet info block
  * @psdata:                    Protocol specific
+ * @sw_data:                   Software private data not touched by h/w
  */
 struct knav_dma_desc {
        __le32  desc_info;
@@ -154,7 +156,7 @@ struct knav_dma_desc {
        __le32  orig_buff;
        __le32  epib[KNAV_DMA_NUM_EPIB_WORDS];
        __le32  psdata[KNAV_DMA_NUM_PS_WORDS];
-       __le32  pad[4];
+       u32     sw_data[KNAV_DMA_NUM_SW_DATA_WORDS];
 } ____cacheline_aligned;
 
 #if IS_ENABLED(CONFIG_KEYSTONE_NAVIGATOR_DMA)
index f5f80c5..dc8eb63 100644 (file)
@@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work);
        }
 
 /*
- * define and init a srcu struct at build time.
- * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *     static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *     init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)                                 \
        static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
index eead8ab..881a79d 100644 (file)
@@ -100,6 +100,7 @@ struct plat_stmmacenet_data {
        int interface;
        struct stmmac_mdio_bus_data *mdio_bus_data;
        struct device_node *phy_node;
+       struct device_node *mdio_node;
        struct stmmac_dma_cfg *dma_cfg;
        int clk_csr;
        int has_gmac;
diff --git a/include/linux/swait.h b/include/linux/swait.h
new file mode 100644 (file)
index 0000000..c1f9c62
--- /dev/null
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+       raw_spinlock_t          lock;
+       struct list_head        task_list;
+};
+
+struct swait_queue {
+       struct task_struct      *task;
+       struct list_head        task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {                               \
+       .task           = current,                                      \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAITQUEUE(name)                                       \
+       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
+       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
+       .task_list      = LIST_HEAD_INIT((name).task_list),             \
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                                   struct lock_class_key *key);
+
+#define init_swait_queue_head(q)                               \
+       do {                                                    \
+               static struct lock_class_key __key;             \
+               __init_swait_queue_head((q), #q, &__key);       \
+       } while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
+       ({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
+       DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+       return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)                 \
+({                                                                     \
+       struct swait_queue __wait;                                      \
+       long __ret = ret;                                               \
+                                                                       \
+       INIT_LIST_HEAD(&__wait.task_list);                              \
+       for (;;) {                                                      \
+               long __int = prepare_to_swait_event(&wq, &__wait, state);\
+                                                                       \
+               if (condition)                                          \
+                       break;                                          \
+                                                                       \
+               if (___wait_is_interruptible(state) && __int) {         \
+                       __ret = __int;                                  \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               cmd;                                                    \
+       }                                                               \
+       finish_swait(&wq, &__wait);                                     \
+       __ret;                                                          \
+})
+
+#define __swait_event(wq, condition)                                   \
+       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
+                           schedule())
+
+#define swait_event(wq, condition)                                     \
+do {                                                                   \
+       if (condition)                                                  \
+               break;                                                  \
+       __swait_event(wq, condition);                                   \
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)                  \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_UNINTERRUPTIBLE, timeout,                    \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)                    \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_timeout(wq, condition, timeout);  \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible(wq, condition)                     \
+       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
+                     schedule())
+
+#define swait_event_interruptible(wq, condition)                       \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (!(condition))                                               \
+               __ret = __swait_event_interruptible(wq, condition);     \
+       __ret;                                                          \
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
+       ___swait_event(wq, ___wait_cond_timeout(condition),             \
+                     TASK_INTERRUPTIBLE, timeout,                      \
+                     __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)      \
+({                                                                     \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __swait_event_interruptible_timeout(wq,         \
+                                               condition, timeout);    \
+       __ret;                                                          \
+})
+
+#endif /* _LINUX_SWAIT_H */
index e7a018e..017fced 100644 (file)
@@ -1,10 +1,13 @@
 #ifndef __LINUX_SWIOTLB_H
 #define __LINUX_SWIOTLB_H
 
+#include <linux/dma-direction.h>
+#include <linux/init.h>
 #include <linux/types.h>
 
 struct device;
 struct dma_attrs;
+struct page;
 struct scatterlist;
 
 extern int swiotlb_force;
index 97fd4e5..21f7364 100644 (file)
@@ -97,6 +97,19 @@ static inline void tick_broadcast_exit(void)
        tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
 }
 
+enum tick_dep_bits {
+       TICK_DEP_BIT_POSIX_TIMER        = 0,
+       TICK_DEP_BIT_PERF_EVENTS        = 1,
+       TICK_DEP_BIT_SCHED              = 2,
+       TICK_DEP_BIT_CLOCK_UNSTABLE     = 3
+};
+
+#define TICK_DEP_MASK_NONE             0
+#define TICK_DEP_MASK_POSIX_TIMER      (1 << TICK_DEP_BIT_POSIX_TIMER)
+#define TICK_DEP_MASK_PERF_EVENTS      (1 << TICK_DEP_BIT_PERF_EVENTS)
+#define TICK_DEP_MASK_SCHED            (1 << TICK_DEP_BIT_SCHED)
+#define TICK_DEP_MASK_CLOCK_UNSTABLE   (1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
+
 #ifdef CONFIG_NO_HZ_COMMON
 extern int tick_nohz_enabled;
 extern int tick_nohz_tick_stopped(void);
@@ -154,9 +167,73 @@ static inline int housekeeping_any_cpu(void)
        return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 }
 
-extern void tick_nohz_full_kick(void);
+extern void tick_nohz_dep_set(enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_task(struct task_struct *tsk,
+                                  enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+                                    enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit);
+
+/*
+ * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases
+ * on top of static keys.
+ */
+static inline void tick_dep_set(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set(bit);
+}
+
+static inline void tick_dep_clear(enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear(bit);
+}
+
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_set_cpu(cpu, bit);
+}
+
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_cpu(cpu))
+               tick_nohz_dep_clear_cpu(cpu, bit);
+}
+
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_task(tsk, bit);
+}
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_task(tsk, bit);
+}
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_set_signal(signal, bit);
+}
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit)
+{
+       if (tick_nohz_full_enabled())
+               tick_nohz_dep_clear_signal(signal, bit);
+}
+
 extern void tick_nohz_full_kick_cpu(int cpu);
-extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(void);
 #else
 static inline int housekeeping_any_cpu(void)
@@ -166,9 +243,21 @@ static inline int housekeeping_any_cpu(void)
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
+
+static inline void tick_dep_set(enum tick_dep_bits bit) { }
+static inline void tick_dep_clear(enum tick_dep_bits bit) { }
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_set_task(struct task_struct *tsk,
+                                    enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+                                      enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+                                        enum tick_dep_bits bit) { }
+
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
-static inline void tick_nohz_full_kick(void) { }
-static inline void tick_nohz_full_kick_all(void) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
 
index 2524722..e880054 100644 (file)
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:          Offset clock monotonic -> clock tai
  * @tai_offset:                The current UTC to TAI offset in seconds
  * @clock_was_set_seq: The sequence number of clock was set events
+ * @cs_was_changed_seq:        The sequence number of clocksource change events
  * @next_leap_ktime:   CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:          Monotonic raw base time in timespec64 format
  * @cycle_interval:    Number of clock cycles in one NTP interval
@@ -91,6 +92,7 @@ struct timekeeper {
        ktime_t                 offs_tai;
        s32                     tai_offset;
        unsigned int            clock_was_set_seq;
+       u8                      cs_was_changed_seq;
        ktime_t                 next_leap_ktime;
        struct timespec64       raw_time;
 
index ec89d84..96f37be 100644 (file)
@@ -266,6 +266,64 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta);
 extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
                                        struct timespec64 *ts_real);
 
+/*
+ * struct system_time_snapshot - simultaneous raw/real time capture with
+ *     counter value
+ * @cycles:    Clocksource counter value to produce the system times
+ * @real:      Realtime system time
+ * @raw:       Monotonic raw system time
+ * @clock_was_set_seq: The sequence number of clock was set events
+ * @cs_was_changed_seq:        The sequence number of clocksource change events
+ */
+struct system_time_snapshot {
+       cycle_t         cycles;
+       ktime_t         real;
+       ktime_t         raw;
+       unsigned int    clock_was_set_seq;
+       u8              cs_was_changed_seq;
+};
+
+/*
+ * struct system_device_crosststamp - system/device cross-timestamp
+ *     (syncronized capture)
+ * @device:            Device time
+ * @sys_realtime:      Realtime simultaneous with device time
+ * @sys_monoraw:       Monotonic raw simultaneous with device time
+ */
+struct system_device_crosststamp {
+       ktime_t device;
+       ktime_t sys_realtime;
+       ktime_t sys_monoraw;
+};
+
+/*
+ * struct system_counterval_t - system counter value with the pointer to the
+ *     corresponding clocksource
+ * @cycles:    System counter value
+ * @cs:                Clocksource corresponding to system counter value. Used by
+ *     timekeeping code to verify comparibility of two cycle values
+ */
+struct system_counterval_t {
+       cycle_t                 cycles;
+       struct clocksource      *cs;
+};
+
+/*
+ * Get cross timestamp between system clock and device clock
+ */
+extern int get_device_system_crosststamp(
+                       int (*get_time_fn)(ktime_t *device_time,
+                               struct system_counterval_t *system_counterval,
+                               void *ctx),
+                       void *ctx,
+                       struct system_time_snapshot *history,
+                       struct system_device_crosststamp *xtstamp);
+
+/*
+ * Simultaneously snapshot realtime and monotonic raw clocks
+ */
+extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
+
 /*
  * Persistent clock related interfaces
  */
index 429fdfc..925730b 100644 (file)
@@ -568,6 +568,8 @@ enum {
        FILTER_DYN_STRING,
        FILTER_PTR_STRING,
        FILTER_TRACE_FN,
+       FILTER_COMM,
+       FILTER_CPU,
 };
 
 extern int trace_event_raw_init(struct trace_event_call *call);
index acd522a..be586c6 100644 (file)
  * See the file COPYING for more details.
  */
 
+#include <linux/smp.h>
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/cpumask.h>
 #include <linux/rcupdate.h>
 #include <linux/tracepoint-defs.h>
 
@@ -338,15 +340,19 @@ extern void syscall_unregfunc(void);
  * "void *__data, proto" as the callback prototype.
  */
 #define DECLARE_TRACE_NOARGS(name)                                     \
-               __DECLARE_TRACE(name, void, , 1, void *__data, __data)
+       __DECLARE_TRACE(name, void, ,                                   \
+                       cpu_online(raw_smp_processor_id()),             \
+                       void *__data, __data)
 
 #define DECLARE_TRACE(name, proto, args)                               \
-               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,   \
-                               PARAMS(void *__data, proto),            \
-                               PARAMS(__data, args))
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()),             \
+                       PARAMS(void *__data, proto),                    \
+                       PARAMS(__data, args))
 
 #define DECLARE_TRACE_CONDITION(name, proto, args, cond)               \
-       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
+                       cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
                        PARAMS(void *__data, proto),                    \
                        PARAMS(__data, args))
 
index 2fd8708..d9fb4b0 100644 (file)
@@ -649,6 +649,7 @@ extern long vt_compat_ioctl(struct tty_struct *tty,
 /* tty_mutex.c */
 /* functions for preparation of BKL removal */
 extern void __lockfunc tty_lock(struct tty_struct *tty);
+extern int  tty_lock_interruptible(struct tty_struct *tty);
 extern void __lockfunc tty_unlock(struct tty_struct *tty);
 extern void __lockfunc tty_lock_slave(struct tty_struct *tty);
 extern void __lockfunc tty_unlock_slave(struct tty_struct *tty);
index cbb20af..bb679b4 100644 (file)
@@ -11,4 +11,8 @@ unsigned long ucs2_strlen(const ucs2_char_t *s);
 unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength);
 int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len);
 
+unsigned long ucs2_utf8size(const ucs2_char_t *src);
+unsigned long ucs2_as_utf8(u8 *dest, const ucs2_char_t *src,
+                          unsigned long maxlength);
+
 #endif /* _LINUX_UCS2_STRING_H_ */
index ae71a76..27d7a0a 100644 (file)
@@ -338,7 +338,7 @@ do {                                                                        \
                            schedule(); try_to_freeze())
 
 /**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
  * @wq: the waitqueue to wait on
  * @condition: a C expression for the event to wait for
  *
index 0e32bc7..ca73c50 100644 (file)
@@ -311,6 +311,7 @@ enum {
 
        __WQ_DRAINING           = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED            = 1 << 17, /* internal: workqueue is ordered */
+       __WQ_LEGACY             = 1 << 18, /* internal: create*_workqueue() */
 
        WQ_MAX_ACTIVE           = 512,    /* I like 512, better ideas? */
        WQ_MAX_UNBOUND_PER_CPU  = 4,      /* 4 * #cpus for unbound wq */
@@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 
 #define create_workqueue(name)                                         \
-       alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
+       alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
 #define create_freezable_workqueue(name)                               \
-       alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
-                       1, (name))
+       alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \
+                       WQ_MEM_RECLAIM, 1, (name))
 #define create_singlethread_workqueue(name)                            \
-       alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)
+       alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
index b333c94..d0b5ca5 100644 (file)
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
                    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
 /*
index ef03ae5..8a0f55b 100644 (file)
@@ -533,7 +533,8 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
                const unsigned int requested_sizes[]);
 int vb2_core_prepare_buf(struct vb2_queue *q, unsigned int index, void *pb);
 int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb);
-int vb2_core_dqbuf(struct vb2_queue *q, void *pb, bool nonblocking);
+int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb,
+                  bool nonblocking);
 
 int vb2_core_streamon(struct vb2_queue *q, unsigned int type);
 int vb2_core_streamoff(struct vb2_queue *q, unsigned int type);
index 2a91a05..9b4c418 100644 (file)
@@ -6,8 +6,8 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
-void unix_inflight(struct file *fp);
-void unix_notinflight(struct file *fp);
+void unix_inflight(struct user_struct *user, struct file *fp);
+void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
index 5289929..5ee3c68 100644 (file)
@@ -252,6 +252,12 @@ struct l2cap_conn_rsp {
 #define L2CAP_PSM_3DSP         0x0021
 #define L2CAP_PSM_IPSP         0x0023 /* 6LoWPAN */
 
+#define L2CAP_PSM_DYN_START    0x1001
+#define L2CAP_PSM_DYN_END      0xffff
+#define L2CAP_PSM_AUTO_END     0x10ff
+#define L2CAP_PSM_LE_DYN_START  0x0080
+#define L2CAP_PSM_LE_DYN_END   0x00ff
+
 /* channel identifier */
 #define L2CAP_CID_SIGNALING    0x0001
 #define L2CAP_CID_CONN_LESS    0x0002
index 6816f0f..30a56ab 100644 (file)
@@ -44,6 +44,24 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
        return dst && !(dst->flags & DST_METADATA);
 }
 
+static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
+                                      const struct sk_buff *skb_b)
+{
+       const struct metadata_dst *a, *b;
+
+       if (!(skb_a->_skb_refdst | skb_b->_skb_refdst))
+               return 0;
+
+       a = (const struct metadata_dst *) skb_dst(skb_a);
+       b = (const struct metadata_dst *) skb_dst(skb_b);
+
+       if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
+               return 1;
+
+       return memcmp(&a->u.tun_info, &b->u.tun_info,
+                     sizeof(a->u.tun_info) + a->u.tun_info.options_len);
+}
+
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
 
index 481fe1c..49dcad4 100644 (file)
@@ -270,8 +270,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
                                            struct sock *newsk,
                                            const struct request_sock *req);
 
-void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
-                             struct sock *child);
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+                                     struct request_sock *req,
+                                     struct sock *child);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
                                   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
index 877f682..295d291 100644 (file)
@@ -64,8 +64,16 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 void ip6_route_input(struct sk_buff *skb);
 
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
-                                  struct flowi6 *fl6);
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+                                        struct flowi6 *fl6, int flags);
+
+static inline struct dst_entry *ip6_route_output(struct net *net,
+                                                const struct sock *sk,
+                                                struct flowi6 *fl6)
+{
+       return ip6_route_output_flags(net, sk, fl6, 0);
+}
+
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   int flags);
 
index 7029527..4079fc1 100644 (file)
@@ -61,6 +61,7 @@ struct fib_nh_exception {
        struct rtable __rcu             *fnhe_rth_input;
        struct rtable __rcu             *fnhe_rth_output;
        unsigned long                   fnhe_stamp;
+       struct rcu_head                 rcu;
 };
 
 struct fnhe_hash_bucket {
index 6db96ea..dda9abf 100644 (file)
@@ -230,6 +230,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
                    u8 *protocol, struct flowi4 *fl4);
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
index 8f81bbb..e0f4109 100644 (file)
@@ -439,6 +439,12 @@ int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length);
 /* Send a single event to user space */
 void wireless_send_event(struct net_device *dev, unsigned int cmd,
                         union iwreq_data *wrqu, const char *extra);
+#ifdef CONFIG_WEXT_CORE
+/* flush all previous wext events - if work is done from netdev notifiers */
+void wireless_nlevent_flush(void);
+#else
+static inline void wireless_nlevent_flush(void) {}
+#endif
 
 /* We may need a function to send a stream of events to user space.
  * More on that later... */
index 788ef58..62e17d1 100644 (file)
@@ -79,12 +79,10 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
             const struct nf_conntrack_l3proto *l3proto,
             const struct nf_conntrack_l4proto *proto);
 
-#ifdef CONFIG_LOCKDEP
-# define CONNTRACK_LOCKS 8
-#else
-# define CONNTRACK_LOCKS 1024
-#endif
+#define CONNTRACK_LOCKS 1024
+
 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+void nf_conntrack_lock(spinlock_t *lock);
 
 extern spinlock_t nf_conntrack_expect_lock;
 
index 262532d..59fa93c 100644 (file)
@@ -21,6 +21,7 @@ struct scm_creds {
 struct scm_fp_list {
        short                   count;
        short                   max;
+       struct user_struct      *user;
        struct file             *fp[SCM_MAX_FD];
 };
 
index 20e7212..205630b 100644 (file)
@@ -756,7 +756,6 @@ struct sctp_transport {
 
        /* Reference counting. */
        atomic_t refcnt;
-       __u32    dead:1,
                /* RTO-Pending : A flag used to track if one of the DATA
                 *              chunks sent to this address is currently being
                 *              used to compute a RTT. If this flag is 0,
@@ -766,7 +765,7 @@ struct sctp_transport {
                 *              calculation completes (i.e. the DATA chunk
                 *              is SACK'd) clear this flag.
                 */
-                rto_pending:1,
+       __u32   rto_pending:1,
 
                /*
                 * hb_sent : a flag that signals that we have a pending
@@ -955,7 +954,7 @@ void sctp_transport_route(struct sctp_transport *, union sctp_addr *,
 void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk);
 void sctp_transport_free(struct sctp_transport *);
 void sctp_transport_reset_timers(struct sctp_transport *);
-void sctp_transport_hold(struct sctp_transport *);
+int sctp_transport_hold(struct sctp_transport *);
 void sctp_transport_put(struct sctp_transport *);
 void sctp_transport_update_rto(struct sctp_transport *, __u32);
 void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32);
index b9e7b3d..f5ea148 100644 (file)
@@ -1035,18 +1035,6 @@ struct proto {
        struct list_head        node;
 #ifdef SOCK_REFCNT_DEBUG
        atomic_t                socks;
-#endif
-#ifdef CONFIG_MEMCG_KMEM
-       /*
-        * cgroup specific init/deinit functions. Called once for all
-        * protocols that implement it, from cgroups populate function.
-        * This function has to setup any files the protocol want to
-        * appear in the kmem cgroup filesystem.
-        */
-       int                     (*init_cgroup)(struct mem_cgroup *memcg,
-                                              struct cgroup_subsys *ss);
-       void                    (*destroy_cgroup)(struct mem_cgroup *memcg);
-       struct cg_proto         *(*proto_cgroup)(struct mem_cgroup *memcg);
 #endif
        int                     (*diag_destroy)(struct sock *sk, int err);
 };
index 7dda3d7..aecd303 100644 (file)
@@ -16,7 +16,7 @@ struct sock_reuseport {
 };
 
 extern int reuseport_alloc(struct sock *sk);
-extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2);
+extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
 extern void reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
                                          u32 hash,
index 8ea1997..ae6468f 100644 (file)
@@ -216,7 +216,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* TCP thin-stream limits */
 #define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
 
-/* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
+/* TCP initial congestion window as per rfc6928 */
 #define TCP_INIT_CWND          10
 
 /* Bit Flags for sysctl_tcp_fastopen */
@@ -447,7 +447,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
 void tcp_v4_mtu_reduced(struct sock *sk);
-void tcp_req_err(struct sock *sk, u32 seq);
+void tcp_req_err(struct sock *sk, u32 seq, bool abort);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(const struct sock *sk,
                                      struct request_sock *req,
index e2b712c..c21c38c 100644 (file)
@@ -343,7 +343,7 @@ void snd_hdac_bus_enter_link_reset(struct hdac_bus *bus);
 void snd_hdac_bus_exit_link_reset(struct hdac_bus *bus);
 
 void snd_hdac_bus_update_rirb(struct hdac_bus *bus);
-void snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
+int snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
                                    void (*ack)(struct hdac_bus *,
                                                struct hdac_stream *));
 
index fdabbb4..f730b91 100644 (file)
@@ -167,6 +167,10 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
 int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count);
 int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream,
                         unsigned char *buffer, int count);
+int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
+                             unsigned char *buffer, int count);
+int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream,
+                              int count);
 
 /* main midi functions */
 
index 56cf8e4..28ee5c2 100644 (file)
@@ -94,5 +94,8 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd,
        sense_reason_t (*exec_cmd)(struct se_cmd *cmd));
 
 bool target_sense_desc_format(struct se_device *dev);
+sector_t target_to_linux_sector(struct se_device *dev, sector_t lb);
+bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
+                                      struct request_queue *q, int block_size);
 
 #endif /* TARGET_CORE_BACKEND_H */
index 5d82816..e8c8c08 100644 (file)
@@ -140,6 +140,8 @@ enum se_cmd_flags_table {
        SCF_COMPARE_AND_WRITE           = 0x00080000,
        SCF_COMPARE_AND_WRITE_POST      = 0x00100000,
        SCF_PASSTHROUGH_PROT_SG_TO_MEM_NOALLOC = 0x00200000,
+       SCF_ACK_KREF                    = 0x00400000,
+       SCF_USE_CPUID                   = 0x00800000,
 };
 
 /* struct se_dev_entry->lun_flags and struct se_lun->lun_access */
@@ -187,6 +189,7 @@ enum target_sc_flags_table {
        TARGET_SCF_BIDI_OP              = 0x01,
        TARGET_SCF_ACK_KREF             = 0x02,
        TARGET_SCF_UNKNOWN_SIZE         = 0x04,
+       TARGET_SCF_USE_CPUID    = 0x08,
 };
 
 /* fabric independent task management function values */
@@ -490,8 +493,9 @@ struct se_cmd {
 #define CMD_T_SENT             (1 << 4)
 #define CMD_T_STOP             (1 << 5)
 #define CMD_T_DEV_ACTIVE       (1 << 7)
-#define CMD_T_REQUEST_STOP     (1 << 8)
 #define CMD_T_BUSY             (1 << 9)
+#define CMD_T_TAS              (1 << 10)
+#define CMD_T_FABRIC_STOP      (1 << 11)
        spinlock_t              t_state_lock;
        struct kref             cmd_kref;
        struct completion       t_transport_stop_comp;
@@ -511,9 +515,6 @@ struct se_cmd {
 
        struct list_head        state_list;
 
-       /* old task stop completion, consider merging with some of the above */
-       struct completion       task_stop_comp;
-
        /* backend private data */
        void                    *priv;
 
index 317a1ed..9130dd5 100644 (file)
@@ -231,13 +231,13 @@ TRACE_EVENT(snd_soc_jack_report,
        TP_ARGS(jack, mask, val),
 
        TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                __field(        int,            mask                    )
                __field(        int,            val                     )
        ),
 
        TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                __entry->mask = mask;
                __entry->val = val;
        ),
@@ -253,12 +253,12 @@ TRACE_EVENT(snd_soc_jack_notify,
        TP_ARGS(jack, val),
 
        TP_STRUCT__entry(
-               __string(       name,           jack->jack->name        )
+               __string(       name,           jack->jack->id          )
                __field(        int,            val                     )
        ),
 
        TP_fast_assign(
-               __assign_str(name, jack->jack->name);
+               __assign_str(name, jack->jack->id);
                __entry->val = val;
        ),
 
diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h
new file mode 100644 (file)
index 0000000..a72bd93
--- /dev/null
@@ -0,0 +1,66 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpuhp
+
+#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUHP_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpuhp_enter,
+
+       TP_PROTO(unsigned int cpu,
+                int target,
+                int idx,
+                int (*fun)(unsigned int)),
+
+       TP_ARGS(cpu, target, idx, fun),
+
+       TP_STRUCT__entry(
+               __field( unsigned int,  cpu             )
+               __field( int,           target          )
+               __field( int,           idx             )
+               __field( void *,        fun             )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cpu;
+               __entry->target = target;
+               __entry->idx    = idx;
+               __entry->fun    = fun;
+       ),
+
+       TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
+                 __entry->cpu, __entry->target, __entry->idx, __entry->fun)
+);
+
+TRACE_EVENT(cpuhp_exit,
+
+       TP_PROTO(unsigned int cpu,
+                int state,
+                int idx,
+                int ret),
+
+       TP_ARGS(cpu, state, idx, ret),
+
+       TP_STRUCT__entry(
+               __field( unsigned int,  cpu             )
+               __field( int,           state           )
+               __field( int,           idx             )
+               __field( int,           ret             )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cpu;
+               __entry->state  = state;
+               __entry->idx    = idx;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk(" cpu: %04u  state: %3d step: %3d ret: %d",
+                 __entry->cpu, __entry->state, __entry->idx,  __entry->ret)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 98feb1b..d6dfa05 100644 (file)
@@ -17,7 +17,7 @@ TRACE_EVENT(fence_annotate_wait_on,
 
        TP_STRUCT__entry(
                __string(driver, fence->ops->get_driver_name(fence))
-               __string(timeline, fence->ops->get_driver_name(fence))
+               __string(timeline, fence->ops->get_timeline_name(fence))
                __field(unsigned int, context)
                __field(unsigned int, seqno)
 
index 073b9ac..5144013 100644 (file)
@@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire,
 );
 
 #ifdef CONFIG_NO_HZ_COMMON
+
+#define TICK_DEP_NAMES                                 \
+               tick_dep_name(NONE)                     \
+               tick_dep_name(POSIX_TIMER)              \
+               tick_dep_name(PERF_EVENTS)              \
+               tick_dep_name(SCHED)                    \
+               tick_dep_name_end(CLOCK_UNSTABLE)
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+
+TICK_DEP_NAMES
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
+#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }
+
+#define show_tick_dep_name(val)                                \
+       __print_symbolic(val, TICK_DEP_NAMES)
+
 TRACE_EVENT(tick_stop,
 
-       TP_PROTO(int success, char *error_msg),
+       TP_PROTO(int success, int dependency),
 
-       TP_ARGS(success, error_msg),
+       TP_ARGS(success, dependency),
 
        TP_STRUCT__entry(
                __field( int ,          success )
-               __string( msg,          error_msg )
+               __field( int ,          dependency )
        ),
 
        TP_fast_assign(
                __entry->success        = success;
-               __assign_str(msg, error_msg);
+               __entry->dependency     = dependency;
        ),
 
-       TP_printk("success=%s msg=%s",  __entry->success ? "yes" : "no", __get_str(msg))
+       TP_printk("success=%d dependency=%s",  __entry->success, \
+                       show_tick_dep_name(__entry->dependency))
 );
 #endif
 
index 4cc989a..f95e1c4 100644 (file)
@@ -48,6 +48,8 @@ struct drm_etnaviv_timespec {
 #define ETNAVIV_PARAM_GPU_FEATURES_2                0x05
 #define ETNAVIV_PARAM_GPU_FEATURES_3                0x06
 #define ETNAVIV_PARAM_GPU_FEATURES_4                0x07
+#define ETNAVIV_PARAM_GPU_FEATURES_5                0x08
+#define ETNAVIV_PARAM_GPU_FEATURES_6                0x09
 
 #define ETNAVIV_PARAM_GPU_STREAM_COUNT              0x10
 #define ETNAVIV_PARAM_GPU_REGISTER_MAX              0x11
@@ -59,6 +61,7 @@ struct drm_etnaviv_timespec {
 #define ETNAVIV_PARAM_GPU_BUFFER_SIZE               0x17
 #define ETNAVIV_PARAM_GPU_INSTRUCTION_COUNT         0x18
 #define ETNAVIV_PARAM_GPU_NUM_CONSTANTS             0x19
+#define ETNAVIV_PARAM_GPU_NUM_VARYINGS              0x1a
 
 #define ETNA_MAX_PIPES 4
 
index aa6f857..5df4881 100644 (file)
@@ -292,6 +292,9 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6             (1ULL << 0)
 
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+#define BPF_F_ZERO_CSUM_TX             (1ULL << 1)
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
index 41e0433..149bec8 100644 (file)
@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXSET _IO(0x12,128)
 #define BLKDAXGET _IO(0x12,129)
 
 #define BMAP_IOCTL 1           /* obsolete - kept for compatibility */
index 1e3c8cb..a8e3a8c 100644 (file)
@@ -66,27 +66,33 @@ struct media_device_info {
 /*
  * DVB entities
  */
-#define MEDIA_ENT_F_DTV_DEMOD          (MEDIA_ENT_F_BASE + 1)
-#define MEDIA_ENT_F_TS_DEMUX           (MEDIA_ENT_F_BASE + 2)
-#define MEDIA_ENT_F_DTV_CA             (MEDIA_ENT_F_BASE + 3)
-#define MEDIA_ENT_F_DTV_NET_DECAP      (MEDIA_ENT_F_BASE + 4)
+#define MEDIA_ENT_F_DTV_DEMOD          (MEDIA_ENT_F_BASE + 0x00001)
+#define MEDIA_ENT_F_TS_DEMUX           (MEDIA_ENT_F_BASE + 0x00002)
+#define MEDIA_ENT_F_DTV_CA             (MEDIA_ENT_F_BASE + 0x00003)
+#define MEDIA_ENT_F_DTV_NET_DECAP      (MEDIA_ENT_F_BASE + 0x00004)
 
 /*
- * Connectors
+ * I/O entities
  */
-/* It is a responsibility of the entity drivers to add connectors and links */
-#define MEDIA_ENT_F_CONN_RF            (MEDIA_ENT_F_BASE + 21)
-#define MEDIA_ENT_F_CONN_SVIDEO                (MEDIA_ENT_F_BASE + 22)
-#define MEDIA_ENT_F_CONN_COMPOSITE     (MEDIA_ENT_F_BASE + 23)
-/* For internal test signal generators and other debug connectors */
-#define MEDIA_ENT_F_CONN_TEST          (MEDIA_ENT_F_BASE + 24)
+#define MEDIA_ENT_F_IO_DTV             (MEDIA_ENT_F_BASE + 0x01001)
+#define MEDIA_ENT_F_IO_VBI             (MEDIA_ENT_F_BASE + 0x01002)
+#define MEDIA_ENT_F_IO_SWRADIO         (MEDIA_ENT_F_BASE + 0x01003)
 
 /*
- * I/O entities
+ * Connectors
  */
-#define MEDIA_ENT_F_IO_DTV             (MEDIA_ENT_F_BASE + 31)
-#define MEDIA_ENT_F_IO_VBI             (MEDIA_ENT_F_BASE + 32)
-#define MEDIA_ENT_F_IO_SWRADIO         (MEDIA_ENT_F_BASE + 33)
+/* It is a responsibility of the entity drivers to add connectors and links */
+#ifdef __KERNEL__
+       /*
+        * For now, it should not be used in userspace, as some
+        * definitions may change
+        */
+
+#define MEDIA_ENT_F_CONN_RF            (MEDIA_ENT_F_BASE + 0x30001)
+#define MEDIA_ENT_F_CONN_SVIDEO                (MEDIA_ENT_F_BASE + 0x30002)
+#define MEDIA_ENT_F_CONN_COMPOSITE     (MEDIA_ENT_F_BASE + 0x30003)
+
+#endif
 
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
@@ -114,7 +120,7 @@ struct media_device_info {
 
 #define MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN        MEDIA_ENT_F_OLD_SUBDEV_BASE
 
-#ifndef __KERNEL__
+#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API)
 
 /*
  * Legacy symbols used to avoid userspace compilation breakages
@@ -127,6 +133,10 @@ struct media_device_info {
 #define MEDIA_ENT_TYPE_MASK            0x00ff0000
 #define MEDIA_ENT_SUBTYPE_MASK         0x0000ffff
 
+/* End of the old subdev reserved numberspace */
+#define MEDIA_ENT_T_DEVNODE_UNKNOWN    (MEDIA_ENT_T_DEVNODE | \
+                                        MEDIA_ENT_SUBTYPE_MASK)
+
 #define MEDIA_ENT_T_DEVNODE            MEDIA_ENT_F_OLD_BASE
 #define MEDIA_ENT_T_DEVNODE_V4L                MEDIA_ENT_F_IO_V4L
 #define MEDIA_ENT_T_DEVNODE_FB         (MEDIA_ENT_T_DEVNODE + 2)
@@ -291,14 +301,14 @@ struct media_v2_entity {
        __u32 id;
        char name[64];          /* FIXME: move to a property? (RFC says so) */
        __u32 function;         /* Main function of the entity */
-       __u16 reserved[12];
-};
+       __u32 reserved[6];
+} __attribute__ ((packed));
 
 /* Should match the specific fields at media_intf_devnode */
 struct media_v2_intf_devnode {
        __u32 major;
        __u32 minor;
-};
+} __attribute__ ((packed));
 
 struct media_v2_interface {
        __u32 id;
@@ -310,22 +320,22 @@ struct media_v2_interface {
                struct media_v2_intf_devnode devnode;
                __u32 raw[16];
        };
-};
+} __attribute__ ((packed));
 
 struct media_v2_pad {
        __u32 id;
        __u32 entity_id;
        __u32 flags;
-       __u16 reserved[9];
-};
+       __u32 reserved[5];
+} __attribute__ ((packed));
 
 struct media_v2_link {
        __u32 id;
        __u32 source_id;
        __u32 sink_id;
        __u32 flags;
-       __u32 reserved[5];
-};
+       __u32 reserved[6];
+} __attribute__ ((packed));
 
 struct media_v2_topology {
        __u64 topology_version;
@@ -345,7 +355,7 @@ struct media_v2_topology {
        __u32 num_links;
        __u32 reserved4;
        __u64 ptr_links;
-};
+} __attribute__ ((packed));
 
 static inline void __user *media_get_uptr(__u64 arg)
 {
index 5b4a4be..cc68b92 100644 (file)
@@ -66,14 +66,18 @@ struct nd_cmd_ars_cap {
        __u64 length;
        __u32 status;
        __u32 max_ars_out;
+       __u32 clear_err_unit;
+       __u32 reserved;
 } __packed;
 
 struct nd_cmd_ars_start {
        __u64 address;
        __u64 length;
        __u16 type;
-       __u8 reserved[6];
+       __u8 flags;
+       __u8 reserved[5];
        __u32 status;
+       __u32 scrub_time;
 } __packed;
 
 struct nd_cmd_ars_status {
@@ -81,11 +85,14 @@ struct nd_cmd_ars_status {
        __u32 out_length;
        __u64 address;
        __u64 length;
+       __u64 restart_address;
+       __u64 restart_length;
        __u16 type;
+       __u16 flags;
        __u32 num_records;
        struct nd_ars_record {
                __u32 handle;
-               __u32 flags;
+               __u32 reserved;
                __u64 err_address;
                __u64 length;
        } __packed records[0];
index f0b7bfe..ac6dded 100644 (file)
@@ -51,7 +51,9 @@ struct ptp_clock_caps {
        int n_per_out; /* Number of programmable periodic signals. */
        int pps;       /* Whether the clock supports a PPS callback. */
        int n_pins;    /* Number of input/output pins. */
-       int rsv[14];   /* Reserved for future use. */
+       /* Whether the clock supports precise system-device cross timestamps */
+       int cross_timestamping;
+       int rsv[13];   /* Reserved for future use. */
 };
 
 struct ptp_extts_request {
@@ -81,6 +83,13 @@ struct ptp_sys_offset {
        struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
 };
 
+struct ptp_sys_offset_precise {
+       struct ptp_clock_time device;
+       struct ptp_clock_time sys_realtime;
+       struct ptp_clock_time sys_monoraw;
+       unsigned int rsv[4];    /* Reserved for future use. */
+};
+
 enum ptp_pin_function {
        PTP_PF_NONE,
        PTP_PF_EXTTS,
@@ -124,6 +133,8 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET     _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset)
 #define PTP_PIN_GETFUNC    _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc)
 #define PTP_PIN_SETFUNC    _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE \
+       _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
 
 struct ptp_extts_event {
        struct ptp_clock_time t; /* Time event occured. */
index 58c9e37..8dc93df 100644 (file)
@@ -93,9 +93,6 @@ static int kernel_init(void *);
 extern void init_IRQ(void);
 extern void fork_init(void);
 extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
 
 /*
  * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -388,7 +385,6 @@ static noinline void __init_refok rest_init(void)
        int pid;
 
        rcu_scheduler_starting();
-       smpboot_thread_init();
        /*
         * We need to spawn init first so that it obtains pid 1, however
         * the init task will end up wanting to create kthreads, which, if
@@ -452,20 +448,6 @@ void __init parse_early_param(void)
        done = 1;
 }
 
-/*
- *     Activate the first processor.
- */
-
-static void __init boot_cpu_init(void)
-{
-       int cpu = smp_processor_id();
-       /* Mark the boot cpu "present", "online" etc for SMP and UP case */
-       set_cpu_online(cpu, true);
-       set_cpu_active(cpu, true);
-       set_cpu_present(cpu, true);
-       set_cpu_possible(cpu, true);
-}
-
 void __init __weak smp_setup_processor_id(void)
 {
 }
@@ -499,11 +481,6 @@ asmlinkage __visible void __init start_kernel(void)
        char *command_line;
        char *after_dashes;
 
-       /*
-        * Need to run as early as possible, to initialize the
-        * lockdep hash:
-        */
-       lockdep_init();
        set_task_stack_end_magic(&init_task);
        smp_setup_processor_id();
        debug_objects_early_init();
@@ -530,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
        setup_command_line(command_line);
        setup_nr_cpu_ids();
        setup_per_cpu_areas();
+       boot_cpu_state_init();
        smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
 
        build_all_zonelists(NULL, NULL);
@@ -929,6 +907,28 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+       return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+       if (rodata_enabled)
+               mark_rodata_ro();
+       else
+               pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+       pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
 static int __ref kernel_init(void *unused)
 {
        int ret;
@@ -937,7 +937,7 @@ static int __ref kernel_init(void *unused)
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();
        free_initmem();
-       mark_rodata_ro();
+       mark_readonly();
        system_state = SYSTEM_RUNNING;
        numa_default_policy();
 
index ed3027d..331fc1b 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -156,11 +156,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
        struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
 
        /*
-        * We raced in the idr lookup or with shm_destroy().  Either way, the
-        * ID is busted.
+        * Callers of shm_lock() must validate the status of the returned ipc
+        * object pointer (as returned by ipc_lock()), and error out as
+        * appropriate.
         */
-       WARN_ON(IS_ERR(ipcp));
-
+       if (IS_ERR(ipcp))
+               return (void *)ipcp;
        return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 
@@ -186,18 +187,33 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 }
 
 
-/* This is called by fork, once for every shm attach. */
-static void shm_open(struct vm_area_struct *vma)
+static int __shm_open(struct vm_area_struct *vma)
 {
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        struct shmid_kernel *shp;
 
        shp = shm_lock(sfd->ns, sfd->id);
+
+       if (IS_ERR(shp))
+               return PTR_ERR(shp);
+
        shp->shm_atim = get_seconds();
        shp->shm_lprid = task_tgid_vnr(current);
        shp->shm_nattch++;
        shm_unlock(shp);
+       return 0;
+}
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+       int err = __shm_open(vma);
+       /*
+        * We raced in the idr lookup or with shm_destroy().
+        * Either way, the ID is busted.
+        */
+       WARN_ON_ONCE(err);
 }
 
 /*
@@ -260,6 +276,14 @@ static void shm_close(struct vm_area_struct *vma)
        down_write(&shm_ids(ns).rwsem);
        /* remove from the list of attaches of the shm segment */
        shp = shm_lock(ns, sfd->id);
+
+       /*
+        * We raced in the idr lookup or with shm_destroy().
+        * Either way, the ID is busted.
+        */
+       if (WARN_ON_ONCE(IS_ERR(shp)))
+               goto done; /* no-op */
+
        shp->shm_lprid = task_tgid_vnr(current);
        shp->shm_dtim = get_seconds();
        shp->shm_nattch--;
@@ -267,6 +291,7 @@ static void shm_close(struct vm_area_struct *vma)
                shm_destroy(ns, shp);
        else
                shm_unlock(shp);
+done:
        up_write(&shm_ids(ns).rwsem);
 }
 
@@ -388,17 +413,25 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
        struct shm_file_data *sfd = shm_file_data(file);
        int ret;
 
+       /*
+        * In case of remap_file_pages() emulation, the file can represent
+        * removed IPC ID: propogate shm_lock() error to caller.
+        */
+       ret =__shm_open(vma);
+       if (ret)
+               return ret;
+
        ret = sfd->file->f_op->mmap(sfd->file, vma);
-       if (ret != 0)
+       if (ret) {
+               shm_close(vma);
                return ret;
+       }
        sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
        WARN_ON(!sfd->vm_ops->fault);
 #endif
        vma->vm_ops = &shm_vm_ops;
-       shm_open(vma);
-
-       return ret;
+       return 0;
 }
 
 static int shm_release(struct inode *ino, struct file *file)
index b0799bc..89ebbc4 100644 (file)
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 {
        struct perf_event *event;
        const struct perf_event_attr *attr;
+       struct file *file;
 
-       event = perf_event_get(fd);
-       if (IS_ERR(event))
-               return event;
+       file = perf_event_get(fd);
+       if (IS_ERR(file))
+               return file;
+
+       event = file->private_data;
 
        attr = perf_event_attrs(event);
        if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
                goto err;
 
        if (attr->type == PERF_TYPE_RAW)
-               return event;
+               return file;
 
        if (attr->type == PERF_TYPE_HARDWARE)
-               return event;
+               return file;
 
        if (attr->type == PERF_TYPE_SOFTWARE &&
            attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-               return event;
+               return file;
 err:
-       perf_event_release_kernel(event);
+       fput(file);
        return ERR_PTR(-EINVAL);
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-       struct perf_event *event = ptr;
-
-       perf_event_release_kernel(event);
+       fput((struct file *)ptr);
 }
 
 static const struct bpf_map_ops perf_event_array_ops = {
index d1d3e8f..2e7f7ab 100644 (file)
@@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
                /* adjust offset of jmps if necessary */
                if (i < pos && i + insn->off + 1 > pos)
                        insn->off += delta;
-               else if (i > pos && i + insn->off + 1 < pos)
+               else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
                        insn->off -= delta;
        }
 }
index c03a640..d27904c 100644 (file)
@@ -58,6 +58,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/cpuset.h>
 #include <net/sock.h>
 
 /*
@@ -2739,6 +2740,7 @@ out_unlock_rcu:
 out_unlock_threadgroup:
        percpu_up_write(&cgroup_threadgroup_rwsem);
        cgroup_kn_unlock(of->kn);
+       cpuset_post_attach_flush();
        return ret ?: nbytes;
 }
 
@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
 
        if (ss) {
                /* css free path */
+               struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;
 
-               if (css->parent)
-                       css_put(css->parent);
-
                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);
+
+               if (parent)
+                       css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        css->serial_nr = css_serial_nr_next++;
+       atomic_set(&css->online_cnt, 0);
 
        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+               atomic_inc(&css->online_cnt);
+               if (css->parent)
+                       atomic_inc(&css->parent->online_cnt);
        }
        return ret;
 }
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
 
        mutex_lock(&cgroup_mutex);
-       offline_css(css);
-       mutex_unlock(&cgroup_mutex);
 
-       css_put(css);
+       do {
+               offline_css(css);
+               css_put(css);
+               /* @css can't go away while we're holding cgroup_mutex */
+               css = css->parent;
+       } while (css && atomic_dec_and_test(&css->online_cnt));
+
+       mutex_unlock(&cgroup_mutex);
 }
 
 /* css kill confirmation processing requires process context, bounce */
@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
 
-       INIT_WORK(&css->destroy_work, css_killed_work_fn);
-       queue_work(cgroup_destroy_wq, &css->destroy_work);
+       if (atomic_dec_and_test(&css->online_cnt)) {
+               INIT_WORK(&css->destroy_work, css_killed_work_fn);
+               queue_work(cgroup_destroy_wq, &css->destroy_work);
+       }
 }
 
 /**
index 5b9d396..6ea42e8 100644 (file)
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/smpboot.h>
+
 #include <trace/events/power.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpuhp.h>
 
 #include "smpboot.h"
 
+/**
+ * cpuhp_cpu_state - Per cpu hotplug state storage
+ * @state:     The current cpu state
+ * @target:    The target state
+ * @thread:    Pointer to the hotplug thread
+ * @should_run:        Thread should execute
+ * @cb_stat:   The state for a single callback (install/uninstall)
+ * @cb:                Single callback function (install/uninstall)
+ * @result:    Result of the operation
+ * @done:      Signal completion to the issuer of the task
+ */
+struct cpuhp_cpu_state {
+       enum cpuhp_state        state;
+       enum cpuhp_state        target;
+#ifdef CONFIG_SMP
+       struct task_struct      *thread;
+       bool                    should_run;
+       enum cpuhp_state        cb_state;
+       int                     (*cb)(unsigned int cpu);
+       int                     result;
+       struct completion       done;
+#endif
+};
+
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+
+/**
+ * cpuhp_step - Hotplug state machine step
+ * @name:      Name of the step
+ * @startup:   Startup function of the step
+ * @teardown:  Teardown function of the step
+ * @skip_onerr:        Do not invoke the functions on error rollback
+ *             Will go away once the notifiers are gone
+ * @cant_stop: Bringup/teardown can't be stopped at this step
+ */
+struct cpuhp_step {
+       const char      *name;
+       int             (*startup)(unsigned int cpu);
+       int             (*teardown)(unsigned int cpu);
+       bool            skip_onerr;
+       bool            cant_stop;
+};
+
+static DEFINE_MUTEX(cpuhp_state_mutex);
+static struct cpuhp_step cpuhp_bp_states[];
+static struct cpuhp_step cpuhp_ap_states[];
+
+/**
+ * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * @cpu:       The cpu for which the callback should be invoked
+ * @step:      The step in the state machine
+ * @cb:                The callback function to invoke
+ *
+ * Called from cpu hotplug and from the state register machinery
+ */
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
+                                int (*cb)(unsigned int))
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       int ret = 0;
+
+       if (cb) {
+               trace_cpuhp_enter(cpu, st->target, step, cb);
+               ret = cb(cpu);
+               trace_cpuhp_exit(cpu, st->state, step, ret);
+       }
+       return ret;
+}
+
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
+bool cpuhp_tasks_frozen;
+EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
 
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
@@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb)
        return raw_notifier_chain_register(&cpu_chain, nb);
 }
 
-static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
                        int *nr_calls)
 {
+       unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
+       void *hcpu = (void *)(long)cpu;
+
        int ret;
 
-       ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+       ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
                                        nr_calls);
 
        return notifier_to_errno(ret);
 }
 
-static int cpu_notify(unsigned long val, void *v)
+static int cpu_notify(unsigned long val, unsigned int cpu)
 {
-       return __cpu_notify(val, v, -1, NULL);
+       return __cpu_notify(val, cpu, -1, NULL);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
+/* Notifier wrappers for transitioning to state machine */
+static int notify_prepare(unsigned int cpu)
+{
+       int nr_calls = 0;
+       int ret;
+
+       ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
+       if (ret) {
+               nr_calls--;
+               printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
+                               __func__, cpu);
+               __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
+       }
+       return ret;
+}
+
+static int notify_online(unsigned int cpu)
+{
+       cpu_notify(CPU_ONLINE, cpu);
+       return 0;
+}
+
+static int notify_starting(unsigned int cpu)
+{
+       cpu_notify(CPU_STARTING, cpu);
+       return 0;
+}
+
+static int bringup_wait_for_ap(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+       wait_for_completion(&st->done);
+       return st->result;
+}
+
+static int bringup_cpu(unsigned int cpu)
+{
+       struct task_struct *idle = idle_thread_get(cpu);
+       int ret;
+
+       /* Arch-specific enabling code. */
+       ret = __cpu_up(cpu, idle);
+       if (ret) {
+               cpu_notify(CPU_UP_CANCELED, cpu);
+               return ret;
+       }
+       ret = bringup_wait_for_ap(cpu);
+       BUG_ON(!cpu_online(cpu));
+       return ret;
+}
+
+/*
+ * Hotplug state machine related functions
+ */
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
+                         struct cpuhp_step *steps)
+{
+       for (st->state++; st->state < st->target; st->state++) {
+               struct cpuhp_step *step = steps + st->state;
+
+               if (!step->skip_onerr)
+                       cpuhp_invoke_callback(cpu, st->state, step->startup);
+       }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+                               struct cpuhp_step *steps, enum cpuhp_state target)
+{
+       enum cpuhp_state prev_state = st->state;
+       int ret = 0;
+
+       for (; st->state > target; st->state--) {
+               struct cpuhp_step *step = steps + st->state;
+
+               ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+               if (ret) {
+                       st->target = prev_state;
+                       undo_cpu_down(cpu, st, steps);
+                       break;
+               }
+       }
+       return ret;
+}
+
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
+                       struct cpuhp_step *steps)
+{
+       for (st->state--; st->state > st->target; st->state--) {
+               struct cpuhp_step *step = steps + st->state;
+
+               if (!step->skip_onerr)
+                       cpuhp_invoke_callback(cpu, st->state, step->teardown);
+       }
+}
+
+static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+                             struct cpuhp_step *steps, enum cpuhp_state target)
+{
+       enum cpuhp_state prev_state = st->state;
+       int ret = 0;
+
+       while (st->state < target) {
+               struct cpuhp_step *step;
+
+               st->state++;
+               step = steps + st->state;
+               ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+               if (ret) {
+                       st->target = prev_state;
+                       undo_cpu_up(cpu, st, steps);
+                       break;
+               }
+       }
+       return ret;
+}
+
+/*
+ * The cpu hotplug threads manage the bringup and teardown of the cpus
+ */
+static void cpuhp_create(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+       init_completion(&st->done);
+}
+
+static int cpuhp_should_run(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+       return st->should_run;
+}
+
+/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
+static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+       enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
+
+       return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+}
+
+/* Execute the online startup callbacks. Used to be CPU_ONLINE */
+static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+       return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+}
+
+/*
+ * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
+ * callbacks when a state gets [un]installed at runtime.
+ */
+static void cpuhp_thread_fun(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+       int ret = 0;
+
+       /*
+        * Paired with the mb() in cpuhp_kick_ap_work and
+        * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+        */
+       smp_mb();
+       if (!st->should_run)
+               return;
+
+       st->should_run = false;
+
+       /* Single callback invocation for [un]install ? */
+       if (st->cb) {
+               if (st->cb_state < CPUHP_AP_ONLINE) {
+                       local_irq_disable();
+                       ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+                       local_irq_enable();
+               } else {
+                       ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+               }
+       } else {
+               /* Cannot happen .... */
+               BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+               /* Regular hotplug work */
+               if (st->state < st->target)
+                       ret = cpuhp_ap_online(cpu, st);
+               else if (st->state > st->target)
+                       ret = cpuhp_ap_offline(cpu, st);
+       }
+       st->result = ret;
+       complete(&st->done);
+}
 
-static void cpu_notify_nofail(unsigned long val, void *v)
+/* Invoke a single callback on a remote cpu */
+static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
+                                   int (*cb)(unsigned int))
 {
-       BUG_ON(cpu_notify(val, v));
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       st->cb_state = state;
+       st->cb = cb;
+       /*
+        * Make sure the above stores are visible before should_run becomes
+        * true. Paired with the mb() above in cpuhp_thread_fun()
+        */
+       smp_mb();
+       st->should_run = true;
+       wake_up_process(st->thread);
+       wait_for_completion(&st->done);
+       return st->result;
 }
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
+{
+       st->result = 0;
+       st->cb = NULL;
+       /*
+        * Make sure the above stores are visible before should_run becomes
+        * true. Paired with the mb() above in cpuhp_thread_fun()
+        */
+       smp_mb();
+       st->should_run = true;
+       wake_up_process(st->thread);
+}
+
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       enum cpuhp_state state = st->state;
+
+       trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+       __cpuhp_kick_ap_work(st);
+       wait_for_completion(&st->done);
+       trace_cpuhp_exit(cpu, st->state, state, st->result);
+       return st->result;
+}
+
+static struct smp_hotplug_thread cpuhp_threads = {
+       .store                  = &cpuhp_state.thread,
+       .create                 = &cpuhp_create,
+       .thread_should_run      = cpuhp_should_run,
+       .thread_fn              = cpuhp_thread_fun,
+       .thread_comm            = "cpuhp/%u",
+       .selfparking            = true,
+};
+
+void __init cpuhp_threads_init(void)
+{
+       BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
+       kthread_unpark(this_cpu_read(cpuhp_state.thread));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
-
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
        cpu_maps_update_begin();
@@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu)
        read_unlock(&tasklist_lock);
 }
 
-struct take_cpu_down_param {
-       unsigned long mod;
-       void *hcpu;
-};
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+       BUG_ON(cpu_notify(val, cpu));
+}
+
+static int notify_down_prepare(unsigned int cpu)
+{
+       int err, nr_calls = 0;
+
+       err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
+       if (err) {
+               nr_calls--;
+               __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
+               pr_warn("%s: attempt to take down CPU %u failed\n",
+                               __func__, cpu);
+       }
+       return err;
+}
+
+static int notify_dying(unsigned int cpu)
+{
+       cpu_notify(CPU_DYING, cpu);
+       return 0;
+}
 
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
-       struct take_cpu_down_param *param = _param;
-       int err;
+       struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+       enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
+       int err, cpu = smp_processor_id();
 
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;
 
-       cpu_notify(CPU_DYING | param->mod, param->hcpu);
+       /* Invoke the former CPU_DYING callbacks */
+       for (; st->state > target; st->state--) {
+               struct cpuhp_step *step = cpuhp_ap_states + st->state;
+
+               cpuhp_invoke_callback(cpu, st->state, step->teardown);
+       }
        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Park the stopper thread */
-       stop_machine_park((long)param->hcpu);
+       stop_machine_park(cpu);
        return 0;
 }
 
-/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int takedown_cpu(unsigned int cpu)
 {
-       int err, nr_calls = 0;
-       void *hcpu = (void *)(long)cpu;
-       unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-       struct take_cpu_down_param tcd_param = {
-               .mod = mod,
-               .hcpu = hcpu,
-       };
-
-       if (num_online_cpus() == 1)
-               return -EBUSY;
-
-       if (!cpu_online(cpu))
-               return -EINVAL;
-
-       cpu_hotplug_begin();
-
-       err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
-       if (err) {
-               nr_calls--;
-               __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-               pr_warn("%s: attempt to take down CPU %u failed\n",
-                       __func__, cpu);
-               goto out_release;
-       }
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       int err;
 
        /*
         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
        else
                synchronize_rcu();
 
+       /* Park the smpboot threads */
+       kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
        smpboot_park_threads(cpu);
 
        /*
@@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
-       err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+       err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
-               cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+               cpu_notify_nofail(CPU_DOWN_FAILEDcpu);
                irq_unlock_sparse();
-               goto out_release;
+               return err;
        }
        BUG_ON(cpu_online(cpu));
 
@@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
         *
         * Wait for the stop thread to go away.
         */
-       while (!per_cpu(cpu_dead_idle, cpu))
-               cpu_relax();
-       smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
-       per_cpu(cpu_dead_idle, cpu) = false;
+       wait_for_completion(&st->done);
+       BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
 
        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();
@@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
        /* This actually kills the CPU. */
        __cpu_die(cpu);
 
-       /* CPU is completely dead: tell everyone.  Too late to complain. */
        tick_cleanup_dead_cpu(cpu);
-       cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+       return 0;
+}
 
+static int notify_dead(unsigned int cpu)
+{
+       cpu_notify_nofail(CPU_DEAD, cpu);
        check_for_tasks(cpu);
+       return 0;
+}
 
-out_release:
+static void cpuhp_complete_idle_dead(void *arg)
+{
+       struct cpuhp_cpu_state *st = arg;
+
+       complete(&st->done);
+}
+
+void cpuhp_report_idle_dead(void)
+{
+       struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+       BUG_ON(st->state != CPUHP_AP_OFFLINE);
+       rcu_report_dead(smp_processor_id());
+       st->state = CPUHP_AP_IDLE_DEAD;
+       /*
+        * We cannot call complete after rcu_report_dead() so we delegate it
+        * to an online cpu.
+        */
+       smp_call_function_single(cpumask_first(cpu_online_mask),
+                                cpuhp_complete_idle_dead, st, 0);
+}
+
+#else
+#define notify_down_prepare    NULL
+#define takedown_cpu           NULL
+#define notify_dead            NULL
+#define notify_dying           NULL
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
+                          enum cpuhp_state target)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       int prev_state, ret = 0;
+       bool hasdied = false;
+
+       if (num_online_cpus() == 1)
+               return -EBUSY;
+
+       if (!cpu_present(cpu))
+               return -EINVAL;
+
+       cpu_hotplug_begin();
+
+       cpuhp_tasks_frozen = tasks_frozen;
+
+       prev_state = st->state;
+       st->target = target;
+       /*
+        * If the current CPU state is in the range of the AP hotplug thread,
+        * then we need to kick the thread.
+        */
+       if (st->state > CPUHP_TEARDOWN_CPU) {
+               ret = cpuhp_kick_ap_work(cpu);
+               /*
+                * The AP side has done the error rollback already. Just
+                * return the error code..
+                */
+               if (ret)
+                       goto out;
+
+               /*
+                * We might have stopped still in the range of the AP hotplug
+                * thread. Nothing to do anymore.
+                */
+               if (st->state > CPUHP_TEARDOWN_CPU)
+                       goto out;
+       }
+       /*
+        * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
+        * to do the further cleanups.
+        */
+       ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+
+       hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
+out:
        cpu_hotplug_done();
-       if (!err)
-               cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-       return err;
+       /* This post dead nonsense must die */
+       if (!ret && hasdied)
+               cpu_notify_nofail(CPU_POST_DEAD, cpu);
+       return ret;
 }
 
-int cpu_down(unsigned int cpu)
+static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
 {
        int err;
 
@@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu)
                goto out;
        }
 
-       err = _cpu_down(cpu, 0);
+       err = _cpu_down(cpu, 0, target);
 
 out:
        cpu_maps_update_done();
        return err;
 }
+int cpu_down(unsigned int cpu)
+{
+       return do_cpu_down(cpu, CPUHP_OFFLINE);
+}
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-/*
- * Unpark per-CPU smpboot kthreads at CPU-online time.
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
  */
-static int smpboot_thread_call(struct notifier_block *nfb,
-                              unsigned long action, void *hcpu)
+void notify_cpu_starting(unsigned int cpu)
 {
-       int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+       enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
 
-       case CPU_DOWN_FAILED:
-       case CPU_ONLINE:
-               smpboot_unpark_threads(cpu);
-               break;
+       while (st->state < target) {
+               struct cpuhp_step *step;
 
-       default:
-               break;
+               st->state++;
+               step = cpuhp_ap_states + st->state;
+               cpuhp_invoke_callback(cpu, st->state, step->startup);
        }
-
-       return NOTIFY_OK;
 }
 
-static struct notifier_block smpboot_thread_notifier = {
-       .notifier_call = smpboot_thread_call,
-       .priority = CPU_PRI_SMPBOOT,
-};
-
-void smpboot_thread_init(void)
+/*
+ * Called from the idle task. We need to set active here, so we can kick off
+ * the stopper thread and unpark the smpboot threads. If the target state is
+ * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
+ * cpu further.
+ */
+void cpuhp_online_idle(enum cpuhp_state state)
 {
-       register_cpu_notifier(&smpboot_thread_notifier);
+       struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+       unsigned int cpu = smp_processor_id();
+
+       /* Happens for the boot cpu */
+       if (state != CPUHP_AP_ONLINE_IDLE)
+               return;
+
+       st->state = CPUHP_AP_ONLINE_IDLE;
+
+       /* The cpu is marked online, set it active now */
+       set_cpu_active(cpu, true);
+       /* Unpark the stopper thread and the hotplug thread of this cpu */
+       stop_machine_unpark(cpu);
+       kthread_unpark(st->thread);
+
+       /* Should we go further up ? */
+       if (st->target > CPUHP_AP_ONLINE_IDLE)
+               __cpuhp_kick_ap_work(st);
+       else
+               complete(&st->done);
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 {
-       int ret, nr_calls = 0;
-       void *hcpu = (void *)(long)cpu;
-       unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
+       int ret = 0;
 
        cpu_hotplug_begin();
 
-       if (cpu_online(cpu) || !cpu_present(cpu)) {
+       if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }
 
-       idle = idle_thread_get(cpu);
-       if (IS_ERR(idle)) {
-               ret = PTR_ERR(idle);
-               goto out;
-       }
-
-       ret = smpboot_create_threads(cpu);
-       if (ret)
+       /*
+        * The caller of do_cpu_up might have raced with another
+        * caller. Ignore it for now.
+        */
+       if (st->state >= target)
                goto out;
 
-       ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
-       if (ret) {
-               nr_calls--;
-               pr_warn("%s: attempt to bring up CPU %u failed\n",
-                       __func__, cpu);
-               goto out_notify;
+       if (st->state == CPUHP_OFFLINE) {
+               /* Let it fail before we try to bring the cpu up */
+               idle = idle_thread_get(cpu);
+               if (IS_ERR(idle)) {
+                       ret = PTR_ERR(idle);
+                       goto out;
+               }
        }
 
-       /* Arch-specific enabling code. */
-       ret = __cpu_up(cpu, idle);
-
-       if (ret != 0)
-               goto out_notify;
-       BUG_ON(!cpu_online(cpu));
+       cpuhp_tasks_frozen = tasks_frozen;
 
-       /* Now call notifier in preparation. */
-       cpu_notify(CPU_ONLINE | mod, hcpu);
+       st->target = target;
+       /*
+        * If the current CPU state is in the range of the AP hotplug thread,
+        * then we need to kick the thread once more.
+        */
+       if (st->state > CPUHP_BRINGUP_CPU) {
+               ret = cpuhp_kick_ap_work(cpu);
+               /*
+                * The AP side has done the error rollback already. Just
+                * return the error code..
+                */
+               if (ret)
+                       goto out;
+       }
 
-out_notify:
-       if (ret != 0)
-               __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+       /*
+        * Try to reach the target state. We max out on the BP at
+        * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
+        * responsible for bringing it up to the target state.
+        */
+       target = min((int)target, CPUHP_BRINGUP_CPU);
+       ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
        cpu_hotplug_done();
-
        return ret;
 }
 
-int cpu_up(unsigned int cpu)
+static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
 {
        int err = 0;
 
@@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu)
                goto out;
        }
 
-       err = _cpu_up(cpu, 0);
-
+       err = _cpu_up(cpu, 0, target);
 out:
        cpu_maps_update_done();
        return err;
 }
+
+int cpu_up(unsigned int cpu)
+{
+       return do_cpu_up(cpu, CPUHP_ONLINE);
+}
 EXPORT_SYMBOL_GPL(cpu_up);
 
 #ifdef CONFIG_PM_SLEEP_SMP
@@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void)
                if (cpu == first_cpu)
                        continue;
                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
-               error = _cpu_down(cpu, 1);
+               error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
@@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void)
 
        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
-               error = _cpu_up(cpu, 1);
+               error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
@@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+#endif /* CONFIG_SMP */
+
+/* Boot processor state steps */
+static struct cpuhp_step cpuhp_bp_states[] = {
+       [CPUHP_OFFLINE] = {
+               .name                   = "offline",
+               .startup                = NULL,
+               .teardown               = NULL,
+       },
+#ifdef CONFIG_SMP
+       [CPUHP_CREATE_THREADS]= {
+               .name                   = "threads:create",
+               .startup                = smpboot_create_threads,
+               .teardown               = NULL,
+               .cant_stop              = true,
+       },
+       /*
+        * Preparatory and dead notifiers. Will be replaced once the notifiers
+        * are converted to states.
+        */
+       [CPUHP_NOTIFY_PREPARE] = {
+               .name                   = "notify:prepare",
+               .startup                = notify_prepare,
+               .teardown               = notify_dead,
+               .skip_onerr             = true,
+               .cant_stop              = true,
+       },
+       /* Kicks the plugged cpu into life */
+       [CPUHP_BRINGUP_CPU] = {
+               .name                   = "cpu:bringup",
+               .startup                = bringup_cpu,
+               .teardown               = NULL,
+               .cant_stop              = true,
+       },
+       /*
+        * Handled on controll processor until the plugged processor manages
+        * this itself.
+        */
+       [CPUHP_TEARDOWN_CPU] = {
+               .name                   = "cpu:teardown",
+               .startup                = NULL,
+               .teardown               = takedown_cpu,
+               .cant_stop              = true,
+       },
+#endif
+};
+
+/* Application processor state steps */
+static struct cpuhp_step cpuhp_ap_states[] = {
+#ifdef CONFIG_SMP
+       /* Final state before CPU kills itself */
+       [CPUHP_AP_IDLE_DEAD] = {
+               .name                   = "idle:dead",
+       },
+       /*
+        * Last state before CPU enters the idle loop to die. Transient state
+        * for synchronization.
+        */
+       [CPUHP_AP_OFFLINE] = {
+               .name                   = "ap:offline",
+               .cant_stop              = true,
+       },
+       /*
+        * Low level startup/teardown notifiers. Run with interrupts
+        * disabled. Will be removed once the notifiers are converted to
+        * states.
+        */
+       [CPUHP_AP_NOTIFY_STARTING] = {
+               .name                   = "notify:starting",
+               .startup                = notify_starting,
+               .teardown               = notify_dying,
+               .skip_onerr             = true,
+               .cant_stop              = true,
+       },
+       /* Entry state on starting. Interrupts enabled from here on. Transient
+        * state for synchronsization */
+       [CPUHP_AP_ONLINE] = {
+               .name                   = "ap:online",
+       },
+       /* Handle smpboot threads park/unpark */
+       [CPUHP_AP_SMPBOOT_THREADS] = {
+               .name                   = "smpboot:threads",
+               .startup                = smpboot_unpark_threads,
+               .teardown               = NULL,
+       },
+       /*
+        * Online/down_prepare notifiers. Will be removed once the notifiers
+        * are converted to states.
+        */
+       [CPUHP_AP_NOTIFY_ONLINE] = {
+               .name                   = "notify:online",
+               .startup                = notify_online,
+               .teardown               = notify_down_prepare,
+       },
+#endif
+       /*
+        * The dynamically registered state space is here
+        */
+
+       /* CPU is fully up and running. */
+       [CPUHP_ONLINE] = {
+               .name                   = "online",
+               .startup                = NULL,
+               .teardown               = NULL,
+       },
+};
+
+/* Sanity check for callbacks */
+static int cpuhp_cb_check(enum cpuhp_state state)
+{
+       if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
+               return -EINVAL;
+       return 0;
+}
+
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+       /*
+        * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+        * purposes as that state is handled explicitely in cpu_down.
+        */
+       return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+       struct cpuhp_step *sp;
+
+       sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+       return sp + state;
+}
+
+static void cpuhp_store_callbacks(enum cpuhp_state state,
+                                 const char *name,
+                                 int (*startup)(unsigned int cpu),
+                                 int (*teardown)(unsigned int cpu))
+{
+       /* (Un)Install the callbacks for further cpu hotplug operations */
+       struct cpuhp_step *sp;
+
+       mutex_lock(&cpuhp_state_mutex);
+       sp = cpuhp_get_step(state);
+       sp->startup = startup;
+       sp->teardown = teardown;
+       sp->name = name;
+       mutex_unlock(&cpuhp_state_mutex);
+}
+
+static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
+{
+       return cpuhp_get_step(state)->teardown;
+}
+
+/*
+ * Call the startup/teardown function for a step either on the AP or
+ * on the current CPU.
+ */
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
+                           int (*cb)(unsigned int), bool bringup)
+{
+       int ret;
+
+       if (!cb)
+               return 0;
+       /*
+        * The non AP bound callbacks can fail on bringup. On teardown
+        * e.g. module removal we crash for now.
+        */
+#ifdef CONFIG_SMP
+       if (cpuhp_is_ap_state(state))
+               ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+       else
+               ret = cpuhp_invoke_callback(cpu, state, cb);
+#else
+       ret = cpuhp_invoke_callback(cpu, state, cb);
+#endif
+       BUG_ON(ret && !bringup);
+       return ret;
+}
+
+/*
+ * Called from __cpuhp_setup_state on a recoverable failure.
+ *
+ * Note: The teardown callbacks for rollback are not allowed to fail!
+ */
+static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
+                                  int (*teardown)(unsigned int cpu))
+{
+       int cpu;
+
+       if (!teardown)
+               return;
+
+       /* Roll back the already executed steps on the other cpus */
+       for_each_present_cpu(cpu) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+               int cpustate = st->state;
+
+               if (cpu >= failedcpu)
+                       break;
+
+               /* Did we invoke the startup call on that cpu ? */
+               if (cpustate >= state)
+                       cpuhp_issue_call(cpu, state, teardown, false);
+       }
+}
+
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+       enum cpuhp_state i;
+
+       mutex_lock(&cpuhp_state_mutex);
+       for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
+               if (cpuhp_ap_states[i].name)
+                       continue;
+
+               cpuhp_ap_states[i].name = "Reserved";
+               mutex_unlock(&cpuhp_state_mutex);
+               return i;
+       }
+       mutex_unlock(&cpuhp_state_mutex);
+       WARN(1, "No more dynamic states available for CPU hotplug\n");
+       return -ENOSPC;
+}
+
 /**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
- * @cpu: cpu that just started
+ * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * @state:     The state to setup
+ * @invoke:    If true, the startup function is invoked for cpus where
+ *             cpu state >= @state
+ * @startup:   startup callback function
+ * @teardown:  teardown callback function
  *
- * This function calls the cpu_chain notifiers with CPU_STARTING.
- * It must be called by the arch code on the new cpu, before the new cpu
- * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ * Returns 0 if successful, otherwise a proper error code
  */
-void notify_cpu_starting(unsigned int cpu)
+int __cpuhp_setup_state(enum cpuhp_state state,
+                       const char *name, bool invoke,
+                       int (*startup)(unsigned int cpu),
+                       int (*teardown)(unsigned int cpu))
 {
-       unsigned long val = CPU_STARTING;
+       int cpu, ret = 0;
+       int dyn_state = 0;
 
-#ifdef CONFIG_PM_SLEEP_SMP
-       if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
-               val = CPU_STARTING_FROZEN;
-#endif /* CONFIG_PM_SLEEP_SMP */
-       cpu_notify(val, (void *)(long)cpu);
+       if (cpuhp_cb_check(state) || !name)
+               return -EINVAL;
+
+       get_online_cpus();
+
+       /* currently assignments for the ONLINE state are possible */
+       if (state == CPUHP_AP_ONLINE_DYN) {
+               dyn_state = 1;
+               ret = cpuhp_reserve_state(state);
+               if (ret < 0)
+                       goto out;
+               state = ret;
+       }
+
+       cpuhp_store_callbacks(state, name, startup, teardown);
+
+       if (!invoke || !startup)
+               goto out;
+
+       /*
+        * Try to call the startup callback for each present cpu
+        * depending on the hotplug state of the cpu.
+        */
+       for_each_present_cpu(cpu) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+               int cpustate = st->state;
+
+               if (cpustate < state)
+                       continue;
+
+               ret = cpuhp_issue_call(cpu, state, startup, true);
+               if (ret) {
+                       cpuhp_rollback_install(cpu, state, teardown);
+                       cpuhp_store_callbacks(state, NULL, NULL, NULL);
+                       goto out;
+               }
+       }
+out:
+       put_online_cpus();
+       if (!ret && dyn_state)
+               return state;
+       return ret;
 }
+EXPORT_SYMBOL(__cpuhp_setup_state);
 
-#endif /* CONFIG_SMP */
+/**
+ * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * @state:     The state to remove
+ * @invoke:    If true, the teardown function is invoked for cpus where
+ *             cpu state >= @state
+ *
+ * The teardown callback is currently not allowed to fail. Think
+ * about module removal!
+ */
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+       int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+       int cpu;
+
+       BUG_ON(cpuhp_cb_check(state));
+
+       get_online_cpus();
+
+       if (!invoke || !teardown)
+               goto remove;
+
+       /*
+        * Call the teardown callback for each present cpu depending
+        * on the hotplug state of the cpu. This function is not
+        * allowed to fail currently!
+        */
+       for_each_present_cpu(cpu) {
+               struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+               int cpustate = st->state;
+
+               if (cpustate >= state)
+                       cpuhp_issue_call(cpu, state, teardown, false);
+       }
+remove:
+       cpuhp_store_callbacks(state, NULL, NULL, NULL);
+       put_online_cpus();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
+static ssize_t show_cpuhp_state(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+       return sprintf(buf, "%d\n", st->state);
+}
+static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+
+static ssize_t write_cpuhp_target(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+       struct cpuhp_step *sp;
+       int target, ret;
+
+       ret = kstrtoint(buf, 10, &target);
+       if (ret)
+               return ret;
+
+#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
+       if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
+               return -EINVAL;
+#else
+       if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
+               return -EINVAL;
+#endif
+
+       ret = lock_device_hotplug_sysfs();
+       if (ret)
+               return ret;
+
+       mutex_lock(&cpuhp_state_mutex);
+       sp = cpuhp_get_step(target);
+       ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
+       mutex_unlock(&cpuhp_state_mutex);
+       if (ret)
+               return ret;
+
+       if (st->state < target)
+               ret = do_cpu_up(dev->id, target);
+       else
+               ret = do_cpu_down(dev->id, target);
+
+       unlock_device_hotplug();
+       return ret ? ret : count;
+}
+
+static ssize_t show_cpuhp_target(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+       return sprintf(buf, "%d\n", st->target);
+}
+static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static struct attribute *cpuhp_cpu_attrs[] = {
+       &dev_attr_state.attr,
+       &dev_attr_target.attr,
+       NULL
+};
+
+static struct attribute_group cpuhp_cpu_attr_group = {
+       .attrs = cpuhp_cpu_attrs,
+       .name = "hotplug",
+       NULL
+};
+
+static ssize_t show_cpuhp_states(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       ssize_t cur, res = 0;
+       int i;
+
+       mutex_lock(&cpuhp_state_mutex);
+       for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
+               struct cpuhp_step *sp = cpuhp_get_step(i);
+
+               if (sp->name) {
+                       cur = sprintf(buf, "%3d: %s\n", i, sp->name);
+                       buf += cur;
+                       res += cur;
+               }
+       }
+       mutex_unlock(&cpuhp_state_mutex);
+       return res;
+}
+static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+
+static struct attribute *cpuhp_cpu_root_attrs[] = {
+       &dev_attr_states.attr,
+       NULL
+};
+
+static struct attribute_group cpuhp_cpu_root_attr_group = {
+       .attrs = cpuhp_cpu_root_attrs,
+       .name = "hotplug",
+       NULL
+};
+
+static int __init cpuhp_sysfs_init(void)
+{
+       int cpu, ret;
+
+       ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+                                &cpuhp_cpu_root_attr_group);
+       if (ret)
+               return ret;
+
+       for_each_possible_cpu(cpu) {
+               struct device *dev = get_cpu_device(cpu);
+
+               if (!dev)
+                       continue;
+               ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+device_initcall(cpuhp_sysfs_init);
+#endif
 
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -789,3 +1673,25 @@ void init_cpu_online(const struct cpumask *src)
 {
        cpumask_copy(&__cpu_online_mask, src);
 }
+
+/*
+ * Activate the first processor.
+ */
+void __init boot_cpu_init(void)
+{
+       int cpu = smp_processor_id();
+
+       /* Mark the boot cpu "present", "online" etc for SMP and UP case */
+       set_cpu_online(cpu, true);
+       set_cpu_active(cpu, true);
+       set_cpu_present(cpu, true);
+       set_cpu_possible(cpu, true);
+}
+
+/*
+ * Must be called _AFTER_ setting up the per_cpu areas
+ */
+void __init boot_cpu_state_init(void)
+{
+       per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
+}
index 3e945fc..41989ab 100644 (file)
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_SPINLOCK(callback_lock);
 
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
 /*
  * CPU / memory hotplug is handled asynchronously.
  */
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 }
 
 /*
- * cpuset_migrate_mm
- *
- *    Migrate memory region from one set of nodes to another.
- *
- *    Temporarilly set tasks mems_allowed to target nodes of migration,
- *    so that the migration code can allocate pages on these nodes.
- *
- *    While the mm_struct we are migrating is typically from some
- *    other task, the task_struct mems_allowed that we are hacking
- *    is for our current task, which must allocate new pages for that
- *    migrating memory region.
+ * Migrate memory region from one set of nodes to another.  This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management.  All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
  */
 
+struct cpuset_migrate_mm_work {
+       struct work_struct      work;
+       struct mm_struct        *mm;
+       nodemask_t              from;
+       nodemask_t              to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+       struct cpuset_migrate_mm_work *mwork =
+               container_of(work, struct cpuset_migrate_mm_work, work);
+
+       /* on a wq worker, no need to worry about %current's mems_allowed */
+       do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+       mmput(mwork->mm);
+       kfree(mwork);
+}
+
 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
 {
-       struct task_struct *tsk = current;
-
-       tsk->mems_allowed = *to;
+       struct cpuset_migrate_mm_work *mwork;
 
-       do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+       mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+       if (mwork) {
+               mwork->mm = mm;
+               mwork->from = *from;
+               mwork->to = *to;
+               INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+               queue_work(cpuset_migrate_mm_wq, &mwork->work);
+       } else {
+               mmput(mm);
+       }
+}
 
-       rcu_read_lock();
-       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
-       rcu_read_unlock();
+void cpuset_post_attach_flush(void)
+{
+       flush_workqueue(cpuset_migrate_mm_wq);
 }
 
 /*
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
                        cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
-               mmput(mm);
+               else
+                       mmput(mm);
        }
        css_task_iter_end(&it);
 
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
-                       if (is_memory_migrate(cs)) {
+                       if (is_memory_migrate(cs))
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
-                       }
-                       mmput(mm);
+                       else
+                               mmput(mm);
                }
        }
 
@@ -1714,6 +1737,7 @@ out_unlock:
        mutex_unlock(&cpuset_mutex);
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
+       flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
 }
 
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
        top_cpuset.effective_mems = node_states[N_MEMORY];
 
        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+       cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+       BUG_ON(!cpuset_migrate_mm_wq);
 }
 
 /**
index e1dbf4a..90ff129 100644 (file)
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
        } else {
                kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                           __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
                if (!bp->bp_type) {
                        kdb_printf("Software breakpoints are unavailable.\n"
-                                  "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                  "  Boot the kernel with rodata=off\n"
                                   "  OR use hw breaks: help bph\n");
                }
-#endif
                return 1;
        }
        return 0;
index 06ae52e..712570d 100644 (file)
@@ -49,8 +49,6 @@
 
 #include <asm/irq_regs.h>
 
-static struct workqueue_struct *perf_wq;
-
 typedef int (*remote_function_f)(void *);
 
 struct remote_function_call {
@@ -66,8 +64,17 @@ static void remote_function(void *data)
        struct task_struct *p = tfc->p;
 
        if (p) {
-               tfc->ret = -EAGAIN;
-               if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+               /* -EAGAIN */
+               if (task_cpu(p) != smp_processor_id())
+                       return;
+
+               /*
+                * Now that we're on right CPU with IRQs disabled, we can test
+                * if we hit the right task without races.
+                */
+
+               tfc->ret = -ESRCH; /* No such (running) process */
+               if (p != current)
                        return;
        }
 
@@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
                .p      = p,
                .func   = func,
                .info   = info,
-               .ret    = -ESRCH, /* No such (running) process */
+               .ret    = -EAGAIN,
        };
+       int ret;
 
-       if (task_curr(p))
-               smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+       do {
+               ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+               if (!ret)
+                       ret = data.ret;
+       } while (ret == -EAGAIN);
 
-       return data.ret;
+       return ret;
 }
 
 /**
@@ -126,44 +137,170 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
        return data.ret;
 }
 
-static void event_function_call(struct perf_event *event,
-                               int (*active)(void *),
-                               void (*inactive)(void *),
-                               void *data)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+                         struct perf_event_context *ctx)
+{
+       raw_spin_lock(&cpuctx->ctx.lock);
+       if (ctx)
+               raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+                           struct perf_event_context *ctx)
+{
+       if (ctx)
+               raw_spin_unlock(&ctx->lock);
+       raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#define TASK_TOMBSTONE ((void *)-1L)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+       return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ *  - removing the last event from a task ctx; this is relatively straight
+ *    forward and is done in __perf_remove_from_context.
+ *
+ *  - adding the first event to a task ctx; this is tricky because we cannot
+ *    rely on ctx->is_active and therefore cannot use event_function_call().
+ *    See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+                       struct perf_event_context *, void *);
+
+struct event_function_struct {
+       struct perf_event *event;
+       event_f func;
+       void *data;
+};
+
+static int event_function(void *info)
+{
+       struct event_function_struct *efs = info;
+       struct perf_event *event = efs->event;
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_event_context *task_ctx = cpuctx->task_ctx;
+       int ret = 0;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       perf_ctx_lock(cpuctx, task_ctx);
+       /*
+        * Since we do the IPI call without holding ctx->lock things can have
+        * changed, double check we hit the task we set out to hit.
+        */
+       if (ctx->task) {
+               if (ctx->task != current) {
+                       ret = -ESRCH;
+                       goto unlock;
+               }
+
+               /*
+                * We only use event_function_call() on established contexts,
+                * and event_function() is only ever called when active (or
+                * rather, we'll have bailed in task_function_call() or the
+                * above ctx->task != current test), therefore we must have
+                * ctx->is_active here.
+                */
+               WARN_ON_ONCE(!ctx->is_active);
+               /*
+                * And since we have ctx->is_active, cpuctx->task_ctx must
+                * match.
+                */
+               WARN_ON_ONCE(task_ctx != ctx);
+       } else {
+               WARN_ON_ONCE(&cpuctx->ctx != ctx);
+       }
+
+       efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+       perf_ctx_unlock(cpuctx, task_ctx);
+
+       return ret;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+       struct event_function_struct efs = {
+               .event = event,
+               .func = func,
+               .data = data,
+       };
+
+       int ret = event_function(&efs);
+       WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
-       struct task_struct *task = ctx->task;
+       struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+       struct event_function_struct efs = {
+               .event = event,
+               .func = func,
+               .data = data,
+       };
+
+       if (!event->parent) {
+               /*
+                * If this is a !child event, we must hold ctx::mutex to
+                * stabilize the the event->ctx relation. See
+                * perf_event_ctx_lock().
+                */
+               lockdep_assert_held(&ctx->mutex);
+       }
 
        if (!task) {
-               cpu_function_call(event->cpu, active, data);
+               cpu_function_call(event->cpu, event_function, &efs);
                return;
        }
 
+       if (task == TASK_TOMBSTONE)
+               return;
+
 again:
-       if (!task_function_call(task, active, data))
+       if (!task_function_call(task, event_function, &efs))
                return;
 
        raw_spin_lock_irq(&ctx->lock);
+       /*
+        * Reload the task pointer, it might have been changed by
+        * a concurrent perf_event_context_sched_out().
+        */
+       task = ctx->task;
+       if (task == TASK_TOMBSTONE) {
+               raw_spin_unlock_irq(&ctx->lock);
+               return;
+       }
        if (ctx->is_active) {
-               /*
-                * Reload the task pointer, it might have been changed by
-                * a concurrent perf_event_context_sched_out().
-                */
-               task = ctx->task;
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
-       inactive(data);
+       func(event, NULL, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
 }
 
-#define EVENT_OWNER_KERNEL ((void *) -1)
-
-static bool is_kernel_event(struct perf_event *event)
-{
-       return event->owner == EVENT_OWNER_KERNEL;
-}
-
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -179,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event)
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
+       EVENT_TIME = 0x4,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -186,7 +324,13 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
@@ -368,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
        return event->clock();
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
-                         struct perf_event_context *ctx)
-{
-       raw_spin_lock(&cpuctx->ctx.lock);
-       if (ctx)
-               raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
-                           struct perf_event_context *ctx)
-{
-       if (ctx)
-               raw_spin_unlock(&ctx->lock);
-       raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
@@ -579,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
-
-       /*
-        * next is NULL when called from perf_event_enable_on_exec()
-        * that will systematically cause a cgroup_switch()
-        */
-       if (next)
-               cgrp2 = perf_cgroup_from_task(next, NULL);
+       cgrp2 = perf_cgroup_from_task(next, NULL);
 
        /*
         * only schedule out current cgroup events if we know
@@ -611,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
-
-       /* prev can never be NULL */
        cgrp2 = perf_cgroup_from_task(prev, NULL);
 
        /*
@@ -917,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx)
        if (atomic_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
-               if (ctx->task)
+               if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
@@ -934,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx)
  * perf_event_context::mutex nests and those are:
  *
  *  - perf_event_exit_task_context()   [ child , 0 ]
- *      __perf_event_exit_task()
- *        sync_child_event()
- *          put_event()                        [ parent, 1 ]
+ *      perf_event_exit_event()
+ *        put_event()                  [ parent, 1 ]
  *
  *  - perf_event_init_context()                [ parent, 0 ]
  *      inherit_task_group()
@@ -979,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx)
  * Lock order:
  *     task_struct::perf_event_mutex
  *       perf_event_context::mutex
- *         perf_event_context::lock
  *         perf_event::child_mutex;
+ *           perf_event_context::lock
  *         perf_event::mmap_mutex
  *         mmap_sem
  */
@@ -1078,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event)
 
 /*
  * Get the perf_event_context for a task and lock it.
+ *
  * This has to cope with with the fact that until it is locked,
  * the context could get moved to another task.
  */
@@ -1118,9 +1232,12 @@ retry:
                        goto retry;
                }
 
-               if (!atomic_inc_not_zero(&ctx->refcount)) {
+               if (ctx->task == TASK_TOMBSTONE ||
+                   !atomic_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
+               } else {
+                       WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
@@ -1180,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
 
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
  */
 static void update_event_times(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        u64 run_end;
 
+       lockdep_assert_held(&ctx->lock);
+
        if (event->state < PERF_EVENT_STATE_INACTIVE ||
            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
                return;
+
        /*
         * in cgroup mode, time_enabled represents
         * the time the event was enabled AND active
@@ -1246,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+       lockdep_assert_held(&ctx->lock);
+
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;
 
@@ -1448,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
        if (is_cgroup_event(event)) {
                ctx->nr_cgroups--;
+               /*
+                * Because cgroup events are always per-cpu events, this will
+                * always be called from the right CPU.
+                */
                cpuctx = __get_cpu_context(ctx);
                /*
-                * if there are no more cgroup events
-                * then cler cgrp to avoid stale pointer
-                * in update_cgrp_time_from_cpuctx()
+                * If there are no more cgroup events then clear cgrp to avoid
+                * stale pointer in update_cgrp_time_from_cpuctx().
                 */
                if (!ctx->nr_cgroups)
                        cpuctx->cgrp = NULL;
@@ -1530,45 +1654,11 @@ out:
                perf_event__header_size(tmp);
 }
 
-/*
- * User event without the task.
- */
 static bool is_orphaned_event(struct perf_event *event)
 {
-       return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
-       return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
-       if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
-               return;
-
-       if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
-               get_ctx(ctx);
-               ctx->orphans_remove_sched = true;
-       }
-}
-
-static int __init perf_workqueue_init(void)
-{
-       perf_wq = create_singlethread_workqueue("perf");
-       WARN(!perf_wq, "failed to create perf workqueue\n");
-       return perf_wq ? 0 : -1;
+       return event->state == PERF_EVENT_STATE_DEAD;
 }
 
-core_initcall(perf_workqueue_init);
-
 static inline int pmu_filter_match(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
@@ -1611,14 +1701,14 @@ event_sched_out(struct perf_event *event,
 
        perf_pmu_disable(event->pmu);
 
+       event->tstamp_stopped = tstamp;
+       event->pmu->del(event, 0);
+       event->oncpu = -1;
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
-       event->tstamp_stopped = tstamp;
-       event->pmu->del(event, 0);
-       event->oncpu = -1;
 
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
@@ -1629,9 +1719,6 @@ event_sched_out(struct perf_event *event,
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 
-       if (is_orphaned_child(event))
-               schedule_orphans_remove(ctx);
-
        perf_pmu_enable(event->pmu);
 }
 
@@ -1655,21 +1742,7 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
 
-struct remove_event {
-       struct perf_event *event;
-       bool detach_group;
-};
-
-static void ___perf_remove_from_context(void *info)
-{
-       struct remove_event *re = info;
-       struct perf_event *event = re->event;
-       struct perf_event_context *ctx = event->ctx;
-
-       if (re->detach_group)
-               perf_group_detach(event);
-       list_del_event(event, ctx);
-}
+#define DETACH_GROUP   0x01UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -1677,33 +1750,31 @@ static void ___perf_remove_from_context(void *info)
  * We disable the event on the hardware level first. After that we
  * remove it from the context list.
  */
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+                          struct perf_cpu_context *cpuctx,
+                          struct perf_event_context *ctx,
+                          void *info)
 {
-       struct remove_event *re = info;
-       struct perf_event *event = re->event;
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       unsigned long flags = (unsigned long)info;
 
-       raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
-       if (re->detach_group)
+       if (flags & DETACH_GROUP)
                perf_group_detach(event);
        list_del_event(event, ctx);
-       if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+       if (!ctx->nr_events && ctx->is_active) {
                ctx->is_active = 0;
-               cpuctx->task_ctx = NULL;
+               if (ctx->task) {
+                       WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+                       cpuctx->task_ctx = NULL;
+               }
        }
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
 }
 
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
  * If event->ctx is a cloned context, callers must make sure that
  * every task struct that event->ctx->task could possibly point to
  * remains valid.  This is OK when called from perf_release since
@@ -1711,73 +1782,32 @@ static int __perf_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
 {
-       struct perf_event_context *ctx = event->ctx;
-       struct remove_event re = {
-               .event = event,
-               .detach_group = detach_group,
-       };
-
-       lockdep_assert_held(&ctx->mutex);
+       lockdep_assert_held(&event->ctx->mutex);
 
-       event_function_call(event, __perf_remove_from_context,
-                           ___perf_remove_from_context, &re);
+       event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
  * Cross CPU call to disable a performance event
  */
-int __perf_event_disable(void *info)
-{
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-       /*
-        * If this is a per-task event, need to check whether this
-        * event's task is the current task on this cpu.
-        *
-        * Can trigger due to concurrent perf_event_context_sched_out()
-        * flipping contexts around.
-        */
-       if (ctx->task && cpuctx->task_ctx != ctx)
-               return -EINVAL;
-
-       raw_spin_lock(&ctx->lock);
-
-       /*
-        * If the event is on, turn it off.
-        * If it is in error state, leave it in error state.
-        */
-       if (event->state >= PERF_EVENT_STATE_INACTIVE) {
-               update_context_time(ctx);
-               update_cgrp_time_from_event(event);
-               update_group_times(event);
-               if (event == event->group_leader)
-                       group_sched_out(event, cpuctx, ctx);
-               else
-                       event_sched_out(event, cpuctx, ctx);
-               event->state = PERF_EVENT_STATE_OFF;
-       }
-
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
-}
-
-void ___perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+                                struct perf_cpu_context *cpuctx,
+                                struct perf_event_context *ctx,
+                                void *info)
 {
-       struct perf_event *event = info;
+       if (event->state < PERF_EVENT_STATE_INACTIVE)
+               return;
 
-       /*
-        * Since we have the lock this context can't be scheduled
-        * in, so we can change the state safely.
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE) {
-               update_group_times(event);
-               event->state = PERF_EVENT_STATE_OFF;
-       }
+       update_context_time(ctx);
+       update_cgrp_time_from_event(event);
+       update_group_times(event);
+       if (event == event->group_leader)
+               group_sched_out(event, cpuctx, ctx);
+       else
+               event_sched_out(event, cpuctx, ctx);
+       event->state = PERF_EVENT_STATE_OFF;
 }
 
 /*
@@ -1788,7 +1818,8 @@ void ___perf_event_disable(void *info)
  * remains valid.  This condition is satisifed when called through
  * perf_event_for_each_child or perf_event_for_each because they
  * hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
  * When called from perf_pending_event it's OK because event->ctx
  * is the current context on this CPU and preemption is disabled,
  * hence we can't get into perf_event_task_sched_out for this context.
@@ -1804,8 +1835,12 @@ static void _perf_event_disable(struct perf_event *event)
        }
        raw_spin_unlock_irq(&ctx->lock);
 
-       event_function_call(event, __perf_event_disable,
-                           ___perf_event_disable, event);
+       event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+       event_function_local(event, __perf_event_disable, NULL);
 }
 
 /*
@@ -1918,9 +1953,6 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
 
-       if (is_orphaned_child(event))
-               schedule_orphans_remove(ctx);
-
 out:
        perf_pmu_enable(event->pmu);
 
@@ -2039,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
        event->tstamp_stopped = tstamp;
 }
 
-static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+                         struct perf_cpu_context *cpuctx,
+                         enum event_type_t event_type);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task);
 
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              struct perf_event_context *ctx)
+{
+       if (!cpuctx->task_ctx)
+               return;
+
+       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+               return;
+
+       ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                struct task_struct *task)
@@ -2058,22 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
-static void ___perf_install_in_context(void *info)
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+                       struct perf_event_context *task_ctx)
 {
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
-
-       /*
-        * Since the task isn't running, its safe to add the event, us holding
-        * the ctx->lock ensures the task won't get scheduled in.
-        */
-       add_event_to_ctx(event, ctx);
+       perf_pmu_disable(cpuctx->ctx.pmu);
+       if (task_ctx)
+               task_ctx_sched_out(cpuctx, task_ctx);
+       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+       perf_event_sched_in(cpuctx, task_ctx, current);
+       perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
 /*
  * Cross CPU call to install and enable a performance event
  *
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
  */
 static int  __perf_install_in_context(void *info)
 {
@@ -2081,79 +2127,106 @@ static int  __perf_install_in_context(void *info)
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
-       struct task_struct *task = current;
-
-       perf_ctx_lock(cpuctx, task_ctx);
-       perf_pmu_disable(cpuctx->ctx.pmu);
-
-       /*
-        * If there was an active task_ctx schedule it out.
-        */
-       if (task_ctx)
-               task_ctx_sched_out(task_ctx);
+       bool activate = true;
+       int ret = 0;
 
-       /*
-        * If the context we're installing events in is not the
-        * active task_ctx, flip them.
-        */
-       if (ctx->task && task_ctx != ctx) {
-               if (task_ctx)
-                       raw_spin_unlock(&task_ctx->lock);
+       raw_spin_lock(&cpuctx->ctx.lock);
+       if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;
-       }
 
-       if (task_ctx) {
-               cpuctx->task_ctx = task_ctx;
-               task = task_ctx->task;
-       }
-
-       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+               /* If we're on the wrong CPU, try again */
+               if (task_cpu(ctx->task) != smp_processor_id()) {
+                       ret = -ESRCH;
+                       goto unlock;
+               }
 
-       update_context_time(ctx);
-       /*
-        * update cgrp time only if current cgrp
-        * matches event->cgrp. Must be done before
-        * calling add_event_to_ctx()
-        */
-       update_cgrp_time_from_event(event);
+               /*
+                * If we're on the right CPU, see if the task we target is
+                * current, if not we don't have to activate the ctx, a future
+                * context switch will do that for us.
+                */
+               if (ctx->task != current)
+                       activate = false;
+               else
+                       WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
 
-       add_event_to_ctx(event, ctx);
+       } else if (task_ctx) {
+               raw_spin_lock(&task_ctx->lock);
+       }
 
-       /*
-        * Schedule everything back in
-        */
-       perf_event_sched_in(cpuctx, task_ctx, task);
+       if (activate) {
+               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               add_event_to_ctx(event, ctx);
+               ctx_resched(cpuctx, task_ctx);
+       } else {
+               add_event_to_ctx(event, ctx);
+       }
 
-       perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
 
-       return 0;
+       return ret;
 }
 
 /*
- * Attach a performance event to a context
- *
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
+ * Attach a performance event to a context.
  *
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
 {
+       struct task_struct *task = READ_ONCE(ctx->task);
+
        lockdep_assert_held(&ctx->mutex);
 
        event->ctx = ctx;
        if (event->cpu != -1)
                event->cpu = cpu;
 
-       event_function_call(event, __perf_install_in_context,
-                           ___perf_install_in_context, event);
+       if (!task) {
+               cpu_function_call(cpu, __perf_install_in_context, event);
+               return;
+       }
+
+       /*
+        * Should not happen, we validate the ctx is still alive before calling.
+        */
+       if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
+               return;
+
+       /*
+        * Installing events is tricky because we cannot rely on ctx->is_active
+        * to be set in case this is the nr_events 0 -> 1 transition.
+        */
+again:
+       /*
+        * Cannot use task_function_call() because we need to run on the task's
+        * CPU regardless of whether its current or not.
+        */
+       if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+               return;
+
+       raw_spin_lock_irq(&ctx->lock);
+       task = ctx->task;
+       if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
+               /*
+                * Cannot happen because we already checked above (which also
+                * cannot happen), and we hold ctx->mutex, which serializes us
+                * against perf_event_exit_task_context().
+                */
+               raw_spin_unlock_irq(&ctx->lock);
+               return;
+       }
+       raw_spin_unlock_irq(&ctx->lock);
+       /*
+        * Since !ctx->is_active doesn't mean anything, we must IPI
+        * unconditionally.
+        */
+       goto again;
 }
 
 /*
@@ -2180,85 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event)
 /*
  * Cross CPU call to enable a performance event
  */
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+                               struct perf_cpu_context *cpuctx,
+                               struct perf_event_context *ctx,
+                               void *info)
 {
-       struct perf_event *event = info;
-       struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-       int err;
+       struct perf_event_context *task_ctx;
 
-       /*
-        * There's a time window between 'ctx->is_active' check
-        * in perf_event_enable function and this place having:
-        *   - IRQs on
-        *   - ctx->lock unlocked
-        *
-        * where the task could be killed and 'ctx' deactivated
-        * by perf_event_exit_task.
-        */
-       if (!ctx->is_active)
-               return -EINVAL;
-
-       raw_spin_lock(&ctx->lock);
-       update_context_time(ctx);
-
-       if (event->state >= PERF_EVENT_STATE_INACTIVE)
-               goto unlock;
+       if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+           event->state <= PERF_EVENT_STATE_ERROR)
+               return;
 
-       /*
-        * set current task's cgroup time reference point
-        */
-       perf_cgroup_set_timestamp(current, ctx);
+       if (ctx->is_active)
+               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
        __perf_event_mark_enabled(event);
 
+       if (!ctx->is_active)
+               return;
+
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
                        perf_cgroup_defer_enabled(event);
-               goto unlock;
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+               return;
        }
 
        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
-       if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
-               goto unlock;
-
-       if (!group_can_go_on(event, cpuctx, 1)) {
-               err = -EEXIST;
-       } else {
-               if (event == leader)
-                       err = group_sched_in(event, cpuctx, ctx);
-               else
-                       err = event_sched_in(event, cpuctx, ctx);
-       }
-
-       if (err) {
-               /*
-                * If this event can't go on and it's part of a
-                * group, then the whole group has to come off.
-                */
-               if (leader != event) {
-                       group_sched_out(leader, cpuctx, ctx);
-                       perf_mux_hrtimer_restart(cpuctx);
-               }
-               if (leader->attr.pinned) {
-                       update_group_times(leader);
-                       leader->state = PERF_EVENT_STATE_ERROR;
-               }
+       if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+               return;
        }
 
-unlock:
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
-}
+       task_ctx = cpuctx->task_ctx;
+       if (ctx->task)
+               WARN_ON_ONCE(task_ctx != ctx);
 
-void ___perf_event_enable(void *info)
-{
-       __perf_event_mark_enabled((struct perf_event *)info);
+       ctx_resched(cpuctx, task_ctx);
 }
 
 /*
@@ -2275,7 +2310,8 @@ static void _perf_event_enable(struct perf_event *event)
        struct perf_event_context *ctx = event->ctx;
 
        raw_spin_lock_irq(&ctx->lock);
-       if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+       if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+           event->state <  PERF_EVENT_STATE_ERROR) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
@@ -2291,8 +2327,7 @@ static void _perf_event_enable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
        raw_spin_unlock_irq(&ctx->lock);
 
-       event_function_call(event, __perf_event_enable,
-                           ___perf_event_enable, event);
+       event_function_call(event, __perf_event_enable, NULL);
 }
 
 /*
@@ -2342,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
 {
-       struct perf_event *event;
        int is_active = ctx->is_active;
+       struct perf_event *event;
 
-       ctx->is_active &= ~event_type;
-       if (likely(!ctx->nr_events))
+       lockdep_assert_held(&ctx->lock);
+
+       if (likely(!ctx->nr_events)) {
+               /*
+                * See __perf_remove_from_context().
+                */
+               WARN_ON_ONCE(ctx->is_active);
+               if (ctx->task)
+                       WARN_ON_ONCE(cpuctx->task_ctx);
                return;
+       }
 
-       update_context_time(ctx);
-       update_cgrp_time_from_cpuctx(cpuctx);
-       if (!ctx->nr_active)
+       ctx->is_active &= ~event_type;
+       if (!(ctx->is_active & EVENT_ALL))
+               ctx->is_active = 0;
+
+       if (ctx->task) {
+               WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+               if (!ctx->is_active)
+                       cpuctx->task_ctx = NULL;
+       }
+
+       is_active ^= ctx->is_active; /* changed bits */
+
+       if (is_active & EVENT_TIME) {
+               /* update (and stop) ctx time */
+               update_context_time(ctx);
+               update_cgrp_time_from_cpuctx(cpuctx);
+       }
+
+       if (!ctx->nr_active || !(is_active & EVENT_ALL))
                return;
 
        perf_pmu_disable(ctx->pmu);
-       if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+       if (is_active & EVENT_PINNED) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
 
-       if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+       if (is_active & EVENT_FLEXIBLE) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
@@ -2518,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {
-                       /*
-                        * XXX do we need a memory barrier of sorts
-                        * wrt to rcu_dereference() of perf_event_ctxp
-                        */
-                       task->perf_event_ctxp[ctxn] = next_ctx;
-                       next->perf_event_ctxp[ctxn] = ctx;
-                       ctx->task = next;
-                       next_ctx->task = task;
+                       WRITE_ONCE(ctx->task, next);
+                       WRITE_ONCE(next_ctx->task, task);
 
                        swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
 
+                       /*
+                        * RCU_INIT_POINTER here is safe because we've not
+                        * modified the ctx and the above modification of
+                        * ctx->task and ctx->task_ctx_data are immaterial
+                        * since those values are always verified under
+                        * ctx->lock which we're now holding.
+                        */
+                       RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+                       RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
                        do_switch = 0;
 
                        perf_event_sync_stat(ctx, next_ctx);
@@ -2541,8 +2604,7 @@ unlock:
 
        if (do_switch) {
                raw_spin_lock(&ctx->lock);
-               ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-               cpuctx->task_ctx = NULL;
+               task_ctx_sched_out(cpuctx, ctx);
                raw_spin_unlock(&ctx->lock);
        }
 }
@@ -2637,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
                perf_cgroup_sched_out(task, next);
 }
 
-static void task_ctx_sched_out(struct perf_event_context *ctx)
-{
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-       if (!cpuctx->task_ctx)
-               return;
-
-       if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-               return;
-
-       ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-       cpuctx->task_ctx = NULL;
-}
-
 /*
  * Called with IRQs disabled
  */
@@ -2725,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx,
             enum event_type_t event_type,
             struct task_struct *task)
 {
-       u64 now;
        int is_active = ctx->is_active;
+       u64 now;
+
+       lockdep_assert_held(&ctx->lock);
 
-       ctx->is_active |= event_type;
        if (likely(!ctx->nr_events))
                return;
 
-       now = perf_clock();
-       ctx->timestamp = now;
-       perf_cgroup_set_timestamp(task, ctx);
+       ctx->is_active |= (event_type | EVENT_TIME);
+       if (ctx->task) {
+               if (!is_active)
+                       cpuctx->task_ctx = ctx;
+               else
+                       WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+       }
+
+       is_active ^= ctx->is_active; /* changed bits */
+
+       if (is_active & EVENT_TIME) {
+               /* start ctx time */
+               now = perf_clock();
+               ctx->timestamp = now;
+               perf_cgroup_set_timestamp(task, ctx);
+       }
+
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
-       if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+       if (is_active & EVENT_PINNED)
                ctx_pinned_sched_in(ctx, cpuctx);
 
        /* Then walk through the lower prio flexible groups */
-       if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+       if (is_active & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
 }
 
@@ -2773,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         * cpu flexible, task flexible.
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
-       if (ctx->nr_events)
-               cpuctx->task_ctx = ctx;
-
-       perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
-
+       perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
 }
@@ -2800,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        struct perf_event_context *ctx;
        int ctxn;
 
+       /*
+        * If cgroup events exist on this CPU, then we need to check if we have
+        * to switch in PMU state; cgroup event are system-wide mode only.
+        *
+        * Since cgroup events are CPU events, we must schedule these in before
+        * we schedule in the task events.
+        */
+       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+               perf_cgroup_sched_in(prev, task);
+
        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (likely(!ctx))
@@ -2807,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
                perf_event_context_sched_in(ctx, task);
        }
-       /*
-        * if cgroup events exist on this CPU, then we need
-        * to check if we have to switch in PMU state.
-        * cgroup event are system-wide mode only
-        */
-       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-               perf_cgroup_sched_in(prev, task);
 
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);
@@ -3051,17 +3112,6 @@ done:
        return rotate;
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
-       if (atomic_read(&nr_freq_events) ||
-           __this_cpu_read(perf_throttled_count))
-               return false;
-       else
-               return true;
-}
-#endif
-
 void perf_event_task_tick(void)
 {
        struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3072,6 +3122,7 @@ void perf_event_task_tick(void)
 
        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
+       tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 
        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3099,46 +3150,31 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
        struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
-       int ret;
 
        local_irq_save(flags);
        ctx = current->perf_event_ctxp[ctxn];
        if (!ctx || !ctx->nr_events)
                goto out;
 
-       /*
-        * We must ctxsw out cgroup events to avoid conflict
-        * when invoking perf_task_event_sched_in() later on
-        * in this function. Otherwise we end up trying to
-        * ctxswin cgroup events which are already scheduled
-        * in.
-        */
-       perf_cgroup_sched_out(current, NULL);
-
-       raw_spin_lock(&ctx->lock);
-       task_ctx_sched_out(ctx);
-
-       list_for_each_entry(event, &ctx->event_list, event_entry) {
-               ret = event_enable_on_exec(event, ctx);
-               if (ret)
-                       enabled = 1;
-       }
+       cpuctx = __get_cpu_context(ctx);
+       perf_ctx_lock(cpuctx, ctx);
+       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       list_for_each_entry(event, &ctx->event_list, event_entry)
+               enabled |= event_enable_on_exec(event, ctx);
 
        /*
-        * Unclone this context if we enabled any event.
+        * Unclone and reschedule this context if we enabled any event.
         */
-       if (enabled)
+       if (enabled) {
                clone_ctx = unclone_ctx(ctx);
+               ctx_resched(cpuctx, ctx);
+       }
+       perf_ctx_unlock(cpuctx, ctx);
 
-       raw_spin_unlock(&ctx->lock);
-
-       /*
-        * Also calls ctxswin for cgroup events, if any:
-        */
-       perf_event_context_sched_in(ctx, ctx->task);
 out:
        local_irq_restore(flags);
 
@@ -3334,7 +3370,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
-       INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 
 static struct perf_event_context *
@@ -3519,13 +3554,37 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       spin_lock(&nr_freq_lock);
+       if (atomic_dec_and_test(&nr_freq_events))
+               tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               unaccount_freq_event_nohz();
+       else
+               atomic_dec(&nr_freq_events);
+}
+
 static void unaccount_event(struct perf_event *event)
 {
+       bool dec = false;
+
        if (event->parent)
                return;
 
        if (event->attach_state & PERF_ATTACH_TASK)
-               static_key_slow_dec_deferred(&perf_sched_events);
+               dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.comm)
@@ -3533,19 +3592,32 @@ static void unaccount_event(struct perf_event *event)
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
-               atomic_dec(&nr_freq_events);
+               unaccount_freq_event();
        if (event->attr.context_switch) {
-               static_key_slow_dec_deferred(&perf_sched_events);
+               dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
-               static_key_slow_dec_deferred(&perf_sched_events);
+               dec = true;
        if (has_branch_stack(event))
-               static_key_slow_dec_deferred(&perf_sched_events);
+               dec = true;
+
+       if (dec) {
+               if (!atomic_add_unless(&perf_sched_count, -1, 1))
+                       schedule_delayed_work(&perf_sched_work, HZ);
+       }
 
        unaccount_event_cpu(event, event->cpu);
 }
 
+static void perf_sched_delayed(struct work_struct *work)
+{
+       mutex_lock(&perf_sched_mutex);
+       if (atomic_dec_and_test(&perf_sched_count))
+               static_branch_disable(&perf_sched_events);
+       mutex_unlock(&perf_sched_mutex);
+}
+
 /*
  * The following implement mutual exclusion of events on "exclusive" pmus
  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3556,7 +3628,7 @@ static void unaccount_event(struct perf_event *event)
  *  3) two matching events on the same context.
  *
  * The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
  */
 static int exclusive_event_init(struct perf_event *event)
 {
@@ -3618,40 +3690,17 @@ static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *iter_event;
-       struct pmu *pmu = event->pmu;
-
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
-               return true;
-
-       list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
-               if (exclusive_event_match(iter_event, event))
-                       return false;
-       }
-
-       return true;
-}
-
-static void __free_event(struct perf_event *event)
-{
-       if (!event->parent) {
-               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-                       put_callchain_buffers();
-       }
-
-       perf_event_free_bpf_prog(event);
-
-       if (event->destroy)
-               event->destroy(event);
+       struct pmu *pmu = event->pmu;
 
-       if (event->ctx)
-               put_ctx(event->ctx);
+       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+               return true;
 
-       if (event->pmu) {
-               exclusive_event_destroy(event);
-               module_put(event->pmu->module);
+       list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+               if (exclusive_event_match(iter_event, event))
+                       return false;
        }
 
-       call_rcu(&event->rcu_head, free_event_rcu);
+       return true;
 }
 
 static void _free_event(struct perf_event *event)
@@ -3675,7 +3724,25 @@ static void _free_event(struct perf_event *event)
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
 
-       __free_event(event);
+       if (!event->parent) {
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
+       }
+
+       perf_event_free_bpf_prog(event);
+
+       if (event->destroy)
+               event->destroy(event);
+
+       if (event->ctx)
+               put_ctx(event->ctx);
+
+       if (event->pmu) {
+               exclusive_event_destroy(event);
+               module_put(event->pmu->module);
+       }
+
+       call_rcu(&event->rcu_head, free_event_rcu);
 }
 
 /*
@@ -3702,14 +3769,13 @@ static void perf_remove_from_owner(struct perf_event *event)
        struct task_struct *owner;
 
        rcu_read_lock();
-       owner = ACCESS_ONCE(event->owner);
        /*
-        * Matches the smp_wmb() in perf_event_exit_task(). If we observe
-        * !owner it means the list deletion is complete and we can indeed
-        * free this event, otherwise we need to serialize on
+        * Matches the smp_store_release() in perf_event_exit_task(). If we
+        * observe !owner it means the list deletion is complete and we can
+        * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
-       smp_read_barrier_depends();
+       owner = lockless_dereference(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
@@ -3737,8 +3803,10 @@ static void perf_remove_from_owner(struct perf_event *event)
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
-               if (event->owner)
+               if (event->owner) {
                        list_del_init(&event->owner_entry);
+                       smp_store_release(&event->owner, NULL);
+               }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
@@ -3746,37 +3814,111 @@ static void perf_remove_from_owner(struct perf_event *event)
 
 static void put_event(struct perf_event *event)
 {
-       struct perf_event_context *ctx;
-
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
 
+       _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_event *child, *tmp;
+
+       /*
+        * If we got here through err_file: fput(event_file); we will not have
+        * attached to a context yet.
+        */
+       if (!ctx) {
+               WARN_ON_ONCE(event->attach_state &
+                               (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+               goto no_ctx;
+       }
+
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
 
+       ctx = perf_event_ctx_lock(event);
+       WARN_ON_ONCE(ctx->parent_ctx);
+       perf_remove_from_context(event, DETACH_GROUP);
+
+       raw_spin_lock_irq(&ctx->lock);
        /*
-        * There are two ways this annotation is useful:
+        * Mark this even as STATE_DEAD, there is no external reference to it
+        * anymore.
         *
-        *  1) there is a lock recursion from perf_event_exit_task
-        *     see the comment there.
+        * Anybody acquiring event->child_mutex after the below loop _must_
+        * also see this, most importantly inherit_event() which will avoid
+        * placing more children on the list.
         *
-        *  2) there is a lock-inversion with mmap_sem through
-        *     perf_read_group(), which takes faults while
-        *     holding ctx->mutex, however this is called after
-        *     the last filedesc died, so there is no possibility
-        *     to trigger the AB-BA case.
+        * Thus this guarantees that we will in fact observe and kill _ALL_
+        * child events.
         */
-       ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
-       WARN_ON_ONCE(ctx->parent_ctx);
-       perf_remove_from_context(event, true);
+       event->state = PERF_EVENT_STATE_DEAD;
+       raw_spin_unlock_irq(&ctx->lock);
+
        perf_event_ctx_unlock(event, ctx);
 
-       _free_event(event);
-}
+again:
+       mutex_lock(&event->child_mutex);
+       list_for_each_entry(child, &event->child_list, child_list) {
 
-int perf_event_release_kernel(struct perf_event *event)
-{
-       put_event(event);
+               /*
+                * Cannot change, child events are not migrated, see the
+                * comment with perf_event_ctx_lock_nested().
+                */
+               ctx = lockless_dereference(child->ctx);
+               /*
+                * Since child_mutex nests inside ctx::mutex, we must jump
+                * through hoops. We start by grabbing a reference on the ctx.
+                *
+                * Since the event cannot get freed while we hold the
+                * child_mutex, the context must also exist and have a !0
+                * reference count.
+                */
+               get_ctx(ctx);
+
+               /*
+                * Now that we have a ctx ref, we can drop child_mutex, and
+                * acquire ctx::mutex without fear of it going away. Then we
+                * can re-acquire child_mutex.
+                */
+               mutex_unlock(&event->child_mutex);
+               mutex_lock(&ctx->mutex);
+               mutex_lock(&event->child_mutex);
+
+               /*
+                * Now that we hold ctx::mutex and child_mutex, revalidate our
+                * state, if child is still the first entry, it didn't get freed
+                * and we can continue doing so.
+                */
+               tmp = list_first_entry_or_null(&event->child_list,
+                                              struct perf_event, child_list);
+               if (tmp == child) {
+                       perf_remove_from_context(child, DETACH_GROUP);
+                       list_del(&child->child_list);
+                       free_event(child);
+                       /*
+                        * This matches the refcount bump in inherit_event();
+                        * this can't be the last reference.
+                        */
+                       put_event(event);
+               }
+
+               mutex_unlock(&event->child_mutex);
+               mutex_unlock(&ctx->mutex);
+               put_ctx(ctx);
+               goto again;
+       }
+       mutex_unlock(&event->child_mutex);
+
+no_ctx:
+       put_event(event); /* Must be the 'last' reference */
        return 0;
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3786,46 +3928,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
-       put_event(file->private_data);
+       perf_event_release_kernel(file->private_data);
        return 0;
 }
 
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
-       struct perf_event_context *ctx;
-       struct perf_event *event, *tmp;
-
-       ctx = container_of(work, struct perf_event_context,
-                          orphans_remove.work);
-
-       mutex_lock(&ctx->mutex);
-       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
-               struct perf_event *parent_event = event->parent;
-
-               if (!is_orphaned_child(event))
-                       continue;
-
-               perf_remove_from_context(event, true);
-
-               mutex_lock(&parent_event->child_mutex);
-               list_del_init(&event->child_list);
-               mutex_unlock(&parent_event->child_mutex);
-
-               free_event(event);
-               put_event(parent_event);
-       }
-
-       raw_spin_lock_irq(&ctx->lock);
-       ctx->orphans_remove_sched = false;
-       raw_spin_unlock_irq(&ctx->lock);
-       mutex_unlock(&ctx->mutex);
-
-       put_ctx(ctx);
-}
-
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
@@ -3969,7 +4075,7 @@ static bool is_event_hup(struct perf_event *event)
 {
        bool no_children;
 
-       if (event->state != PERF_EVENT_STATE_EXIT)
+       if (event->state > PERF_EVENT_STATE_EXIT)
                return false;
 
        mutex_lock(&event->child_mutex);
@@ -4054,7 +4160,7 @@ static void _perf_event_reset(struct perf_event *event)
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
  * task existence requirements of perf_event_enable/disable.
  */
 static void perf_event_for_each_child(struct perf_event *event,
@@ -4086,36 +4192,14 @@ static void perf_event_for_each(struct perf_event *event,
                perf_event_for_each_child(sibling, func);
 }
 
-struct period_event {
-       struct perf_event *event;
-       u64 value;
-};
-
-static void ___perf_event_period(void *info)
-{
-       struct period_event *pe = info;
-       struct perf_event *event = pe->event;
-       u64 value = pe->value;
-
-       if (event->attr.freq) {
-               event->attr.sample_freq = value;
-       } else {
-               event->attr.sample_period = value;
-               event->hw.sample_period = value;
-       }
-
-       local64_set(&event->hw.period_left, 0);
-}
-
-static int __perf_event_period(void *info)
+static void __perf_event_period(struct perf_event *event,
+                               struct perf_cpu_context *cpuctx,
+                               struct perf_event_context *ctx,
+                               void *info)
 {
-       struct period_event *pe = info;
-       struct perf_event *event = pe->event;
-       struct perf_event_context *ctx = event->ctx;
-       u64 value = pe->value;
+       u64 value = *((u64 *)info);
        bool active;
 
-       raw_spin_lock(&ctx->lock);
        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
@@ -4135,14 +4219,10 @@ static int __perf_event_period(void *info)
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(ctx->pmu);
        }
-       raw_spin_unlock(&ctx->lock);
-
-       return 0;
 }
 
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
-       struct period_event pe = { .event = event, };
        u64 value;
 
        if (!is_sampling_event(event))
@@ -4157,10 +4237,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;
 
-       pe.value = value;
-
-       event_function_call(event, __perf_event_period,
-                           ___perf_event_period, &pe);
+       event_function_call(event, __perf_event_period, &value);
 
        return 0;
 }
@@ -4932,7 +5009,7 @@ static void perf_pending_event(struct irq_work *entry)
 
        if (event->pending_disable) {
                event->pending_disable = 0;
-               __perf_event_disable(event);
+               perf_event_disable_local(event);
        }
 
        if (event->pending_wakeup) {
@@ -6359,9 +6436,9 @@ static int __perf_event_overflow(struct perf_event *event,
                if (unlikely(throttle
                             && hwc->interrupts >= max_samples_per_tick)) {
                        __this_cpu_inc(perf_throttled_count);
+                       tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
-                       tick_nohz_full_kick();
                        ret = 1;
                }
        }
@@ -6720,7 +6797,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
        kfree_rcu(hlist, rcu_head);
 }
 
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -6732,15 +6809,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
 
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
 {
        int cpu;
 
        for_each_possible_cpu(cpu)
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
 }
 
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;
@@ -6763,14 +6840,13 @@ exit:
        return err;
 }
 
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
 {
-       int err;
-       int cpu, failed_cpu;
+       int err, cpu, failed_cpu;
 
        get_online_cpus();
        for_each_possible_cpu(cpu) {
-               err = swevent_hlist_get_cpu(event, cpu);
+               err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
@@ -6783,7 +6859,7 @@ fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
-               swevent_hlist_put_cpu(event, cpu);
+               swevent_hlist_put_cpu(cpu);
        }
 
        put_online_cpus();
@@ -6799,7 +6875,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
 
        static_key_slow_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+       swevent_hlist_put();
 }
 
 static int perf_swevent_init(struct perf_event *event)
@@ -6830,7 +6906,7 @@ static int perf_swevent_init(struct perf_event *event)
        if (!event->parent) {
                int err;
 
-               err = swevent_hlist_get(event);
+               err = swevent_hlist_get();
                if (err)
                        return err;
 
@@ -7751,31 +7827,75 @@ static void account_event_cpu(struct perf_event *event, int cpu)
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
 
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       /* Lock so we don't race with concurrent unaccount */
+       spin_lock(&nr_freq_lock);
+       if (atomic_inc_return(&nr_freq_events) == 1)
+               tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+       spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+       if (tick_nohz_full_enabled())
+               account_freq_event_nohz();
+       else
+               atomic_inc(&nr_freq_events);
+}
+
+
 static void account_event(struct perf_event *event)
 {
+       bool inc = false;
+
        if (event->parent)
                return;
 
        if (event->attach_state & PERF_ATTACH_TASK)
-               static_key_slow_inc(&perf_sched_events.key);
+               inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
-       if (event->attr.freq) {
-               if (atomic_inc_return(&nr_freq_events) == 1)
-                       tick_nohz_full_kick_all();
-       }
+       if (event->attr.freq)
+               account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
-               static_key_slow_inc(&perf_sched_events.key);
+               inc = true;
        }
        if (has_branch_stack(event))
-               static_key_slow_inc(&perf_sched_events.key);
+               inc = true;
        if (is_cgroup_event(event))
-               static_key_slow_inc(&perf_sched_events.key);
+               inc = true;
+
+       if (inc) {
+               if (atomic_inc_not_zero(&perf_sched_count))
+                       goto enabled;
+
+               mutex_lock(&perf_sched_mutex);
+               if (!atomic_read(&perf_sched_count)) {
+                       static_branch_enable(&perf_sched_events);
+                       /*
+                        * Guarantee that all CPUs observe they key change and
+                        * call the perf scheduling hooks before proceeding to
+                        * install events that need them.
+                        */
+                       synchronize_sched();
+               }
+               /*
+                * Now that we have waited for the sync_sched(), allow further
+                * increments to by-pass the mutex.
+                */
+               atomic_inc(&perf_sched_count);
+               mutex_unlock(&perf_sched_mutex);
+       }
+enabled:
 
        account_event_cpu(event, event->cpu);
 }
@@ -7911,6 +8031,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                }
        }
 
+       /* symmetric to unaccount_event() in _free_event() */
+       account_event(event);
+
        return event;
 
 err_per_task:
@@ -8274,8 +8397,6 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
 
-       account_event(event);
-
        /*
         * Special case software events and allow them to be part of
         * any hardware group.
@@ -8394,10 +8515,19 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group) {
                gctx = group_leader->ctx;
                mutex_lock_double(&gctx->mutex, &ctx->mutex);
+               if (gctx->task == TASK_TOMBSTONE) {
+                       err = -ESRCH;
+                       goto err_locked;
+               }
        } else {
                mutex_lock(&ctx->mutex);
        }
 
+       if (ctx->task == TASK_TOMBSTONE) {
+               err = -ESRCH;
+               goto err_locked;
+       }
+
        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_locked;
@@ -8422,11 +8552,11 @@ SYSCALL_DEFINE5(perf_event_open,
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
-               perf_remove_from_context(group_leader, false);
+               perf_remove_from_context(group_leader, 0);
 
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                       perf_remove_from_context(sibling, false);
+                       perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }
 
@@ -8479,6 +8609,8 @@ SYSCALL_DEFINE5(perf_event_open,
        perf_event__header_size(event);
        perf_event__id_header_size(event);
 
+       event->owner = current;
+
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
 
@@ -8488,8 +8620,6 @@ SYSCALL_DEFINE5(perf_event_open,
 
        put_online_cpus();
 
-       event->owner = current;
-
        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
@@ -8514,7 +8644,12 @@ err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
 err_alloc:
-       free_event(event);
+       /*
+        * If event_file is set, the fput() above will have called ->release()
+        * and that will take care of freeing the event.
+        */
+       if (!event_file)
+               free_event(event);
 err_cpus:
        put_online_cpus();
 err_task:
@@ -8556,9 +8691,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        }
 
        /* Mark owner so we could distinguish it from user events. */
-       event->owner = EVENT_OWNER_KERNEL;
-
-       account_event(event);
+       event->owner = TASK_TOMBSTONE;
 
        ctx = find_get_context(event->pmu, task, event);
        if (IS_ERR(ctx)) {
@@ -8568,12 +8701,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
+       if (ctx->task == TASK_TOMBSTONE) {
+               err = -ESRCH;
+               goto err_unlock;
+       }
+
        if (!exclusive_event_installable(event, ctx)) {
-               mutex_unlock(&ctx->mutex);
-               perf_unpin_context(ctx);
-               put_ctx(ctx);
                err = -EBUSY;
-               goto err_free;
+               goto err_unlock;
        }
 
        perf_install_in_context(ctx, event, cpu);
@@ -8582,6 +8717,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
        return event;
 
+err_unlock:
+       mutex_unlock(&ctx->mutex);
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
 err_free:
        free_event(event);
 err:
@@ -8606,7 +8745,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
-               perf_remove_from_context(event, false);
+               perf_remove_from_context(event, 0);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
@@ -8673,33 +8812,15 @@ static void sync_child_event(struct perf_event *child_event,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
-
-       /*
-        * Remove this event from the parent's list
-        */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
-       list_del_init(&child_event->child_list);
-       mutex_unlock(&parent_event->child_mutex);
-
-       /*
-        * Make sure user/parent get notified, that we just
-        * lost one event.
-        */
-       perf_event_wakeup(parent_event);
-
-       /*
-        * Release the parent event, if this was the last
-        * reference to it.
-        */
-       put_event(parent_event);
 }
 
 static void
-__perf_event_exit_task(struct perf_event *child_event,
-                        struct perf_event_context *child_ctx,
-                        struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+                     struct perf_event_context *child_ctx,
+                     struct task_struct *child)
 {
+       struct perf_event *parent_event = child_event->parent;
+
        /*
         * Do not destroy the 'original' grouping; because of the context
         * switch optimization the original events could've ended up in a
@@ -8712,57 +8833,86 @@ __perf_event_exit_task(struct perf_event *child_event,
         * Do destroy all inherited groups, we don't care about those
         * and being thorough is better.
         */
-       perf_remove_from_context(child_event, !!child_event->parent);
+       raw_spin_lock_irq(&child_ctx->lock);
+       WARN_ON_ONCE(child_ctx->is_active);
+
+       if (parent_event)
+               perf_group_detach(child_event);
+       list_del_event(child_event, child_ctx);
+       child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+       raw_spin_unlock_irq(&child_ctx->lock);
 
        /*
-        * It can happen that the parent exits first, and has events
-        * that are still around due to the child reference. These
-        * events need to be zapped.
+        * Parent events are governed by their filedesc, retain them.
         */
-       if (child_event->parent) {
-               sync_child_event(child_event, child);
-               free_event(child_event);
-       } else {
-               child_event->state = PERF_EVENT_STATE_EXIT;
+       if (!parent_event) {
                perf_event_wakeup(child_event);
+               return;
        }
+       /*
+        * Child events can be cleaned up.
+        */
+
+       sync_child_event(child_event, child);
+
+       /*
+        * Remove this event from the parent's list
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_del_init(&child_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       /*
+        * Kick perf_poll() for is_event_hup().
+        */
+       perf_event_wakeup(parent_event);
+       free_event(child_event);
+       put_event(parent_event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
-       struct perf_event *child_event, *next;
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
-       unsigned long flags;
+       struct perf_event *child_event, *next;
 
-       if (likely(!child->perf_event_ctxp[ctxn]))
+       WARN_ON_ONCE(child != current);
+
+       child_ctx = perf_pin_task_context(child, ctxn);
+       if (!child_ctx)
                return;
 
-       local_irq_save(flags);
        /*
-        * We can't reschedule here because interrupts are disabled,
-        * and either child is current or it is a task that can't be
-        * scheduled, so we are now safe from rescheduling changing
-        * our context.
+        * In order to reduce the amount of tricky in ctx tear-down, we hold
+        * ctx::mutex over the entire thing. This serializes against almost
+        * everything that wants to access the ctx.
+        *
+        * The exception is sys_perf_event_open() /
+        * perf_event_create_kernel_count() which does find_get_context()
+        * without ctx::mutex (it cannot because of the move_group double mutex
+        * lock thing). See the comments in perf_install_in_context().
         */
-       child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+       mutex_lock(&child_ctx->mutex);
 
        /*
-        * Take the context lock here so that if find_get_context is
-        * reading child->perf_event_ctxp, we wait until it has
-        * incremented the context's refcount before we do put_ctx below.
+        * In a single ctx::lock section, de-schedule the events and detach the
+        * context from the task such that we cannot ever get it scheduled back
+        * in.
         */
-       raw_spin_lock(&child_ctx->lock);
-       task_ctx_sched_out(child_ctx);
-       child->perf_event_ctxp[ctxn] = NULL;
+       raw_spin_lock_irq(&child_ctx->lock);
+       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
 
        /*
-        * If this context is a clone; unclone it so it can't get
-        * swapped to another process while we're removing all
-        * the events from it.
+        * Now that the context is inactive, destroy the task <-> ctx relation
+        * and mark the context dead.
         */
+       RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+       put_ctx(child_ctx); /* cannot be last */
+       WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
+       put_task_struct(current); /* cannot be last */
+
        clone_ctx = unclone_ctx(child_ctx);
-       update_context_time(child_ctx);
-       raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+       raw_spin_unlock_irq(&child_ctx->lock);
 
        if (clone_ctx)
                put_ctx(clone_ctx);
@@ -8774,20 +8924,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         */
        perf_event_task(child, child_ctx, 0);
 
-       /*
-        * We can recurse on the same lock type through:
-        *
-        *   __perf_event_exit_task()
-        *     sync_child_event()
-        *       put_event()
-        *         mutex_lock(&ctx->mutex)
-        *
-        * But since its the parent context it won't be the same instance.
-        */
-       mutex_lock(&child_ctx->mutex);
-
        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-               __perf_event_exit_task(child_event, child_ctx, child);
+               perf_event_exit_event(child_event, child_ctx, child);
 
        mutex_unlock(&child_ctx->mutex);
 
@@ -8812,8 +8950,7 @@ void perf_event_exit_task(struct task_struct *child)
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
-               smp_wmb();
-               event->owner = NULL;
+               smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);
 
@@ -8896,21 +9033,20 @@ void perf_event_delayed_put(struct task_struct *task)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
 {
-       int err;
-       struct fd f;
-       struct perf_event *event;
+       struct file *file;
 
-       err = perf_fget_light(fd, &f);
-       if (err)
-               return ERR_PTR(err);
+       file = fget_raw(fd);
+       if (!file)
+               return ERR_PTR(-EBADF);
 
-       event = f.file->private_data;
-       atomic_long_inc(&event->refcount);
-       fdput(f);
+       if (file->f_op != &perf_fops) {
+               fput(file);
+               return ERR_PTR(-EBADF);
+       }
 
-       return event;
+       return file;
 }
 
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -8953,8 +9089,16 @@ inherit_event(struct perf_event *parent_event,
        if (IS_ERR(child_event))
                return child_event;
 
+       /*
+        * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+        * must be under the same lock in order to serialize against
+        * perf_event_release_kernel(), such that either we must observe
+        * is_orphaned_event() or they will observe us on the child_list.
+        */
+       mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
+               mutex_unlock(&parent_event->child_mutex);
                free_event(child_event);
                return NULL;
        }
@@ -9002,8 +9146,6 @@ inherit_event(struct perf_event *parent_event,
        /*
         * Link this into the parent event's child list
         */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);
 
@@ -9208,7 +9350,7 @@ static void perf_event_init_cpu(int cpu)
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
        mutex_lock(&swhash->hlist_mutex);
-       if (swhash->hlist_refcount > 0) {
+       if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;
 
                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -9221,13 +9363,14 @@ static void perf_event_init_cpu(int cpu)
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
-       struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_event *event;
 
-       rcu_read_lock();
-       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
-               __perf_remove_from_context(&re);
-       rcu_read_unlock();
+       raw_spin_lock(&ctx->lock);
+       list_for_each_entry(event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+       raw_spin_unlock(&ctx->lock);
 }
 
 static void perf_event_exit_cpu_context(int cpu)
@@ -9283,11 +9426,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        switch (action & ~CPU_TASKS_FROZEN) {
 
        case CPU_UP_PREPARE:
-       case CPU_DOWN_FAILED:
                perf_event_init_cpu(cpu);
                break;
 
-       case CPU_UP_CANCELED:
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
@@ -9316,9 +9457,6 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
-       /* do not patch jump label more than once per second */
-       jump_label_rate_limit(&perf_sched_events, HZ);
-
        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
@@ -9338,6 +9476,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
 
 static int __init perf_event_sysfs_init(void)
 {
index 92ce5f4..3f8cb1e 100644 (file)
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
         * current task.
         */
        if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
-               __perf_event_disable(bp);
+               perf_event_disable_local(bp);
        else
                perf_event_disable(bp);
 
index adfdc05..1faad2c 100644 (file)
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
        __free_page(page);
 }
 
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+       int pg;
+
+       if (rb->aux_priv) {
+               rb->free_aux(rb->aux_priv);
+               rb->free_aux = NULL;
+               rb->aux_priv = NULL;
+       }
+
+       if (rb->aux_nr_pages) {
+               for (pg = 0; pg < rb->aux_nr_pages; pg++)
+                       rb_free_aux_page(rb, pg);
+
+               kfree(rb->aux_pages);
+               rb->aux_nr_pages = 0;
+       }
+}
+
 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                 pgoff_t pgoff, int nr_pages, long watermark, int flags)
 {
@@ -547,30 +566,11 @@ out:
        if (!ret)
                rb->aux_pgoff = pgoff;
        else
-               rb_free_aux(rb);
+               __rb_free_aux(rb);
 
        return ret;
 }
 
-static void __rb_free_aux(struct ring_buffer *rb)
-{
-       int pg;
-
-       if (rb->aux_priv) {
-               rb->free_aux(rb->aux_priv);
-               rb->free_aux = NULL;
-               rb->aux_priv = NULL;
-       }
-
-       if (rb->aux_nr_pages) {
-               for (pg = 0; pg < rb->aux_nr_pages; pg++)
-                       rb_free_aux_page(rb, pg);
-
-               kfree(rb->aux_pages);
-               rb->aux_nr_pages = 0;
-       }
-}
-
 void rb_free_aux(struct ring_buffer *rb)
 {
        if (atomic_dec_and_test(&rb->aux_refcount))
index 0167679..5f6ce93 100644 (file)
@@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
                goto free_area;
 
        area->xol_mapping.name = "[uprobes]";
+       area->xol_mapping.fault = NULL;
        area->xol_mapping.pages = area->pages;
        area->pages[0] = alloc_page(GFP_HIGHUSER);
        if (!area->pages[0])
index 0773f2b..a5d2e74 100644 (file)
  *   futex_wait(futex, val);
  *
  *   waiters++; (a)
- *   mb(); (A) <-- paired with -.
- *                              |
- *   lock(hash_bucket(futex));  |
- *                              |
- *   uval = *futex;             |
- *                              |        *futex = newval;
- *                              |        sys_futex(WAKE, futex);
- *                              |          futex_wake(futex);
- *                              |
- *                              `------->  mb(); (B)
+ *   smp_mb(); (A) <-- paired with -.
+ *                                  |
+ *   lock(hash_bucket(futex));      |
+ *                                  |
+ *   uval = *futex;                 |
+ *                                  |        *futex = newval;
+ *                                  |        sys_futex(WAKE, futex);
+ *                                  |          futex_wake(futex);
+ *                                  |
+ *                                  `--------> smp_mb(); (B)
  *   if (uval == val)
  *     queue();
  *     unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
        /*
         * Ensure futex_get_mm() implies a full barrier such that
         * get_futex_key() implies a full barrier. This is relied upon
-        * as full barrier (B), see the ordering comment above.
+        * as smp_mb(); (B), see the ordering comment above.
         */
        smp_mb__after_atomic();
 }
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
 
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-               ihold(key->shared.inode); /* implies MB (B) */
+               ihold(key->shared.inode); /* implies smp_mb(); (B) */
                break;
        case FUT_OFF_MMSHARED:
-               futex_get_mm(key); /* implies MB (B) */
+               futex_get_mm(key); /* implies smp_mb(); (B) */
                break;
        default:
                /*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
                 * mm, therefore the only purpose of calling get_futex_key_refs
                 * is because we need the barrier for the lockless waiter check.
                 */
-               smp_mb(); /* explicit MB (B) */
+               smp_mb(); /* explicit smp_mb(); (B) */
        }
 }
 
@@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        if (!fshared) {
                key->private.mm = mm;
                key->private.address = address;
-               get_futex_key_refs(key);  /* implies MB (B) */
+               get_futex_key_refs(key);  /* implies smp_mb(); (B) */
                return 0;
        }
 
@@ -520,7 +520,20 @@ again:
        else
                err = 0;
 
-       lock_page(page);
+       /*
+        * The treatment of mapping from this point on is critical. The page
+        * lock protects many things but in this context the page lock
+        * stabilizes mapping, prevents inode freeing in the shared
+        * file-backed region case and guards against movement to swap cache.
+        *
+        * Strictly speaking the page lock is not needed in all cases being
+        * considered here and page lock forces unnecessarily serialization
+        * From this point on, mapping will be re-verified if necessary and
+        * page lock will be acquired only if it is unavoidable
+        */
+       page = compound_head(page);
+       mapping = READ_ONCE(page->mapping);
+
        /*
         * If page->mapping is NULL, then it cannot be a PageAnon
         * page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
         * shmem_writepage move it from filecache to swapcache beneath us:
         * an unlikely race, but we do need to retry for page->mapping.
         */
-       mapping = compound_head(page)->mapping;
-       if (!mapping) {
-               int shmem_swizzled = PageSwapCache(page);
+       if (unlikely(!mapping)) {
+               int shmem_swizzled;
+
+               /*
+                * Page lock is required to identify which special case above
+                * applies. If this is really a shmem page then the page lock
+                * will prevent unexpected transitions.
+                */
+               lock_page(page);
+               shmem_swizzled = PageSwapCache(page) || page->mapping;
                unlock_page(page);
                put_page(page);
+
                if (shmem_swizzled)
                        goto again;
+
                return -EFAULT;
        }
 
        /*
         * Private mappings are handled in a simple way.
         *
+        * If the futex key is stored on an anonymous page, then the associated
+        * object is the mm which is implicitly pinned by the calling process.
+        *
         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
@@ -566,16 +591,74 @@ again:
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
+
+               get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
        } else {
+               struct inode *inode;
+
+               /*
+                * The associated futex object in this case is the inode and
+                * the page->mapping must be traversed. Ordinarily this should
+                * be stabilised under page lock but it's not strictly
+                * necessary in this case as we just want to pin the inode, not
+                * update the radix tree or anything like that.
+                *
+                * The RCU read lock is taken as the inode is finally freed
+                * under RCU. If the mapping still matches expectations then the
+                * mapping->host can be safely accessed as being a valid inode.
+                */
+               rcu_read_lock();
+
+               if (READ_ONCE(page->mapping) != mapping) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               inode = READ_ONCE(mapping->host);
+               if (!inode) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /*
+                * Take a reference unless it is about to be freed. Previously
+                * this reference was taken by ihold under the page lock
+                * pinning the inode in place so i_lock was unnecessary. The
+                * only way for this check to fail is if the inode was
+                * truncated in parallel so warn for now if this happens.
+                *
+                * We are not calling into get_futex_key_refs() in file-backed
+                * cases, therefore a successful atomic_inc return below will
+                * guarantee that get_futex_key() will still imply smp_mb(); (B).
+                */
+               if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+                       rcu_read_unlock();
+                       put_page(page);
+
+                       goto again;
+               }
+
+               /* Should be impossible but lets be paranoid for now */
+               if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+                       err = -EFAULT;
+                       rcu_read_unlock();
+                       iput(inode);
+
+                       goto out;
+               }
+
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = mapping->host;
+               key->shared.inode = inode;
                key->shared.pgoff = basepage_index(page);
+               rcu_read_unlock();
        }
 
-       get_futex_key_refs(key); /* implies MB (B) */
-
 out:
-       unlock_page(page);
        put_page(page);
        return err;
 }
@@ -1191,7 +1274,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
        if (pi_state->owner != current)
                return -EINVAL;
 
-       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
        /*
@@ -1217,22 +1300,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
        else if (curval != uval)
                ret = -EINVAL;
        if (ret) {
-               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                return ret;
        }
 
-       raw_spin_lock_irq(&pi_state->owner->pi_lock);
+       raw_spin_lock(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
        list_del_init(&pi_state->list);
-       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+       raw_spin_unlock(&pi_state->owner->pi_lock);
 
-       raw_spin_lock_irq(&new_owner->pi_lock);
+       raw_spin_lock(&new_owner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
-       raw_spin_unlock_irq(&new_owner->pi_lock);
+       raw_spin_unlock(&new_owner->pi_lock);
 
-       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 
        deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
 
@@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 
        q->lock_ptr = &hb->lock;
 
-       spin_lock(&hb->lock); /* implies MB (A) */
+       spin_lock(&hb->lock); /* implies smp_mb(); (A) */
        return hb;
 }
 
@@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
 
        /* In the common case we don't take the spinlock, which is nice. */
 retry:
-       lock_ptr = q->lock_ptr;
-       barrier();
+       /*
+        * q->lock_ptr can change between this read and the following spin_lock.
+        * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+        * optimizing lock_ptr out of the logic below.
+        */
+       lock_ptr = READ_ONCE(q->lock_ptr);
        if (lock_ptr != NULL) {
                spin_lock(lock_ptr);
                /*
@@ -2127,11 +2214,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                 * we returned due to timeout or signal without taking the
                 * rt_mutex. Too late.
                 */
-               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
                if (!owner)
                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
-               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
index 3b48dab..3bbfd6a 100644 (file)
@@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY
        bool
        select IRQ_DOMAIN
 
+# Generic IRQ IPI support
+config GENERIC_IRQ_IPI
+       bool
+
 # Generic MSI interrupt support
 config GENERIC_MSI_IRQ
        bool
index 2fc9cbd..2ee42e9 100644 (file)
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
index 5797909..2f9f2b0 100644 (file)
@@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data)
        data = data->parent_data;
        data->chip->irq_mask(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
 
 /**
  * irq_chip_unmask_parent - Unmask the parent interrupt
@@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
        data = data->parent_data;
        data->chip->irq_unmask(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
 
 /**
  * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
        data = data->parent_data;
        data->chip->irq_eoi(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
 
 /**
  * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
 
        return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
 
 /**
  * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
index a302cf9..a15b548 100644 (file)
@@ -136,9 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 {
        irqreturn_t retval = IRQ_NONE;
        unsigned int flags = 0, irq = desc->irq_data.irq;
-       struct irqaction *action = desc->action;
+       struct irqaction *action;
 
-       do {
+       for_each_action_of_desc(desc, action) {
                irqreturn_t res;
 
                trace_irq_handler_entry(irq, action);
@@ -172,8 +172,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
                }
 
                retval |= res;
-               action = action->next;
-       } while (action);
+       }
 
        add_interrupt_randomness(irq, flags);
 
index fcab63c..09be2c9 100644 (file)
@@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 #define IRQ_GET_DESC_CHECK_GLOBAL      (_IRQ_DESC_CHECK)
 #define IRQ_GET_DESC_CHECK_PERCPU      (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
 
+#define for_each_action_of_desc(desc, act)                     \
+       for (act = desc->act; act; act = act->next)
+
 struct irq_desc *
 __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
                    unsigned int check);
@@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
        __irq_put_desc_unlock(desc, flags, false);
 }
 
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+
 /*
  * Manipulation functions for irq_data.state
  */
@@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
        return __irqd_to_state(d) & mask;
 }
 
+#undef __irqd_to_state
+
 static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
        __this_cpu_inc(*desc->kstat_irqs);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644 (file)
index 0000000..c37f34b
--- /dev/null
@@ -0,0 +1,326 @@
+/*
+ * linux/kernel/irq/ipi.c
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd
+ * Author: Qais Yousef <qais.yousef@imgtec.com>
+ *
+ * This file contains driver APIs to the IPI subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq/ipi: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+/**
+ * irq_reserve_ipi() - Setup an IPI to destination cpumask
+ * @domain:    IPI domain
+ * @dest:      cpumask of cpus which can receive the IPI
+ *
+ * Allocate a virq that can be used to send IPI to any CPU in dest mask.
+ *
+ * On success it'll return linux irq number and 0 on failure
+ */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+                            const struct cpumask *dest)
+{
+       unsigned int nr_irqs, offset;
+       struct irq_data *data;
+       int virq, i;
+
+       if (!domain ||!irq_domain_is_ipi(domain)) {
+               pr_warn("Reservation on a non IPI domain\n");
+               return 0;
+       }
+
+       if (!cpumask_subset(dest, cpu_possible_mask)) {
+               pr_warn("Reservation is not in possible_cpu_mask\n");
+               return 0;
+       }
+
+       nr_irqs = cpumask_weight(dest);
+       if (!nr_irqs) {
+               pr_warn("Reservation for empty destination mask\n");
+               return 0;
+       }
+
+       if (irq_domain_is_ipi_single(domain)) {
+               /*
+                * If the underlying implementation uses a single HW irq on
+                * all cpus then we only need a single Linux irq number for
+                * it. We have no restrictions vs. the destination mask. The
+                * underlying implementation can deal with holes nicely.
+                */
+               nr_irqs = 1;
+               offset = 0;
+       } else {
+               unsigned int next;
+
+               /*
+                * The IPI requires a seperate HW irq on each CPU. We require
+                * that the destination mask is consecutive. If an
+                * implementation needs to support holes, it can reserve
+                * several IPI ranges.
+                */
+               offset = cpumask_first(dest);
+               /*
+                * Find a hole and if found look for another set bit after the
+                * hole. For now we don't support this scenario.
+                */
+               next = cpumask_next_zero(offset, dest);
+               if (next < nr_cpu_ids)
+                       next = cpumask_next(next, dest);
+               if (next < nr_cpu_ids) {
+                       pr_warn("Destination mask has holes\n");
+                       return 0;
+               }
+       }
+
+       virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+       if (virq <= 0) {
+               pr_warn("Can't reserve IPI, failed to alloc descs\n");
+               return 0;
+       }
+
+       virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
+                                      (void *) dest, true);
+
+       if (virq <= 0) {
+               pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
+               goto free_descs;
+       }
+
+       for (i = 0; i < nr_irqs; i++) {
+               data = irq_get_irq_data(virq + i);
+               cpumask_copy(data->common->affinity, dest);
+               data->common->ipi_offset = offset;
+       }
+       return virq;
+
+free_descs:
+       irq_free_descs(virq, nr_irqs);
+       return 0;
+}
+
+/**
+ * irq_destroy_ipi() - unreserve an IPI that was previously allocated
+ * @irq:       linux irq number to be destroyed
+ *
+ * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
+ * all virqs associated with them.
+ */
+void irq_destroy_ipi(unsigned int irq)
+{
+       struct irq_data *data = irq_get_irq_data(irq);
+       struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+       struct irq_domain *domain;
+       unsigned int nr_irqs;
+
+       if (!irq || !data || !ipimask)
+               return;
+
+       domain = data->domain;
+       if (WARN_ON(domain == NULL))
+               return;
+
+       if (!irq_domain_is_ipi(domain)) {
+               pr_warn("Trying to destroy a non IPI domain!\n");
+               return;
+       }
+
+       if (irq_domain_is_ipi_per_cpu(domain))
+               nr_irqs = cpumask_weight(ipimask);
+       else
+               nr_irqs = 1;
+
+       irq_domain_free_irqs(irq, nr_irqs);
+}
+
+/**
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
+ * @irq:       linux irq number
+ * @cpu:       the target cpu
+ *
+ * When dealing with coprocessors IPI, we need to inform the coprocessor of
+ * the hwirq it needs to use to receive and send IPIs.
+ *
+ * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ */
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
+{
+       struct irq_data *data = irq_get_irq_data(irq);
+       struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+
+       if (!data || !ipimask || cpu > nr_cpu_ids)
+               return INVALID_HWIRQ;
+
+       if (!cpumask_test_cpu(cpu, ipimask))
+               return INVALID_HWIRQ;
+
+       /*
+        * Get the real hardware irq number if the underlying implementation
+        * uses a seperate irq per cpu. If the underlying implementation uses
+        * a single hardware irq for all cpus then the IPI send mechanism
+        * needs to take care of the cpu destinations.
+        */
+       if (irq_domain_is_ipi_per_cpu(data->domain))
+               data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
+
+       return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
+}
+EXPORT_SYMBOL_GPL(ipi_get_hwirq);
+
+static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
+                          const struct cpumask *dest, unsigned int cpu)
+{
+       struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+
+       if (!chip || !ipimask)
+               return -EINVAL;
+
+       if (!chip->ipi_send_single && !chip->ipi_send_mask)
+               return -EINVAL;
+
+       if (cpu > nr_cpu_ids)
+               return -EINVAL;
+
+       if (dest) {
+               if (!cpumask_subset(dest, ipimask))
+                       return -EINVAL;
+       } else {
+               if (!cpumask_test_cpu(cpu, ipimask))
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+/**
+ * __ipi_send_single - send an IPI to a target Linux SMP CPU
+ * @desc:      pointer to irq_desc of the IRQ
+ * @cpu:       destination CPU, must in the destination mask passed to
+ *             irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
+{
+       struct irq_data *data = irq_desc_get_irq_data(desc);
+       struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+#ifdef DEBUG
+       /*
+        * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+        * Since the callers should be arch or core code which is generally
+        * trusted, only check for errors when debugging.
+        */
+       if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+               return -EINVAL;
+#endif
+       if (!chip->ipi_send_single) {
+               chip->ipi_send_mask(data, cpumask_of(cpu));
+               return 0;
+       }
+
+       /* FIXME: Store this information in irqdata flags */
+       if (irq_domain_is_ipi_per_cpu(data->domain) &&
+           cpu != data->common->ipi_offset) {
+               /* use the correct data for that cpu */
+               unsigned irq = data->irq + cpu - data->common->ipi_offset;
+
+               data = irq_get_irq_data(irq);
+       }
+       chip->ipi_send_single(data, cpu);
+       return 0;
+}
+
+/**
+ * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * @desc:      pointer to irq_desc of the IRQ
+ * @dest:      dest CPU(s), must be a subset of the mask passed to
+ *             irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
+{
+       struct irq_data *data = irq_desc_get_irq_data(desc);
+       struct irq_chip *chip = irq_data_get_irq_chip(data);
+       unsigned int cpu;
+
+#ifdef DEBUG
+       /*
+        * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+        * Since the callers should be arch or core code which is generally
+        * trusted, only check for errors when debugging.
+        */
+       if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+               return -EINVAL;
+#endif
+       if (chip->ipi_send_mask) {
+               chip->ipi_send_mask(data, dest);
+               return 0;
+       }
+
+       if (irq_domain_is_ipi_per_cpu(data->domain)) {
+               unsigned int base = data->irq;
+
+               for_each_cpu(cpu, dest) {
+                       unsigned irq = base + cpu - data->common->ipi_offset;
+
+                       data = irq_get_irq_data(irq);
+                       chip->ipi_send_single(data, cpu);
+               }
+       } else {
+               for_each_cpu(cpu, dest)
+                       chip->ipi_send_single(data, cpu);
+       }
+       return 0;
+}
+
+/**
+ * ipi_send_single - Send an IPI to a single CPU
+ * @virq:      linux irq number from irq_reserve_ipi()
+ * @cpu:       destination CPU, must in the destination mask passed to
+ *             irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_single(unsigned int virq, unsigned int cpu)
+{
+       struct irq_desc *desc = irq_to_desc(virq);
+       struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+       struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+       if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+               return -EINVAL;
+
+       return __ipi_send_single(desc, cpu);
+}
+EXPORT_SYMBOL_GPL(ipi_send_single);
+
+/**
+ * ipi_send_mask - Send an IPI to target CPU(s)
+ * @virq:      linux irq number from irq_reserve_ipi()
+ * @dest:      dest CPU(s), must be a subset of the mask passed to
+ *             irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
+{
+       struct irq_desc *desc = irq_to_desc(virq);
+       struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+       struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+       if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+               return -EINVAL;
+
+       return __ipi_send_mask(desc, dest);
+}
+EXPORT_SYMBOL_GPL(ipi_send_mask);
index 0409da0..0ccd028 100644 (file)
 static struct lock_class_key irq_desc_lock_class;
 
 #if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+       cpulist_parse(str, irq_default_affinity);
+       /*
+        * Set at least the boot cpu. We don't want to end up with
+        * bugreports caused by random comandline masks
+        */
+       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+       return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
 static void __init init_irq_default_affinity(void)
 {
-       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-       cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+       if (!irq_default_affinity)
+               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+       if (cpumask_empty(irq_default_affinity))
+               cpumask_setall(irq_default_affinity);
 }
 #else
 static void __init init_irq_default_affinity(void)
index 6e655f7..3a519a0 100644 (file)
@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
 static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 
-static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
-                                 irq_hw_number_t hwirq, int node);
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
 
 struct irqchip_fwid {
@@ -575,10 +573,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
        unsigned int type = IRQ_TYPE_NONE;
        int virq;
 
-       if (fwspec->fwnode)
-               domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
-       else
+       if (fwspec->fwnode) {
+               domain = irq_find_matching_fwnode(fwspec->fwnode,
+                                                 DOMAIN_BUS_WIRED);
+               if (!domain)
+                       domain = irq_find_matching_fwnode(fwspec->fwnode,
+                                                         DOMAIN_BUS_ANY);
+       } else {
                domain = irq_default_domain;
+       }
 
        if (!domain) {
                pr_warn("no irq domain found for %s !\n",
@@ -835,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
-static int irq_domain_alloc_descs(int virq, unsigned int cnt,
-                                 irq_hw_number_t hwirq, int node)
+int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
+                          int node)
 {
        unsigned int hint;
 
@@ -890,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
 
        return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
 
 static void irq_domain_insert_irq(int virq)
 {
@@ -1040,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
 
 /**
  * irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -1073,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
        irq_data->chip = &no_irq_chip;
        irq_data->chip_data = NULL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
 
 /**
  * irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -1270,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
                                                       nr_irqs, arg);
        return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
 
 /**
  * irq_domain_free_irqs_parent - Free interrupts from parent domain
@@ -1287,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
                irq_domain_free_irqs_recursive(domain->parent, irq_base,
                                               nr_irqs);
 }
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
 
 /**
  * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
index 8411872..3ddd229 100644 (file)
@@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq)
  */
 void irq_set_thread_affinity(struct irq_desc *desc)
 {
-       struct irqaction *action = desc->action;
+       struct irqaction *action;
 
-       while (action) {
+       for_each_action_of_desc(desc, action)
                if (action->thread)
                        set_bit(IRQTF_AFFINITY, &action->thread_flags);
-               action = action->next;
-       }
 }
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
                return;
 
        raw_spin_lock_irqsave(&desc->lock, flags);
-       for (action = desc->action; action; action = action->next) {
+       for_each_action_of_desc(desc, action) {
                if (action->dev_id == dev_id) {
                        if (action->thread)
                                __irq_wake_thread(desc, action);
index a2c02fd..4e1b947 100644 (file)
@@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
        int ret = 1;
 
        raw_spin_lock_irqsave(&desc->lock, flags);
-       for (action = desc->action ; action; action = action->next) {
+       for_each_action_of_desc(desc, action) {
                if ((action != new_action) && action->name &&
                                !strcmp(new_action->name, action->name)) {
                        ret = 0;
index 3214417..5707f97 100644 (file)
@@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
         * desc->lock here. See synchronize_irq().
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
-       action = desc->action;
-       while (action) {
+       for_each_action_of_desc(desc, action) {
                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
                if (action->thread_fn)
                        printk(KERN_CONT " threaded [<%p>] %pf",
                                        action->thread_fn, action->thread_fn);
                printk(KERN_CONT "\n");
-               action = action->next;
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
index 8dc6591..8d34308 100644 (file)
@@ -66,13 +66,15 @@ struct resource crashk_res = {
        .name  = "Crash kernel",
        .start = 0,
        .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
 };
 struct resource crashk_low_res = {
        .name  = "Crash kernel",
        .start = 0,
        .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+       .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+       .desc  = IORES_DESC_CRASH_KERNEL
 };
 
 int kexec_should_crash(struct task_struct *p)
@@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
 
        ram_res->start = end;
        ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
        ram_res->name = "System RAM";
 
        crashk_res.end = end - 1;
index 007b791..56b18eb 100644 (file)
@@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 
        /* Walk the RAM ranges and allocate a suitable range for the buffer */
        if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
+               ret = walk_iomem_res_desc(crashk_res.desc,
+                               IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+                               crashk_res.start, crashk_res.end, kbuf,
+                               locate_mem_hole_callback);
        else
                ret = walk_system_ram_res(0, -1, kbuf,
                                          locate_mem_hole_callback);
index a028127..b5c30d9 100644 (file)
  * of times)
  */
 
-#include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
        proc_create("latency_stats", 0644, NULL, &lstats_fops);
        return 0;
 }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int err;
+
+       err = proc_dointvec(table, write, buffer, lenp, ppos);
+       if (latencytop_enabled)
+               force_schedstat_enabled();
+
+       return err;
+}
 device_initcall(init_lstats_procfs);
index 60ace56..f894a2c 100644 (file)
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
        return ret;
 }
 
-static int lockdep_initialized;
-
 unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
@@ -292,7 +290,7 @@ LIST_HEAD(all_lock_classes);
 #define __classhashfn(key)     hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)    (classhash_table + __classhashfn((key)))
 
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +301,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
 #define __chainhashfn(chain)   hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)  (chainhash_table + __chainhashfn((chain)))
 
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 
 /*
  * The hash key of the lock dependency chains is a hash itself too:
@@ -433,19 +431,6 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
-/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
-       .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
-       .entries = lockdep_init_trace_data,
-};
-
 /*
  * Various lockdep statistics:
  */
@@ -666,23 +651,9 @@ static inline struct lock_class *
 look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
        struct lockdep_subclass_key *key;
-       struct list_head *hash_head;
+       struct hlist_head *hash_head;
        struct lock_class *class;
 
-#ifdef CONFIG_DEBUG_LOCKDEP
-       /*
-        * If the architecture calls into lockdep before initializing
-        * the hashes then we'll warn about it later. (we cannot printk
-        * right now)
-        */
-       if (unlikely(!lockdep_initialized)) {
-               lockdep_init();
-               lockdep_init_error = 1;
-               lock_init_error = lock->name;
-               save_stack_trace(&lockdep_init_trace);
-       }
-#endif
-
        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
                debug_locks_off();
                printk(KERN_ERR
@@ -719,7 +690,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return NULL;
 
-       list_for_each_entry_rcu(class, hash_head, hash_entry) {
+       hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key) {
                        /*
                         * Huh! same key, different name? Did someone trample
@@ -742,7 +713,7 @@ static inline struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
        struct lockdep_subclass_key *key;
-       struct list_head *hash_head;
+       struct hlist_head *hash_head;
        struct lock_class *class;
 
        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +745,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
         */
-       list_for_each_entry_rcu(class, hash_head, hash_entry) {
+       hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key)
                        goto out_unlock_set;
        }
@@ -805,7 +776,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         * We use RCU's safe list-add method to make
         * parallel walking of the hash-list safe:
         */
-       list_add_tail_rcu(&class->hash_entry, hash_head);
+       hlist_add_head_rcu(&class->hash_entry, hash_head);
        /*
         * Add it to the global list of classes:
         */
@@ -1822,7 +1793,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-              struct held_lock *next, int distance, int trylock_loop)
+              struct held_lock *next, int distance, int *stack_saved)
 {
        struct lock_list *entry;
        int ret;
@@ -1883,8 +1854,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                }
        }
 
-       if (!trylock_loop && !save_trace(&trace))
-               return 0;
+       if (!*stack_saved) {
+               if (!save_trace(&trace))
+                       return 0;
+               *stack_saved = 1;
+       }
 
        /*
         * Ok, all validations passed, add the new lock
@@ -1907,6 +1881,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Debugging printouts:
         */
        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+               /* We drop graph lock, so another thread can overwrite trace. */
+               *stack_saved = 0;
                graph_unlock();
                printk("\n new dependency: ");
                print_lock_name(hlock_class(prev));
@@ -1929,7 +1905,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
        int depth = curr->lockdep_depth;
-       int trylock_loop = 0;
+       int stack_saved = 0;
        struct held_lock *hlock;
 
        /*
@@ -1956,7 +1932,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                 */
                if (hlock->read != 2 && hlock->check) {
                        if (!check_prev_add(curr, hlock, next,
-                                               distance, trylock_loop))
+                                               distance, &stack_saved))
                                return 0;
                        /*
                         * Stop after the first non-trylock entry,
@@ -1979,7 +1955,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                if (curr->held_locks[depth].irq_context !=
                                curr->held_locks[depth-1].irq_context)
                        break;
-               trylock_loop = 1;
        }
        return 1;
 out_bug:
@@ -2006,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
        return lock_classes + chain_hlocks[chain->base + i];
 }
 
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+                                       struct held_lock *hlock)
+{
+       int i;
+       struct held_lock *hlock_curr;
+
+       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+               hlock_curr = curr->held_locks + i;
+               if (hlock_curr->irq_context != hlock->irq_context)
+                       break;
+
+       }
+
+       return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+                       struct held_lock *hlock,
+                       struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+       int i, j, id;
+
+       i = get_first_held_lock(curr, hlock);
+
+       if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+               return 0;
+
+       for (j = 0; j < chain->depth - 1; j++, i++) {
+               id = curr->held_locks[i].class_idx - 1;
+
+               if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+                       return 0;
+       }
+#endif
+       return 1;
+}
+
 /*
  * Look up a dependency chain. If the key is not present yet then
  * add it and return 1 - in this case the new dependency chain is
@@ -2017,9 +2039,8 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                                     u64 chain_key)
 {
        struct lock_class *class = hlock_class(hlock);
-       struct list_head *hash_head = chainhashentry(chain_key);
+       struct hlist_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
-       struct held_lock *hlock_curr;
        int i, j;
 
        /*
@@ -2033,10 +2054,13 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         * We can walk it lock-free, because entries only get added
         * to the hash:
         */
-       list_for_each_entry_rcu(chain, hash_head, entry) {
+       hlist_for_each_entry_rcu(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(chain_lookup_hits);
+                       if (!check_no_collision(curr, hlock, chain))
+                               return 0;
+
                        if (very_verbose(class))
                                printk("\nhash chain already cached, key: "
                                        "%016Lx tail class: [%p] %s\n",
@@ -2057,7 +2081,7 @@ cache_hit:
        /*
         * We have to walk the chain again locked - to avoid duplicates:
         */
-       list_for_each_entry(chain, hash_head, entry) {
+       hlist_for_each_entry(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
                        graph_unlock();
                        goto cache_hit;
@@ -2074,13 +2098,7 @@ cache_hit:
        chain = lock_chains + nr_lock_chains++;
        chain->chain_key = chain_key;
        chain->irq_context = hlock->irq_context;
-       /* Find the first held_lock of current chain */
-       for (i = curr->lockdep_depth - 1; i >= 0; i--) {
-               hlock_curr = curr->held_locks + i;
-               if (hlock_curr->irq_context != hlock->irq_context)
-                       break;
-       }
-       i++;
+       i = get_first_held_lock(curr, hlock);
        chain->depth = curr->lockdep_depth + 1 - i;
        if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                chain->base = nr_chain_hlocks;
@@ -2091,7 +2109,7 @@ cache_hit:
                }
                chain_hlocks[chain->base + j] = class - lock_classes;
        }
-       list_add_tail_rcu(&chain->entry, hash_head);
+       hlist_add_head_rcu(&chain->entry, hash_head);
        debug_atomic_inc(chain_lookup_misses);
        inc_chains();
 
@@ -2168,7 +2186,7 @@ static void check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
        struct held_lock *hlock, *prev_hlock = NULL;
-       unsigned int i, id;
+       unsigned int i;
        u64 chain_key = 0;
 
        for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2185,17 +2203,16 @@ static void check_chain_key(struct task_struct *curr)
                                (unsigned long long)hlock->prev_chain_key);
                        return;
                }
-               id = hlock->class_idx - 1;
                /*
                 * Whoops ran out of static storage again?
                 */
-               if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+               if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
                        return;
 
                if (prev_hlock && (prev_hlock->irq_context !=
                                                        hlock->irq_context))
                        chain_key = 0;
-               chain_key = iterate_chain_key(chain_key, id);
+               chain_key = iterate_chain_key(chain_key, hlock->class_idx);
                prev_hlock = hlock;
        }
        if (chain_key != curr->curr_chain_key) {
@@ -3073,7 +3090,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
        struct held_lock *hlock;
-       unsigned int depth, id;
+       unsigned int depth;
        int chain_head = 0;
        int class_idx;
        u64 chain_key;
@@ -3176,11 +3193,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * The 'key ID' is what is the most compact key value to drive
         * the hash, not class->key.
         */
-       id = class - lock_classes;
        /*
         * Whoops, we did it again.. ran straight out of our static allocation.
         */
-       if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+       if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
                return 0;
 
        chain_key = curr->curr_chain_key;
@@ -3198,7 +3214,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                chain_key = 0;
                chain_head = 1;
        }
-       chain_key = iterate_chain_key(chain_key, id);
+       chain_key = iterate_chain_key(chain_key, class_idx);
 
        if (nest_lock && !__lock_is_held(nest_lock))
                return print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -3875,7 +3891,7 @@ void lockdep_reset(void)
        nr_process_chains = 0;
        debug_locks = 1;
        for (i = 0; i < CHAINHASH_SIZE; i++)
-               INIT_LIST_HEAD(chainhash_table + i);
+               INIT_HLIST_HEAD(chainhash_table + i);
        raw_local_irq_restore(flags);
 }
 
@@ -3894,7 +3910,7 @@ static void zap_class(struct lock_class *class)
        /*
         * Unhash the class and remove it from the all_lock_classes list:
         */
-       list_del_rcu(&class->hash_entry);
+       hlist_del_rcu(&class->hash_entry);
        list_del_rcu(&class->lock_entry);
 
        RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3933,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
 void lockdep_free_key_range(void *start, unsigned long size)
 {
        struct lock_class *class;
-       struct list_head *head;
+       struct hlist_head *head;
        unsigned long flags;
        int i;
        int locked;
@@ -3930,9 +3946,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
         */
        for (i = 0; i < CLASSHASH_SIZE; i++) {
                head = classhash_table + i;
-               if (list_empty(head))
-                       continue;
-               list_for_each_entry_rcu(class, head, hash_entry) {
+               hlist_for_each_entry_rcu(class, head, hash_entry) {
                        if (within(class->key, start, size))
                                zap_class(class);
                        else if (within(class->name, start, size))
@@ -3962,7 +3976,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
        struct lock_class *class;
-       struct list_head *head;
+       struct hlist_head *head;
        unsigned long flags;
        int i, j;
        int locked;
@@ -3987,9 +4001,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
        locked = graph_lock();
        for (i = 0; i < CLASSHASH_SIZE; i++) {
                head = classhash_table + i;
-               if (list_empty(head))
-                       continue;
-               list_for_each_entry_rcu(class, head, hash_entry) {
+               hlist_for_each_entry_rcu(class, head, hash_entry) {
                        int match = 0;
 
                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4013,28 +4025,6 @@ out_restore:
        raw_local_irq_restore(flags);
 }
 
-void lockdep_init(void)
-{
-       int i;
-
-       /*
-        * Some architectures have their own start_kernel()
-        * code which calls lockdep_init(), while we also
-        * call lockdep_init() from the start_kernel() itself,
-        * and we want to initialize the hashes only once:
-        */
-       if (lockdep_initialized)
-               return;
-
-       for (i = 0; i < CLASSHASH_SIZE; i++)
-               INIT_LIST_HEAD(classhash_table + i);
-
-       for (i = 0; i < CHAINHASH_SIZE; i++)
-               INIT_LIST_HEAD(chainhash_table + i);
-
-       lockdep_initialized = 1;
-}
-
 void __init lockdep_info(void)
 {
        printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4051,6 @@ void __init lockdep_info(void)
 
        printk(" per task-struct memory footprint: %lu bytes\n",
                sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-       if (lockdep_init_error) {
-               printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-               printk("Call stack leading to lockdep invocation was:\n");
-               print_stack_trace(&lockdep_init_trace, 0);
-       }
-#endif
 }
 
 static void
index 5b9102a..c835270 100644 (file)
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
        node->locked = 0;
        node->next   = NULL;
 
-       prev = xchg_acquire(lock, node);
+       /*
+        * We rely on the full barrier with global transitivity implied by the
+        * below xchg() to order the initialization stores above against any
+        * observation of @node. And to provide the ACQUIRE ordering associated
+        * with a LOCK primitive.
+        */
+       prev = xchg(lock, node);
        if (likely(prev == NULL)) {
                /*
                 * Lock acquired, don't need to set node->locked to 1. Threads
index 0551c21..e364b42 100644 (file)
@@ -716,6 +716,7 @@ static inline void
 __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 {
        unsigned long flags;
+       WAKE_Q(wake_q);
 
        /*
         * As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
                                           struct mutex_waiter, list);
 
                debug_mutex_wake_waiter(lock, waiter);
-
-               wake_up_process(waiter->task);
+               wake_q_add(&wake_q, waiter->task);
        }
 
        spin_unlock_mutex(&lock->wait_lock, flags);
+       wake_up_q(&wake_q);
 }
 
 /*
index 393d187..ce2f75e 100644 (file)
@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * sequentiality; this is because not all clear_pending_set_locked()
         * implementations imply full barriers.
         */
-       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
-               cpu_relax();
+       smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
 
        /*
         * take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
         *
         * The PV pv_wait_head_or_lock function, if active, will acquire
         * the lock and return a non-zero value. So we have to skip the
-        * smp_load_acquire() call. As the next PV queue head hasn't been
+        * smp_cond_acquire() call. As the next PV queue head hasn't been
         * designated yet, there is no way for the locked value to become
         * _Q_SLOW_VAL. So both the set_locked() and the
         * atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
                        break;
                }
                /*
-                * The smp_load_acquire() call above has provided the necessary
+                * The smp_cond_acquire() call above has provided the necessary
                 * acquire semantics required for locking. At most two
                 * iterations of this loop may be ran.
                 */
index 87bb235..21ede57 100644 (file)
@@ -54,6 +54,11 @@ struct pv_node {
        u8                      state;
 };
 
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
 /*
  * By replacing the regular queued_spin_trylock() with the function below,
  * it will be called once when a lock waiter enter the PV slowpath before
@@ -65,9 +70,11 @@ struct pv_node {
 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
+       int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+                  (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
 
-       return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-               (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+       qstat_inc(qstat_pv_lock_stealing, ret);
+       return ret;
 }
 
 /*
@@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 }
 #endif /* _Q_PENDING_BITS == 8 */
 
-/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
 /*
  * Lock and MCS node addresses hash table for fast lookup
  *
@@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
        if (READ_ONCE(pn->state) == vcpu_hashed)
                lp = (struct qspinlock **)1;
 
+       /*
+        * Tracking # of slowpath locking operations
+        */
+       qstat_inc(qstat_pv_lock_slowpath, true);
+
        for (;; waitcnt++) {
                /*
                 * Set correct vCPU state to be used by queue node wait-early
index 640dcec..eb2a2c9 100644 (file)
@@ -22,6 +22,7 @@
  *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick   - average latency (ns) of vCPU kick operation
  *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_slowpath  - # of locking operations via the slowpath
  *   pv_lock_stealing  - # of lock stealing operations
  *   pv_spurious_wakeup        - # of spurious wakeups
  *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
        qstat_pv_kick_wake,
        qstat_pv_latency_kick,
        qstat_pv_latency_wake,
+       qstat_pv_lock_slowpath,
        qstat_pv_lock_stealing,
        qstat_pv_spurious_wakeup,
        qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
        [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
        [qstat_pv_latency_kick]    = "pv_latency_kick",
        [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
        [qstat_pv_lock_stealing]   = "pv_lock_stealing",
        [qstat_pv_wait_again]      = "pv_wait_again",
        [qstat_pv_wait_early]      = "pv_wait_early",
@@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 #define pv_kick(c)     __pv_kick(c)
 #define pv_wait(p, v)  __pv_wait(p, v)
 
-/*
- * PV unfair trylock count tracking function
- */
-static inline int qstat_spin_steal_lock(struct qspinlock *lock)
-{
-       int ret = pv_queued_spin_steal_lock(lock);
-
-       qstat_inc(qstat_pv_lock_stealing, ret);
-       return ret;
-}
-#undef  queued_spin_trylock
-#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
-
 #else /* CONFIG_QUEUED_LOCK_STAT */
 
 static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
index 8251e75..3e74660 100644 (file)
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  * 2) Drop lock->wait_lock
  * 3) Try to unlock the lock with cmpxchg
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+                                       unsigned long flags)
        __releases(lock->wait_lock)
 {
        struct task_struct *owner = rt_mutex_owner(lock);
 
        clear_rt_mutex_waiters(lock);
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
        /*
         * If a new waiter comes in between the unlock and the cmpxchg
         * we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 /*
  * Simple slow path only version: lock->owner is protected by lock->wait_lock.
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+                                       unsigned long flags)
        __releases(lock->wait_lock)
 {
        lock->owner = NULL;
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
        return true;
 }
 #endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        int ret = 0, depth = 0;
        struct rt_mutex *lock;
        bool detect_deadlock;
-       unsigned long flags;
        bool requeue = true;
 
        detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /*
         * [1] Task cannot go away as we did a get_task() before !
         */
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
+       raw_spin_lock_irq(&task->pi_lock);
 
        /*
         * [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * operations.
         */
        if (!raw_spin_trylock(&lock->wait_lock)) {
-               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+               raw_spin_unlock_irq(&task->pi_lock);
                cpu_relax();
                goto retry;
        }
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                /*
                 * No requeue[7] here. Just release @task [8]
                 */
-               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+               raw_spin_unlock(&task->pi_lock);
                put_task_struct(task);
 
                /*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                 * If there is no owner of the lock, end of chain.
                 */
                if (!rt_mutex_owner(lock)) {
-                       raw_spin_unlock(&lock->wait_lock);
+                       raw_spin_unlock_irq(&lock->wait_lock);
                        return 0;
                }
 
                /* [10] Grab the next task, i.e. owner of @lock */
                task = rt_mutex_owner(lock);
                get_task_struct(task);
-               raw_spin_lock_irqsave(&task->pi_lock, flags);
+               raw_spin_lock(&task->pi_lock);
 
                /*
                 * No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                top_waiter = rt_mutex_top_waiter(lock);
 
                /* [13] Drop locks */
-               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-               raw_spin_unlock(&lock->wait_lock);
+               raw_spin_unlock(&task->pi_lock);
+               raw_spin_unlock_irq(&lock->wait_lock);
 
                /* If owner is not blocked, end of chain. */
                if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        rt_mutex_enqueue(lock, waiter);
 
        /* [8] Release the task */
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+       raw_spin_unlock(&task->pi_lock);
        put_task_struct(task);
 
        /*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                 */
                if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
                        wake_up_process(rt_mutex_top_waiter(lock)->task);
-               raw_spin_unlock(&lock->wait_lock);
+               raw_spin_unlock_irq(&lock->wait_lock);
                return 0;
        }
 
        /* [10] Grab the next task, i.e. the owner of @lock */
        task = rt_mutex_owner(lock);
        get_task_struct(task);
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
+       raw_spin_lock(&task->pi_lock);
 
        /* [11] requeue the pi waiters if necessary */
        if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        top_waiter = rt_mutex_top_waiter(lock);
 
        /* [13] Drop the locks */
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock(&task->pi_lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
 
        /*
         * Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        goto again;
 
  out_unlock_pi:
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+       raw_spin_unlock_irq(&task->pi_lock);
  out_put_task:
        put_task_struct(task);
 
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 /*
  * Try to take an rt-mutex
  *
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
  *
  * @lock:   The lock to be acquired.
  * @task:   The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                                struct rt_mutex_waiter *waiter)
 {
-       unsigned long flags;
-
        /*
         * Before testing whether we can acquire @lock, we set the
         * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
         * case, but conditionals are more expensive than a redundant
         * store.
         */
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
+       raw_spin_lock(&task->pi_lock);
        task->pi_blocked_on = NULL;
        /*
         * Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
         */
        if (rt_mutex_has_waiters(lock))
                rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+       raw_spin_unlock(&task->pi_lock);
 
 takeit:
        /* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
  *
  * Prepare waiter and propagate pi chain
  *
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                   struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        struct rt_mutex_waiter *top_waiter = waiter;
        struct rt_mutex *next_lock;
        int chain_walk = 0, res;
-       unsigned long flags;
 
        /*
         * Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        if (owner == task)
                return -EDEADLK;
 
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
+       raw_spin_lock(&task->pi_lock);
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
        task->pi_blocked_on = waiter;
 
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+       raw_spin_unlock(&task->pi_lock);
 
        if (!owner)
                return 0;
 
-       raw_spin_lock_irqsave(&owner->pi_lock, flags);
+       raw_spin_lock(&owner->pi_lock);
        if (waiter == rt_mutex_top_waiter(lock)) {
                rt_mutex_dequeue_pi(owner, top_waiter);
                rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        /* Store the lock on which owner is blocked or NULL */
        next_lock = task_blocked_on_lock(owner);
 
-       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+       raw_spin_unlock(&owner->pi_lock);
        /*
         * Even if full deadlock detection is on, if the owner is not
         * blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         */
        get_task_struct(owner);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
 
        res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
                                         next_lock, waiter, task);
 
-       raw_spin_lock(&lock->wait_lock);
+       raw_spin_lock_irq(&lock->wait_lock);
 
        return res;
 }
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  * Remove the top waiter from the current tasks pi waiter tree and
  * queue it up.
  *
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
  */
 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
                                    struct rt_mutex *lock)
 {
        struct rt_mutex_waiter *waiter;
-       unsigned long flags;
 
-       raw_spin_lock_irqsave(&current->pi_lock, flags);
+       raw_spin_lock(&current->pi_lock);
 
        waiter = rt_mutex_top_waiter(lock);
 
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
         */
        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+       raw_spin_unlock(&current->pi_lock);
 
        wake_q_add(wake_q, waiter->task);
 }
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 /*
  * Remove a waiter from a lock and give up
  *
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
  * have just failed to try_to_take_rt_mutex().
  */
 static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
        bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex *next_lock;
-       unsigned long flags;
 
-       raw_spin_lock_irqsave(&current->pi_lock, flags);
+       raw_spin_lock(&current->pi_lock);
        rt_mutex_dequeue(lock, waiter);
        current->pi_blocked_on = NULL;
-       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+       raw_spin_unlock(&current->pi_lock);
 
        /*
         * Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
        if (!owner || !is_top_waiter)
                return;
 
-       raw_spin_lock_irqsave(&owner->pi_lock, flags);
+       raw_spin_lock(&owner->pi_lock);
 
        rt_mutex_dequeue_pi(owner, waiter);
 
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
        /* Store the lock on which owner is blocked or NULL */
        next_lock = task_blocked_on_lock(owner);
 
-       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+       raw_spin_unlock(&owner->pi_lock);
 
        /*
         * Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(owner);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
 
        rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
                                   next_lock, NULL, current);
 
-       raw_spin_lock(&lock->wait_lock);
+       raw_spin_lock_irq(&lock->wait_lock);
 }
 
 /*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:               the rt_mutex to take
  * @state:              the state the task should block in (TASK_INTERRUPTIBLE
- *                      or TASK_UNINTERRUPTIBLE)
+ *                      or TASK_UNINTERRUPTIBLE)
  * @timeout:            the pre-initialized and started timer, or NULL for none
  * @waiter:             the pre-initialized rt_mutex_waiter
  *
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
  */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                                break;
                }
 
-               raw_spin_unlock(&lock->wait_lock);
+               raw_spin_unlock_irq(&lock->wait_lock);
 
                debug_rt_mutex_print_deadlock(waiter);
 
                schedule();
 
-               raw_spin_lock(&lock->wait_lock);
+               raw_spin_lock_irq(&lock->wait_lock);
                set_current_state(state);
        }
 
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                  enum rtmutex_chainwalk chwalk)
 {
        struct rt_mutex_waiter waiter;
+       unsigned long flags;
        int ret = 0;
 
        debug_rt_mutex_init_waiter(&waiter);
        RB_CLEAR_NODE(&waiter.pi_tree_entry);
        RB_CLEAR_NODE(&waiter.tree_entry);
 
-       raw_spin_lock(&lock->wait_lock);
+       /*
+        * Technically we could use raw_spin_[un]lock_irq() here, but this can
+        * be called in early boot if the cmpxchg() fast path is disabled
+        * (debug, no architecture support). In this case we will acquire the
+        * rtmutex with lock->wait_lock held. But we cannot unconditionally
+        * enable interrupts in that early boot case. So we need to use the
+        * irqsave/restore variants.
+        */
+       raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
        /* Try to acquire the lock again: */
        if (try_to_take_rt_mutex(lock, current, NULL)) {
-               raw_spin_unlock(&lock->wait_lock);
+               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
                return 0;
        }
 
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
         */
        fixup_rt_mutex_waiters(lock);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
        /* Remove pending timer: */
        if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
  */
 static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
+       unsigned long flags;
        int ret;
 
        /*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
                return 0;
 
        /*
-        * The mutex has currently no owner. Lock the wait lock and
-        * try to acquire the lock.
+        * The mutex has currently no owner. Lock the wait lock and try to
+        * acquire the lock. We use irqsave here to support early boot calls.
         */
-       raw_spin_lock(&lock->wait_lock);
+       raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
        ret = try_to_take_rt_mutex(lock, current, NULL);
 
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
         */
        fixup_rt_mutex_waiters(lock);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
        return ret;
 }
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
                                        struct wake_q_head *wake_q)
 {
-       raw_spin_lock(&lock->wait_lock);
+       unsigned long flags;
+
+       /* irqsave required to support early boot calls */
+       raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
        debug_rt_mutex_unlock(lock);
 
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
         */
        while (!rt_mutex_has_waiters(lock)) {
                /* Drops lock->wait_lock ! */
-               if (unlock_rt_mutex_safe(lock) == true)
+               if (unlock_rt_mutex_safe(lock, flags) == true)
                        return false;
                /* Relock the rtmutex and try again */
-               raw_spin_lock(&lock->wait_lock);
+               raw_spin_lock_irqsave(&lock->wait_lock, flags);
        }
 
        /*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
         */
        mark_wakeup_next_waiter(wake_q, lock);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
        /* check PI boosting */
        return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
 
-       raw_spin_lock(&lock->wait_lock);
+       raw_spin_lock_irq(&lock->wait_lock);
 
        if (try_to_take_rt_mutex(lock, task, NULL)) {
-               raw_spin_unlock(&lock->wait_lock);
+               raw_spin_unlock_irq(&lock->wait_lock);
                return 1;
        }
 
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        if (unlikely(ret))
                remove_waiter(lock, waiter);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
 
        debug_rt_mutex_print_deadlock(waiter);
 
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
 
-       raw_spin_lock(&lock->wait_lock);
+       raw_spin_lock_irq(&lock->wait_lock);
 
        set_current_state(TASK_INTERRUPTIBLE);
 
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
         */
        fixup_rt_mutex_waiters(lock);
 
-       raw_spin_unlock(&lock->wait_lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
 
        return ret;
 }
index e517a16..fb9b887 100644 (file)
@@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
-       struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+       unsigned long pfn = PHYS_PFN(offset);
 
        /* In the simple case just return the existing linear address */
-       if (!PageHighMem(page))
+       if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
                return __va(offset);
        return NULL; /* fallback to ioremap_cache */
 }
@@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * being mapped does not have i/o side effects and the __iomem
  * annotation is not applicable.
  *
- * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * MEMREMAP_WB - matches the default mapping for System RAM on
  * the architecture.  This is usually a read-allocate write-back cache.
  * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
  * memremap() will bypass establishing a new mapping and instead return
@@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * MEMREMAP_WT - establish a mapping whereby writes either bypass the
  * cache or are written through to memory and never exist in a
  * cache-dirty state with respect to program visibility.  Attempts to
- * map "System RAM" with this mapping type will fail.
+ * map System RAM with this mapping type will fail.
  */
 void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 {
-       int is_ram = region_intersects(offset, size, "System RAM");
+       int is_ram = region_intersects(offset, size,
+                                      IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
        void *addr = NULL;
 
        if (is_ram == REGION_MIXED) {
@@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
                 * MEMREMAP_WB is special in that it can be satisifed
                 * from the direct map.  Some archs depend on the
                 * capability of memremap() to autodetect cases where
-                * the requested range is potentially in "System RAM"
+                * the requested range is potentially in System RAM.
                 */
                if (is_ram == REGION_INTERSECTS)
                        addr = try_ram_remap(offset, size);
@@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
         * If we don't have a mapping yet and more request flags are
         * pending then we will be attempting to establish a new virtual
         * address mapping.  Enforce that this mapping is not aliasing
-        * "System RAM"
+        * System RAM.
         */
        if (!addr && is_ram == REGION_INTERSECTS && flags) {
                WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -114,7 +115,7 @@ EXPORT_SYMBOL(memunmap);
 
 static void devm_memremap_release(struct device *dev, void *res)
 {
-       memunmap(res);
+       memunmap(*(void **)res);
 }
 
 static int devm_memremap_match(struct device *dev, void *res, void *match_data)
@@ -136,8 +137,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
        if (addr) {
                *ptr = addr;
                devres_add(dev, ptr);
-       } else
+       } else {
                devres_free(ptr);
+               return ERR_PTR(-ENXIO);
+       }
 
        return addr;
 }
@@ -150,7 +153,7 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
-pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
 {
        return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
 }
@@ -183,7 +186,11 @@ EXPORT_SYMBOL(put_zone_device_page);
 
 static void pgmap_radix_release(struct resource *res)
 {
-       resource_size_t key;
+       resource_size_t key, align_start, align_size, align_end;
+
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(resource_size(res), SECTION_SIZE);
+       align_end = align_start + align_size - 1;
 
        mutex_lock(&pgmap_lock);
        for (key = res->start; key <= res->end; key += SECTION_SIZE)
@@ -226,12 +233,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
                percpu_ref_put(pgmap->ref);
        }
 
-       pgmap_radix_release(res);
-
        /* pages are dead and unused, undo the arch mapping */
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(resource_size(res), SECTION_SIZE);
        arch_remove_memory(align_start, align_size);
+       pgmap_radix_release(res);
        dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
                        "%s: failed to free all reserved pages\n", __func__);
 }
@@ -265,13 +271,17 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 void *devm_memremap_pages(struct device *dev, struct resource *res,
                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
-       int is_ram = region_intersects(res->start, resource_size(res),
-                       "System RAM");
-       resource_size_t key, align_start, align_size;
+       resource_size_t key, align_start, align_size, align_end;
        struct dev_pagemap *pgmap;
        struct page_map *page_map;
+       int error, nid, is_ram;
        unsigned long pfn;
-       int error, nid;
+
+       align_start = res->start & ~(SECTION_SIZE - 1);
+       align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+               - align_start;
+       is_ram = region_intersects(align_start, align_size,
+               IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 
        if (is_ram == REGION_MIXED) {
                WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -309,7 +319,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 
        mutex_lock(&pgmap_lock);
        error = 0;
-       for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+       align_end = align_start + align_size - 1;
+       for (key = align_start; key <= align_end; key += SECTION_SIZE) {
                struct dev_pagemap *dup;
 
                rcu_read_lock();
@@ -336,8 +347,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        if (nid < 0)
                nid = numa_mem_id();
 
-       align_start = res->start & ~(SECTION_SIZE - 1);
-       align_size = ALIGN(resource_size(res), SECTION_SIZE);
        error = arch_add_memory(nid, align_start, align_size, true);
        if (error)
                goto err_add_memory;
@@ -345,8 +354,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        for_each_device_pfn(pfn, page_map) {
                struct page *page = pfn_to_page(pfn);
 
-               /* ZONE_DEVICE pages must never appear on a slab lru */
-               list_force_poison(&page->lru);
+               /*
+                * ZONE_DEVICE pages union ->lru with a ->pgmap back
+                * pointer.  It is a bug if a ZONE_DEVICE page is ever
+                * freed or placed on a driver-private list.  Seed the
+                * storage with LIST_POISON* values.
+                */
+               list_del(&page->lru);
                page->pgmap = pgmap;
        }
        devres_add(dev, page_map);
index 8358f46..794ebe8 100644 (file)
@@ -303,6 +303,9 @@ struct load_info {
        struct _ddebug *debug;
        unsigned int num_debug;
        bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+       unsigned long mod_kallsyms_init_off;
+#endif
        struct {
                unsigned int sym, str, mod, vers, info, pcpu;
        } index;
@@ -981,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                mod->exit();
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
+       ftrace_release_mod(mod);
+
        async_synchronize_full();
 
        /* Store the name of the last unloaded module for diagnostic purposes */
@@ -2480,10 +2485,21 @@ static void layout_symtab(struct module *mod, struct load_info *info)
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
                                         info->index.str) | INIT_OFFSET_MASK;
-       mod->init_layout.size = debug_align(mod->init_layout.size);
        pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+       /* We'll tack temporary mod_kallsyms on the end. */
+       mod->init_layout.size = ALIGN(mod->init_layout.size,
+                                     __alignof__(struct mod_kallsyms));
+       info->mod_kallsyms_init_off = mod->init_layout.size;
+       mod->init_layout.size += sizeof(struct mod_kallsyms);
+       mod->init_layout.size = debug_align(mod->init_layout.size);
 }
 
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section.  Later we switch to the cut-down
+ * core-only ones.
+ */
 static void add_kallsyms(struct module *mod, const struct load_info *info)
 {
        unsigned int i, ndst;
@@ -2492,29 +2508,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
        char *s;
        Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 
-       mod->symtab = (void *)symsec->sh_addr;
-       mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+       /* Set up to point into init section. */
+       mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+       mod->kallsyms->symtab = (void *)symsec->sh_addr;
+       mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
        /* Make sure we get permanent strtab: don't use info->strtab. */
-       mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+       mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
 
        /* Set types up while we still have access to sections. */
-       for (i = 0; i < mod->num_symtab; i++)
-               mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
-       mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
-       mod->core_strtab = s = mod->core_layout.base + info->stroffs;
-       src = mod->symtab;
-       for (ndst = i = 0; i < mod->num_symtab; i++) {
+       for (i = 0; i < mod->kallsyms->num_symtab; i++)
+               mod->kallsyms->symtab[i].st_info
+                       = elf_type(&mod->kallsyms->symtab[i], info);
+
+       /* Now populate the cut down core kallsyms for after init. */
+       mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+       mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+       src = mod->kallsyms->symtab;
+       for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
                if (i == 0 ||
                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
                                   info->index.pcpu)) {
                        dst[ndst] = src[i];
-                       dst[ndst++].st_name = s - mod->core_strtab;
-                       s += strlcpy(s, &mod->strtab[src[i].st_name],
+                       dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+                       s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
                                     KSYM_NAME_LEN) + 1;
                }
        }
-       mod->core_num_syms = ndst;
+       mod->core_kallsyms.num_symtab = ndst;
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -3263,9 +3284,8 @@ static noinline int do_init_module(struct module *mod)
        module_put(mod);
        trim_init_extable(mod);
 #ifdef CONFIG_KALLSYMS
-       mod->num_symtab = mod->core_num_syms;
-       mod->symtab = mod->core_symtab;
-       mod->strtab = mod->core_strtab;
+       /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+       rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
        mod_tree_remove_init(mod);
        disable_ro_nx(&mod->init_layout);
@@ -3295,6 +3315,7 @@ fail:
        module_put(mod);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
+       ftrace_release_mod(mod);
        free_module(mod);
        wake_up_all(&module_wq);
        return ret;
@@ -3371,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
        mod->state = MODULE_STATE_COMING;
        mutex_unlock(&module_mutex);
 
+       ftrace_module_enable(mod);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_COMING, mod);
        return 0;
@@ -3496,7 +3518,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
        /* Module is ready to execute: parsing args may do that. */
        after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-                                 -32768, 32767, NULL,
+                                 -32768, 32767, mod,
                                  unknown_module_param_cb);
        if (IS_ERR(after_dashes)) {
                err = PTR_ERR(after_dashes);
@@ -3627,6 +3649,11 @@ static inline int is_arm_mapping_symbol(const char *str)
               && (str[2] == '\0' || str[2] == '.');
 }
 
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+       return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
 static const char *get_ksymbol(struct module *mod,
                               unsigned long addr,
                               unsigned long *size,
@@ -3634,6 +3661,7 @@ static const char *get_ksymbol(struct module *mod,
 {
        unsigned int i, best = 0;
        unsigned long nextval;
+       struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
        /* At worse, next value is at end of module */
        if (within_module_init(addr, mod))
@@ -3643,32 +3671,32 @@ static const char *get_ksymbol(struct module *mod,
 
        /* Scan for closest preceding symbol, and next symbol. (ELF
           starts real symbols at 1). */
-       for (i = 1; i < mod->num_symtab; i++) {
-               if (mod->symtab[i].st_shndx == SHN_UNDEF)
+       for (i = 1; i < kallsyms->num_symtab; i++) {
+               if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
                        continue;
 
                /* We ignore unnamed symbols: they're uninformative
                 * and inserted at a whim. */
-               if (mod->symtab[i].st_value <= addr
-                   && mod->symtab[i].st_value > mod->symtab[best].st_value
-                   && *(mod->strtab + mod->symtab[i].st_name) != '\0'
-                   && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+               if (*symname(kallsyms, i) == '\0'
+                   || is_arm_mapping_symbol(symname(kallsyms, i)))
+                       continue;
+
+               if (kallsyms->symtab[i].st_value <= addr
+                   && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
                        best = i;
-               if (mod->symtab[i].st_value > addr
-                   && mod->symtab[i].st_value < nextval
-                   && *(mod->strtab + mod->symtab[i].st_name) != '\0'
-                   && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
-                       nextval = mod->symtab[i].st_value;
+               if (kallsyms->symtab[i].st_value > addr
+                   && kallsyms->symtab[i].st_value < nextval)
+                       nextval = kallsyms->symtab[i].st_value;
        }
 
        if (!best)
                return NULL;
 
        if (size)
-               *size = nextval - mod->symtab[best].st_value;
+               *size = nextval - kallsyms->symtab[best].st_value;
        if (offset)
-               *offset = addr - mod->symtab[best].st_value;
-       return mod->strtab + mod->symtab[best].st_name;
+               *offset = addr - kallsyms->symtab[best].st_value;
+       return symname(kallsyms, best);
 }
 
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
@@ -3758,19 +3786,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+               struct mod_kallsyms *kallsyms;
+
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-               if (symnum < mod->num_symtab) {
-                       *value = mod->symtab[symnum].st_value;
-                       *type = mod->symtab[symnum].st_info;
-                       strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-                               KSYM_NAME_LEN);
+               kallsyms = rcu_dereference_sched(mod->kallsyms);
+               if (symnum < kallsyms->num_symtab) {
+                       *value = kallsyms->symtab[symnum].st_value;
+                       *type = kallsyms->symtab[symnum].st_info;
+                       strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
                        strlcpy(module_name, mod->name, MODULE_NAME_LEN);
                        *exported = is_exported(name, *value, mod);
                        preempt_enable();
                        return 0;
                }
-               symnum -= mod->num_symtab;
+               symnum -= kallsyms->num_symtab;
        }
        preempt_enable();
        return -ERANGE;
@@ -3779,11 +3809,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 static unsigned long mod_find_symname(struct module *mod, const char *name)
 {
        unsigned int i;
+       struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
 
-       for (i = 0; i < mod->num_symtab; i++)
-               if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
-                   mod->symtab[i].st_info != 'U')
-                       return mod->symtab[i].st_value;
+       for (i = 0; i < kallsyms->num_symtab; i++)
+               if (strcmp(name, symname(kallsyms, i)) == 0 &&
+                   kallsyms->symtab[i].st_info != 'U')
+                       return kallsyms->symtab[i].st_value;
        return 0;
 }
 
@@ -3822,11 +3853,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
        module_assert_mutex();
 
        list_for_each_entry(mod, &modules, list) {
+               /* We hold module_mutex: no need for rcu_dereference_sched */
+               struct mod_kallsyms *kallsyms = mod->kallsyms;
+
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-               for (i = 0; i < mod->num_symtab; i++) {
-                       ret = fn(data, mod->strtab + mod->symtab[i].st_name,
-                                mod, mod->symtab[i].st_value);
+               for (i = 0; i < kallsyms->num_symtab; i++) {
+                       ret = fn(data, symname(kallsyms, i),
+                                mod, kallsyms->symtab[i].st_value);
                        if (ret != 0)
                                return ret;
                }
index f4ad91b..4d73a83 100644 (file)
@@ -588,7 +588,7 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-       /* Veryify no one has done anything silly */
+       /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
 
        /* bump default and minimum pid_max based on number of cpus */
index 02e8dfa..68d3ebc 100644 (file)
@@ -235,7 +235,7 @@ config PM_TRACE_RTC
 
 config APM_EMULATION
        tristate "Advanced Power Management Emulation"
-       depends on PM && SYS_SUPPORTS_APM_EMULATION
+       depends on SYS_SUPPORTS_APM_EMULATION
        help
          APM is a BIOS specification for saving power using several different
          techniques. This is mostly useful for battery powered laptops with
index 99513e1..5136969 100644 (file)
@@ -59,6 +59,7 @@ int profile_setup(char *str)
 
        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 #ifdef CONFIG_SCHEDSTATS
+               force_schedstat_enabled();
                prof_on = SLEEP_PROFILING;
                if (str[strlen(sleepstr)] == ',')
                        str += strlen(sleepstr) + 1;
index d2988d0..65ae0e5 100644 (file)
@@ -932,12 +932,14 @@ rcu_torture_writer(void *arg)
        int nsynctypes = 0;
 
        VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
-       pr_alert("%s" TORTURE_FLAG
-                " Grace periods expedited from boot/sysfs for %s,\n",
-                torture_type, cur_ops->name);
-       pr_alert("%s" TORTURE_FLAG
-                " Testing of dynamic grace-period expediting diabled.\n",
-                torture_type);
+       if (!can_expedite) {
+               pr_alert("%s" TORTURE_FLAG
+                        " Grace periods expedited from boot/sysfs for %s,\n",
+                        torture_type, cur_ops->name);
+               pr_alert("%s" TORTURE_FLAG
+                        " Disabled dynamic grace-period expediting.\n",
+                        torture_type);
+       }
 
        /* Initialize synctype[] array.  If none set, take default. */
        if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
index e492a52..196f030 100644 (file)
@@ -23,7 +23,7 @@
  */
 
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
@@ -122,18 +122,7 @@ free_out:
        debugfs_remove_recursive(rcudir);
        return 1;
 }
-
-static void __exit rcutiny_trace_cleanup(void)
-{
-       debugfs_remove_recursive(rcudir);
-}
-
-module_init(rcutiny_trace_init);
-module_exit(rcutiny_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutiny_trace_init);
 
 static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
index e41dd41..9a535a8 100644 (file)
@@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 
 static struct rcu_state *const rcu_state_p;
-static struct rcu_data __percpu *const rcu_data_p;
 LIST_HEAD(rcu_struct_flavors);
 
 /* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1083,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
        rcu_sysidle_check_cpu(rdp, isidle, maxj);
        if ((rdp->dynticks_snap & 0x1) == 0) {
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
-               return 1;
-       } else {
                if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
                                 rdp->mynode->gpnum))
                        WRITE_ONCE(rdp->gpwrap, true);
-               return 0;
+               return 1;
        }
+       return 0;
 }
 
 /*
@@ -1173,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
                        WRITE_ONCE(*rcrmp,
                                   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
-                       resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
-                       rdp->rsp->jiffies_resched += 5; /* Enable beating. */
-               } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-                       /* Time to beat on that CPU again! */
-                       resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
-                       rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
                }
+               rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
        }
 
+       /* And if it has been a really long time, kick the CPU as well. */
+       if (ULONG_CMP_GE(jiffies,
+                        rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
+           ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
+               resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+
        return 0;
 }
 
@@ -1246,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
                                if (rnp->qsmask & (1UL << cpu))
                                        dump_cpu_task(rnp->grplo + cpu);
                }
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
 }
 
@@ -1266,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        delta = jiffies - READ_ONCE(rsp->jiffies_stall);
        if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
        WRITE_ONCE(rsp->jiffies_stall,
                   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
        /*
         * OK, time to rat on our buddy...
@@ -1292,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
                                        ndetected++;
                                }
                }
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
 
        print_cpu_stall_info_end();
@@ -1357,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
        if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
                WRITE_ONCE(rsp->jiffies_stall,
                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
        /*
         * Attempt to revive the RCU machinery by forcing a context switch.
@@ -1595,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
        }
 unlock_out:
        if (rnp != rnp_root)
-               raw_spin_unlock(&rnp_root->lock);
+               raw_spin_unlock_rcu_node(rnp_root);
 out:
        if (c_out != NULL)
                *c_out = c;
@@ -1614,7 +1613,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
        int needmore;
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-       rcu_nocb_gp_cleanup(rsp, rnp);
        rnp->need_future_gp[c & 0x1] = 0;
        needmore = rnp->need_future_gp[(c + 1) & 0x1];
        trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1633,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
            !READ_ONCE(rsp->gp_flags) ||
            !rsp->gp_kthread)
                return;
-       wake_up(&rsp->gp_wq);
+       swake_up(&rsp->gp_wq);
 }
 
 /*
@@ -1815,7 +1813,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
                return;
        }
        needwake = __note_gp_changes(rsp, rnp, rdp);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        if (needwake)
                rcu_gp_kthread_wake(rsp);
 }
@@ -1840,7 +1838,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
        raw_spin_lock_irq_rcu_node(rnp);
        if (!READ_ONCE(rsp->gp_flags)) {
                /* Spurious wakeup, tell caller to go back to sleep.  */
-               raw_spin_unlock_irq(&rnp->lock);
+               raw_spin_unlock_irq_rcu_node(rnp);
                return false;
        }
        WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1850,7 +1848,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
                 * Grace period already in progress, don't start another.
                 * Not supposed to be able to happen.
                 */
-               raw_spin_unlock_irq(&rnp->lock);
+               raw_spin_unlock_irq_rcu_node(rnp);
                return false;
        }
 
@@ -1859,7 +1857,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
        /* Record GP times before starting GP, hence smp_store_release(). */
        smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
        trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
-       raw_spin_unlock_irq(&rnp->lock);
+       raw_spin_unlock_irq_rcu_node(rnp);
 
        /*
         * Apply per-leaf buffered online and offline operations to the
@@ -1873,7 +1871,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
                if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
                    !rnp->wait_blkd_tasks) {
                        /* Nothing to do on this leaf rcu_node structure. */
-                       raw_spin_unlock_irq(&rnp->lock);
+                       raw_spin_unlock_irq_rcu_node(rnp);
                        continue;
                }
 
@@ -1907,7 +1905,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
                        rcu_cleanup_dead_rnp(rnp);
                }
 
-               raw_spin_unlock_irq(&rnp->lock);
+               raw_spin_unlock_irq_rcu_node(rnp);
        }
 
        /*
@@ -1938,7 +1936,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
                                            rnp->level, rnp->grplo,
                                            rnp->grphi, rnp->qsmask);
-               raw_spin_unlock_irq(&rnp->lock);
+               raw_spin_unlock_irq_rcu_node(rnp);
                cond_resched_rcu_qs();
                WRITE_ONCE(rsp->gp_activity, jiffies);
        }
@@ -1996,7 +1994,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
                raw_spin_lock_irq_rcu_node(rnp);
                WRITE_ONCE(rsp->gp_flags,
                           READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
-               raw_spin_unlock_irq(&rnp->lock);
+               raw_spin_unlock_irq_rcu_node(rnp);
        }
 }
 
@@ -2010,6 +2008,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        int nocb = 0;
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+       struct swait_queue_head *sq;
 
        WRITE_ONCE(rsp->gp_activity, jiffies);
        raw_spin_lock_irq_rcu_node(rnp);
@@ -2025,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         * safe for us to drop the lock in order to mark the grace
         * period as completed in all of the rcu_node structures.
         */
-       raw_spin_unlock_irq(&rnp->lock);
+       raw_spin_unlock_irq_rcu_node(rnp);
 
        /*
         * Propagate new ->completed value to rcu_node structures so
@@ -2046,7 +2045,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                        needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
                /* smp_mb() provided by prior unlock-lock pair. */
                nocb += rcu_future_gp_cleanup(rsp, rnp);
-               raw_spin_unlock_irq(&rnp->lock);
+               sq = rcu_nocb_gp_get(rnp);
+               raw_spin_unlock_irq_rcu_node(rnp);
+               rcu_nocb_gp_cleanup(sq);
                cond_resched_rcu_qs();
                WRITE_ONCE(rsp->gp_activity, jiffies);
                rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2068,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                                       READ_ONCE(rsp->gpnum),
                                       TPS("newreq"));
        }
-       raw_spin_unlock_irq(&rnp->lock);
+       raw_spin_unlock_irq_rcu_node(rnp);
 }
 
 /*
@@ -2092,7 +2093,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gpnum),
                                               TPS("reqwait"));
                        rsp->gp_state = RCU_GP_WAIT_GPS;
-                       wait_event_interruptible(rsp->gp_wq,
+                       swait_event_interruptible(rsp->gp_wq,
                                                 READ_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
                        rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2123,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gpnum),
                                               TPS("fqswait"));
                        rsp->gp_state = RCU_GP_WAIT_FQS;
-                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
+                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
                                        rcu_gp_fqs_check_wake(rsp, &gf), j);
                        rsp->gp_state = RCU_GP_DOING_FQS;
                        /* Locking provides needed memory barriers. */
@@ -2234,19 +2235,21 @@ static bool rcu_start_gp(struct rcu_state *rsp)
 }
 
 /*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure.  This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed.  Note that the caller must hold rnp->lock, which
- * is released before return.
+ * Report a full set of quiescent states to the specified rcu_state data
+ * structure.  Invoke rcu_gp_kthread_wake() to awaken the grace-period
+ * kthread if another grace period is required.  Whether we wake
+ * the grace-period kthread or it awakens itself for the next round
+ * of quiescent-state forcing, that kthread will clean up after the
+ * just-completed grace period.  Note that the caller must hold rnp->lock,
+ * which is released before return.
  */
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
        WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
-       raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
+       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2275,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                         * Our bit has already been cleared, or the
                         * relevant grace period is already over, so done.
                         */
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        return;
                }
                WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
@@ -2287,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 
                        /* Other bits still set at this level, so done. */
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        return;
                }
                mask = rnp->grpmask;
@@ -2297,7 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 
                        break;
                }
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                rnp_c = rnp;
                rnp = rnp->parent;
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -2329,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 
        if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
            rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;  /* Still need more quiescent states! */
        }
 
@@ -2346,19 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
        /* Report up the rest of the hierarchy, tracking current ->gpnum. */
        gps = rnp->gpnum;
        mask = rnp->grpmask;
-       raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
+       raw_spin_unlock_rcu_node(rnp);  /* irqs remain disabled. */
        raw_spin_lock_rcu_node(rnp_p);  /* irqs already disabled. */
        rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
 /*
  * Record a quiescent state for the specified CPU to that CPU's rcu_data
- * structure.  This must be either called from the specified CPU, or
- * called when the specified CPU is known to be offline (and when it is
- * also known that no other CPU is concurrently trying to help the offline
- * CPU).  The lastcomp argument is used to make sure we are still in the
- * grace period of interest.  We don't want to end the current grace period
- * based on quiescent states detected in an earlier grace period!
+ * structure.  This must be called from the specified CPU.
  */
 static void
 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
@@ -2383,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 */
                rdp->cpu_no_qs.b.norm = true;   /* need qs for new gp. */
                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
        mask = rdp->grpmask;
        if ((rnp->qsmask & mask) == 0) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        } else {
-               rdp->core_needs_qs = 0;
+               rdp->core_needs_qs = false;
 
                /*
                 * This GP can't end until cpu checks in, so all of our
@@ -2599,35 +2597,14 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
                rnp->qsmaskinit &= ~mask;
                rnp->qsmask &= ~mask;
                if (rnp->qsmaskinit) {
-                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                       raw_spin_unlock_rcu_node(rnp);
+                       /* irqs remain disabled. */
                        return;
                }
-               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
        }
 }
 
-/*
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function.  We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
- */
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-       unsigned long flags;
-       unsigned long mask;
-       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-       struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
-
-       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-               return;
-
-       /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-       mask = rdp->grpmask;
-       raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
-       rnp->qsmaskinitnext &= ~mask;
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
 /*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
@@ -2859,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
                        rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
                } else {
                        /* Nothing to do here, so just drop the lock. */
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                }
        }
 }
@@ -2895,12 +2872,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
        raw_spin_unlock(&rnp_old->fqslock);
        if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                rsp->n_force_qs_lh++;
-               raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
                return;  /* Someone beat us to it. */
        }
        WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
-       raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-       rcu_gp_kthread_wake(rsp);
+       raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
+       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2925,7 +2902,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        if (cpu_needs_another_gp(rsp, rdp)) {
                raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
                needwake = rcu_start_gp(rsp);
-               raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
                if (needwake)
                        rcu_gp_kthread_wake(rsp);
        } else {
@@ -3016,7 +2993,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 
                        raw_spin_lock_rcu_node(rnp_root);
                        needwake = rcu_start_gp(rsp);
-                       raw_spin_unlock(&rnp_root->lock);
+                       raw_spin_unlock_rcu_node(rnp_root);
                        if (needwake)
                                rcu_gp_kthread_wake(rsp);
                } else {
@@ -3436,14 +3413,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->expmaskinit == rnp->expmaskinitnext) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        continue;  /* No new CPUs, nothing to do. */
                }
 
                /* Update this node's mask, track old value for propagation. */
                oldmask = rnp->expmaskinit;
                rnp->expmaskinit = rnp->expmaskinitnext;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
                /* If was already nonzero, nothing to propagate. */
                if (oldmask)
@@ -3458,7 +3435,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
                        if (rnp_up->expmaskinit)
                                done = true;
                        rnp_up->expmaskinit |= mask;
-                       raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
                        if (done)
                                break;
                        mask = rnp_up->grpmask;
@@ -3481,7 +3458,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                WARN_ON_ONCE(rnp->expmask);
                rnp->expmask = rnp->expmaskinit;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
 }
 
@@ -3522,19 +3499,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                        if (!rnp->expmask)
                                rcu_initiate_boost(rnp, flags);
                        else
-                               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        break;
                }
                if (rnp->parent == NULL) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        if (wake) {
                                smp_mb(); /* EGP done before wake_up(). */
-                               wake_up(&rsp->expedited_wq);
+                               swake_up(&rsp->expedited_wq);
                        }
                        break;
                }
                mask = rnp->grpmask;
-               raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+               raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
                rnp = rnp->parent;
                raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
                WARN_ON_ONCE(!(rnp->expmask & mask));
@@ -3569,7 +3546,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        if (!(rnp->expmask & mask)) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
        rnp->expmask &= ~mask;
@@ -3730,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
                 */
                if (rcu_preempt_has_tasks(rnp))
                        rnp->exp_tasks = rnp->blkd_tasks.next;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
                /* IPI the remaining CPUs for expedited quiescent state. */
                mask = 1;
@@ -3747,7 +3724,7 @@ retry_ipi:
                        raw_spin_lock_irqsave_rcu_node(rnp, flags);
                        if (cpu_online(cpu) &&
                            (rnp->expmask & mask)) {
-                               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                                schedule_timeout_uninterruptible(1);
                                if (cpu_online(cpu) &&
                                    (rnp->expmask & mask))
@@ -3756,7 +3733,7 @@ retry_ipi:
                        }
                        if (!(rnp->expmask & mask))
                                mask_ofl_ipi &= ~mask;
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                }
                /* Report quiescent states for those that went offline. */
                mask_ofl_test |= mask_ofl_ipi;
@@ -3780,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
        jiffies_start = jiffies;
 
        for (;;) {
-               ret = wait_event_interruptible_timeout(
+               ret = swait_event_timeout(
                                rsp->expedited_wq,
                                sync_rcu_preempt_exp_done(rnp_root),
                                jiffies_stall);
@@ -3788,7 +3765,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
                        return;
                if (ret < 0) {
                        /* Hit a signal, disable CPU stall warnings. */
-                       wait_event(rsp->expedited_wq,
+                       swait_event(rsp->expedited_wq,
                                   sync_rcu_preempt_exp_done(rnp_root));
                        return;
                }
@@ -4163,7 +4140,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
                        return;
                raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
                rnp->qsmaskinit |= mask;
-               raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+               raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
        }
 }
 
@@ -4187,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->rsp = rsp;
        mutex_init(&rdp->exp_funnel_mutex);
        rcu_boot_init_nocb_percpu_data(rdp);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
@@ -4215,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        rcu_sysidle_init_percpu_data(rdp->dynticks);
        atomic_set(&rdp->dynticks->dynticks,
                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-       raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
+       raw_spin_unlock_rcu_node(rnp);          /* irqs remain disabled. */
 
        /*
         * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
@@ -4236,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
        rdp->core_needs_qs = false;
        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 static void rcu_prepare_cpu(int cpu)
@@ -4247,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu)
                rcu_init_percpu_data(cpu, rsp);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+       unsigned long flags;
+       unsigned long mask;
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+       struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+               return;
+
+       /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+       mask = rdp->grpmask;
+       raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
+       rnp->qsmaskinitnext &= ~mask;
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+void rcu_report_dead(unsigned int cpu)
+{
+       struct rcu_state *rsp;
+
+       /* QS for any half-done expedited RCU-sched GP. */
+       preempt_disable();
+       rcu_report_exp_rdp(&rcu_sched_state,
+                          this_cpu_ptr(rcu_sched_state.rda), true);
+       preempt_enable();
+       for_each_rcu_flavor(rsp)
+               rcu_cleanup_dying_idle_cpu(cpu, rsp);
+}
+#endif
+
 /*
  * Handle CPU online/offline notification events.
  */
@@ -4278,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self,
                for_each_rcu_flavor(rsp)
                        rcu_cleanup_dying_cpu(rsp);
                break;
-       case CPU_DYING_IDLE:
-               /* QS for any half-done expedited RCU-sched GP. */
-               preempt_disable();
-               rcu_report_exp_rdp(&rcu_sched_state,
-                                  this_cpu_ptr(rcu_sched_state.rda), true);
-               preempt_enable();
-
-               for_each_rcu_flavor(rsp) {
-                       rcu_cleanup_dying_idle_cpu(cpu, rsp);
-               }
-               break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
@@ -4358,7 +4364,7 @@ static int __init rcu_spawn_gp_kthread(void)
                        sp.sched_priority = kthread_prio;
                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
                }
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                wake_up_process(t);
        }
        rcu_spawn_nocb_kthreads();
@@ -4449,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                cpustride *= levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < levelcnt[i]; j++, rnp++) {
-                       raw_spin_lock_init(&rnp->lock);
-                       lockdep_set_class_and_name(&rnp->lock,
+                       raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
+                       lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
                                                   &rcu_node_class[i], buf[i]);
                        raw_spin_lock_init(&rnp->fqslock);
                        lockdep_set_class_and_name(&rnp->fqslock,
@@ -4482,8 +4488,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                }
        }
 
-       init_waitqueue_head(&rsp->gp_wq);
-       init_waitqueue_head(&rsp->expedited_wq);
+       init_swait_queue_head(&rsp->gp_wq);
+       init_swait_queue_head(&rsp->expedited_wq);
        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
index 83360b4..df668c0 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/swait.h>
 #include <linux/stop_machine.h>
 
 /*
@@ -149,8 +150,9 @@ struct rcu_dynticks {
  * Definition for node within the RCU grace-period-detection hierarchy.
  */
 struct rcu_node {
-       raw_spinlock_t lock;    /* Root rcu_node's lock protects some */
-                               /*  rcu_state fields as well as following. */
+       raw_spinlock_t __private lock;  /* Root rcu_node's lock protects */
+                                       /*  some rcu_state fields as well as */
+                                       /*  following. */
        unsigned long gpnum;    /* Current grace period for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
@@ -243,7 +245,7 @@ struct rcu_node {
                                /* Refused to boost: not sure why, though. */
                                /*  This can happen due to race conditions. */
 #ifdef CONFIG_RCU_NOCB_CPU
-       wait_queue_head_t nocb_gp_wq[2];
+       struct swait_queue_head nocb_gp_wq[2];
                                /* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        int need_future_gp[2];
@@ -399,7 +401,7 @@ struct rcu_data {
        atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
        struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
        struct rcu_head **nocb_follower_tail;
-       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
+       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
 
@@ -478,7 +480,7 @@ struct rcu_state {
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
        struct task_struct *gp_kthread;         /* Task for grace periods. */
-       wait_queue_head_t gp_wq;                /* Where GP task waits. */
+       struct swait_queue_head gp_wq;          /* Where GP task waits. */
        short gp_flags;                         /* Commands for GP task. */
        short gp_state;                         /* GP kthread sleep state. */
 
@@ -506,7 +508,7 @@ struct rcu_state {
        unsigned long expedited_sequence;       /* Take a ticket. */
        atomic_long_t expedited_normal;         /* # fallbacks to normal. */
        atomic_t expedited_need_qs;             /* # CPUs left to check in. */
-       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
+       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
        int ncpus_snap;                         /* # CPUs seen last time. */
 
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -621,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                            bool lazy, unsigned long flags);
@@ -680,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #endif /* #else #ifdef CONFIG_PPC */
 
 /*
- * Wrappers for the rcu_node::lock acquire.
+ * Wrappers for the rcu_node::lock acquire and release.
  *
  * Because the rcu_nodes form a tree, the tree traversal locking will observe
  * different lock values, this in turn means that an UNLOCK of one level
@@ -689,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
  *
  * In order to restore full ordering between tree levels, augment the regular
  * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
  */
 static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
 {
-       raw_spin_lock(&rnp->lock);
+       raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
        smp_mb__after_unlock_lock();
 }
 
+static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
+{
+       raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
+}
+
 static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
 {
-       raw_spin_lock_irq(&rnp->lock);
+       raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
        smp_mb__after_unlock_lock();
 }
 
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags)     \
-do {                                                   \
-       typecheck(unsigned long, flags);                \
-       raw_spin_lock_irqsave(&(rnp)->lock, flags);     \
-       smp_mb__after_unlock_lock();                    \
+static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
+{
+       raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)                     \
+do {                                                                   \
+       typecheck(unsigned long, flags);                                \
+       raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags);       \
+       smp_mb__after_unlock_lock();                                    \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags)                        \
+do {                                                                   \
+       typecheck(unsigned long, flags);                                \
+       raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);  \
 } while (0)
 
 static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
 {
-       bool locked = raw_spin_trylock(&rnp->lock);
+       bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
 
        if (locked)
                smp_mb__after_unlock_lock();
index 9467a8b..efdf7b6 100644 (file)
@@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
                rnp->gp_tasks = &t->rcu_node_entry;
        if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
                rnp->exp_tasks = &t->rcu_node_entry;
-       raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
+       raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
 
        /*
         * Report the quiescent state for the expedited GP.  This expedited
@@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                                                         !!rnp->gp_tasks);
                        rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
                } else {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                }
 
                /* Unboost if we were boosted. */
@@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        if (!rcu_preempt_blocked_readers_cgp(rnp)) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
        t = list_entry(rnp->gp_tasks->prev,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
                sched_show_task(t);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
@@ -807,7 +807,6 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
 static struct rcu_state *const rcu_state_p = &rcu_sched_state;
-static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
 
 /*
  * Tell them what RCU they are running.
@@ -991,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp)
         * might exit their RCU read-side critical sections on their own.
         */
        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return 0;
        }
 
@@ -1028,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
@@ -1088,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 
        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
                rnp->n_balk_exp_gp_tasks++;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
        if (rnp->exp_tasks != NULL ||
@@ -1098,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
             ULONG_CMP_GE(jiffies, rnp->boost_time))) {
                if (rnp->exp_tasks == NULL)
                        rnp->boost_tasks = rnp->gp_tasks;
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                t = rnp->boost_kthread_task;
                if (t)
                        rcu_wake_cond(t, rnp->boost_kthread_status);
        } else {
                rcu_initiate_boost_trace(rnp);
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
 }
 
@@ -1172,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                return PTR_ERR(t);
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        rnp->boost_kthread_task = t;
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        sp.sched_priority = kthread_prio;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
@@ -1308,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu)
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
        __releases(rnp->lock)
 {
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 static void invoke_rcu_callbacks_kthread(void)
@@ -1559,7 +1558,7 @@ static void rcu_prepare_for_idle(void)
                rnp = rdp->mynode;
                raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
                needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
-               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
                if (needwake)
                        rcu_gp_kthread_wake(rsp);
        }
@@ -1811,9 +1810,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+       swake_up_all(sq);
 }
 
 /*
@@ -1829,10 +1828,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
        rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1861,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
        if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                /* Prior smp_mb__after_atomic() orders against prior enqueue. */
                WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-               wake_up(&rdp_leader->nocb_wq);
+               swake_up(&rdp_leader->nocb_wq);
        }
 }
 
@@ -2059,7 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 
        raw_spin_lock_irqsave_rcu_node(rnp, flags);
        needwake = rcu_start_future_gp(rnp, rdp, &c);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        if (needwake)
                rcu_gp_kthread_wake(rdp->rsp);
 
@@ -2069,7 +2073,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
         */
        trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
        for (;;) {
-               wait_event_interruptible(
+               swait_event_interruptible(
                        rnp->nocb_gp_wq[c & 0x1],
                        (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                if (likely(d))
@@ -2097,7 +2101,7 @@ wait_again:
        /* Wait for callbacks to appear. */
        if (!rcu_nocb_poll) {
                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-               wait_event_interruptible(my_rdp->nocb_wq,
+               swait_event_interruptible(my_rdp->nocb_wq,
                                !READ_ONCE(my_rdp->nocb_leader_sleep));
                /* Memory barrier handled by smp_mb() calls below and repoll. */
        } else if (firsttime) {
@@ -2172,7 +2176,7 @@ wait_again:
                         * List was empty, wake up the follower.
                         * Memory barriers supplied by atomic_long_add().
                         */
-                       wake_up(&rdp->nocb_wq);
+                       swake_up(&rdp->nocb_wq);
                }
        }
 
@@ -2193,7 +2197,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
                if (!rcu_nocb_poll) {
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            "FollowerSleep");
-                       wait_event_interruptible(rdp->nocb_wq,
+                       swait_event_interruptible(rdp->nocb_wq,
                                                 READ_ONCE(rdp->nocb_follower_head));
                } else if (firsttime) {
                        /* Don't drown trace log with "Poll"! */
@@ -2352,7 +2356,7 @@ void __init rcu_init_nohz(void)
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
        rdp->nocb_tail = &rdp->nocb_head;
-       init_waitqueue_head(&rdp->nocb_wq);
+       init_swait_queue_head(&rdp->nocb_wq);
        rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
@@ -2502,7 +2506,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
        return false;
 }
 
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
@@ -2510,6 +2514,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 {
 }
 
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+       return NULL;
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
index 76b94e1..ca828b4 100644 (file)
@@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void)
 {
        return READ_ONCE(rcu_normal);
 }
+EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
 
 static atomic_t rcu_expedited_nesting =
        ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
index 09c0597..4d46605 100644 (file)
@@ -333,13 +333,13 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
 
 /*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+ * Finds the lowest iomem resource existing within [res->start.res->end).
+ * The caller must specify res->start, res->end, res->flags, and optionally
+ * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
+ * This function walks the whole tree and not just first level children until
+ * and unless first_level_children_only is true.
  */
-static int find_next_iomem_res(struct resource *res, char *name,
+static int find_next_iomem_res(struct resource *res, unsigned long desc,
                               bool first_level_children_only)
 {
        resource_size_t start, end;
@@ -358,9 +358,9 @@ static int find_next_iomem_res(struct resource *res, char *name,
        read_lock(&resource_lock);
 
        for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
-               if (p->flags != res->flags)
+               if ((p->flags & res->flags) != res->flags)
                        continue;
-               if (name && strcmp(p->name, name))
+               if ((desc != IORES_DESC_NONE) && (desc != p->desc))
                        continue;
                if (p->start > end) {
                        p = NULL;
@@ -385,15 +385,18 @@ static int find_next_iomem_res(struct resource *res, char *name,
  * Walks through iomem resources and calls func() with matching resource
  * ranges. This walks through whole tree and not just first level children.
  * All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
+ * desc are valid candidates.
  *
- * @name: name of resource
- * @flags: resource flags
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
  * @start: start addr
  * @end: end addr
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
  */
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
-               void *arg, int (*func)(u64, u64, void *))
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+               u64 end, void *arg, int (*func)(u64, u64, void *))
 {
        struct resource res;
        u64 orig_end;
@@ -403,23 +406,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
        res.end = end;
        res.flags = flags;
        orig_end = res.end;
+
        while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, name, false))) {
+               (!find_next_iomem_res(&res, desc, false))) {
+
                ret = (*func)(res.start, res.end, arg);
                if (ret)
                        break;
+
                res.start = res.end + 1;
                res.end = orig_end;
        }
+
        return ret;
 }
 
 /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
  */
 int walk_system_ram_res(u64 start, u64 end, void *arg,
                                int (*func)(u64, u64, void *))
@@ -430,10 +437,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 
        res.start = start;
        res.end = end;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        orig_end = res.end;
        while ((res.start < res.end) &&
-               (!find_next_iomem_res(&res, "System RAM", true))) {
+               (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
                ret = (*func)(res.start, res.end, arg);
                if (ret)
                        break;
@@ -446,9 +453,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 
 /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
  */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +467,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 
        res.start = (u64) start_pfn << PAGE_SHIFT;
        res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
-       res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        orig_end = res.end;
        while ((res.start < res.end) &&
-               (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+               (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                end_pfn = (res.end + 1) >> PAGE_SHIFT;
                if (end_pfn > pfn)
@@ -484,7 +491,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
 }
 /*
  * This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
  */
 int __weak page_is_ram(unsigned long pfn)
 {
@@ -496,30 +503,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address
  * @size: size of region
- * @name: name of resource (in iomem_resource)
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
  *
  * Check if the specified region partially overlaps or fully eclipses a
- * resource identified by @name.  Return REGION_DISJOINT if the region
- * does not overlap @name, return REGION_MIXED if the region overlaps
- * @type and another resource, and return REGION_INTERSECTS if the
- * region overlaps @type and no other defined resource. Note, that
- * REGION_INTERSECTS is also returned in the case when the specified
- * region overlaps RAM and undefined memory holes.
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
  *
  * region_intersect() is used by memory remapping functions to ensure
  * the user is not remapping RAM and is a vast speed up over walking
  * through the resource table page by page.
  */
-int region_intersects(resource_size_t start, size_t size, const char *name)
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+                     unsigned long desc)
 {
-       unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        resource_size_t end = start + size - 1;
        int type = 0; int other = 0;
        struct resource *p;
 
        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
 
                if (start >= p->start && start <= p->end)
                        is_type ? type++ : other++;
@@ -538,6 +549,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
 
        return REGION_DISJOINT;
 }
+EXPORT_SYMBOL_GPL(region_intersects);
 
 void __weak arch_remove_reservations(struct resource *avail)
 {
@@ -948,6 +960,7 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->start = start;
        res->end = end;
        res->flags = IORESOURCE_BUSY;
+       res->desc = IORES_DESC_NONE;
 
        while (1) {
 
@@ -982,6 +995,7 @@ static void __init __reserve_region_with_split(struct resource *root,
                                next_res->start = conflict->end + 1;
                                next_res->end = end;
                                next_res->flags = IORESOURCE_BUSY;
+                               next_res->desc = IORES_DESC_NONE;
                        }
                } else {
                        res->start = conflict->end + 1;
@@ -1071,8 +1085,9 @@ struct resource * __request_region(struct resource *parent,
        res->name = name;
        res->start = start;
        res->end = start + n - 1;
-       res->flags = resource_type(parent);
+       res->flags = resource_type(parent) | resource_ext_type(parent);
        res->flags |= IORESOURCE_BUSY | flags;
+       res->desc = IORES_DESC_NONE;
 
        write_lock(&resource_lock);
 
@@ -1083,9 +1098,10 @@ struct resource * __request_region(struct resource *parent,
                if (!conflict)
                        break;
                if (conflict != parent) {
-                       parent = conflict;
-                       if (!(conflict->flags & IORESOURCE_BUSY))
+                       if (!(conflict->flags & IORESOURCE_BUSY)) {
+                               parent = conflict;
                                continue;
+                       }
                }
                if (conflict->flags & flags & IORESOURCE_MUXED) {
                        add_wait_queue(&muxed_resource_wait, &wait);
@@ -1237,6 +1253,7 @@ int release_mem_region_adjustable(struct resource *parent,
                        new_res->start = end + 1;
                        new_res->end = res->end;
                        new_res->flags = res->flags;
+                       new_res->desc = res->desc;
                        new_res->parent = res->parent;
                        new_res->sibling = res->sibling;
                        new_res->child = NULL;
@@ -1412,6 +1429,7 @@ static int __init reserve_setup(char *str)
                        res->start = io_start;
                        res->end = io_start + io_num - 1;
                        res->flags = IORESOURCE_BUSY;
+                       res->desc = IORES_DESC_NONE;
                        res->child = NULL;
                        if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
                                reserved = x+1;
index 6768797..7d4cba2 100644 (file)
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
index bc54e84..fedb967 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
 #include <linux/compiler.h>
+#include <linux/tick.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
 {
        if (!sched_clock_stable())
                static_key_slow_inc(&__sched_clock_stable);
+
+       tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
 void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
        /* XXX worry about clock continuity */
        if (sched_clock_stable())
                static_key_slow_dec(&__sched_clock_stable);
+
+       tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
 static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
index 63d3a24..ea8f49a 100644 (file)
@@ -26,6 +26,7 @@
  *              Thomas Gleixner, Mike Kravetz
  */
 
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
-#include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
 
 #undef SCHED_FEAT
 
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)      \
-       #name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-       int i;
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (!(sysctl_sched_features & (1UL << i)))
-                       seq_puts(m, "NO_");
-               seq_printf(m, "%s ", sched_feat_names[i]);
-       }
-       seq_puts(m, "\n");
-
-       return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)      \
-       jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-       static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-       static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-       int i;
-       int neg = 0;
-
-       if (strncmp(cmp, "NO_", 3) == 0) {
-               neg = 1;
-               cmp += 3;
-       }
-
-       for (i = 0; i < __SCHED_FEAT_NR; i++) {
-               if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg) {
-                               sysctl_sched_features &= ~(1UL << i);
-                               sched_feat_disable(i);
-                       } else {
-                               sysctl_sched_features |= (1UL << i);
-                               sched_feat_enable(i);
-                       }
-                       break;
-               }
-       }
-
-       return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char buf[64];
-       char *cmp;
-       int i;
-       struct inode *inode;
-
-       if (cnt > 63)
-               cnt = 63;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt] = 0;
-       cmp = strstrip(buf);
-
-       /* Ensure the static_key remains in a consistent state */
-       inode = file_inode(filp);
-       inode_lock(inode);
-       i = sched_feat_set(cmp);
-       inode_unlock(inode);
-       if (i == __SCHED_FEAT_NR)
-               return -EINVAL;
-
-       *ppos += cnt;
-
-       return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-       .open           = sched_feat_open,
-       .write          = sched_feat_write,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
-                       &sched_feat_fops);
-
-       return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -453,20 +320,6 @@ static inline void init_hrtick(void)
 }
 #endif /* CONFIG_SCHED_HRTICK */
 
-/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val)                                             \
-({     typeof(*(ptr)) __old, __val = *(ptr);                           \
-       for (;;) {                                                      \
-               __old = cmpxchg((ptr), __val, __val | (val));           \
-               if (__old == __val)                                     \
-                       break;                                          \
-               __val = __old;                                          \
-       }                                                               \
-       __old;                                                          \
-})
-
 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 /*
  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)
 #endif /* CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
 {
+       int fifo_nr_running;
+
+       /* Deadline tasks, even if single, need the tick */
+       if (rq->dl.dl_nr_running)
+               return false;
+
        /*
-        * FIFO realtime policy runs the highest priority task. Other runnable
-        * tasks are of a lower priority. The scheduler tick does nothing.
+        * FIFO realtime policy runs the highest priority task (after DEADLINE).
+        * Other runnable tasks are of a lower priority. The scheduler tick
+        * isn't needed.
         */
-       if (current->policy == SCHED_FIFO)
+       fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+       if (fifo_nr_running)
                return true;
 
        /*
         * Round-robin realtime tasks time slice with other tasks at the same
-        * realtime priority. Is this task the only one at this priority?
+        * realtime priority.
         */
-       if (current->policy == SCHED_RR) {
-               struct sched_rt_entity *rt_se = &current->rt;
-
-               return list_is_singular(&rt_se->run_list);
+       if (rq->rt.rr_nr_running) {
+               if (rq->rt.rr_nr_running == 1)
+                       return true;
+               else
+                       return false;
        }
 
-       /*
-        * More than one running task need preemption.
-        * nr_running update is assumed to be visible
-        * after IPI is sent from wakers.
-        */
-       if (this_rq()->nr_running > 1)
+       /* Normal multitasking need periodic preemption checks */
+       if (rq->cfs.nr_running > 1)
                return false;
 
        return true;
@@ -2093,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        ttwu_queue(p, cpu);
 stat:
-       ttwu_stat(p, cpu, wake_flags);
+       if (schedstat_enabled())
+               ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2141,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
        ttwu_do_wakeup(rq, p, 0);
-       ttwu_stat(p, smp_processor_id(), 0);
+       if (schedstat_enabled())
+               ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -2183,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)
        dl_se->dl_bw = 0;
 
        dl_se->dl_throttled = 0;
-       dl_se->dl_new = 1;
        dl_se->dl_yielded = 0;
 }
 
@@ -2210,6 +2069,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
+       /* Even if schedstat is disabled, there should not be garbage */
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
@@ -2218,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        __dl_clear_params(p);
 
        INIT_LIST_HEAD(&p->rt.run_list);
+       p->rt.timeout           = 0;
+       p->rt.time_slice        = sched_rr_timeslice;
+       p->rt.on_rq             = 0;
+       p->rt.on_list           = 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2281,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+       if (enabled)
+               static_branch_enable(&sched_schedstats);
+       else
+               static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+       if (!schedstat_enabled()) {
+               pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+               static_branch_enable(&sched_schedstats);
+       }
+}
+
+static int __init setup_schedstats(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+
+       if (!strcmp(str, "enable")) {
+               set_schedstats(true);
+               ret = 1;
+       } else if (!strcmp(str, "disable")) {
+               set_schedstats(false);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               pr_warn("Unable to parse schedstats=\n");
+
+       return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = static_branch_likely(&sched_schedstats);
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_schedstats(state);
+       return err;
+}
+#endif
+#endif
+
 /*
  * fork()/clone()-time setup:
  */
@@ -3010,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)
 }
 #endif
 
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-       if (in_lock_functions(addr)) {
-               addr = CALLER_ADDR2;
-               if (in_lock_functions(addr))
-                       addr = CALLER_ADDR3;
-       }
-       return addr;
-}
-
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
 
@@ -3041,7 +2958,7 @@ void preempt_count_add(int val)
                                PREEMPT_MASK - 10);
 #endif
        if (preempt_count() == val) {
-               unsigned long ip = get_parent_ip(CALLER_ADDR1);
+               unsigned long ip = get_lock_parent_ip();
 #ifdef CONFIG_DEBUG_PREEMPT
                current->preempt_disable_ip = ip;
 #endif
@@ -3068,7 +2985,7 @@ void preempt_count_sub(int val)
 #endif
 
        if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
        __preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
@@ -3280,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)
 
                trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
-               cpu = cpu_of(rq);
        } else {
                lockdep_unpin_lock(&rq->lock);
                raw_spin_unlock_irq(&rq->lock);
@@ -3466,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
        struct rq *rq;
        const struct sched_class *prev_class;
 
@@ -3494,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
+
+       if (oldprio == prio)
+               queue_flag &= ~DEQUEUE_MOVE;
+
        prev_class = p->sched_class;
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flag);
        if (running)
                put_prev_task(rq, p);
 
@@ -3516,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
-                       enqueue_flag |= ENQUEUE_REPLENISH;
+                       queue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
@@ -3524,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
-                       enqueue_flag |= ENQUEUE_HEAD;
+                       queue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@ -3539,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, enqueue_flag);
+               enqueue_task(rq, p, queue_flag);
 
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3895,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,
        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
@@ -4077,17 +3998,14 @@ change:
                 * itself.
                 */
                new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-               if (new_effective_prio == oldprio) {
-                       __setscheduler_params(p, attr);
-                       task_rq_unlock(rq, p, &flags);
-                       return 0;
-               }
+               if (new_effective_prio == oldprio)
+                       queue_flags &= ~DEQUEUE_MOVE;
        }
 
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flags);
        if (running)
                put_prev_task(rq, p);
 
@@ -4097,15 +4015,14 @@ change:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
-               int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
-               if (oldprio <= p->prio)
-                       enqueue_flags |= ENQUEUE_HEAD;
+               if (oldprio < p->prio)
+                       queue_flags |= ENQUEUE_HEAD;
 
-               enqueue_task(rq, p, enqueue_flags);
+               enqueue_task(rq, p, queue_flags);
        }
 
        check_class_changed(rq, p, prev_class, oldprio);
@@ -5096,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 
+       kasan_unpoison_task_stack(idle);
+
 #ifdef CONFIG_SMP
        /*
         * Its possible that init_idle() gets called multiple times on a task,
@@ -5405,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-       {
-               .procname       = "sched_domain",
-               .mode           = 0555,
-       },
-       {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-       {
-               .procname       = "kernel",
-               .mode           = 0555,
-               .child          = sd_ctl_dir,
-       },
-       {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-       struct ctl_table *entry =
-               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-       return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-       struct ctl_table *entry;
-
-       /*
-        * In the intermediate directories, both the child directory and
-        * procname are dynamically allocated and could fail but the mode
-        * will always be set. In the lowest directory the names are
-        * static strings and all have proc handlers.
-        */
-       for (entry = *tablep; entry->mode; entry++) {
-               if (entry->child)
-                       sd_free_ctl_entry(&entry->child);
-               if (entry->proc_handler == NULL)
-                       kfree(entry->procname);
-       }
-
-       kfree(*tablep);
-       *tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-               const char *procname, void *data, int maxlen,
-               umode_t mode, proc_handler *proc_handler,
-               bool load_idx)
-{
-       entry->procname = procname;
-       entry->data = data;
-       entry->maxlen = maxlen;
-       entry->mode = mode;
-       entry->proc_handler = proc_handler;
-
-       if (load_idx) {
-               entry->extra1 = &min_load_idx;
-               entry->extra2 = &max_load_idx;
-       }
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-       if (table == NULL)
-               return NULL;
-
-       set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax, true);
-       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[9], "cache_nice_tries",
-               &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax, false);
-       set_table_entry(&table[11], "max_newidle_lb_cost",
-               &sd->max_newidle_lb_cost,
-               sizeof(long), 0644, proc_doulongvec_minmax, false);
-       set_table_entry(&table[12], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
-
-       return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-       struct ctl_table *entry, *table;
-       struct sched_domain *sd;
-       int domain_num = 0, i;
-       char buf[32];
-
-       for_each_domain(cpu, sd)
-               domain_num++;
-       entry = table = sd_alloc_ctl_entry(domain_num + 1);
-       if (table == NULL)
-               return NULL;
-
-       i = 0;
-       for_each_domain(cpu, sd) {
-               snprintf(buf, 32, "domain%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_domain_table(sd);
-               entry++;
-               i++;
-       }
-       return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-       int i, cpu_num = num_possible_cpus();
-       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-       char buf[32];
-
-       WARN_ON(sd_ctl_dir[0].child);
-       sd_ctl_dir[0].child = entry;
-
-       if (entry == NULL)
-               return;
-
-       for_each_possible_cpu(i) {
-               snprintf(buf, 32, "cpu%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_cpu_table(i);
-               entry++;
-       }
-
-       WARN_ON(sd_sysctl_header);
-       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-       unregister_sysctl_table(sd_sysctl_header);
-       sd_sysctl_header = NULL;
-       if (sd_ctl_dir[0].child)
-               sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
 static void set_rq_online(struct rq *rq)
 {
        if (!rq->online) {
@@ -5692,16 +5434,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
                set_cpu_rq_start_time();
                return NOTIFY_OK;
 
-       case CPU_ONLINE:
-               /*
-                * At this point a starting CPU has marked itself as online via
-                * set_cpu_online(). But it might not yet have marked itself
-                * as active, which is essential from here on.
-                */
-               set_cpu_active(cpu, true);
-               stop_machine_unpark(cpu);
-               return NOTIFY_OK;
-
        case CPU_DOWN_FAILED:
                set_cpu_active(cpu, true);
                return NOTIFY_OK;
@@ -6173,11 +5905,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+       int ret;
+
        alloc_bootmem_cpumask_var(&cpu_isolated_map);
-       cpulist_parse(str, cpu_isolated_map);
+       ret = cpulist_parse(str, cpu_isolated_map);
+       if (ret) {
+               pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+               return 0;
+       }
        return 1;
 }
-
 __setup("isolcpus=", isolated_cpu_setup);
 
 struct s_data {
@@ -6840,7 +6577,7 @@ static void sched_init_numa(void)
 
                        sched_domains_numa_masks[i][j] = mask;
 
-                       for (k = 0; k < nr_node_ids; k++) {
+                       for_each_node(k) {
                                if (node_distance(j, k) > sched_domains_numa_distance[i])
                                        continue;
 
@@ -7860,11 +7597,9 @@ void sched_destroy_group(struct task_group *tg)
 void sched_offline_group(struct task_group *tg)
 {
        unsigned long flags;
-       int i;
 
        /* end participation in shares distribution */
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
+       unregister_fair_sched_group(tg);
 
        spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
@@ -7890,7 +7625,7 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
 
        if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
 
@@ -7914,7 +7649,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
        task_rq_unlock(rq, tsk, &flags);
 }
index b2ab2ff..75f98c5 100644 (file)
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
                u64 steal;
-               cputime_t steal_ct;
+               unsigned long steal_jiffies;
 
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
 
                /*
-                * cputime_t may be less precise than nsecs (eg: if it's
-                * based on jiffies). Lets cast the result to cputime
+                * steal is in nsecs but our caller is expecting steal
+                * time in jiffies. Lets cast the result to jiffies
                 * granularity and account the rest on the next rounds.
                 */
-               steal_ct = nsecs_to_cputime(steal);
-               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+               steal_jiffies = nsecs_to_jiffies(steal);
+               this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
 
-               account_steal_time(steal_ct);
-               return steal_ct;
+               account_steal_time(jiffies_to_cputime(steal_jiffies));
+               return steal_jiffies;
        }
 #endif
        return false;
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
 {
-       unsigned long long clock;
+       unsigned long now = READ_ONCE(jiffies);
 
-       clock = local_clock();
-       if (clock < tsk->vtime_snap)
+       if (time_before(now, (unsigned long)tsk->vtime_snap))
                return 0;
 
-       return clock - tsk->vtime_snap;
+       return jiffies_to_cputime(now - tsk->vtime_snap);
 }
 
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
-       unsigned long long delta = vtime_delta(tsk);
+       unsigned long now = READ_ONCE(jiffies);
+       unsigned long delta = now - tsk->vtime_snap;
 
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-       tsk->vtime_snap += delta;
+       tsk->vtime_snap = now;
 
-       /* CHECKME: always safe to convert nsecs to cputime? */
-       return nsecs_to_cputime(delta);
+       return jiffies_to_cputime(delta);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
+       if (!vtime_delta(tsk))
+               return;
+
        write_seqcount_begin(&tsk->vtime_seqcount);
        __vtime_account_system(tsk);
        write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
        write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
        if (context_tracking_in_user())
                tsk->vtime_snap_whence = VTIME_USER;
        write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
        cputime_t delta_cpu;
 
        write_seqcount_begin(&tsk->vtime_seqcount);
-       delta_cpu = get_vtime_delta(tsk);
        tsk->vtime_snap_whence = VTIME_SYS;
-       account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       if (vtime_delta(tsk)) {
+               delta_cpu = get_vtime_delta(tsk);
+               account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+       }
        write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
        write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
        tsk->vtime_snap_whence = VTIME_USER;
        write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
         * that can thus safely catch up with a tickless delta.
         */
        write_seqcount_begin(&tsk->vtime_seqcount);
-       __vtime_account_system(tsk);
+       if (vtime_delta(tsk))
+               __vtime_account_system(tsk);
        current->flags |= PF_VCPU;
        write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
        write_seqcount_begin(&current->vtime_seqcount);
        current->vtime_snap_whence = VTIME_SYS;
-       current->vtime_snap = sched_clock_cpu(smp_processor_id());
+       current->vtime_snap = jiffies;
        write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
        local_irq_save(flags);
        write_seqcount_begin(&t->vtime_seqcount);
        t->vtime_snap_whence = VTIME_SYS;
-       t->vtime_snap = sched_clock_cpu(cpu);
+       t->vtime_snap = jiffies;
        write_seqcount_end(&t->vtime_seqcount);
        local_irq_restore(flags);
 }
index cd64c97..c7a036f 100644 (file)
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
 
-       WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+       WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+       /*
+        * We are racing with the deadline timer. So, do nothing because
+        * the deadline timer handler will take care of properly recharging
+        * the runtime and postponing the deadline
+        */
+       if (dl_se->dl_throttled)
+               return;
 
        /*
         * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         */
        dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
        dl_se->runtime = pi_se->dl_runtime;
-       dl_se->dl_new = 0;
 }
 
 /*
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                dl_se->runtime = pi_se->dl_runtime;
        }
 
+       if (dl_se->dl_yielded && dl_se->runtime > 0)
+               dl_se->runtime = 0;
+
        /*
         * We keep moving the deadline away until we get some
         * available runtime for the entity. This ensures correct
@@ -420,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
         * entity.
         */
        if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
-               printk_deferred_once("sched: DL replenish lagged to much\n");
+               printk_deferred_once("sched: DL replenish lagged too much\n");
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
                dl_se->runtime = pi_se->dl_runtime;
        }
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
 
-       /*
-        * The arrival of a new instance needs special treatment, i.e.,
-        * the actual scheduling parameters have to be "renewed".
-        */
-       if (dl_se->dl_new) {
-               setup_new_dl_entity(dl_se, pi_se);
-               return;
-       }
-
        if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
            dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -604,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                goto unlock;
        }
 
-       /*
-        * This is possible if switched_from_dl() raced against a running
-        * callback that took the above !dl_task() path and we've since then
-        * switched back into SCHED_DEADLINE.
-        *
-        * There's nothing to do except drop our task reference.
-        */
-       if (dl_se->dl_new)
-               goto unlock;
-
        /*
         * The task might have been boosted by someone else and might be in the
         * boosting/deboosting path, its not throttled.
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
         * approach need further study.
         */
        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-       if (unlikely((s64)delta_exec <= 0))
+       if (unlikely((s64)delta_exec <= 0)) {
+               if (unlikely(dl_se->dl_yielded))
+                       goto throttle;
                return;
+       }
 
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
 
        sched_rt_avg_update(rq, delta_exec);
 
-       dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-       if (dl_runtime_exceeded(dl_se)) {
+       dl_se->runtime -= delta_exec;
+
+throttle:
+       if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
                dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
                if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
         * parameters of the task might need updating. Otherwise,
         * we want a replenishment of its runtime.
         */
-       if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+       if (flags & ENQUEUE_WAKEUP)
                update_dl_entity(dl_se, pi_se);
        else if (flags & ENQUEUE_REPLENISH)
                replenish_dl_entity(dl_se, pi_se);
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
  */
 static void yield_task_dl(struct rq *rq)
 {
-       struct task_struct *p = rq->curr;
-
        /*
         * We make the task go to sleep until its current deadline by
         * forcing its runtime to zero. This way, update_curr_dl() stops
         * it and the bandwidth timer will wake it up and will give it
         * new scheduling parameters (thanks to dl_yielded=1).
         */
-       if (p->dl.runtime > 0) {
-               rq->curr->dl.dl_yielded = 1;
-               p->dl.runtime = 0;
-       }
+       rq->curr->dl.dl_yielded = 1;
+
        update_rq_clock(rq);
        update_curr_dl(rq);
        /*
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+       if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+               setup_new_dl_entity(&p->dl, &p->dl);
+
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                 */
                resched_curr(rq);
 #endif /* CONFIG_SMP */
-       } else
-               switched_to_dl(rq, p);
+       }
 }
 
 const struct sched_class dl_sched_class = {
index 6415117..4fbc3bd 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
 
 #include "sched.h"
 
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#define SCHED_FEAT(name, enabled)      \
+       #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+       int i;
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
+       }
+       seq_puts(m, "\n");
+
+       return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+       int i;
+       int neg = 0;
+
+       if (strncmp(cmp, "NO_", 3) == 0) {
+               neg = 1;
+               cmp += 3;
+       }
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
+                       if (neg) {
+                               sysctl_sched_features &= ~(1UL << i);
+                               sched_feat_disable(i);
+                       } else {
+                               sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
+                       break;
+               }
+       }
+
+       return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       char *cmp;
+       int i;
+       struct inode *inode;
+
+       if (cnt > 63)
+               cnt = 63;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+       cmp = strstrip(buf);
+
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       inode_lock(inode);
+       i = sched_feat_set(cmp);
+       inode_unlock(inode);
+       if (i == __SCHED_FEAT_NR)
+               return -EINVAL;
+
+       *ppos += cnt;
+
+       return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+       debugfs_create_file("sched_features", 0644, NULL, NULL,
+                       &sched_feat_fops);
+
+       return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+       {
+               .procname       = "sched_domain",
+               .mode           = 0555,
+       },
+       {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+       {
+               .procname       = "kernel",
+               .mode           = 0555,
+               .child          = sd_ctl_dir,
+       },
+       {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+       struct ctl_table *entry =
+               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+       return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+       struct ctl_table *entry;
+
+       /*
+        * In the intermediate directories, both the child directory and
+        * procname are dynamically allocated and could fail but the mode
+        * will always be set. In the lowest directory the names are
+        * static strings and all have proc handlers.
+        */
+       for (entry = *tablep; entry->mode; entry++) {
+               if (entry->child)
+                       sd_free_ctl_entry(&entry->child);
+               if (entry->proc_handler == NULL)
+                       kfree(entry->procname);
+       }
+
+       kfree(*tablep);
+       *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+               const char *procname, void *data, int maxlen,
+               umode_t mode, proc_handler *proc_handler,
+               bool load_idx)
+{
+       entry->procname = procname;
+       entry->data = data;
+       entry->maxlen = maxlen;
+       entry->mode = mode;
+       entry->proc_handler = proc_handler;
+
+       if (load_idx) {
+               entry->extra1 = &min_load_idx;
+               entry->extra2 = &max_load_idx;
+       }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "min_interval", &sd->min_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[1], "max_interval", &sd->max_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+               sizeof(int), 0644, proc_dointvec_minmax, true);
+       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[9], "cache_nice_tries",
+               &sd->cache_nice_tries,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[10], "flags", &sd->flags,
+               sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[11], "max_newidle_lb_cost",
+               &sd->max_newidle_lb_cost,
+               sizeof(long), 0644, proc_doulongvec_minmax, false);
+       set_table_entry(&table[12], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+       /* &table[13] is terminator */
+
+       return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+       struct ctl_table *entry, *table;
+       struct sched_domain *sd;
+       int domain_num = 0, i;
+       char buf[32];
+
+       for_each_domain(cpu, sd)
+               domain_num++;
+       entry = table = sd_alloc_ctl_entry(domain_num + 1);
+       if (table == NULL)
+               return NULL;
+
+       i = 0;
+       for_each_domain(cpu, sd) {
+               snprintf(buf, 32, "domain%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_domain_table(sd);
+               entry++;
+               i++;
+       }
+       return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+       int i, cpu_num = num_possible_cpus();
+       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+       char buf[32];
+
+       WARN_ON(sd_ctl_dir[0].child);
+       sd_ctl_dir[0].child = entry;
+
+       if (entry == NULL)
+               return;
+
+       for_each_possible_cpu(i) {
+               snprintf(buf, 32, "cpu%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_cpu_table(i);
+               entry++;
+       }
+
+       WARN_ON(sd_sysctl_header);
+       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+       unregister_sysctl_table(sd_sysctl_header);
+       sd_sysctl_header = NULL;
+       if (sd_ctl_dir[0].child)
+               sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-       PN(se->statistics.wait_start);
-       PN(se->statistics.sleep_start);
-       PN(se->statistics.block_start);
-       PN(se->statistics.sleep_max);
-       PN(se->statistics.block_max);
-       PN(se->statistics.exec_max);
-       PN(se->statistics.slice_max);
-       PN(se->statistics.wait_max);
-       PN(se->statistics.wait_sum);
-       P(se->statistics.wait_count);
+       if (schedstat_enabled()) {
+               PN(se->statistics.wait_start);
+               PN(se->statistics.sleep_start);
+               PN(se->statistics.block_start);
+               PN(se->statistics.sleep_max);
+               PN(se->statistics.block_max);
+               PN(se->statistics.exec_max);
+               PN(se->statistics.slice_max);
+               PN(se->statistics.wait_max);
+               PN(se->statistics.wait_sum);
+               P(se->statistics.wait_count);
+       }
 #endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                (long long)(p->nvcsw + p->nivcsw),
                p->prio);
 #ifdef CONFIG_SCHEDSTATS
-       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.statistics.wait_sum),
-               SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       if (schedstat_enabled()) {
+               SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+                       SPLIT_NS(p->se.statistics.wait_sum),
+                       SPLIT_NS(p->se.sum_exec_runtime),
+                       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+       }
 #else
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 {
+       struct dl_bw *dl_bw;
+
        SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
        SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+       dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+       dl_bw = &dl_rq->dl_bw;
+#endif
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+       SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
 }
 
 extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do {                                                                      \
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
-       P(yld_count);
-
-       P(sched_count);
-       P(sched_goidle);
 #ifdef CONFIG_SMP
        P64(avg_idle);
        P64(max_idle_balance_cost);
 #endif
 
-       P(ttwu_count);
-       P(ttwu_local);
+       if (schedstat_enabled()) {
+               P(yld_count);
+               P(sched_count);
+               P(sched_goidle);
+               P(ttwu_count);
+               P(ttwu_local);
+       }
 
 #undef P
 #undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-       PN(se.statistics.sum_sleep_runtime);
-       PN(se.statistics.wait_start);
-       PN(se.statistics.sleep_start);
-       PN(se.statistics.block_start);
-       PN(se.statistics.sleep_max);
-       PN(se.statistics.block_max);
-       PN(se.statistics.exec_max);
-       PN(se.statistics.slice_max);
-       PN(se.statistics.wait_max);
-       PN(se.statistics.wait_sum);
-       P(se.statistics.wait_count);
-       PN(se.statistics.iowait_sum);
-       P(se.statistics.iowait_count);
        P(se.nr_migrations);
-       P(se.statistics.nr_migrations_cold);
-       P(se.statistics.nr_failed_migrations_affine);
-       P(se.statistics.nr_failed_migrations_running);
-       P(se.statistics.nr_failed_migrations_hot);
-       P(se.statistics.nr_forced_migrations);
-       P(se.statistics.nr_wakeups);
-       P(se.statistics.nr_wakeups_sync);
-       P(se.statistics.nr_wakeups_migrate);
-       P(se.statistics.nr_wakeups_local);
-       P(se.statistics.nr_wakeups_remote);
-       P(se.statistics.nr_wakeups_affine);
-       P(se.statistics.nr_wakeups_affine_attempts);
-       P(se.statistics.nr_wakeups_passive);
-       P(se.statistics.nr_wakeups_idle);
 
-       {
+       if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
 
+               PN(se.statistics.sum_sleep_runtime);
+               PN(se.statistics.wait_start);
+               PN(se.statistics.sleep_start);
+               PN(se.statistics.block_start);
+               PN(se.statistics.sleep_max);
+               PN(se.statistics.block_max);
+               PN(se.statistics.exec_max);
+               PN(se.statistics.slice_max);
+               PN(se.statistics.wait_max);
+               PN(se.statistics.wait_sum);
+               P(se.statistics.wait_count);
+               PN(se.statistics.iowait_sum);
+               P(se.statistics.iowait_count);
+               P(se.statistics.nr_migrations_cold);
+               P(se.statistics.nr_failed_migrations_affine);
+               P(se.statistics.nr_failed_migrations_running);
+               P(se.statistics.nr_failed_migrations_hot);
+               P(se.statistics.nr_forced_migrations);
+               P(se.statistics.nr_wakeups);
+               P(se.statistics.nr_wakeups_sync);
+               P(se.statistics.nr_wakeups_migrate);
+               P(se.statistics.nr_wakeups_local);
+               P(se.statistics.nr_wakeups_remote);
+               P(se.statistics.nr_wakeups_affine);
+               P(se.statistics.nr_wakeups_affine_attempts);
+               P(se.statistics.nr_wakeups_passive);
+               P(se.statistics.nr_wakeups_idle);
+
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
                        avg_atom = div64_ul(avg_atom, nr_switches);
index 1926606..3313052 100644 (file)
@@ -20,8 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *p;
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+       u64 delta;
+
+       delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->statistics.wait_sum += delta;
        se->statistics.wait_start = 0;
 }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        /*
         * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Mark the end of the wait period if dequeueing a
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         */
        if (se != cfs_rq->curr)
                update_stats_wait_end(cfs_rq, se);
+
+       if (flags & DEQUEUE_SLEEP) {
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       if (tsk->state & TASK_INTERRUPTIBLE)
+                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+                       if (tsk->state & TASK_UNINTERRUPTIBLE)
+                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+               }
+       }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
 }
 
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
 /*
  * We are picking a new current task - update its stats:
  */
@@ -907,10 +932,11 @@ struct numa_group {
        spinlock_t lock; /* nr_tasks, tasks */
        int nr_tasks;
        pid_t gid;
+       int active_nodes;
 
        struct rcu_head rcu;
-       nodemask_t active_nodes;
        unsigned long total_faults;
+       unsigned long max_faults_cpu;
        /*
         * Faults_cpu is used to decide whether memory should move
         * towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
                group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+       return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                                        int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                return true;
 
        /*
-        * Do not migrate if the destination is not a node that
-        * is actively used by this numa group.
-        */
-       if (!node_isset(dst_nid, ng->active_nodes))
-               return false;
-
-       /*
-        * Source is a node that is not actively used by this
-        * numa group, while the destination is. Migrate.
+        * Destination node is much more heavily used than the source
+        * node? Allow migration.
         */
-       if (!node_isset(src_nid, ng->active_nodes))
+       if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+                                       ACTIVE_NODE_FRACTION)
                return true;
 
        /*
-        * Both source and destination are nodes in active
-        * use by this numa group. Maximize memory bandwidth
-        * by migrating from more heavily used groups, to less
-        * heavily used ones, spreading the load around.
-        * Use a 1/4 hysteresis to avoid spurious page movement.
+        * Distribute memory according to CPU & memory use on each node,
+        * with 3/4 hysteresis to avoid unnecessary memory migrations:
+        *
+        * faults_cpu(dst)   3   faults_cpu(src)
+        * --------------- * - > ---------------
+        * faults_mem(dst)   4   faults_mem(src)
         */
-       return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+       return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+              group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
 static unsigned long weighted_cpuload(const int cpu);
@@ -1220,8 +1254,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
 
        env->best_task = p;
        env->best_imp = imp;
@@ -1289,20 +1321,30 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
+       bool assigned = false;
 
        rcu_read_lock();
 
        raw_spin_lock_irq(&dst_rq->lock);
        cur = dst_rq->curr;
        /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
         */
        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
        raw_spin_unlock_irq(&dst_rq->lock);
 
        /*
@@ -1386,6 +1428,7 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
+                       put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1411,9 +1454,16 @@ balance:
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
+       assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -1468,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
 
                .best_task = NULL,
                .best_imp = 0,
-               .best_cpu = -1
+               .best_cpu = -1,
        };
        struct sched_domain *sd;
        unsigned long taskweight, groupweight;
@@ -1520,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
         *   multiple NUMA nodes; in order to better consolidate the group,
         *   we need to check other locations.
         */
-       if (env.best_cpu == -1 || (p->numa_group &&
-                       nodes_weight(p->numa_group->active_nodes) > 1)) {
+       if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
                for_each_online_node(nid) {
                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                continue;
@@ -1556,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
         * trying for a better one later. Do not set the preferred node here.
         */
        if (p->numa_group) {
+               struct numa_group *ng = p->numa_group;
+
                if (env.best_cpu == -1)
                        nid = env.src_nid;
                else
                        nid = env.dst_nid;
 
-               if (node_isset(nid, p->numa_group->active_nodes))
+               if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
                        sched_setnuma(p, env.dst_nid);
        }
 
@@ -1611,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 
 /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
  * tracking the nodes from which NUMA hinting faults are triggered. This can
  * be different from the set of nodes where the workload's memory is currently
  * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
 {
        unsigned long faults, max_faults = 0;
-       int nid;
+       int nid, active_nodes = 0;
 
        for_each_online_node(nid) {
                faults = group_faults_cpu(numa_group, nid);
@@ -1634,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 
        for_each_online_node(nid) {
                faults = group_faults_cpu(numa_group, nid);
-               if (!node_isset(nid, numa_group->active_nodes)) {
-                       if (faults > max_faults * 6 / 16)
-                               node_set(nid, numa_group->active_nodes);
-               } else if (faults < max_faults * 3 / 16)
-                       node_clear(nid, numa_group->active_nodes);
+               if (faults * ACTIVE_NODE_FRACTION > max_faults)
+                       active_nodes++;
        }
+
+       numa_group->max_faults_cpu = max_faults;
+       numa_group->active_nodes = active_nodes;
 }
 
 /*
@@ -1930,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
        update_task_scan_period(p, fault_types[0], fault_types[1]);
 
        if (p->numa_group) {
-               update_numa_active_node_mask(p->numa_group);
+               numa_group_count_active_nodes(p->numa_group);
                spin_unlock_irq(group_lock);
                max_nid = preferred_group_nid(p, max_group_nid);
        }
@@ -1974,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                        return;
 
                atomic_set(&grp->refcount, 1);
+               grp->active_nodes = 1;
+               grp->max_faults_cpu = 0;
                spin_lock_init(&grp->lock);
                grp->gid = p->pid;
                /* Second half of the array tracks nids where faults happen */
                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
                                                nr_node_ids;
 
-               node_set(task_node(current), grp->active_nodes);
-
                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
                        grp->faults[i] = p->numa_faults[i];
 
@@ -2095,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        bool migrated = flags & TNF_MIGRATED;
        int cpu_node = task_node(current);
        int local = !!(flags & TNF_FAULT_LOCAL);
+       struct numa_group *ng;
        int priv;
 
        if (!static_branch_likely(&sched_numa_balancing))
@@ -2135,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         * actively using should be counted as local. This allows the
         * scan rate to slow down when a workload has settled down.
         */
-       if (!priv && !local && p->numa_group &&
-                       node_isset(cpu_node, p->numa_group->active_nodes) &&
-                       node_isset(mem_node, p->numa_group->active_nodes))
+       ng = p->numa_group;
+       if (!priv && !local && ng && ng->active_nodes > 1 &&
+                               numa_is_active_node(cpu_node, ng) &&
+                               numa_is_active_node(mem_node, ng))
                local = 1;
 
        task_numa_placement(p);
@@ -3086,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+       if (schedstat_enabled())
+               return;
+
+       /* Force schedstat enabled if a dependent tracepoint is active */
+       if (trace_sched_stat_wait_enabled()    ||
+                       trace_sched_stat_sleep_enabled()   ||
+                       trace_sched_stat_iowait_enabled()  ||
+                       trace_sched_stat_blocked_enabled() ||
+                       trace_sched_stat_runtime_enabled())  {
+               pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                            "stat_blocked and stat_runtime require the "
+                            "kernel parameter schedstats=enabled or "
+                            "kernel.sched_schedstats=1\n");
+       }
+#endif
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3106,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
-               enqueue_sleeper(cfs_rq, se);
+               if (schedstat_enabled())
+                       enqueue_sleeper(cfs_rq, se);
        }
 
-       update_stats_enqueue(cfs_rq, se);
-       check_spread(cfs_rq, se);
+       check_schedstat_required();
+       if (schedstat_enabled()) {
+               update_stats_enqueue(cfs_rq, se);
+               check_spread(cfs_rq, se);
+       }
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -3177,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se);
 
-       update_stats_dequeue(cfs_rq, se);
-       if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
-
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-               }
-#endif
-       }
+       if (schedstat_enabled())
+               update_stats_dequeue(cfs_rq, se, flags);
 
        clear_buddies(cfs_rq, se);
 
@@ -3263,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * a CPU. So account for the time it spent waiting on the
                 * runqueue.
                 */
-               update_stats_wait_end(cfs_rq, se);
+               if (schedstat_enabled())
+                       update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
                update_load_avg(se, 1);
        }
@@ -3276,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
-       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+       if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
                se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
@@ -3359,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);
 
-       check_spread(cfs_rq, prev);
+       if (schedstat_enabled()) {
+               check_spread(cfs_rq, prev);
+               if (prev->on_rq)
+                       update_stats_wait_start(cfs_rq, prev);
+       }
+
        if (prev->on_rq) {
-               update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
@@ -4443,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
                /* scale is effectively 1 << i now, and >> i divides by scale */
 
-               old_load = this_rq->cpu_load[i] - tickless_load;
+               old_load = this_rq->cpu_load[i];
                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               old_load += tickless_load;
+               if (tickless_load) {
+                       old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+                       /*
+                        * old_load can never be a negative value because a
+                        * decayed tickless_load cannot be greater than the
+                        * original tickless_load.
+                        */
+                       old_load += tickless_load;
+               }
                new_load = this_load;
                /*
                 * Round up the averaging division if load is increasing. This
@@ -4468,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+                                  unsigned long curr_jiffies,
+                                  unsigned long load,
+                                  int active)
+{
+       unsigned long pending_updates;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * In the regular NOHZ case, we were idle, this means load 0.
+                * In the NOHZ_FULL case, we were non-idle, we should consider
+                * its weighted load.
+                */
+               __update_cpu_load(this_rq, load, pending_updates, active);
+       }
+}
+
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4485,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
 {
-       unsigned long curr_jiffies = READ_ONCE(jiffies);
-       unsigned long load = weighted_cpuload(cpu_of(this_rq));
-       unsigned long pending_updates;
-
        /*
         * bail if there's load or we're actually up-to-date.
         */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
+       if (weighted_cpuload(cpu_of(this_rq)))
                return;
 
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates, 0);
+       __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
 }
 
 /*
@@ -4511,22 +4597,12 @@ void update_cpu_load_nohz(int active)
        struct rq *this_rq = this_rq();
        unsigned long curr_jiffies = READ_ONCE(jiffies);
        unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
-       unsigned long pending_updates;
 
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
 
        raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * In the regular NOHZ case, we were idle, this means load 0.
-                * In the NOHZ_FULL case, we were non-idle, we should consider
-                * its weighted load.
-                */
-               __update_cpu_load(this_rq, load, pending_updates, active);
-       }
+       __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
        raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
@@ -4538,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
 {
        unsigned long load = weighted_cpuload(cpu_of(this_rq));
        /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
         */
        this_rq->last_load_update_tick = jiffies;
        __update_cpu_load(this_rq, load, 1, 1);
@@ -7832,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, rq->next_balance)) {
                        raw_spin_lock_irq(&rq->lock);
                        update_rq_clock(rq);
-                       update_idle_cpu_load(rq);
+                       update_cpu_load_idle(rq);
                        raw_spin_unlock_irq(&rq->lock);
                        rebalance_domains(rq, CPU_IDLE);
                }
@@ -8218,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
        for_each_possible_cpu(i) {
                if (tg->cfs_rq)
                        kfree(tg->cfs_rq[i]);
-               if (tg->se) {
-                       if (tg->se[i])
-                               remove_entity_load_avg(tg->se[i]);
+               if (tg->se)
                        kfree(tg->se[i]);
-               }
        }
 
        kfree(tg->cfs_rq);
@@ -8270,21 +8343,29 @@ err:
        return 0;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
 {
-       struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
+       struct rq *rq;
+       int cpu;
 
-       /*
-       * Only empty task groups can be destroyed; so we can speculatively
-       * check on_list without danger of it being re-added.
-       */
-       if (!tg->cfs_rq[cpu]->on_list)
-               return;
+       for_each_possible_cpu(cpu) {
+               if (tg->se[cpu])
+                       remove_entity_load_avg(tg->se[cpu]);
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+               /*
+                * Only empty task groups can be destroyed; so we can speculatively
+                * check on_list without danger of it being re-added.
+                */
+               if (!tg->cfs_rq[cpu]->on_list)
+                       continue;
+
+               rq = cpu_rq(cpu);
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
 }
 
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8366,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
 
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
index de0e786..bd12c6c 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
+#include <linux/cpuhotplug.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -162,7 +163,7 @@ static void cpuidle_idle_call(void)
         */
        if (idle_should_freeze()) {
                entered_state = cpuidle_enter_freeze(drv, dev);
-               if (entered_state >= 0) {
+               if (entered_state > 0) {
                        local_irq_enable();
                        goto exit_idle;
                }
@@ -193,8 +194,6 @@ exit_idle:
        rcu_idle_exit();
 }
 
-DEFINE_PER_CPU(bool, cpu_dead_idle);
-
 /*
  * Generic idle loop implementation
  *
@@ -221,10 +220,7 @@ static void cpu_idle_loop(void)
                        rmb();
 
                        if (cpu_is_offline(smp_processor_id())) {
-                               rcu_cpu_notify(NULL, CPU_DYING_IDLE,
-                                              (void *)(long)smp_processor_id());
-                               smp_mb(); /* all activity before dead. */
-                               this_cpu_write(cpu_dead_idle, true);
+                               cpuhp_report_idle_dead();
                                arch_cpu_idle_dead();
                        }
 
@@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state)
        boot_init_stack_canary();
 #endif
        arch_cpu_idle_prepare();
+       cpuhp_online_idle(state);
        cpu_idle_loop();
 }
index 8ec86ab..5624713 100644 (file)
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        raw_spin_lock(&rt_b->rt_runtime_lock);
        if (!rt_b->rt_period_active) {
                rt_b->rt_period_active = 1;
-               hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+               /*
+                * SCHED_DEADLINE updates the bandwidth, as a run away
+                * RT task with a DL task could hog a CPU. But DL does
+                * not reset the period. If a deadline task was running
+                * without an RT task running, it can cause RT tasks to
+                * throttle when they start up. Kick the timer right away
+                * to update the period.
+                */
+               hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
                hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-       return !list_empty(&rt_se->run_list);
+       return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                if (!rt_se)
                        enqueue_top_rt_rq(rt_rq);
                else if (!on_rt_rq(rt_se))
-                       enqueue_rt_entity(rt_se, false);
+                       enqueue_rt_entity(rt_se, 0);
 
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
        if (!rt_se)
                dequeue_top_rt_rq(rt_rq);
        else if (on_rt_rq(rt_se))
-               dequeue_rt_entity(rt_se);
+               dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1141,6 +1149,20 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
                return 1;
 }
 
+static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct task_struct *tsk;
+
+       if (group_rq)
+               return group_rq->rr_nr_running;
+
+       tsk = rt_task_of(rt_se);
+
+       return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
@@ -1148,6 +1170,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
        WARN_ON(!rt_prio(prio));
        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
        inc_rt_prio(rt_rq, prio);
        inc_rt_migration(rt_se, rt_rq);
@@ -1160,13 +1183,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+       rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
        dec_rt_migration(rt_se, rt_rq);
        dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+               return false;
+
+       return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+       list_del_init(&rt_se->run_list);
+
+       if (list_empty(array->queue + rt_se_prio(rt_se)))
+               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+       rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1226,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
         * get throttled and the current group doesn't have any other
         * active members.
         */
-       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+               if (rt_se->on_list)
+                       __delist_rt_entity(rt_se, array);
                return;
+       }
 
-       if (head)
-               list_add(&rt_se->run_list, queue);
-       else
-               list_add_tail(&rt_se->run_list, queue);
-       __set_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(rt_se->on_list);
+               if (flags & ENQUEUE_HEAD)
+                       list_add(&rt_se->run_list, queue);
+               else
+                       list_add_tail(&rt_se->run_list, queue);
+
+               __set_bit(rt_se_prio(rt_se), array->bitmap);
+               rt_se->on_list = 1;
+       }
+       rt_se->on_rq = 1;
 
        inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
 
-       list_del_init(&rt_se->run_list);
-       if (list_empty(array->queue + rt_se_prio(rt_se)))
-               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(!rt_se->on_list);
+               __delist_rt_entity(rt_se, array);
+       }
+       rt_se->on_rq = 0;
 
        dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1207,7 +1265,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct sched_rt_entity *back = NULL;
 
@@ -1220,31 +1278,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
-                       __dequeue_rt_entity(rt_se);
+                       __dequeue_rt_entity(rt_se, flags);
        }
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rq *rq = rq_of_rt_se(rt_se);
 
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
        for_each_sched_rt_entity(rt_se)
-               __enqueue_rt_entity(rt_se, head);
+               __enqueue_rt_entity(rt_se, flags);
        enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rq *rq = rq_of_rt_se(rt_se);
 
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
 
        for_each_sched_rt_entity(rt_se) {
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
                if (rt_rq && rt_rq->rt_nr_running)
-                       __enqueue_rt_entity(rt_se, false);
+                       __enqueue_rt_entity(rt_se, flags);
        }
        enqueue_top_rt_rq(&rq->rt);
 }
@@ -1260,7 +1318,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
 
-       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+       enqueue_rt_entity(rt_se, flags);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1271,7 +1329,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        struct sched_rt_entity *rt_se = &p->rt;
 
        update_curr_rt(rq);
-       dequeue_rt_entity(rt_se);
+       dequeue_rt_entity(rt_se, flags);
 
        dequeue_pushable_task(rq, p);
 }
index 10f1637..b2ff5a2 100644 (file)
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                        struct sched_entity *se, int cpu,
                        struct sched_entity *parent);
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
 struct rt_rq {
        struct rt_prio_array active;
        unsigned int rt_nr_running;
+       unsigned int rr_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        struct {
                int curr; /* highest queued rt task prio */
@@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
 
 static inline u64 global_rt_period(void)
 {
@@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 extern const int sched_prio_to_weight[40];
 extern const u32 sched_prio_to_wmult[40];
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP          0x01
+#define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP         0x01
-#define ENQUEUE_HEAD           0x02
+#define ENQUEUE_RESTORE                0x02
+#define ENQUEUE_MOVE           0x04
+
+#define ENQUEUE_HEAD           0x08
+#define ENQUEUE_REPLENISH      0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING         0x04    /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING         0x20
 #else
 #define ENQUEUE_WAKING         0x00
 #endif
-#define ENQUEUE_REPLENISH      0x08
-#define ENQUEUE_RESTORE        0x10
-
-#define DEQUEUE_SLEEP          0x01
-#define DEQUEUE_SAVE           0x02
 
 #define RETRY_TASK             ((void *)-1UL)
 
@@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
 
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+       int cpu;
+
+       if (!tick_nohz_full_enabled())
+               return;
+
+       cpu = cpu_of(rq);
+
+       if (!tick_nohz_full_cpu(cpu))
+               return;
+
+       if (sched_can_stop_tick(rq))
+               tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+       else
+               tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
 static inline void add_nr_running(struct rq *rq, unsigned count)
 {
        unsigned prev_nr = rq->nr_running;
@@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
                if (!rq->rd->overload)
                        rq->rd->overload = true;
 #endif
-
-#ifdef CONFIG_NO_HZ_FULL
-               if (tick_nohz_full_cpu(rq->cpu)) {
-                       /*
-                        * Tick is needed if more than one task runs on a CPU.
-                        * Send the target an IPI to kick it out of nohz mode.
-                        *
-                        * We assume that IPI implies full memory barrier and the
-                        * new value of rq->nr_running is visible on reception
-                        * from the target.
-                        */
-                       tick_nohz_full_kick_cpu(rq->cpu);
-               }
-#endif
        }
+
+       sched_update_tick_dependency(rq);
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
        rq->nr_running -= count;
+       /* Check if we still need preemption */
+       sched_update_tick_dependency(rq);
 }
 
 static inline void rq_last_tick_reset(struct rq *rq)
index b0fbc76..70b3b6a 100644 (file)
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)       do { var = (val); } while (0)
+# define schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)      do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
+# define schedstat_enabled()           0
 # define schedstat_inc(rq, field)      do { } while (0)
 # define schedstat_add(rq, field, amt) do { } while (0)
 # define schedstat_set(var, val)       do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644 (file)
index 0000000..82f0dff
--- /dev/null
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                            struct lock_class_key *key)
+{
+       raw_spin_lock_init(&q->lock);
+       lockdep_set_class_and_name(&q->lock, key, name);
+       INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+
+       if (list_empty(&q->task_list))
+               return;
+
+       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+       wake_up_process(curr->task);
+       list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+       unsigned long flags;
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       swake_up_locked(q);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+       struct swait_queue *curr;
+       LIST_HEAD(tmp);
+
+       if (!swait_active(q))
+               return;
+
+       raw_spin_lock_irq(&q->lock);
+       list_splice_init(&q->task_list, &tmp);
+       while (!list_empty(&tmp)) {
+               curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+               wake_up_state(curr->task, TASK_NORMAL);
+               list_del_init(&curr->task_list);
+
+               if (list_empty(&tmp))
+                       break;
+
+               raw_spin_unlock_irq(&q->lock);
+               raw_spin_lock_irq(&q->lock);
+       }
+       raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       wait->task = current;
+       if (list_empty(&wait->task_list))
+               list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&q->lock, flags);
+       __prepare_to_swait(q, wait);
+       set_current_state(state);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+       if (signal_pending_state(state, current))
+               return -ERESTARTSYS;
+
+       prepare_to_swait(q, wait, state);
+
+       return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       __set_current_state(TASK_RUNNING);
+       if (!list_empty(&wait->task_list))
+               list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+       unsigned long flags;
+
+       __set_current_state(TASK_RUNNING);
+
+       if (!list_empty_careful(&wait->task_list)) {
+               raw_spin_lock_irqsave(&q->lock, flags);
+               list_del_init(&wait->task_list);
+               raw_spin_unlock_irqrestore(&q->lock, flags);
+       }
+}
+EXPORT_SYMBOL(finish_swait);
index 580ac2d..15a1795 100644 (file)
@@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void)
                put_seccomp_filter(thread);
                smp_store_release(&thread->seccomp.filter,
                                  caller->seccomp.filter);
+
+               /*
+                * Don't let an unprivileged task work around
+                * the no_new_privs restriction by creating
+                * a thread that sets it up, enters seccomp,
+                * then dies.
+                */
+               if (task_no_new_privs(caller))
+                       task_set_no_new_privs(thread);
+
                /*
                 * Opt the other thread into seccomp if needed.
                 * As threads are considered to be trust-realm
                 * equivalent (see ptrace_may_access), it is safe to
                 * allow one thread to transition the other.
                 */
-               if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
-                       /*
-                        * Don't let an unprivileged task work around
-                        * the no_new_privs restriction by creating
-                        * a thread that sets it up, enters seccomp,
-                        * then dies.
-                        */
-                       if (task_no_new_privs(caller))
-                               task_set_no_new_privs(thread);
-
+               if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
                        seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
-               }
        }
 }
 
index f3f1f7a..0508544 100644 (file)
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);
 
-       __set_current_state(TASK_INTERRUPTIBLE);
-       schedule();
+       while (!signal_pending(current)) {
+               __set_current_state(TASK_INTERRUPTIBLE);
+               schedule();
+       }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
 }
index d903c02..7416544 100644 (file)
@@ -105,13 +105,12 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
-       while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
-               cpu_relax();
+       smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
 }
 
-static void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(struct call_single_data *csd)
 {
        csd_lock_wait(csd);
        csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd)
        smp_wmb();
 }
 
-static void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(struct call_single_data *csd)
 {
        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
@@ -569,6 +568,7 @@ void __init smp_init(void)
        unsigned int cpu;
 
        idle_threads_init();
+       cpuhp_threads_init();
 
        /* FIXME: This should be done in userspace --RR */
        for_each_present_cpu(cpu) {
index d264f59..13bc43d 100644 (file)
@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
                kthread_unpark(tsk);
 }
 
-void smpboot_unpark_threads(unsigned int cpu)
+int smpboot_unpark_threads(unsigned int cpu)
 {
        struct smp_hotplug_thread *cur;
 
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
                if (cpumask_test_cpu(cpu, cur->cpumask))
                        smpboot_unpark_thread(cur, cpu);
        mutex_unlock(&smpboot_threads_lock);
+       return 0;
 }
 
 static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
                kthread_park(tsk);
 }
 
-void smpboot_park_threads(unsigned int cpu)
+int smpboot_park_threads(unsigned int cpu)
 {
        struct smp_hotplug_thread *cur;
 
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
        list_for_each_entry_reverse(cur, &hotplug_threads, list)
                smpboot_park_thread(cur, cpu);
        mutex_unlock(&smpboot_threads_lock);
+       return 0;
 }
 
 static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
index 72415a0..485b81c 100644 (file)
@@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { }
 #endif
 
 int smpboot_create_threads(unsigned int cpu);
-void smpboot_park_threads(unsigned int cpu);
-void smpboot_unpark_threads(unsigned int cpu);
+int smpboot_park_threads(unsigned int cpu);
+int smpboot_unpark_threads(unsigned int cpu);
+
+void __init cpuhp_threads_init(void);
 
 #endif
index 479e443..8aae49d 100644 (file)
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 
        if (preempt_count() == cnt) {
 #ifdef CONFIG_DEBUG_PREEMPT
-               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+               current->preempt_disable_ip = get_lock_parent_ip();
 #endif
-               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
        }
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
index 97715fd..f5102fa 100644 (file)
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHEDSTATS
+       {
+               .procname       = "sched_schedstats",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_schedstats,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
        {
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
                .data           = &latencytop_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = sysctl_latencytop,
        },
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
index 664de53..56ece14 100644 (file)
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
                /* cs is a watchdog. */
                if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+       }
+       spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+       struct clocksource *cs, *old_wd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&watchdog_lock, flags);
+       /* save current watchdog */
+       old_wd = watchdog;
+       if (fallback)
+               watchdog = NULL;
+
+       list_for_each_entry(cs, &clocksource_list, list) {
+               /* cs is a clocksource to be watched. */
+               if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+                       continue;
+
+               /* Skip current if we were requested for a fallback. */
+               if (fallback && cs == old_wd)
+                       continue;
+
                /* Pick the best watchdog. */
-               if (!watchdog || cs->rating > watchdog->rating) {
+               if (!watchdog || cs->rating > watchdog->rating)
                        watchdog = cs;
-                       /* Reset watchdog cycles */
-                       clocksource_reset_watchdog();
-               }
        }
+       /* If we failed to find a fallback restore the old one. */
+       if (!watchdog)
+               watchdog = old_wd;
+
+       /* If we changed the watchdog we need to reset cycles. */
+       if (watchdog != old_wd)
+               clocksource_reset_watchdog();
+
        /* Check if the watchdog timer needs to be started. */
        clocksource_start_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
 
+static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
 static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
        clocksource_enqueue(cs);
        clocksource_enqueue_watchdog(cs);
        clocksource_select();
+       clocksource_select_watchdog(false);
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
        mutex_lock(&clocksource_mutex);
        __clocksource_change_rating(cs, rating);
        clocksource_select();
+       clocksource_select_watchdog(false);
        mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
  */
 static int clocksource_unbind(struct clocksource *cs)
 {
-       /*
-        * I really can't convince myself to support this on hardware
-        * designed by lobotomized monkeys.
-        */
-       if (clocksource_is_watchdog(cs))
-               return -EBUSY;
+       if (clocksource_is_watchdog(cs)) {
+               /* Select and try to install a replacement watchdog. */
+               clocksource_select_watchdog(true);
+               if (clocksource_is_watchdog(cs))
+                       return -EBUSY;
+       }
 
        if (cs == curr_clocksource) {
                /* Select and try to install a replacement clock source */
index 435b885..fa909f9 100644 (file)
@@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer,
  */
 static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
-                            unsigned long newstate, int reprogram)
+                            u8 newstate, int reprogram)
 {
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-       unsigned int state = timer->state;
+       u8 state = timer->state;
 
        timer->state = newstate;
        if (!(state & HRTIMER_STATE_ENQUEUED))
@@ -930,7 +930,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
        if (hrtimer_is_queued(timer)) {
-               unsigned long state = timer->state;
+               u8 state = timer->state;
                int reprogram;
 
                /*
@@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
        return 0;
 }
 
+static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
+                                           const enum hrtimer_mode mode)
+{
+#ifdef CONFIG_TIME_LOW_RES
+       /*
+        * CONFIG_TIME_LOW_RES indicates that the system has no way to return
+        * granular time values. For relative timers we add hrtimer_resolution
+        * (i.e. one jiffie) to prevent short timeouts.
+        */
+       timer->is_rel = mode & HRTIMER_MODE_REL;
+       if (timer->is_rel)
+               tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+#endif
+       return tim;
+}
+
 /**
  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
  * @timer:     the timer to be added
@@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        /* Remove an active timer from the queue: */
        remove_hrtimer(timer, base, true);
 
-       if (mode & HRTIMER_MODE_REL) {
+       if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());
-               /*
-                * CONFIG_TIME_LOW_RES is a temporary way for architectures
-                * to signal that they simply return xtime in
-                * do_gettimeoffset(). In this case we want to round up by
-                * resolution when starting a relative timer, to avoid short
-                * timeouts. This will go away with the GTOD framework.
-                */
-#ifdef CONFIG_TIME_LOW_RES
-               tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
-#endif
-       }
+
+       tim = hrtimer_update_lowres(timer, tim, mode);
 
        hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
@@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
 /**
  * hrtimer_get_remaining - get remaining time for the timer
  * @timer:     the timer to read
+ * @adjust:    adjust relative timers when CONFIG_TIME_LOW_RES=y
  */
-ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
 {
        unsigned long flags;
        ktime_t rem;
 
        lock_hrtimer_base(timer, &flags);
-       rem = hrtimer_expires_remaining(timer);
+       if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
+               rem = hrtimer_expires_remaining_adjusted(timer);
+       else
+               rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);
 
        return rem;
 }
-EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
 
 #ifdef CONFIG_NO_HZ_COMMON
 /**
@@ -1219,6 +1230,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
        timer_stats_account_hrtimer(timer);
        fn = timer->function;
 
+       /*
+        * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
+        * timer is restarted with a period then it becomes an absolute
+        * timer. If its not restarted it does not matter.
+        */
+       if (IS_ENABLED(CONFIG_TIME_LOW_RES))
+               timer->is_rel = false;
+
        /*
         * Because we run timers from hardirq context, there is no chance
         * they get migrated to another cpu, therefore its safe to unlock
index 8d262b4..1d5c720 100644 (file)
@@ -26,7 +26,7 @@
  */
 static struct timeval itimer_get_remtime(struct hrtimer *timer)
 {
-       ktime_t rem = hrtimer_get_remaining(timer);
+       ktime_t rem = __hrtimer_get_remaining(timer, true);
 
        /*
         * Racy but safe: if the itimer expires after the above
index 347fecf..555e21f 100644 (file)
@@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = {
        .name           = "jiffies",
        .rating         = 1, /* lowest valid rating*/
        .read           = jiffies_read,
-       .mask           = 0xffffffff, /*32bits*/
+       .mask           = CLOCKSOURCE_MASK(32),
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
        .shift          = JIFFIES_SHIFT,
        .max_cycles     = 10,
index 36f2ca0..6df8927 100644 (file)
@@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc)
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;
 
-               if (!timeval_inject_offset_valid(&txc->time))
-                       return -EINVAL;
+               if (txc->modes & ADJ_NANO) {
+                       struct timespec ts;
+
+                       ts.tv_sec = txc->time.tv_sec;
+                       ts.tv_nsec = txc->time.tv_usec;
+                       if (!timespec_inject_offset_valid(&ts))
+                               return -EINVAL;
+
+               } else {
+                       if (!timeval_inject_offset_valid(&txc->time))
+                               return -EINVAL;
+               }
        }
 
        /*
index f5e86d2..1cafba8 100644 (file)
@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
        return err;
 }
 
-
 /*
  * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
                                cputime_expires->sched_exp = exp;
                        break;
                }
+               if (CPUCLOCK_PERTHREAD(timer->it_clock))
+                       tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+               else
+                       tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
        }
 }
 
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        return 0;
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-static void nohz_kick_work_fn(struct work_struct *work)
-{
-       tick_nohz_full_kick_all();
-}
-
-static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
-
-/*
- * We need the IPIs to be sent from sane process context.
- * The posix cpu timers are always set with irqs disabled.
- */
-static void posix_cpu_timer_kick_nohz(void)
-{
-       if (context_tracking_is_enabled())
-               schedule_work(&nohz_kick_work);
-}
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-{
-       if (!task_cputime_zero(&tsk->cputime_expires))
-               return false;
-
-       /* Check if cputimer is running. This is accessed without locking. */
-       if (READ_ONCE(tsk->signal->cputimer.running))
-               return false;
-
-       return true;
-}
-#else
-static inline void posix_cpu_timer_kick_nohz(void) { }
-#endif
-
 /*
  * Guts of sys_timer_settime for CPU timers.
  * This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                sample_to_timespec(timer->it_clock,
                                   old_incr, &old->it_interval);
        }
-       if (!ret)
-               posix_cpu_timer_kick_nohz();
+
        return ret;
 }
 
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                }
        }
+       if (task_cputime_zero(tsk_expires))
+               tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
 
        /* Turn off cputimer->running. This is done without locking. */
        WRITE_ONCE(cputimer->running, false);
+       tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
        arm_timer(timer);
        unlock_task_sighand(p, &flags);
 
-       /* Kick full dynticks CPUs in case they need to tick on the new timer */
-       posix_cpu_timer_kick_nohz();
 out:
        timer->it_overrun_last = timer->it_overrun;
        timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                }
 
                if (!*newval)
-                       goto out;
+                       return;
                *newval += now;
        }
 
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                        tsk->signal->cputime_expires.virt_exp = *newval;
                break;
        }
-out:
-       posix_cpu_timer_kick_nohz();
+
+       tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
index 31d11ac..f2826c3 100644 (file)
@@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
                timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
 
-       remaining = ktime_sub(hrtimer_get_expires(timer), now);
+       remaining = __hrtimer_expires_remaining_adjusted(timer, now);
        /* Return 0 only, when the timer is expired and not pending */
        if (remaining.tv64 <= 0) {
                /*
index 9d7a053..969e670 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
-#include <linux/perf_event.h>
 #include <linux/context_tracking.h>
 
 #include <asm/irq_regs.h>
  */
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
-/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
- */
-static ktime_t last_jiffies_update;
-
 struct tick_sched *tick_get_tick_sched(int cpu)
 {
        return &per_cpu(tick_cpu_sched, cpu);
 }
 
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
 /*
  * Must be called with interrupts disabled !
  */
@@ -151,59 +151,69 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
 }
+#endif
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
 cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
+static unsigned long tick_dep_mask;
 
-static bool can_stop_full_tick(void)
+static void trace_tick_dependency(unsigned long dep)
+{
+       if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+               trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+               trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_SCHED) {
+               trace_tick_stop(0, TICK_DEP_MASK_SCHED);
+               return;
+       }
+
+       if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+               trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
 {
        WARN_ON_ONCE(!irqs_disabled());
 
-       if (!sched_can_stop_tick()) {
-               trace_tick_stop(0, "more than 1 task in runqueue\n");
+       if (tick_dep_mask) {
+               trace_tick_dependency(tick_dep_mask);
                return false;
        }
 
-       if (!posix_cpu_timers_can_stop_tick(current)) {
-               trace_tick_stop(0, "posix timers running\n");
+       if (ts->tick_dep_mask) {
+               trace_tick_dependency(ts->tick_dep_mask);
                return false;
        }
 
-       if (!perf_event_can_stop_tick()) {
-               trace_tick_stop(0, "perf events running\n");
+       if (current->tick_dep_mask) {
+               trace_tick_dependency(current->tick_dep_mask);
                return false;
        }
 
-       /* sched_clock_tick() needs us? */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-       /*
-        * TODO: kick full dynticks CPUs when
-        * sched_clock_stable is set.
-        */
-       if (!sched_clock_stable()) {
-               trace_tick_stop(0, "unstable sched clock\n");
-               /*
-                * Don't allow the user to think they can get
-                * full NO_HZ with this machine.
-                */
-               WARN_ONCE(tick_nohz_full_running,
-                         "NO_HZ FULL will not work with unstable sched clock");
+       if (current->signal->tick_dep_mask) {
+               trace_tick_dependency(current->signal->tick_dep_mask);
                return false;
        }
-#endif
 
        return true;
 }
 
-static void nohz_full_kick_work_func(struct irq_work *work)
+static void nohz_full_kick_func(struct irq_work *work)
 {
        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-       .func = nohz_full_kick_work_func,
+       .func = nohz_full_kick_func,
 };
 
 /*
@@ -212,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
  * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
  * is NMI safe.
  */
-void tick_nohz_full_kick(void)
+static void tick_nohz_full_kick(void)
 {
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;
@@ -232,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu)
        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
-static void nohz_full_kick_ipi(void *info)
-{
-       /* Empty, the tick restart happens on tick_nohz_irq_exit() */
-}
-
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
  */
-void tick_nohz_full_kick_all(void)
+static void tick_nohz_full_kick_all(void)
 {
+       int cpu;
+
        if (!tick_nohz_full_running)
                return;
 
        preempt_disable();
-       smp_call_function_many(tick_nohz_full_mask,
-                              nohz_full_kick_ipi, NULL, false);
-       tick_nohz_full_kick();
+       for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
+               tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
 }
 
+static void tick_nohz_dep_set_all(unsigned long *dep,
+                                 enum tick_dep_bits bit)
+{
+       unsigned long prev;
+
+       prev = fetch_or(dep, BIT_MASK(bit));
+       if (!prev)
+               tick_nohz_full_kick_all();
+}
+
+/*
+ * Set a global tick dependency. Used by perf events that rely on freq and
+ * by unstable clock.
+ */
+void tick_nohz_dep_set(enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear(enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tick_dep_mask);
+}
+
+/*
+ * Set per-CPU tick dependency. Used by scheduler and perf events in order to
+ * manage events throttling.
+ */
+void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+       unsigned long prev;
+       struct tick_sched *ts;
+
+       ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+       if (!prev) {
+               preempt_disable();
+               /* Perf needs local kick that is NMI safe */
+               if (cpu == smp_processor_id()) {
+                       tick_nohz_full_kick();
+               } else {
+                       /* Remote irq work not NMI-safe */
+                       if (!WARN_ON_ONCE(in_nmi()))
+                               tick_nohz_full_kick_cpu(cpu);
+               }
+               preempt_enable();
+       }
+}
+
+void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+       struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+       clear_bit(bit, &ts->tick_dep_mask);
+}
+
+/*
+ * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
+ * per task timers.
+ */
+void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       /*
+        * We could optimize this with just kicking the target running the task
+        * if that noise matters for nohz full users.
+        */
+       tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &tsk->tick_dep_mask);
+}
+
+/*
+ * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
+ * per process timers.
+ */
+void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+       clear_bit(bit, &sig->tick_dep_mask);
+}
+
 /*
  * Re-evaluate the need for the tick as we switch the current task.
  * It might need the tick due to per task/process properties:
@@ -261,15 +356,19 @@ void tick_nohz_full_kick_all(void)
 void __tick_nohz_task_switch(void)
 {
        unsigned long flags;
+       struct tick_sched *ts;
 
        local_irq_save(flags);
 
        if (!tick_nohz_full_cpu(smp_processor_id()))
                goto out;
 
-       if (tick_nohz_tick_stopped() && !can_stop_full_tick())
-               tick_nohz_full_kick();
+       ts = this_cpu_ptr(&tick_cpu_sched);
 
+       if (ts->tick_stopped) {
+               if (current->tick_dep_mask || current->signal->tick_dep_mask)
+                       tick_nohz_full_kick();
+       }
 out:
        local_irq_restore(flags);
 }
@@ -687,7 +786,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                ts->tick_stopped = 1;
-               trace_tick_stop(1, " ");
+               trace_tick_stop(1, TICK_DEP_MASK_NONE);
        }
 
        /*
@@ -738,7 +837,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                return;
 
-       if (can_stop_full_tick())
+       if (can_stop_full_tick(ts))
                tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
        else if (ts->tick_stopped)
                tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
@@ -993,9 +1092,9 @@ static void tick_nohz_switch_to_nohz(void)
        /* Get the next period */
        next = tick_init_jiffy_update();
 
-       hrtimer_forward_now(&ts->sched_timer, tick_period);
        hrtimer_set_expires(&ts->sched_timer, next);
-       tick_program_event(next, 1);
+       hrtimer_forward_now(&ts->sched_timer, tick_period);
+       tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
index a4a8d4e..eb4e325 100644 (file)
@@ -60,6 +60,7 @@ struct tick_sched {
        u64                             next_timer;
        ktime_t                         idle_expires;
        int                             do_timer_last;
+       unsigned long                   tick_dep_mask;
 };
 
 extern struct tick_sched *tick_get_tick_sched(int cpu);
index 34b4ced..9c629bb 100644 (file)
@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;
 
+       ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.read = clock->read;
@@ -298,17 +299,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+                                         cycle_t delta)
+{
+       s64 nsec;
+
+       nsec = delta * tkr->mult + tkr->xtime_nsec;
+       nsec >>= tkr->shift;
+
+       /* If arch requires, add in get_arch_timeoffset() */
+       return nsec + arch_gettimeoffset();
+}
+
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
        cycle_t delta;
-       s64 nsec;
 
        delta = timekeeping_get_delta(tkr);
+       return timekeeping_delta_to_ns(tkr, delta);
+}
 
-       nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+                                           cycle_t cycles)
+{
+       cycle_t delta;
 
-       /* If arch requires, add in get_arch_timeoffset() */
-       return nsec + arch_gettimeoffset();
+       /* calculate the delta since the last update_wall_time */
+       delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+       return timekeeping_delta_to_ns(tkr, delta);
 }
 
 /**
@@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void)
        return tk->xtime_sec;
 }
 
+/**
+ * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
+ * @systime_snapshot:  pointer to struct receiving the system time snapshot
+ */
+void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
+{
+       struct timekeeper *tk = &tk_core.timekeeper;
+       unsigned long seq;
+       ktime_t base_raw;
+       ktime_t base_real;
+       s64 nsec_raw;
+       s64 nsec_real;
+       cycle_t now;
 
-#ifdef CONFIG_NTP_PPS
+       WARN_ON_ONCE(timekeeping_suspended);
+
+       do {
+               seq = read_seqcount_begin(&tk_core.seq);
+
+               now = tk->tkr_mono.read(tk->tkr_mono.clock);
+               systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
+               systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
+               base_real = ktime_add(tk->tkr_mono.base,
+                                     tk_core.timekeeper.offs_real);
+               base_raw = tk->tkr_raw.base;
+               nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
+               nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+       } while (read_seqcount_retry(&tk_core.seq, seq));
+
+       systime_snapshot->cycles = now;
+       systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+       systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_snapshot);
+
+/* Scale base by mult/div checking for overflow */
+static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
+{
+       u64 tmp, rem;
+
+       tmp = div64_u64_rem(*base, div, &rem);
+
+       if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
+           ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
+               return -EOVERFLOW;
+       tmp *= mult;
+       rem *= mult;
+
+       do_div(rem, div);
+       *base = tmp + rem;
+       return 0;
+}
 
 /**
- * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
- * @ts_raw:    pointer to the timespec to be set to raw monotonic time
- * @ts_real:   pointer to the timespec to be set to the time of day
+ * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
+ * @history:                   Snapshot representing start of history
+ * @partial_history_cycles:    Cycle offset into history (fractional part)
+ * @total_history_cycles:      Total history length in cycles
+ * @discontinuity:             True indicates clock was set on history period
+ * @ts:                                Cross timestamp that should be adjusted using
+ *     partial/total ratio
  *
- * This function reads both the time of day and raw monotonic time at the
- * same time atomically and stores the resulting timestamps in timespec
- * format.
+ * Helper function used by get_device_system_crosststamp() to correct the
+ * crosstimestamp corresponding to the start of the current interval to the
+ * system counter value (timestamp point) provided by the driver. The
+ * total_history_* quantities are the total history starting at the provided
+ * reference point and ending at the start of the current interval. The cycle
+ * count between the driver timestamp point and the start of the current
+ * interval is partial_history_cycles.
  */
-void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
+static int adjust_historical_crosststamp(struct system_time_snapshot *history,
+                                        cycle_t partial_history_cycles,
+                                        cycle_t total_history_cycles,
+                                        bool discontinuity,
+                                        struct system_device_crosststamp *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-       unsigned long seq;
-       s64 nsecs_raw, nsecs_real;
+       u64 corr_raw, corr_real;
+       bool interp_forward;
+       int ret;
 
-       WARN_ON_ONCE(timekeeping_suspended);
+       if (total_history_cycles == 0 || partial_history_cycles == 0)
+               return 0;
+
+       /* Interpolate shortest distance from beginning or end of history */
+       interp_forward = partial_history_cycles > total_history_cycles/2 ?
+               true : false;
+       partial_history_cycles = interp_forward ?
+               total_history_cycles - partial_history_cycles :
+               partial_history_cycles;
+
+       /*
+        * Scale the monotonic raw time delta by:
+        *      partial_history_cycles / total_history_cycles
+        */
+       corr_raw = (u64)ktime_to_ns(
+               ktime_sub(ts->sys_monoraw, history->raw));
+       ret = scale64_check_overflow(partial_history_cycles,
+                                    total_history_cycles, &corr_raw);
+       if (ret)
+               return ret;
+
+       /*
+        * If there is a discontinuity in the history, scale monotonic raw
+        *      correction by:
+        *      mult(real)/mult(raw) yielding the realtime correction
+        * Otherwise, calculate the realtime correction similar to monotonic
+        *      raw calculation
+        */
+       if (discontinuity) {
+               corr_real = mul_u64_u32_div
+                       (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
+       } else {
+               corr_real = (u64)ktime_to_ns(
+                       ktime_sub(ts->sys_realtime, history->real));
+               ret = scale64_check_overflow(partial_history_cycles,
+                                            total_history_cycles, &corr_real);
+               if (ret)
+                       return ret;
+       }
+
+       /* Fixup monotonic raw and real time time values */
+       if (interp_forward) {
+               ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
+               ts->sys_realtime = ktime_add_ns(history->real, corr_real);
+       } else {
+               ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
+               ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
+       }
+
+       return 0;
+}
+
+/*
+ * cycle_between - true if test occurs chronologically between before and after
+ */
+static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+{
+       if (test > before && test < after)
+               return true;
+       if (test < before && before > after)
+               return true;
+       return false;
+}
+
+/**
+ * get_device_system_crosststamp - Synchronously capture system/device timestamp
+ * @get_time_fn:       Callback to get simultaneous device time and
+ *     system counter from the device driver
+ * @ctx:               Context passed to get_time_fn()
+ * @history_begin:     Historical reference point used to interpolate system
+ *     time when counter provided by the driver is before the current interval
+ * @xtstamp:           Receives simultaneously captured system and device time
+ *
+ * Reads a timestamp from a device and correlates it to system time
+ */
+int get_device_system_crosststamp(int (*get_time_fn)
+                                 (ktime_t *device_time,
+                                  struct system_counterval_t *sys_counterval,
+                                  void *ctx),
+                                 void *ctx,
+                                 struct system_time_snapshot *history_begin,
+                                 struct system_device_crosststamp *xtstamp)
+{
+       struct system_counterval_t system_counterval;
+       struct timekeeper *tk = &tk_core.timekeeper;
+       cycle_t cycles, now, interval_start;
+       unsigned int clock_was_set_seq = 0;
+       ktime_t base_real, base_raw;
+       s64 nsec_real, nsec_raw;
+       u8 cs_was_changed_seq;
+       unsigned long seq;
+       bool do_interp;
+       int ret;
 
        do {
                seq = read_seqcount_begin(&tk_core.seq);
+               /*
+                * Try to synchronously capture device time and a system
+                * counter value calling back into the device driver
+                */
+               ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
+               if (ret)
+                       return ret;
+
+               /*
+                * Verify that the clocksource associated with the captured
+                * system counter value is the same as the currently installed
+                * timekeeper clocksource
+                */
+               if (tk->tkr_mono.clock != system_counterval.cs)
+                       return -ENODEV;
+               cycles = system_counterval.cycles;
 
-               *ts_raw = tk->raw_time;
-               ts_real->tv_sec = tk->xtime_sec;
-               ts_real->tv_nsec = 0;
+               /*
+                * Check whether the system counter value provided by the
+                * device driver is on the current timekeeping interval.
+                */
+               now = tk->tkr_mono.read(tk->tkr_mono.clock);
+               interval_start = tk->tkr_mono.cycle_last;
+               if (!cycle_between(interval_start, cycles, now)) {
+                       clock_was_set_seq = tk->clock_was_set_seq;
+                       cs_was_changed_seq = tk->cs_was_changed_seq;
+                       cycles = interval_start;
+                       do_interp = true;
+               } else {
+                       do_interp = false;
+               }
 
-               nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
-               nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
+               base_real = ktime_add(tk->tkr_mono.base,
+                                     tk_core.timekeeper.offs_real);
+               base_raw = tk->tkr_raw.base;
 
+               nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
+                                                    system_counterval.cycles);
+               nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
+                                                   system_counterval.cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
-       timespec64_add_ns(ts_raw, nsecs_raw);
-       timespec64_add_ns(ts_real, nsecs_real);
-}
-EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
+       xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
+       xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
 
-#endif /* CONFIG_NTP_PPS */
+       /*
+        * Interpolate if necessary, adjusting back from the start of the
+        * current interval
+        */
+       if (do_interp) {
+               cycle_t partial_history_cycles, total_history_cycles;
+               bool discontinuity;
+
+               /*
+                * Check that the counter value occurs after the provided
+                * history reference and that the history doesn't cross a
+                * clocksource change
+                */
+               if (!history_begin ||
+                   !cycle_between(history_begin->cycles,
+                                  system_counterval.cycles, cycles) ||
+                   history_begin->cs_was_changed_seq != cs_was_changed_seq)
+                       return -EINVAL;
+               partial_history_cycles = cycles - system_counterval.cycles;
+               total_history_cycles = cycles - history_begin->cycles;
+               discontinuity =
+                       history_begin->clock_was_set_seq != clock_was_set_seq;
+
+               ret = adjust_historical_crosststamp(history_begin,
+                                                   partial_history_cycles,
+                                                   total_history_cycles,
+                                                   discontinuity, xtstamp);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
 
 /**
  * do_gettimeofday - Returns the time of day in a timeval
index f75e35b..ba7d8b2 100644 (file)
@@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
        print_name_offset(m, taddr);
        SEQ_printf(m, ", ");
        print_name_offset(m, timer->function);
-       SEQ_printf(m, ", S:%02lx", timer->state);
+       SEQ_printf(m, ", S:%02x", timer->state);
 #ifdef CONFIG_TIMER_STATS
        SEQ_printf(m, ", ");
        print_name_offset(m, timer->start_site);
index 45dd798..326a75e 100644 (file)
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
        struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct perf_event *event;
+       struct file *file;
 
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;
 
-       event = (struct perf_event *)array->ptrs[index];
-       if (!event)
+       file = (struct file *)array->ptrs[index];
+       if (unlikely(!file))
                return -ENOENT;
 
+       event = file->private_data;
+
        /* make sure event is local and doesn't have pmu::count */
        if (event->oncpu != smp_processor_id() ||
            event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
        void *data = (void *) (long) r4;
        struct perf_sample_data sample_data;
        struct perf_event *event;
+       struct file *file;
        struct perf_raw_record raw = {
                .size = size,
                .data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;
 
-       event = (struct perf_event *)array->ptrs[index];
-       if (unlikely(!event))
+       file = (struct file *)array->ptrs[index];
+       if (unlikely(!file))
                return -ENOENT;
 
+       event = file->private_data;
+
        if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
                     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
                return -EINVAL;
index eca592f..57a6eea 100644 (file)
@@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod)
        mutex_unlock(&ftrace_lock);
 }
 
-static void ftrace_module_enable(struct module *mod)
+void ftrace_module_enable(struct module *mod)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod)
        ftrace_process_locs(mod, mod->ftrace_callsites,
                            mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
-
-static int ftrace_module_notify(struct notifier_block *self,
-                               unsigned long val, void *data)
-{
-       struct module *mod = data;
-
-       switch (val) {
-       case MODULE_STATE_COMING:
-               ftrace_module_enable(mod);
-               break;
-       case MODULE_STATE_GOING:
-               ftrace_release_mod(mod);
-               break;
-       default:
-               break;
-       }
-
-       return 0;
-}
-#else
-static int ftrace_module_notify(struct notifier_block *self,
-                               unsigned long val, void *data)
-{
-       return 0;
-}
 #endif /* CONFIG_MODULES */
 
-struct notifier_block ftrace_module_nb = {
-       .notifier_call = ftrace_module_notify,
-       .priority = INT_MIN,    /* Run after anything that can remove kprobes */
-};
-
 void __init ftrace_init(void)
 {
        extern unsigned long __start_mcount_loc[];
@@ -5098,10 +5068,6 @@ void __init ftrace_init(void)
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
 
-       ret = register_module_notifier(&ftrace_module_nb);
-       if (ret)
-               pr_warning("Failed to register trace ftrace module exit notifier\n");
-
        set_ftrace_early_filters();
 
        return;
index 87fb980..d929340 100644 (file)
@@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 {
        __buffer_unlock_commit(buffer, event);
 
-       ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
+       ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
index f333e57..05ddc08 100644 (file)
@@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name)
        struct ftrace_event_field *field;
        struct list_head *head;
 
-       field = __find_event_field(&ftrace_generic_fields, name);
+       head = trace_get_fields(call);
+       field = __find_event_field(head, name);
        if (field)
                return field;
 
-       field = __find_event_field(&ftrace_common_fields, name);
+       field = __find_event_field(&ftrace_generic_fields, name);
        if (field)
                return field;
 
-       head = trace_get_fields(call);
-       return __find_event_field(head, name);
+       return __find_event_field(&ftrace_common_fields, name);
 }
 
 static int __trace_define_field(struct list_head *head, const char *type,
@@ -171,8 +171,10 @@ static int trace_define_generic_fields(void)
 {
        int ret;
 
-       __generic_field(int, cpu, FILTER_OTHER);
-       __generic_field(char *, comm, FILTER_PTR_STRING);
+       __generic_field(int, CPU, FILTER_CPU);
+       __generic_field(int, cpu, FILTER_CPU);
+       __generic_field(char *, COMM, FILTER_COMM);
+       __generic_field(char *, comm, FILTER_COMM);
 
        return ret;
 }
@@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
-               if (call->class && call->class->reg)
+               if (call->class && call->class->reg &&
+                   !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
                        return file;
        }
 
index f93a219..6816302 100644 (file)
@@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps,
                return -EINVAL;
        }
 
-       if (is_string_field(field)) {
+       if (field->filter_type == FILTER_COMM) {
+               filter_build_regex(pred);
+               fn = filter_pred_comm;
+               pred->regex.field_len = TASK_COMM_LEN;
+       } else if (is_string_field(field)) {
                filter_build_regex(pred);
 
-               if (!strcmp(field->name, "comm")) {
-                       fn = filter_pred_comm;
-                       pred->regex.field_len = TASK_COMM_LEN;
-               } else if (field->filter_type == FILTER_STATIC_STRING) {
+               if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
@@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps,
                }
                pred->val = val;
 
-               if (!strcmp(field->name, "cpu"))
+               if (field->filter_type == FILTER_CPU)
                        fn = filter_pred_cpu;
                else
                        fn = select_comparison_fn(pred->op, field->size,
index c995644..21b81a4 100644 (file)
@@ -30,7 +30,7 @@
 struct trace_kprobe {
        struct list_head        list;
        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
-       unsigned long           nhit;
+       unsigned long __percpu *nhit;
        const char              *symbol;        /* symbol name */
        struct trace_probe      tp;
 };
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
        if (!tk)
                return ERR_PTR(ret);
 
+       tk->nhit = alloc_percpu(unsigned long);
+       if (!tk->nhit)
+               goto error;
+
        if (symbol) {
                tk->symbol = kstrdup(symbol, GFP_KERNEL);
                if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
 error:
        kfree(tk->tp.call.name);
        kfree(tk->symbol);
+       free_percpu(tk->nhit);
        kfree(tk);
        return ERR_PTR(ret);
 }
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
        kfree(tk->tp.call.class->system);
        kfree(tk->tp.call.name);
        kfree(tk->symbol);
+       free_percpu(tk->nhit);
        kfree(tk);
 }
 
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
        struct trace_kprobe *tk = v;
+       unsigned long nhit = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               nhit += *per_cpu_ptr(tk->nhit, cpu);
 
        seq_printf(m, "  %-44s %15lu %15lu\n",
-                  trace_event_name(&tk->tp.call), tk->nhit,
+                  trace_event_name(&tk->tp.call), nhit,
                   tk->rp.kp.nmissed);
 
        return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
        struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
 
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
 
        if (tk->tp.flags & TP_FLAG_TRACE)
                kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
        struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
 
-       tk->nhit++;
+       raw_cpu_inc(*tk->nhit);
 
        if (tk->tp.flags & TP_FLAG_TRACE)
                kretprobe_trace_func(tk, ri, regs);
index dda9e67..2a1abba 100644 (file)
@@ -125,6 +125,13 @@ check_stack(unsigned long ip, unsigned long *stack)
                        break;
        }
 
+       /*
+        * Some archs may not have the passed in ip in the dump.
+        * If that happens, we need to show everything.
+        */
+       if (i == stack_trace_max.nr_entries)
+               i = 0;
+
        /*
         * Now find where in the stack these are.
         */
@@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
                for (; p < top && i < stack_trace_max.nr_entries; p++) {
                        if (stack_dump_trace[i] == ULONG_MAX)
                                break;
-                       if (*p == stack_dump_trace[i]) {
+                       /*
+                        * The READ_ONCE_NOCHECK is used to let KASAN know that
+                        * this is not a stack-out-of-bounds error.
+                        */
+                       if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
                                stack_dump_trace[x] = stack_dump_trace[i++];
                                this_size = stack_trace_index[x++] =
                                        (top - p) * sizeof(unsigned long);
index 0655afb..d166308 100644 (file)
@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
 
 extern char *__bad_type_size(void);
 
-#define SYSCALL_FIELD(type, name)                                      \
-       sizeof(type) != sizeof(trace.name) ?                            \
+#define SYSCALL_FIELD(type, field, name)                               \
+       sizeof(type) != sizeof(trace.field) ?                           \
                __bad_type_size() :                                     \
-               #type, #name, offsetof(typeof(trace), name),            \
-               sizeof(trace.name), is_signed_type(type)
+               #type, #name, offsetof(typeof(trace), field),           \
+               sizeof(trace.field), is_signed_type(type)
 
 static int __init
 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
        int i;
        int offset = offsetof(typeof(trace), args);
 
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
        if (ret)
                return ret;
 
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
        struct syscall_trace_exit trace;
        int ret;
 
-       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+                                FILTER_OTHER);
        if (ret)
                return ret;
 
-       ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+       ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
                                 FILTER_OTHER);
 
        return ret;
index 975cb49..f8e26ab 100644 (file)
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
        struct mm_struct *mm;
 
-       /* convert pages-usec to Mbyte-usec */
-       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+       /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+       stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+       do_div(stats->coremem, 1000 * KB);
+       stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+       do_div(stats->virtmem, 1000 * KB);
        mm = get_task_mm(p);
        if (mm) {
                /* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
                                    cputime_t utime, cputime_t stime)
 {
-       if (likely(tsk->mm)) {
-               cputime_t time, dtime;
-               struct timeval value;
-               unsigned long flags;
-               u64 delta;
-
-               local_irq_save(flags);
-               time = stime + utime;
-               dtime = time - tsk->acct_timexpd;
-               jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-               delta = value.tv_sec;
-               delta = delta * USEC_PER_SEC + value.tv_usec;
-
-               if (delta == 0)
-                       goto out;
-               tsk->acct_timexpd = time;
-               tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-               tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-       out:
-               local_irq_restore(flags);
-       }
+       cputime_t time, dtime;
+       u64 delta;
+
+       if (!likely(tsk->mm))
+               return;
+
+       time = stime + utime;
+       dtime = time - tsk->acct_timexpd;
+       /* Avoid division: cputime_t is often in nanoseconds already. */
+       delta = cputime_to_nsecs(dtime);
+
+       if (delta < TICK_NSEC)
+               return;
+
+       tsk->acct_timexpd = time;
+       /*
+        * Divide by 1024 to avoid overflow, and to avoid division.
+        * The final unit reported to userspace is Mbyte-usecs,
+        * the rest of the math is done in xacct_add_tsk.
+        */
+       tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+       tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
 }
 
 /**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
 void acct_update_integrals(struct task_struct *tsk)
 {
        cputime_t utime, stime;
+       unsigned long flags;
 
+       local_irq_save(flags);
        task_cputime(tsk, &utime, &stime);
        __acct_update_integrals(tsk, utime, stime);
+       local_irq_restore(flags);
 }
 
 /**
index 61a0264..7ff5dc7 100644 (file)
@@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock);    /* protects wq->maydays list */
 static LIST_HEAD(workqueues);          /* PR: list of all workqueues */
 static bool workqueue_freezing;                /* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+
+/*
+ * Local execution of unbound work items is no longer guaranteed.  The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                  int node)
 {
        assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+       /*
+        * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+        * delayed item is pending.  The plan is to keep CPU -> NODE
+        * mapping valid and stable across CPU on/offlines.  Once that
+        * happens, this workaround can be removed.
+        */
+       if (unlikely(node == NUMA_NO_NODE))
+               return wq->dfl_pwq;
+
        return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
 }
 
@@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
        return worker && worker->current_pwq->wq == wq;
 }
 
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+       static bool printed_dbg_warning;
+       int new_cpu;
+
+       if (likely(!wq_debug_force_rr_cpu)) {
+               if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+                       return cpu;
+       } else if (!printed_dbg_warning) {
+               pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+               printed_dbg_warning = true;
+       }
+
+       if (cpumask_empty(wq_unbound_cpumask))
+               return cpu;
+
+       new_cpu = __this_cpu_read(wq_rr_cpu_last);
+       new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+       if (unlikely(new_cpu >= nr_cpu_ids)) {
+               new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+               if (unlikely(new_cpu >= nr_cpu_ids))
+                       return cpu;
+       }
+       __this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+       return new_cpu;
+}
+
 static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
@@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
                return;
 retry:
        if (req_cpu == WORK_CPU_UNBOUND)
-               cpu = raw_smp_processor_id();
+               cpu = wq_select_unbound_cpu(raw_smp_processor_id());
 
        /* pwq which will be used unless @work is executing elsewhere */
        if (!(wq->flags & WQ_UNBOUND))
@@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
        timer_stats_timer_set_start_info(&dwork->timer);
 
        dwork->wq = wq;
-       /* timer isn't guaranteed to run in this cpu, record earlier */
-       if (cpu == WORK_CPU_UNBOUND)
-               cpu = raw_smp_processor_id();
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
 
-       add_timer_on(timer, cpu);
+       if (unlikely(cpu != WORK_CPU_UNBOUND))
+               add_timer_on(timer, cpu);
+       else
+               add_timer(timer);
 }
 
 /**
@@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
                  current->pid, current->comm, target_wq->name, target_func);
-       WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+       WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+                             (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
index ecb9e75..f28f7fa 100644 (file)
@@ -1400,6 +1400,21 @@ config RCU_EQS_DEBUG
 
 endmenu # "RCU Debugging"
 
+config DEBUG_WQ_FORCE_RR_CPU
+       bool "Force round-robin CPU selection for unbound work items"
+       depends on DEBUG_KERNEL
+       default n
+       help
+         Workqueue used to implicitly guarantee that work items queued
+         without explicit CPU specified are put on the local CPU.  This
+         guarantee is no longer true and while local CPU is still
+         preferred work items may be put on foreign CPUs.  Kernel
+         parameter "workqueue.debug_force_rr_cpu" is added to force
+         round-robin CPU selection to flush out usages which depend on the
+         now broken guarantee.  This config option enables the debug
+         feature by default.  When enabled, memory and cache locality will
+         be impacted.
+
 config DEBUG_BLOCK_EXT_DEVT
         bool "Force extended block device numbers and spread them"
        depends on DEBUG_KERNEL
@@ -1427,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT
 
          Say N if you are unsure.
 
+config CPU_HOTPLUG_STATE_CONTROL
+       bool "Enable CPU hotplug state control"
+       depends on DEBUG_KERNEL
+       depends on HOTPLUG_CPU
+       default n
+       help
+         Allows to write steps between "offline" and "online" to the CPUs
+         sysfs target file so states can be stepped granular. This is a debug
+         option for now as the hotplug machinery cannot be stopped and
+         restarted at arbitrary points yet.
+
+         Say N if your are unsure.
+
 config NOTIFIER_ERROR_INJECTION
        tristate "Notifier error injection"
        depends on DEBUG_KERNEL
index 49518fb..e07c1ba 100644 (file)
@@ -18,6 +18,8 @@ config UBSAN_SANITIZE_ALL
          This option activates instrumentation for the entire kernel.
          If you don't enable this option, you have to explicitly specify
          UBSAN_SANITIZE := y for the files/directories you want to check for UB.
+         Enabling this option will get kernel image size increased
+         significantly.
 
 config UBSAN_ALIGNMENT
        bool "Enable checking of pointers alignment"
@@ -25,5 +27,5 @@ config UBSAN_ALIGNMENT
        default y if !HAVE_EFFICIENT_UNALIGNED_ACCESS
        help
          This option enables detection of unaligned memory accesses.
-         Enabling this option on architectures that support unalligned
+         Enabling this option on architectures that support unaligned
          accesses may produce a lot of false positives.
index d62de8b..1234818 100644 (file)
@@ -17,7 +17,7 @@
 #include <linux/atomic.h>
 
 #ifdef CONFIG_X86
-#include <asm/processor.h>     /* for boot_cpu_has below */
+#include <asm/cpufeature.h>    /* for boot_cpu_has below */
 #endif
 
 #define TEST(bit, op, c_op, val)                               \
index 5a70f61..81dedaa 100644 (file)
@@ -41,6 +41,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
                        break;
        return i;
 }
+EXPORT_SYMBOL(cpumask_any_but);
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
index 547f7f9..519b5a1 100644 (file)
@@ -21,7 +21,7 @@
 #define ODEBUG_HASH_BITS       14
 #define ODEBUG_HASH_SIZE       (1 << ODEBUG_HASH_BITS)
 
-#define ODEBUG_POOL_SIZE       512
+#define ODEBUG_POOL_SIZE       1024
 #define ODEBUG_POOL_MIN_LEVEL  256
 
 #define ODEBUG_CHUNK_SHIFT     PAGE_SHIFT
index 6745c62..c30d07e 100644 (file)
@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1);
 
 asmlinkage __visible void dump_stack(void)
 {
+       unsigned long flags;
        int was_locked;
        int old;
        int cpu;
@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void)
         * Permit this cpu to perform nested stack dumps while serialising
         * against other CPUs
         */
-       preempt_disable();
-
 retry:
+       local_irq_save(flags);
        cpu = smp_processor_id();
        old = atomic_cmpxchg(&dump_lock, -1, cpu);
        if (old == -1) {
@@ -43,6 +43,7 @@ retry:
        } else if (old == cpu) {
                was_locked = 1;
        } else {
+               local_irq_restore(flags);
                cpu_relax();
                goto retry;
        }
@@ -52,7 +53,7 @@ retry:
        if (!was_locked)
                atomic_set(&dump_lock, -1);
 
-       preempt_enable();
+       local_irq_restore(flags);
 }
 #else
 asmlinkage __visible void dump_stack(void)
index d74cf7a..0507fa5 100644 (file)
@@ -282,9 +282,9 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i,
                          struct klist_node *n)
 {
        i->i_klist = k;
-       i->i_cur = n;
-       if (n)
-               kref_get(&n->n_ref);
+       i->i_cur = NULL;
+       if (n && kref_get_unless_zero(&n->n_ref))
+               i->i_cur = n;
 }
 EXPORT_SYMBOL_GPL(klist_iter_init_node);
 
index 3345a08..3859bf6 100644 (file)
 #include <linux/kernel.h>
 #include <linux/rculist.h>
 
-static struct list_head force_poison;
-void list_force_poison(struct list_head *entry)
-{
-       entry->next = &force_poison;
-       entry->prev = &force_poison;
-}
-
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -30,8 +23,6 @@ void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
 {
-       WARN(new->next == &force_poison || new->prev == &force_poison,
-               "list_add attempted on force-poisoned entry\n");
        WARN(next->prev != prev,
                "list_add corruption. next->prev should be "
                "prev (%p), but was %p. (next=%p).\n",
index fcf5d98..6b79e90 100644 (file)
@@ -1019,9 +1019,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
                return 0;
 
        radix_tree_for_each_slot(slot, root, &iter, first_index) {
-               results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
+               results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
+               if (radix_tree_is_indirect_ptr(results[ret])) {
+                       slot = radix_tree_iter_retry(&iter);
+                       continue;
+               }
                if (++ret == max_items)
                        break;
        }
@@ -1098,9 +1102,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
                return 0;
 
        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
-               results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
+               results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
+               if (radix_tree_is_indirect_ptr(results[ret])) {
+                       slot = radix_tree_iter_retry(&iter);
+                       continue;
+               }
                if (++ret == max_items)
                        break;
        }
index bafa993..004fc70 100644 (file)
@@ -598,9 +598,9 @@ EXPORT_SYMBOL(sg_miter_next);
  *
  * Description:
  *   Stops mapping iterator @miter.  @miter should have been started
- *   started using sg_miter_start().  A stopped iteration can be
- *   resumed by calling sg_miter_next() on it.  This is useful when
- *   resources (kmap) need to be released during iteration.
+ *   using sg_miter_start().  A stopped iteration can be resumed by
+ *   calling sg_miter_next() on it.  This is useful when resources (kmap)
+ *   need to be released during iteration.
  *
  * Context:
  *   Preemption disabled if the SG_MITER_ATOMIC is set.  Don't care
index 98866a7..25b5cbf 100644 (file)
@@ -327,36 +327,67 @@ out:
 }
 
 #define string_get_size_maxbuf 16
-#define test_string_get_size_one(size, blk_size, units, exp_result)            \
+#define test_string_get_size_one(size, blk_size, exp_result10, exp_result2)    \
        do {                                                                   \
-               BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf);    \
-               __test_string_get_size((size), (blk_size), (units),            \
-                                      (exp_result));                          \
+               BUILD_BUG_ON(sizeof(exp_result10) >= string_get_size_maxbuf);  \
+               BUILD_BUG_ON(sizeof(exp_result2) >= string_get_size_maxbuf);   \
+               __test_string_get_size((size), (blk_size), (exp_result10),     \
+                                      (exp_result2));                         \
        } while (0)
 
 
-static __init void __test_string_get_size(const u64 size, const u64 blk_size,
-                                         const enum string_size_units units,
-                                         const char *exp_result)
+static __init void test_string_get_size_check(const char *units,
+                                             const char *exp,
+                                             char *res,
+                                             const u64 size,
+                                             const u64 blk_size)
 {
-       char buf[string_get_size_maxbuf];
-
-       string_get_size(size, blk_size, units, buf, sizeof(buf));
-       if (!memcmp(buf, exp_result, strlen(exp_result) + 1))
+       if (!memcmp(res, exp, strlen(exp) + 1))
                return;
 
-       buf[sizeof(buf) - 1] = '\0';
-       pr_warn("Test 'test_string_get_size_one' failed!\n");
-       pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n",
+       res[string_get_size_maxbuf - 1] = '\0';
+
+       pr_warn("Test 'test_string_get_size' failed!\n");
+       pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %s)\n",
                size, blk_size, units);
-       pr_warn("expected: '%s', got '%s'\n", exp_result, buf);
+       pr_warn("expected: '%s', got '%s'\n", exp, res);
+}
+
+static __init void __test_string_get_size(const u64 size, const u64 blk_size,
+                                         const char *exp_result10,
+                                         const char *exp_result2)
+{
+       char buf10[string_get_size_maxbuf];
+       char buf2[string_get_size_maxbuf];
+
+       string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10));
+       string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2));
+
+       test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10,
+                                  size, blk_size);
+
+       test_string_get_size_check("STRING_UNITS_2", exp_result2, buf2,
+                                  size, blk_size);
 }
 
 static __init void test_string_get_size(void)
 {
-       test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB");
-       test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB");
-       test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B");
+       /* small values */
+       test_string_get_size_one(0, 512, "0 B", "0 B");
+       test_string_get_size_one(1, 512, "512 B", "512 B");
+       test_string_get_size_one(1100, 1, "1.10 kB", "1.07 KiB");
+
+       /* normal values */
+       test_string_get_size_one(16384, 512, "8.39 MB", "8.00 MiB");
+       test_string_get_size_one(500118192, 512, "256 GB", "238 GiB");
+       test_string_get_size_one(8192, 4096, "33.6 MB", "32.0 MiB");
+
+       /* weird block sizes */
+       test_string_get_size_one(3000, 1900, "5.70 MB", "5.44 MiB");
+
+       /* huge values */
+       test_string_get_size_one(U64_MAX, 4096, "75.6 ZB", "64.0 ZiB");
+       test_string_get_size_one(4096, U64_MAX, "75.6 ZB", "64.0 ZiB");
 }
 
 static int __init test_string_helpers_init(void)
index c61b299..915d75d 100644 (file)
@@ -46,8 +46,11 @@ struct test_key {
        bool                    (*test_key)(void);
 };
 
-#define test_key_func(key, branch) \
-       ({bool func(void) { return branch(key); } func; })
+#define test_key_func(key, branch)     \
+static bool key ## _ ## branch(void)   \
+{                                      \
+       return branch(&key);            \
+}
 
 static void invert_key(struct static_key *key)
 {
@@ -92,6 +95,25 @@ static int verify_keys(struct test_key *keys, int size, bool invert)
        return 0;
 }
 
+test_key_func(old_true_key, static_key_true)
+test_key_func(old_false_key, static_key_false)
+test_key_func(true_key, static_branch_likely)
+test_key_func(true_key, static_branch_unlikely)
+test_key_func(false_key, static_branch_likely)
+test_key_func(false_key, static_branch_unlikely)
+test_key_func(base_old_true_key, static_key_true)
+test_key_func(base_inv_old_true_key, static_key_true)
+test_key_func(base_old_false_key, static_key_false)
+test_key_func(base_inv_old_false_key, static_key_false)
+test_key_func(base_true_key, static_branch_likely)
+test_key_func(base_true_key, static_branch_unlikely)
+test_key_func(base_inv_true_key, static_branch_likely)
+test_key_func(base_inv_true_key, static_branch_unlikely)
+test_key_func(base_false_key, static_branch_likely)
+test_key_func(base_false_key, static_branch_unlikely)
+test_key_func(base_inv_false_key, static_branch_likely)
+test_key_func(base_inv_false_key, static_branch_unlikely)
+
 static int __init test_static_key_init(void)
 {
        int ret;
@@ -102,95 +124,95 @@ static int __init test_static_key_init(void)
                {
                        .init_state     = true,
                        .key            = &old_true_key,
-                       .test_key       = test_key_func(&old_true_key, static_key_true),
+                       .test_key       = &old_true_key_static_key_true,
                },
                {
                        .init_state     = false,
                        .key            = &old_false_key,
-                       .test_key       = test_key_func(&old_false_key, static_key_false),
+                       .test_key       = &old_false_key_static_key_false,
                },
                /* internal keys - new keys */
                {
                        .init_state     = true,
                        .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_likely),
+                       .test_key       = &true_key_static_branch_likely,
                },
                {
                        .init_state     = true,
                        .key            = &true_key.key,
-                       .test_key       = test_key_func(&true_key, static_branch_unlikely),
+                       .test_key       = &true_key_static_branch_unlikely,
                },
                {
                        .init_state     = false,
                        .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_likely),
+                       .test_key       = &false_key_static_branch_likely,
                },
                {
                        .init_state     = false,
                        .key            = &false_key.key,
-                       .test_key       = test_key_func(&false_key, static_branch_unlikely),
+                       .test_key       = &false_key_static_branch_unlikely,
                },
                /* external keys - old keys */
                {
                        .init_state     = true,
                        .key            = &base_old_true_key,
-                       .test_key       = test_key_func(&base_old_true_key, static_key_true),
+                       .test_key       = &base_old_true_key_static_key_true,
                },
                {
                        .init_state     = false,
                        .key            = &base_inv_old_true_key,
-                       .test_key       = test_key_func(&base_inv_old_true_key, static_key_true),
+                       .test_key       = &base_inv_old_true_key_static_key_true,
                },
                {
                        .init_state     = false,
                        .key            = &base_old_false_key,
-                       .test_key       = test_key_func(&base_old_false_key, static_key_false),
+                       .test_key       = &base_old_false_key_static_key_false,
                },
                {
                        .init_state     = true,
                        .key            = &base_inv_old_false_key,
-                       .test_key       = test_key_func(&base_inv_old_false_key, static_key_false),
+                       .test_key       = &base_inv_old_false_key_static_key_false,
                },
                /* external keys - new keys */
                {
                        .init_state     = true,
                        .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_likely),
+                       .test_key       = &base_true_key_static_branch_likely,
                },
                {
                        .init_state     = true,
                        .key            = &base_true_key.key,
-                       .test_key       = test_key_func(&base_true_key, static_branch_unlikely),
+                       .test_key       = &base_true_key_static_branch_unlikely,
                },
                {
                        .init_state     = false,
                        .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_likely),
+                       .test_key       = &base_inv_true_key_static_branch_likely,
                },
                {
                        .init_state     = false,
                        .key            = &base_inv_true_key.key,
-                       .test_key       = test_key_func(&base_inv_true_key, static_branch_unlikely),
+                       .test_key       = &base_inv_true_key_static_branch_unlikely,
                },
                {
                        .init_state     = false,
                        .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_likely),
+                       .test_key       = &base_false_key_static_branch_likely,
                },
                {
                        .init_state     = false,
                        .key            = &base_false_key.key,
-                       .test_key       = test_key_func(&base_false_key, static_branch_unlikely),
+                       .test_key       = &base_false_key_static_branch_unlikely,
                },
                {
                        .init_state     = true,
                        .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_likely),
+                       .test_key       = &base_inv_false_key_static_branch_likely,
                },
                {
                        .init_state     = true,
                        .key            = &base_inv_false_key.key,
-                       .test_key       = test_key_func(&base_inv_false_key, static_branch_unlikely),
+                       .test_key       = &base_inv_false_key_static_branch_unlikely,
                },
        };
 
index 6f500ef..f0b323a 100644 (file)
@@ -49,3 +49,65 @@ ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
         }
 }
 EXPORT_SYMBOL(ucs2_strncmp);
+
+unsigned long
+ucs2_utf8size(const ucs2_char_t *src)
+{
+       unsigned long i;
+       unsigned long j = 0;
+
+       for (i = 0; i < ucs2_strlen(src); i++) {
+               u16 c = src[i];
+
+               if (c >= 0x800)
+                       j += 3;
+               else if (c >= 0x80)
+                       j += 2;
+               else
+                       j += 1;
+       }
+
+       return j;
+}
+EXPORT_SYMBOL(ucs2_utf8size);
+
+/*
+ * copy at most maxlength bytes of whole utf8 characters to dest from the
+ * ucs2 string src.
+ *
+ * The return value is the number of characters copied, not including the
+ * final NUL character.
+ */
+unsigned long
+ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, unsigned long maxlength)
+{
+       unsigned int i;
+       unsigned long j = 0;
+       unsigned long limit = ucs2_strnlen(src, maxlength);
+
+       for (i = 0; maxlength && i < limit; i++) {
+               u16 c = src[i];
+
+               if (c >= 0x800) {
+                       if (maxlength < 3)
+                               break;
+                       maxlength -= 3;
+                       dest[j++] = 0xe0 | (c & 0xf000) >> 12;
+                       dest[j++] = 0x80 | (c & 0x0fc0) >> 6;
+                       dest[j++] = 0x80 | (c & 0x003f);
+               } else if (c >= 0x80) {
+                       if (maxlength < 2)
+                               break;
+                       maxlength -= 2;
+                       dest[j++] = 0xc0 | (c & 0x7c0) >> 6;
+                       dest[j++] = 0x80 | (c & 0x03f);
+               } else {
+                       maxlength -= 1;
+                       dest[j++] = c & 0x7f;
+               }
+       }
+       if (maxlength)
+               dest[j] = '\0';
+       return j;
+}
+EXPORT_SYMBOL(ucs2_as_utf8);
index 48ff9c3..f44e178 100644 (file)
@@ -1590,22 +1590,23 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
                        return buf;
                }
        case 'K':
-               /*
-                * %pK cannot be used in IRQ context because its test
-                * for CAP_SYSLOG would be meaningless.
-                */
-               if (kptr_restrict && (in_irq() || in_serving_softirq() ||
-                                     in_nmi())) {
-                       if (spec.field_width == -1)
-                               spec.field_width = default_width;
-                       return string(buf, end, "pK-error", spec);
-               }
-
                switch (kptr_restrict) {
                case 0:
                        /* Always print %pK values */
                        break;
                case 1: {
+                       const struct cred *cred;
+
+                       /*
+                        * kptr_restrict==1 cannot be used in IRQ context
+                        * because its test for CAP_SYSLOG would be meaningless.
+                        */
+                       if (in_irq() || in_serving_softirq() || in_nmi()) {
+                               if (spec.field_width == -1)
+                                       spec.field_width = default_width;
+                               return string(buf, end, "pK-error", spec);
+                       }
+
                        /*
                         * Only print the real pointer value if the current
                         * process has CAP_SYSLOG and is running with the
@@ -1615,8 +1616,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
                         * leak pointer values if a binary opens a file using
                         * %pK and then elevates privileges before reading it.
                         */
-                       const struct cred *cred = current_cred();
-
+                       cred = current_cred();
                        if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                            !uid_eq(cred->euid, cred->uid) ||
                            !gid_eq(cred->egid, cred->gid))
index 97a4e06..03cbfa0 100644 (file)
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        bool
 
 config DEFERRED_STRUCT_PAGE_INIT
-       bool "Defer initialisation of struct pages to kswapd"
+       bool "Defer initialisation of struct pages to kthreads"
        default n
        depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
          single thread. On very large machines this can take a considerable
          amount of time. If this option is set, large machines will bring up
          a subset of memmap at boot and then initialise the rest in parallel
-         when kswapd starts. This has a potential performance impact on
-         processes running early in the lifetime of the systemm until kswapd
-         finishes the initialisation.
+         by starting one-off "pgdatinitX" kernel thread for each node X. This
+         has a potential performance impact on processes running early in the
+         lifetime of the system until these kthreads finish the
+         initialisation.
 
 config IDLE_PAGE_TRACKING
        bool "Enable idle page tracking"
index cc5d29d..c554d17 100644 (file)
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        return 0;
 
 out_destroy_stat:
-       while (--i)
+       while (i--)
                percpu_counter_destroy(&wb->stat[i]);
        fprop_local_destroy_percpu(&wb->completions);
 out_put_cong:
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
                 * here rather than calling cond_resched().
                 */
                if (current->flags & PF_WQ_WORKER)
-                       schedule_timeout(1);
+                       schedule_timeout_uninterruptible(1);
                else
                        cond_resched();
 
index 8fc5081..ba5d8f3 100644 (file)
@@ -22,7 +22,7 @@
  * cleancache_ops is set by cleancache_register_ops to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
-static struct cleancache_ops *cleancache_ops __read_mostly;
+static const struct cleancache_ops *cleancache_ops __read_mostly;
 
 /*
  * Counters available via /sys/kernel/debug/cleancache (if debugfs is
@@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
 /*
  * Register operations for cleancache. Returns 0 on success.
  */
-int cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(const struct cleancache_ops *ops)
 {
        if (cmpxchg(&cleancache_ops, NULL, ops))
                return -EBUSY;
index bc94386..da7a35d 100644 (file)
@@ -195,6 +195,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
        else
                cleancache_invalidate_page(mapping, page);
 
+       VM_BUG_ON_PAGE(page_mapped(page), page);
+       if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+               int mapcount;
+
+               pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
+                        current->comm, page_to_pfn(page));
+               dump_page(page, "still mapped when deleted");
+               dump_stack();
+               add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+               mapcount = page_mapcount(page);
+               if (mapping_exiting(mapping) &&
+                   page_count(page) >= mapcount + 2) {
+                       /*
+                        * All vmas have already been torn down, so it's
+                        * a good bet that actually the page is unmapped,
+                        * and we'd prefer not to leak it: if we're wrong,
+                        * some other bad page check should catch it later.
+                        */
+                       page_mapcount_reset(page);
+                       atomic_sub(mapcount, &page->_count);
+               }
+       }
+
        page_cache_tree_delete(mapping, page, shadow);
 
        page->mapping = NULL;
@@ -205,7 +229,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
-       VM_BUG_ON_PAGE(page_mapped(page), page);
 
        /*
         * At this point page must be either written or cleaned by truncate.
@@ -446,7 +469,8 @@ int filemap_write_and_wait(struct address_space *mapping)
 {
        int err = 0;
 
-       if (mapping->nrpages) {
+       if ((!dax_mapping(mapping) && mapping->nrpages) ||
+           (dax_mapping(mapping) && mapping->nrexceptional)) {
                err = filemap_fdatawrite(mapping);
                /*
                 * Even if the above returned error, the pages may be
@@ -482,13 +506,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 {
        int err = 0;
 
-       if (dax_mapping(mapping) && mapping->nrexceptional) {
-               err = dax_writeback_mapping_range(mapping, lstart, lend);
-               if (err)
-                       return err;
-       }
-
-       if (mapping->nrpages) {
+       if ((!dax_mapping(mapping) && mapping->nrpages) ||
+           (dax_mapping(mapping) && mapping->nrexceptional)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
@@ -1890,6 +1909,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
  * page_cache_read - adds requested page to the page cache if not already there
  * @file:      file to read
  * @offset:    page index
+ * @gfp_mask:  memory allocation flags
  *
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
index b64a361..7bf19ff 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -430,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
-                       if (!is_cow_mapping(vm_flags)) {
-                               WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+                       if (!is_cow_mapping(vm_flags))
                                return -EFAULT;
-                       }
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
index fd3a07b..e10a4fe 100644 (file)
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
-static DEFINE_SPINLOCK(split_queue_lock);
-static LIST_HEAD(split_queue);
-static unsigned long split_queue_len;
 static struct shrinker deferred_split_shrinker;
 
 static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
-       pgtable_trans_huge_deposit(mm, pmd, pgtable);
+       if (pgtable)
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
        return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        pmd_t pmd;
-       pgtable_t pgtable;
+       pgtable_t pgtable = NULL;
        int ret;
 
-       ret = -ENOMEM;
-       pgtable = pte_alloc_one(dst_mm, addr);
-       if (unlikely(!pgtable))
-               goto out;
+       if (!vma_is_dax(vma)) {
+               ret = -ENOMEM;
+               pgtable = pte_alloc_one(dst_mm, addr);
+               if (unlikely(!pgtable))
+                       goto out;
+       }
 
        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
 
-       if (pmd_trans_huge(pmd)) {
+       if (!vma_is_dax(vma)) {
                /* thp accounting separate from pmd_devmap accounting */
                src_page = pmd_page(pmd);
                VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
 
-               if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+               if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+                               vma_is_anonymous(vma)) {
                        pgtable_t pgtable;
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -2835,6 +2836,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        pgtable_t pgtable;
        pmd_t _pmd;
        bool young, write, dirty;
+       unsigned long addr;
        int i;
 
        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2860,10 +2862,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        young = pmd_young(*pmd);
        dirty = pmd_dirty(*pmd);
 
+       pmdp_huge_split_prepare(vma, haddr, pmd);
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
 
-       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+       for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                pte_t entry, *pte;
                /*
                 * Note that NUMA hinting access restrictions are not
@@ -2884,9 +2887,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                }
                if (dirty)
                        SetPageDirty(page + i);
-               pte = pte_offset_map(&_pmd, haddr);
+               pte = pte_offset_map(&_pmd, addr);
                BUG_ON(!pte_none(*pte));
-               set_pte_at(mm, haddr, pte, entry);
+               set_pte_at(mm, addr, pte, entry);
                atomic_inc(&page[i]._mapcount);
                pte_unmap(pte);
        }
@@ -2936,7 +2939,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        pmd_populate(mm, pmd, pgtable);
 
        if (freeze) {
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               for (i = 0; i < HPAGE_PMD_NR; i++) {
                        page_remove_rmap(page + i, false);
                        put_page(page + i);
                }
@@ -3358,6 +3361,7 @@ int total_mapcount(struct page *page)
 int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
        struct page *head = compound_head(page);
+       struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
        struct anon_vma *anon_vma;
        int count, mapcount, ret;
        bool mlocked;
@@ -3401,19 +3405,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                lru_add_drain();
 
        /* Prevent deferred_split_scan() touching ->_count */
-       spin_lock_irqsave(&split_queue_lock, flags);
+       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        count = page_count(head);
        mapcount = total_mapcount(head);
        if (!mapcount && count == 1) {
                if (!list_empty(page_deferred_list(head))) {
-                       split_queue_len--;
+                       pgdata->split_queue_len--;
                        list_del(page_deferred_list(head));
                }
-               spin_unlock_irqrestore(&split_queue_lock, flags);
+               spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                __split_huge_page(page, list);
                ret = 0;
        } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
-               spin_unlock_irqrestore(&split_queue_lock, flags);
+               spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                pr_alert("total_mapcount: %u, page_count(): %u\n",
                                mapcount, count);
                if (PageTail(page))
@@ -3421,7 +3425,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                dump_page(page, "total_mapcount(head) > 0");
                BUG();
        } else {
-               spin_unlock_irqrestore(&split_queue_lock, flags);
+               spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                unfreeze_page(anon_vma, head);
                ret = -EBUSY;
        }
@@ -3436,64 +3440,65 @@ out:
 
 void free_transhuge_page(struct page *page)
 {
+       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
 
-       spin_lock_irqsave(&split_queue_lock, flags);
+       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (!list_empty(page_deferred_list(page))) {
-               split_queue_len--;
+               pgdata->split_queue_len--;
                list_del(page_deferred_list(page));
        }
-       spin_unlock_irqrestore(&split_queue_lock, flags);
+       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
        free_compound_page(page);
 }
 
 void deferred_split_huge_page(struct page *page)
 {
+       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
 
        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 
-       spin_lock_irqsave(&split_queue_lock, flags);
+       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (list_empty(page_deferred_list(page))) {
-               list_add_tail(page_deferred_list(page), &split_queue);
-               split_queue_len++;
+               list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+               pgdata->split_queue_len++;
        }
-       spin_unlock_irqrestore(&split_queue_lock, flags);
+       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
 {
-       /*
-        * Split a page from split_queue will free up at least one page,
-        * at most HPAGE_PMD_NR - 1. We don't track exact number.
-        * Let's use HPAGE_PMD_NR / 2 as ballpark.
-        */
-       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+       struct pglist_data *pgdata = NODE_DATA(sc->nid);
+       return ACCESS_ONCE(pgdata->split_queue_len);
 }
 
 static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
 {
+       struct pglist_data *pgdata = NODE_DATA(sc->nid);
        unsigned long flags;
        LIST_HEAD(list), *pos, *next;
        struct page *page;
        int split = 0;
 
-       spin_lock_irqsave(&split_queue_lock, flags);
-       list_splice_init(&split_queue, &list);
-
+       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
-       list_for_each_safe(pos, next, &list) {
+       list_for_each_safe(pos, next, &pgdata->split_queue) {
                page = list_entry((void *)pos, struct page, mapping);
                page = compound_head(page);
-               /* race with put_compound_page() */
-               if (!get_page_unless_zero(page)) {
+               if (get_page_unless_zero(page)) {
+                       list_move(page_deferred_list(page), &list);
+               } else {
+                       /* We lost race with put_compound_page() */
                        list_del_init(page_deferred_list(page));
-                       split_queue_len--;
+                       pgdata->split_queue_len--;
                }
+               if (!--sc->nr_to_scan)
+                       break;
        }
-       spin_unlock_irqrestore(&split_queue_lock, flags);
+       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3510,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
                put_page(page);
        }
 
-       spin_lock_irqsave(&split_queue_lock, flags);
-       list_splice_tail(&list, &split_queue);
-       spin_unlock_irqrestore(&split_queue_lock, flags);
+       spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+       list_splice_tail(&list, &pgdata->split_queue);
+       spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 
-       return split * HPAGE_PMD_NR / 2;
+       /*
+        * Stop shrinker if we didn't split any page, but the queue is empty.
+        * This can happen if pages were freed under us.
+        */
+       if (!split && list_empty(&pgdata->split_queue))
+               return SHRINK_STOP;
+       return split;
 }
 
 static struct shrinker deferred_split_shrinker = {
        .count_objects = deferred_split_count,
        .scan_objects = deferred_split_scan,
        .seeks = DEFAULT_SEEKS,
+       .flags = SHRINKER_NUMA_AWARE,
 };
 
 #ifdef CONFIG_DEBUG_FS
index 12908dc..aefba5a 100644 (file)
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                nr_nodes--)
 
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
 static void destroy_compound_gigantic_page(struct page *page,
                                        unsigned int order)
 {
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
 
        set_page_private(page, 0);
        page->mapping = NULL;
-       BUG_ON(page_count(page));
-       BUG_ON(page_mapcount(page));
+       VM_BUG_ON_PAGE(page_count(page), page);
+       VM_BUG_ON_PAGE(page_mapcount(page), page);
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
 
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                set_page_count(p, 0);
                set_compound_head(p, page);
        }
+       atomic_set(compound_mapcount_ptr(page), -1);
 }
 
 /*
@@ -2629,8 +2630,10 @@ static int __init hugetlb_init(void)
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
        default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
-       if (default_hstate_max_huge_pages)
-               default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+       if (default_hstate_max_huge_pages) {
+               if (!default_hstate.max_huge_pages)
+                       default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+       }
 
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
@@ -2748,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
        int ret;
 
        if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
 
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
@@ -2789,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
        int ret;
 
        if (!hugepages_supported())
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
 
        tmp = h->nr_overcommit_huge_pages;
 
@@ -3499,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * COW. Warn that such a situation has occurred as it may not be obvious
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-               pr_warning("PID %d killed due to inadequate hugepage pool\n",
+               pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
                return ret;
        }
index ed8b5ff..a38a21e 100644 (file)
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
 
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+       return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+       return (flags & VM_STACK) == VM_STACK;
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+       return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
index bc0a8d8..1ad20ad 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
+#include <linux/linkage.h>
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
@@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
        }
 }
 
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+       void *base = task_stack_page(task);
+       size_t size = sp - base;
+
+       kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+       __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+       __kasan_unpoison_stack(current, sp);
+}
 
 /*
  * All functions below always inlined so compiler could
index d2ed81e..dd79899 100644 (file)
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
  * Remaining API functions
  */
 
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
 {
        return memblock.memory.total_size;
 }
index 30991f8..906d8e3 100644 (file)
@@ -1550,9 +1550,30 @@ out:
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
+{
+       return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings.  In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn, pgprot_t pgprot)
 {
        int ret;
-       pgprot_t pgprot = vma->vm_page_prot;
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
@@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
        return ret;
 }
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn)
@@ -1591,10 +1612,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
-       if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
+       if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;
 
-               page = pfn_t_to_page(pfn);
+               /*
+                * At this point we are committed to insert_page()
+                * regardless of whether the caller specified flags that
+                * result in pfn_t_has_page() == false.
+                */
+               page = pfn_to_page(pfn_t_to_pfn(pfn));
                return insert_page(vma, addr, page, vma->vm_page_prot);
        }
        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2232,11 +2258,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
 
        page_cache_get(old_page);
 
-       /*
-        * Only catch write-faults on shared writable pages,
-        * read-only shared pages can get COWed by
-        * get_user_pages(.write=1, .force=1).
-        */
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                int tmp;
 
@@ -3404,8 +3425,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(pmd_none(*pmd)) &&
            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+        * didn't become pmd_trans_huge under us and then back to pmd_none, as
+        * a result of MADV_DONTNEED running immediately after a huge pmd fault
+        * in a different thread of this mm, in turn leading to a misleading
+        * pmd_trans_huge() retval.  All we have to ensure is that it is a
+        * regular pmd that we can walk with pte_offset_map() and we can do that
+        * through an atomic read in C, which is what pmd_trans_unstable()
+        * provides.
+        */
+       if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
index 4af58a3..979b18c 100644 (file)
@@ -138,7 +138,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->name = "System RAM";
        res->start = start;
        res->end = start + size - 1;
-       res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        if (request_resource(&iomem_resource, res) < 0) {
                pr_debug("System RAM resource %pR cannot be added\n", res);
                kfree(res);
index 27d1354..9a3f6b9 100644 (file)
@@ -532,7 +532,7 @@ retry:
                nid = page_to_nid(page);
                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
-               if (PageTail(page) && PageAnon(page)) {
+               if (PageTransCompound(page) && PageAnon(page)) {
                        get_page(page);
                        pte_unmap_unlock(pte, ptl);
                        lock_page(page);
@@ -548,8 +548,7 @@ retry:
                        goto retry;
                }
 
-               if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                       migrate_page_add(page, qp->pagelist, flags);
+               migrate_page_add(page, qp->pagelist, flags);
        }
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        unsigned long endvma = vma->vm_end;
        unsigned long flags = qp->flags;
 
-       if (vma->vm_flags & VM_PFNMAP)
+       if (!vma_migratable(vma))
                return 1;
 
        if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-               if (vma_migratable(vma) &&
-                       vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+               if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }
 
-       if ((flags & MPOL_MF_STRICT) ||
-           ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-            vma_migratable(vma)))
-               /* queue pages from current vma */
+       /* queue pages from current vma */
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
 }
index 004d42b..7924f4f 100644 (file)
@@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool)
        void *element = pool->elements[--pool->curr_nr];
 
        BUG_ON(pool->curr_nr < 0);
-       check_element(pool, element);
        kasan_unpoison_element(pool, element);
+       check_element(pool, element);
        return element;
 }
 
index b1034f9..3ad0fea 100644 (file)
@@ -1582,7 +1582,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                         (GFP_HIGHUSER_MOVABLE |
                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
                                          __GFP_NORETRY | __GFP_NOWARN) &
-                                        ~(__GFP_IO | __GFP_FS), 0);
+                                        ~__GFP_RECLAIM, 0);
 
        return newpage;
 }
index 84b1262..90e3b86 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
 #include <linux/memory.h>
 #include <linux/printk.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
 #endif
 
+static bool ignore_rlimit_data = true;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
 
 static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -387,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
 {
+       struct rb_root *root = &mm->mm_rb;
        int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;
@@ -411,12 +415,14 @@ static int browse_rb(struct rb_root *root)
                                  vma->vm_start, vma->vm_end);
                        bug = 1;
                }
+               spin_lock(&mm->page_table_lock);
                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
                        pr_emerg("free gap %lx, correct %lx\n",
                               vma->rb_subtree_gap,
                               vma_compute_subtree_gap(vma));
                        bug = 1;
                }
+               spin_unlock(&mm->page_table_lock);
                i++;
                pn = nd;
                prev = vma->vm_start;
@@ -453,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
        struct vm_area_struct *vma = mm->mmap;
 
        while (vma) {
+               struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
 
-               vma_lock_anon_vma(vma);
-               list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
-                       anon_vma_interval_tree_verify(avc);
-               vma_unlock_anon_vma(vma);
+               if (anon_vma) {
+                       anon_vma_lock_read(anon_vma);
+                       list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                               anon_vma_interval_tree_verify(avc);
+                       anon_vma_unlock_read(anon_vma);
+               }
+
                highest_address = vma->vm_end;
                vma = vma->vm_next;
                i++;
@@ -472,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
                          mm->highest_vm_end, highest_address);
                bug = 1;
        }
-       i = browse_rb(&mm->mm_rb);
+       i = browse_rb(mm);
        if (i != mm->map_count) {
                if (i != -1)
                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2139,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
        struct mm_struct *mm = vma->vm_mm;
-       int error;
+       int error = 0;
 
        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;
 
-       /*
-        * We must make sure the anon_vma is allocated
-        * so that the anon_vma locking is not a noop.
-        */
+       /* Guard against wrapping around to address 0. */
+       if (address < PAGE_ALIGN(address+4))
+               address = PAGE_ALIGN(address+4);
+       else
+               return -ENOMEM;
+
+       /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;
-       vma_lock_anon_vma(vma);
 
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_sem in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
-        * Also guard against wrapping around to address 0.
         */
-       if (address < PAGE_ALIGN(address+4))
-               address = PAGE_ALIGN(address+4);
-       else {
-               vma_unlock_anon_vma(vma);
-               return -ENOMEM;
-       }
-       error = 0;
+       anon_vma_lock_write(vma->anon_vma);
 
        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
@@ -2182,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                                 * updates, but we only hold a shared mmap_sem
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
-                                * vma_lock_anon_vma() doesn't help here, as
+                                * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
@@ -2205,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                        }
                }
        }
-       vma_unlock_anon_vma(vma);
+       anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
@@ -2221,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        int error;
 
-       /*
-        * We must make sure the anon_vma is allocated
-        * so that the anon_vma locking is not a noop.
-        */
-       if (unlikely(anon_vma_prepare(vma)))
-               return -ENOMEM;
-
        address &= PAGE_MASK;
        error = security_mmap_addr(address);
        if (error)
                return error;
 
-       vma_lock_anon_vma(vma);
+       /* We must make sure the anon_vma is allocated. */
+       if (unlikely(anon_vma_prepare(vma)))
+               return -ENOMEM;
 
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_sem in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
+       anon_vma_lock_write(vma->anon_vma);
 
        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
@@ -2257,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
                                 * updates, but we only hold a shared mmap_sem
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
-                                * vma_lock_anon_vma() doesn't help here, as
+                                * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
@@ -2278,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
                        }
                }
        }
-       vma_unlock_anon_vma(vma);
+       anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
@@ -2663,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;
 
-       if (start < vma->vm_start || start + size > vma->vm_end)
+       if (start < vma->vm_start)
                goto out;
 
-       if (pgoff == linear_page_index(vma, start)) {
-               ret = 0;
-               goto out;
+       if (start + size > vma->vm_end) {
+               struct vm_area_struct *next;
+
+               for (next = vma->vm_next; next; next = next->vm_next) {
+                       /* hole between vmas ? */
+                       if (next->vm_start != next->vm_prev->vm_end)
+                               goto out;
+
+                       if (next->vm_file != vma->vm_file)
+                               goto out;
+
+                       if (next->vm_flags != vma->vm_flags)
+                               goto out;
+
+                       if (start + size <= next->vm_end)
+                               break;
+               }
+
+               if (!next)
+                       goto out;
        }
 
        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
@@ -2678,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED) {
+               struct vm_area_struct *tmp;
                flags |= MAP_LOCKED;
+
                /* drop PG_Mlocked flag for over-mapped range */
-               munlock_vma_pages_range(vma, start, start + size);
+               for (tmp = vma; tmp->vm_start >= start + size;
+                               tmp = tmp->vm_next) {
+                       munlock_vma_pages_range(tmp,
+                                       max(tmp->vm_start, start),
+                                       min(tmp->vm_end, start + size));
+               }
        }
 
        file = get_file(vma->vm_file);
@@ -2982,9 +3007,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;
 
-       if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
-                               (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
-               return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+       if (is_data_mapping(flags) &&
+           mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
+               if (ignore_rlimit_data)
+                       pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
+                                    "%lu. Will be forbidden soon.\n",
+                                    current->comm, current->pid,
+                                    (mm->data_vm + npages) << PAGE_SHIFT,
+                                    rlimit(RLIMIT_DATA));
+               else
+                       return false;
+       }
 
        return true;
 }
@@ -2993,11 +3026,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 {
        mm->total_vm += npages;
 
-       if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+       if (is_exec_mapping(flags))
                mm->exec_vm += npages;
-       else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+       else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
-       else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+       else if (is_data_mapping(flags))
                mm->data_vm += npages;
 }
 
@@ -3033,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
        pgoff_t pgoff;
        struct page **pages;
 
-       if (vma->vm_ops == &legacy_special_mapping_vmops)
+       if (vma->vm_ops == &legacy_special_mapping_vmops) {
                pages = vma->vm_private_data;
-       else
-               pages = ((struct vm_special_mapping *)vma->vm_private_data)->
-                       pages;
+       } else {
+               struct vm_special_mapping *sm = vma->vm_private_data;
+
+               if (sm->fault)
+                       return sm->fault(sm, vma, vmf);
+
+               pages = sm->pages;
+       }
 
        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;
index 8eb7bb4..f7cb3d4 100644 (file)
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                }
 
                if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-                       if (next - addr != HPAGE_PMD_SIZE)
+                       if (next - addr != HPAGE_PMD_SIZE) {
                                split_huge_pmd(vma, pmd, addr);
-                       else {
+                               if (pmd_none(*pmd))
+                                       continue;
+                       } else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                newprot, prot_numa);
 
index d77946a..8eeba02 100644 (file)
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                }
                        }
                        split_huge_pmd(vma, old_pmd, old_addr);
+                       if (pmd_none(*old_pmd))
+                               continue;
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
index 63358d9..838ca8b 100644 (file)
@@ -5209,6 +5209,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
        spin_lock_init(&pgdat->numabalancing_migrate_lock);
        pgdat->numabalancing_migrate_nr_pages = 0;
        pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       spin_lock_init(&pgdat->split_queue_lock);
+       INIT_LIST_HEAD(&pgdat->split_queue);
+       pgdat->split_queue_len = 0;
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
@@ -6615,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
        return !has_unmovable_pages(zone, page, 0, true);
 }
 
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
index 9d47676..06a005b 100644 (file)
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
  * ARCHes with special requirements for evicting THP backing TLB entries can
  * implement this. Otherwise also, it can help optimize normal TLB flush in
  * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB TLB if flush span is greater than a threshhold, which will
+ * entire TLB if flush span is greater than a threshold, which will
  * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desitable.
+ * invalidate the entire TLB which is not desirable.
  * e.g. see arch/arc: flush_pmd_tlb_range
  */
 #define flush_pmd_tlb_range(vma, addr, end)    flush_tlb_range(vma, addr, end)
@@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+       /* collapse entails shooting down ptes not pmd */
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
 #endif
index 6ecc697..621fbcb 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2275,7 +2275,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 
        err = setup_cpu_cache(cachep, gfp);
        if (err) {
-               __kmem_cache_shutdown(cachep);
+               __kmem_cache_release(cachep);
                return err;
        }
 
@@ -2413,13 +2413,14 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 }
 
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
+{
+       return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
 {
        int i;
        struct kmem_cache_node *n;
-       int rc = __kmem_cache_shrink(cachep, false);
-
-       if (rc)
-               return rc;
 
        free_percpu(cachep->cpu_cache);
 
@@ -2430,7 +2431,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
                kfree(n);
                cachep->node[i] = NULL;
        }
-       return 0;
 }
 
 /*
index 834ad24..2eedace 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -140,6 +140,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);
 
index b50aef0..065b7bd 100644 (file)
@@ -693,6 +693,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
+       __kmem_cache_release(s);
        destroy_memcg_params(s);
        kfree_const(s->name);
        kmem_cache_free(kmem_cache, s);
index 17e8f8c..5ec1580 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
        return 0;
 }
 
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
 int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
        return 0;
index 2e1355a..d8fbd4a 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1592,18 +1592,12 @@ static inline void add_partial(struct kmem_cache_node *n,
        __add_partial(n, page, tail);
 }
 
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
-       list_del(&page->lru);
-       n->nr_partial--;
-}
-
 static inline void remove_partial(struct kmem_cache_node *n,
                                        struct page *page)
 {
        lockdep_assert_held(&n->list_lock);
-       __remove_partial(n, page);
+       list_del(&page->lru);
+       n->nr_partial--;
 }
 
 /*
@@ -3184,6 +3178,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
        }
 }
 
+void __kmem_cache_release(struct kmem_cache *s)
+{
+       free_percpu(s->cpu_slab);
+       free_kmem_cache_nodes(s);
+}
+
 static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
@@ -3443,28 +3443,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 
 /*
  * Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
  */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
        struct page *page, *h;
 
+       BUG_ON(irqs_disabled());
+       spin_lock_irq(&n->list_lock);
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                       __remove_partial(n, page);
+                       remove_partial(n, page);
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
-                       "Objects remaining in %s on kmem_cache_close()");
+                       "Objects remaining in %s on __kmem_cache_shutdown()");
                }
        }
+       spin_unlock_irq(&n->list_lock);
 }
 
 /*
  * Release all resources used by a slab cache.
  */
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
 {
        int node;
        struct kmem_cache_node *n;
@@ -3476,16 +3479,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
                if (n->nr_partial || slabs_node(s, node))
                        return 1;
        }
-       free_percpu(s->cpu_slab);
-       free_kmem_cache_nodes(s);
        return 0;
 }
 
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
-       return kmem_cache_close(s);
-}
-
 /********************************************************************
  *             Kmalloc subsystem
  *******************************************************************/
@@ -3980,7 +3976,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
        memcg_propagate_slab_attrs(s);
        err = sysfs_slab_add(s);
        if (err)
-               kmem_cache_close(s);
+               __kmem_cache_release(s);
 
        return err;
 }
index c108a65..4fb14ca 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 /* Check if the vma is being used as a stack by this task */
-static int vm_is_stack_for_task(struct task_struct *t,
-                               struct vm_area_struct *vma)
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
 {
        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
 
-/*
- * Check if the vma is being used as a stack.
- * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the task_struct of the task
- * that the vma is stack for. Must be called under rcu_read_lock().
- */
-struct task_struct *task_of_stack(struct task_struct *task,
-                               struct vm_area_struct *vma, bool in_group)
-{
-       if (vm_is_stack_for_task(task, vma))
-               return task;
-
-       if (in_group) {
-               struct task_struct *t;
-
-               for_each_thread(task, t) {
-                       if (vm_is_stack_for_task(t, vma))
-                               return t;
-               }
-       }
-
-       return NULL;
-}
-
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
index 9a6c070..149fdf6 100644 (file)
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 
        if (tree) {
                spin_lock(&vmpr->sr_lock);
-               vmpr->tree_scanned += scanned;
+               scanned = vmpr->tree_scanned += scanned;
                vmpr->tree_reclaimed += reclaimed;
-               scanned = vmpr->scanned;
                spin_unlock(&vmpr->sr_lock);
 
                if (scanned < vmpressure_win)
index eb3dd37..71b1c29 100644 (file)
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
        int ret = -EBUSY;
 
        VM_BUG_ON_PAGE(!page_count(page), page);
-       VM_BUG_ON_PAGE(PageTail(page), page);
+       WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
index 40b2c74..084c672 100644 (file)
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
                 * Counters were updated so we expect more updates
                 * to occur in the future. Keep on running the
                 * update worker thread.
+                * If we were marked on cpu_stat_off clear the flag
+                * so that vmstat_shepherd doesn't schedule us again.
                 */
-               queue_delayed_work_on(smp_processor_id(), vmstat_wq,
-                       this_cpu_ptr(&vmstat_work),
-                       round_jiffies_relative(sysctl_stat_interval));
+               if (!cpumask_test_and_clear_cpu(smp_processor_id(),
+                                               cpu_stat_off)) {
+                       queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+                               this_cpu_ptr(&vmstat_work),
+                               round_jiffies_relative(sysctl_stat_interval));
+               }
        } else {
                /*
                 * We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
  * until the diffs stay at zero. The function is used by NOHZ and can only be
  * invoked when tick processing is not active.
  */
-void quiet_vmstat(void)
-{
-       if (system_state != SYSTEM_RUNNING)
-               return;
-
-       do {
-               if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
-                       cancel_delayed_work(this_cpu_ptr(&vmstat_work));
-
-       } while (refresh_cpu_vm_stats(false));
-}
-
 /*
  * Check if the diffs for a certain cpu indicate that
  * an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
        return false;
 }
 
+void quiet_vmstat(void)
+{
+       if (system_state != SYSTEM_RUNNING)
+               return;
+
+       /*
+        * If we are already in hands of the shepherd then there
+        * is nothing for us to do here.
+        */
+       if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+               return;
+
+       if (!need_update(smp_processor_id()))
+               return;
+
+       /*
+        * Just refresh counters and do not care about the pending delayed
+        * vmstat_update. It doesn't fire that often to matter and canceling
+        * it would be too expensive from this path.
+        * vmstat_shepherd will take care about that for us.
+        */
+       refresh_cpu_vm_stats(false);
+}
+
 
 /*
  * Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
 
        get_online_cpus();
        /* Check processors whose vmstat worker threads have been disabled */
-       for_each_cpu(cpu, cpu_stat_off)
-               if (need_update(cpu) &&
-                       cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-
-                       queue_delayed_work_on(cpu, vmstat_wq,
-                               &per_cpu(vmstat_work, cpu), 0);
+       for_each_cpu(cpu, cpu_stat_off) {
+               struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
+               if (need_update(cpu)) {
+                       if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+                               queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+               } else {
+                       /*
+                        * Cancel the work if quiet_vmstat has put this
+                        * cpu on cpu_stat_off because the work item might
+                        * be still scheduled
+                        */
+                       cancel_delayed_work(dw);
+               }
+       }
        put_online_cpus();
 
        schedule_delayed_work(&shepherd,
                round_jiffies_relative(sysctl_stat_interval));
-
 }
 
 static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
        int cpu;
 
        for_each_possible_cpu(cpu)
-               INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+               INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
                        vmstat_update);
 
        if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
index d5871ac..f066781 100644 (file)
@@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 
                rt = atrtr_find(&at_hint);
        }
-       err = ENETUNREACH;
+       err = -ENETUNREACH;
        if (!rt)
                goto out;
 
index e6c8382..ccf70be 100644 (file)
@@ -527,11 +527,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
                 * gets dereferenced.
                 */
                spin_lock_bh(&bat_priv->gw.list_lock);
-               hlist_del_init_rcu(&gw_node->list);
+               if (!hlist_unhashed(&gw_node->list)) {
+                       hlist_del_init_rcu(&gw_node->list);
+                       batadv_gw_node_free_ref(gw_node);
+               }
                spin_unlock_bh(&bat_priv->gw.list_lock);
 
-               batadv_gw_node_free_ref(gw_node);
-
                curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
                if (gw_node == curr_gw)
                        batadv_gw_reselect(bat_priv);
index 01acccc..57f7107 100644 (file)
@@ -75,6 +75,28 @@ out:
        return hard_iface;
 }
 
+/**
+ * batadv_mutual_parents - check if two devices are each others parent
+ * @dev1: 1st net_device
+ * @dev2: 2nd net_device
+ *
+ * veth devices come in pairs and each is the parent of the other!
+ *
+ * Return: true if the devices are each others parent, otherwise false
+ */
+static bool batadv_mutual_parents(const struct net_device *dev1,
+                                 const struct net_device *dev2)
+{
+       int dev1_parent_iflink = dev_get_iflink(dev1);
+       int dev2_parent_iflink = dev_get_iflink(dev2);
+
+       if (!dev1_parent_iflink || !dev2_parent_iflink)
+               return false;
+
+       return (dev1_parent_iflink == dev2->ifindex) &&
+              (dev2_parent_iflink == dev1->ifindex);
+}
+
 /**
  * batadv_is_on_batman_iface - check if a device is a batman iface descendant
  * @net_dev: the device to check
@@ -108,6 +130,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
        if (WARN(!parent_dev, "Cannot find parent device"))
                return false;
 
+       if (batadv_mutual_parents(net_dev, parent_dev))
+               return false;
+
        ret = batadv_is_on_batman_iface(parent_dev);
 
        return ret;
index cdfc85f..0e80fd1 100644 (file)
@@ -303,9 +303,11 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
 
        if (atomic_add_return(v, &vlan->tt.num_entries) == 0) {
                spin_lock_bh(&orig_node->vlan_list_lock);
-               hlist_del_init_rcu(&vlan->list);
+               if (!hlist_unhashed(&vlan->list)) {
+                       hlist_del_init_rcu(&vlan->list);
+                       batadv_orig_node_vlan_free_ref(vlan);
+               }
                spin_unlock_bh(&orig_node->vlan_list_lock);
-               batadv_orig_node_vlan_free_ref(vlan);
        }
 
        batadv_orig_node_vlan_free_ref(vlan);
index d040365..8a4cc2f 100644 (file)
@@ -307,6 +307,9 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
 
        /* check that it's our buffer */
        if (lowpan_is_ipv6(*skb_network_header(skb))) {
+               /* Pull off the 1-byte of 6lowpan header. */
+               skb_pull(skb, 1);
+
                /* Copy the packet so that the IPv6 header is
                 * properly aligned.
                 */
@@ -317,6 +320,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
 
                local_skb->protocol = htons(ETH_P_IPV6);
                local_skb->pkt_type = PACKET_HOST;
+               local_skb->dev = dev;
 
                skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
 
@@ -335,6 +339,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
                if (!local_skb)
                        goto drop;
 
+               local_skb->dev = dev;
+
                ret = iphc_decompress(local_skb, dev, chan);
                if (ret < 0) {
                        kfree_skb(local_skb);
@@ -343,7 +349,6 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
 
                local_skb->protocol = htons(ETH_P_IPV6);
                local_skb->pkt_type = PACKET_HOST;
-               local_skb->dev = dev;
 
                if (give_skb_to_upper(local_skb, dev)
                                != NET_RX_SUCCESS) {
index 47bcef7..883c821 100644 (file)
@@ -4112,8 +4112,10 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
                        break;
                }
 
-               *req_complete = bt_cb(skb)->hci.req_complete;
-               *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+               if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB)
+                       *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+               else
+                       *req_complete = bt_cb(skb)->hci.req_complete;
                kfree_skb(skb);
        }
        spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
index 41b5f38..c78ee2d 100644 (file)
@@ -688,21 +688,29 @@ static u8 update_white_list(struct hci_request *req)
         * command to remove it from the controller.
         */
        list_for_each_entry(b, &hdev->le_white_list, list) {
-               struct hci_cp_le_del_from_white_list cp;
+               /* If the device is neither in pend_le_conns nor
+                * pend_le_reports then remove it from the whitelist.
+                */
+               if (!hci_pend_le_action_lookup(&hdev->pend_le_conns,
+                                              &b->bdaddr, b->bdaddr_type) &&
+                   !hci_pend_le_action_lookup(&hdev->pend_le_reports,
+                                              &b->bdaddr, b->bdaddr_type)) {
+                       struct hci_cp_le_del_from_white_list cp;
+
+                       cp.bdaddr_type = b->bdaddr_type;
+                       bacpy(&cp.bdaddr, &b->bdaddr);
 
-               if (hci_pend_le_action_lookup(&hdev->pend_le_conns,
-                                             &b->bdaddr, b->bdaddr_type) ||
-                   hci_pend_le_action_lookup(&hdev->pend_le_reports,
-                                             &b->bdaddr, b->bdaddr_type)) {
-                       white_list_entries++;
+                       hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
+                                   sizeof(cp), &cp);
                        continue;
                }
 
-               cp.bdaddr_type = b->bdaddr_type;
-               bacpy(&cp.bdaddr, &b->bdaddr);
+               if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
+                       /* White list can not be used with RPAs */
+                       return 0x00;
+               }
 
-               hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
-                           sizeof(cp), &cp);
+               white_list_entries++;
        }
 
        /* Since all no longer valid white list entries have been
index 39a5149..eb4f5f2 100644 (file)
@@ -197,10 +197,20 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
                chan->sport = psm;
                err = 0;
        } else {
-               u16 p;
+               u16 p, start, end, incr;
+
+               if (chan->src_type == BDADDR_BREDR) {
+                       start = L2CAP_PSM_DYN_START;
+                       end = L2CAP_PSM_AUTO_END;
+                       incr = 2;
+               } else {
+                       start = L2CAP_PSM_LE_DYN_START;
+                       end = L2CAP_PSM_LE_DYN_END;
+                       incr = 1;
+               }
 
                err = -EINVAL;
-               for (p = 0x1001; p < 0x1100; p += 2)
+               for (p = start; p <= end; p += incr)
                        if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
                                chan->psm   = cpu_to_le16(p);
                                chan->sport = cpu_to_le16(p);
index 1bb5515..e4cae72 100644 (file)
@@ -58,7 +58,7 @@ static int l2cap_validate_bredr_psm(u16 psm)
                return -EINVAL;
 
        /* Restrict usage of well-known PSMs */
-       if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE))
+       if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE))
                return -EACCES;
 
        return 0;
@@ -67,11 +67,11 @@ static int l2cap_validate_bredr_psm(u16 psm)
 static int l2cap_validate_le_psm(u16 psm)
 {
        /* Valid LE_PSM ranges are defined only until 0x00ff */
-       if (psm > 0x00ff)
+       if (psm > L2CAP_PSM_LE_DYN_END)
                return -EINVAL;
 
        /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */
-       if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE))
+       if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE))
                return -EACCES;
 
        return 0;
@@ -125,6 +125,9 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
                        goto done;
        }
 
+       bacpy(&chan->src, &la.l2_bdaddr);
+       chan->src_type = la.l2_bdaddr_type;
+
        if (la.l2_cid)
                err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid));
        else
@@ -156,9 +159,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
                break;
        }
 
-       bacpy(&chan->src, &la.l2_bdaddr);
-       chan->src_type = la.l2_bdaddr_type;
-
        if (chan->psm && bdaddr_type_is_le(chan->src_type))
                chan->mode = L2CAP_MODE_LE_FLOWCTL;
 
index ffed8a1..4b175df 100644 (file)
@@ -1072,22 +1072,6 @@ static void smp_notify_keys(struct l2cap_conn *conn)
                        hcon->dst_type = smp->remote_irk->addr_type;
                        queue_work(hdev->workqueue, &conn->id_addr_update_work);
                }
-
-               /* When receiving an indentity resolving key for
-                * a remote device that does not use a resolvable
-                * private address, just remove the key so that
-                * it is possible to use the controller white
-                * list for scanning.
-                *
-                * Userspace will have been told to not store
-                * this key at this point. So it is safe to
-                * just remove it.
-                */
-               if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) {
-                       list_del_rcu(&smp->remote_irk->list);
-                       kfree_rcu(smp->remote_irk, rcu);
-                       smp->remote_irk = NULL;
-               }
        }
 
        if (smp->csrk) {
index a1abe49..3addc05 100644 (file)
@@ -121,6 +121,7 @@ static struct notifier_block br_device_notifier = {
        .notifier_call = br_device_event
 };
 
+/* called with RTNL */
 static int br_switchdev_event(struct notifier_block *unused,
                              unsigned long event, void *ptr)
 {
@@ -130,7 +131,6 @@ static int br_switchdev_event(struct notifier_block *unused,
        struct switchdev_notifier_fdb_info *fdb_info;
        int err = NOTIFY_DONE;
 
-       rtnl_lock();
        p = br_port_get_rtnl(dev);
        if (!p)
                goto out;
@@ -155,7 +155,6 @@ static int br_switchdev_event(struct notifier_block *unused,
        }
 
 out:
-       rtnl_unlock();
        return err;
 }
 
index 82e3e97..dcea4f4 100644 (file)
@@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb,
                struct net_bridge_fdb_entry *f;
 
                hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+                       int err;
+
                        if (idx < cb->args[0])
                                goto skip;
 
@@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb,
                        if (!filter_dev && f->dst)
                                goto skip;
 
-                       if (fdb_fill_info(skb, br, f,
-                                         NETLINK_CB(cb->skb).portid,
-                                         cb->nlh->nlmsg_seq,
-                                         RTM_NEWNEIGH,
-                                         NLM_F_MULTI) < 0)
+                       err = fdb_fill_info(skb, br, f,
+                                           NETLINK_CB(cb->skb).portid,
+                                           cb->nlh->nlmsg_seq,
+                                           RTM_NEWNEIGH,
+                                           NLM_F_MULTI);
+                       if (err < 0) {
+                               cb->args[1] = err;
                                break;
+                       }
 skip:
                        ++idx;
                }
index 30e105f..74c278e 100644 (file)
@@ -425,8 +425,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
        mp = br_mdb_ip_get(mdb, group);
        if (!mp) {
                mp = br_multicast_new_group(br, port, group);
-               err = PTR_ERR(mp);
-               if (IS_ERR(mp))
+               err = PTR_ERR_OR_ZERO(mp);
+               if (err)
                        return err;
        }
 
index 61d7617..b82440e 100644 (file)
@@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
                tmppkt = NULL;
 
                /* Verify that length is correct */
-               err = EPROTO;
+               err = -EPROTO;
                if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
                        goto out;
        }
index 393bfb2..5fcfb98 100644 (file)
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
  * @local_retries: localized retries
  * @local_fallback_retries: localized fallback retries
  * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
  * @vary_r: pass r to recursive calls
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  * @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
                               unsigned int local_fallback_retries,
                               int recurse_to_leaf,
                               unsigned int vary_r,
+                              unsigned int stable,
                               int *out2,
                               int parent_r)
 {
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
        int collide, reject;
        int count = out_size;
 
-       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
                recurse_to_leaf ? "_LEAF" : "",
                bucket->id, x, outpos, numrep,
                tries, recurse_tries, local_retries, local_fallback_retries,
-               parent_r);
+               parent_r, stable);
 
-       for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+       for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
                ftotal = 0;
                skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
                                                if (crush_choose_firstn(map,
                                                         map->buckets[-1-item],
                                                         weight, weight_max,
-                                                        x, outpos+1, 0,
+                                                        x, stable ? 1 : outpos+1, 0,
                                                         out2, outpos, count,
                                                         recurse_tries, 0,
                                                         local_retries,
                                                         local_fallback_retries,
                                                         0,
                                                         vary_r,
+                                                        stable,
                                                         NULL,
                                                         sub_r) <= outpos)
                                                        /* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
        int choose_local_fallback_retries = map->choose_local_fallback_tries;
 
        int vary_r = map->chooseleaf_vary_r;
+       int stable = map->chooseleaf_stable;
 
        if ((__u32)ruleno >= map->max_rules) {
                dprintk(" bad ruleno %d\n", ruleno);
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
                case CRUSH_RULE_TAKE:
                        if ((curstep->arg1 >= 0 &&
                             curstep->arg1 < map->max_devices) ||
-                           (-1-curstep->arg1 < map->max_buckets &&
+                           (-1-curstep->arg1 >= 0 &&
+                            -1-curstep->arg1 < map->max_buckets &&
                             map->buckets[-1-curstep->arg1])) {
                                w[0] = curstep->arg1;
                                wsize = 1;
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
                                vary_r = curstep->arg1;
                        break;
 
+               case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+                       if (curstep->arg1 >= 0)
+                               stable = curstep->arg1;
+                       break;
+
                case CRUSH_RULE_CHOOSELEAF_FIRSTN:
                case CRUSH_RULE_CHOOSE_FIRSTN:
                        firstn = 1;
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
                        osize = 0;
 
                        for (i = 0; i < wsize; i++) {
+                               int bno;
                                /*
                                 * see CRUSH_N, CRUSH_N_MINUS macros.
                                 * basically, numrep <= 0 means relative to
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
                                                continue;
                                }
                                j = 0;
+                               /* make sure bucket id is valid */
+                               bno = -1 - w[i];
+                               if (bno < 0 || bno >= map->max_buckets) {
+                                       /* w[i] is probably CRUSH_ITEM_NONE */
+                                       dprintk("  bad w[i] %d\n", w[i]);
+                                       continue;
+                               }
                                if (firstn) {
                                        int recurse_tries;
                                        if (choose_leaf_tries)
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
                                                recurse_tries = choose_tries;
                                        osize += crush_choose_firstn(
                                                map,
-                                               map->buckets[-1-w[i]],
+                                               map->buckets[bno],
                                                weight, weight_max,
                                                x, numrep,
                                                curstep->arg2,
@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
                                                choose_local_fallback_retries,
                                                recurse_to_leaf,
                                                vary_r,
+                                               stable,
                                                c+osize,
                                                0);
                                } else {
@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map,
                                                    numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
-                                               map->buckets[-1-w[i]],
+                                               map->buckets[bno],
                                                weight, weight_max,
                                                x, out_size, numrep,
                                                curstep->arg2,
index 9cfedf5..9382619 100644 (file)
@@ -1197,6 +1197,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
        return new_piece;
 }
 
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+       return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+           sizeof(struct ceph_msg_footer) :
+           sizeof(struct ceph_msg_footer_old);
+}
+
 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
        BUG_ON(!msg);
@@ -2335,9 +2342,9 @@ static int read_partial_message(struct ceph_connection *con)
                        ceph_pr_addr(&con->peer_addr.in_addr),
                        seq, con->in_seq + 1);
                con->in_base_pos = -front_len - middle_len - data_len -
-                       sizeof(m->footer);
+                       sizeof_footer(con);
                con->in_tag = CEPH_MSGR_TAG_READY;
-               return 0;
+               return 1;
        } else if ((s64)seq - (s64)con->in_seq > 1) {
                pr_err("read_partial_message bad seq %lld expected %lld\n",
                       seq, con->in_seq + 1);
@@ -2360,10 +2367,10 @@ static int read_partial_message(struct ceph_connection *con)
                        /* skip this message */
                        dout("alloc_msg said skip message\n");
                        con->in_base_pos = -front_len - middle_len - data_len -
-                               sizeof(m->footer);
+                               sizeof_footer(con);
                        con->in_tag = CEPH_MSGR_TAG_READY;
                        con->in_seq++;
-                       return 0;
+                       return 1;
                }
 
                BUG_ON(!con->in_msg);
index f8f2359..5bc0537 100644 (file)
@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
        u32 osdmap_epoch;
        int already_completed;
        u32 bytes;
+       u8 decode_redir;
        unsigned int i;
 
        tid = le64_to_cpu(msg->hdr.tid);
@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                p += 8 + 4; /* skip replay_version */
                p += 8; /* skip user_version */
 
+               if (le16_to_cpu(msg->hdr.version) >= 7)
+                       ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+               else
+                       decode_redir = 1;
+       } else {
+               decode_redir = 0;
+       }
+
+       if (decode_redir) {
                err = ceph_redirect_decode(&p, end, &redir);
                if (err)
                        goto bad_put;
@@ -2843,8 +2853,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        mutex_lock(&osdc->request_mutex);
        req = __lookup_request(osdc, tid);
        if (!req) {
-               pr_warn("%s osd%d tid %llu unknown, skipping\n",
-                       __func__, osd->o_osd, tid);
+               dout("%s osd%d tid %llu unknown, skipping\n", __func__,
+                    osd->o_osd, tid);
                m = NULL;
                *skip = 1;
                goto out;
index 7d8f581..243574c 100644 (file)
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
         c->choose_local_tries = ceph_decode_32(p);
         c->choose_local_fallback_tries =  ceph_decode_32(p);
         c->choose_total_tries = ceph_decode_32(p);
-        dout("crush decode tunable choose_local_tries = %d",
+        dout("crush decode tunable choose_local_tries = %d\n",
              c->choose_local_tries);
-        dout("crush decode tunable choose_local_fallback_tries = %d",
+        dout("crush decode tunable choose_local_fallback_tries = %d\n",
              c->choose_local_fallback_tries);
-        dout("crush decode tunable choose_total_tries = %d",
+        dout("crush decode tunable choose_total_tries = %d\n",
              c->choose_total_tries);
 
        ceph_decode_need(p, end, sizeof(u32), done);
        c->chooseleaf_descend_once = ceph_decode_32(p);
-       dout("crush decode tunable chooseleaf_descend_once = %d",
+       dout("crush decode tunable chooseleaf_descend_once = %d\n",
             c->chooseleaf_descend_once);
 
        ceph_decode_need(p, end, sizeof(u8), done);
        c->chooseleaf_vary_r = ceph_decode_8(p);
-       dout("crush decode tunable chooseleaf_vary_r = %d",
+       dout("crush decode tunable chooseleaf_vary_r = %d\n",
             c->chooseleaf_vary_r);
 
+       /* skip straw_calc_version, allowed_bucket_algs */
+       ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
+       *p += sizeof(u8) + sizeof(u32);
+
+       ceph_decode_need(p, end, sizeof(u8), done);
+       c->chooseleaf_stable = ceph_decode_8(p);
+       dout("crush decode tunable chooseleaf_stable = %d\n",
+            c->chooseleaf_stable);
+
 done:
        dout("crush_decode success\n");
        return c;
index cc9e365..0ef061b 100644 (file)
@@ -4351,6 +4351,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 
                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
                diffs |= p->vlan_tci ^ skb->vlan_tci;
+               diffs |= skb_metadata_dst_cmp(p, skb);
                if (maclen == ETH_HLEN)
                        diffs |= compare_ether_header(skb_mac_header(p),
                                                      skb_mac_header(skb));
@@ -4548,10 +4549,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
                break;
 
        case GRO_MERGED_FREE:
-               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+                       skb_dst_drop(skb);
                        kmem_cache_free(skbuff_head_cache, skb);
-               else
+               } else {
                        __kfree_skb(skb);
+               }
                break;
 
        case GRO_HELD:
@@ -5376,12 +5379,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 {
        struct netdev_adjacent *lower;
 
-       lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+       lower = list_entry(*iter, struct netdev_adjacent, list);
 
        if (&lower->list == &dev->adj_list.lower)
                return NULL;
 
-       *iter = &lower->list;
+       *iter = lower->list.next;
 
        return lower->dev;
 }
@@ -7419,8 +7422,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);
 
-       if (!dev->tx_queue_len)
+       if (!dev->tx_queue_len) {
                dev->priv_flags |= IFF_NO_QUEUE;
+               dev->tx_queue_len = 1;
+       }
 
        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
index 94d2620..bba502f 100644 (file)
@@ -1752,7 +1752,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;
 
-       if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6)))
+       if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
@@ -1776,7 +1776,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
        info = &md->u.tun_info;
        info->mode = IP_TUNNEL_INFO_TX;
 
-       info->key.tun_flags = TUNNEL_KEY;
+       info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;
@@ -1787,6 +1787,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
                       sizeof(from->remote_ipv6));
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+               if (flags & BPF_F_ZERO_CSUM_TX)
+                       info->key.tun_flags &= ~TUNNEL_CSUM;
        }
 
        return 0;
index d79699c..12e7003 100644 (file)
@@ -208,7 +208,6 @@ ip:
        case htons(ETH_P_IPV6): {
                const struct ipv6hdr *iph;
                struct ipv6hdr _iph;
-               __be32 flow_label;
 
 ipv6:
                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
@@ -230,8 +229,12 @@ ipv6:
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                }
 
-               flow_label = ip6_flowlabel(iph);
-               if (flow_label) {
+               if ((dissector_uses_key(flow_dissector,
+                                       FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+                    (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+                   ip6_flowlabel(iph)) {
+                       __be32 flow_label = ip6_flowlabel(iph);
+
                        if (dissector_uses_key(flow_dissector,
                                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                                key_tags = skb_flow_dissector_target(flow_dissector,
@@ -396,6 +399,13 @@ ip_proto_again:
                                goto out_bad;
                        proto = eth->h_proto;
                        nhoff += sizeof(*eth);
+
+                       /* Cap headers that we access via pointers at the
+                        * end of the Ethernet header as our maximum alignment
+                        * at that point is only 2 bytes.
+                        */
+                       if (NET_IP_ALIGN)
+                               hlen = nhoff;
                }
 
                key_control->flags |= FLOW_DIS_ENCAPSULATION;
index d735e85..8261d95 100644 (file)
@@ -2911,6 +2911,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
        nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
 out:
        netif_addr_unlock_bh(dev);
+       cb->args[1] = err;
        return idx;
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -2944,6 +2945,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
                ops = br_dev->netdev_ops;
        }
 
+       cb->args[1] = 0;
        for_each_netdev(net, dev) {
                if (brport_idx && (dev->ifindex != brport_idx))
                        continue;
@@ -2971,12 +2973,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
                                idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
                                                         idx);
                }
+               if (cb->args[1] == -EMSGSIZE)
+                       break;
 
                if (dev->netdev_ops->ndo_fdb_dump)
                        idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
                                                            idx);
                else
                        idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+               if (cb->args[1] == -EMSGSIZE)
+                       break;
 
                cops = NULL;
        }
index 14596fb..2696aef 100644 (file)
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
                *fplp = fpl;
                fpl->count = 0;
                fpl->max = SCM_MAX_FD;
+               fpl->user = NULL;
        }
        fpp = &fpl->fp[fpl->count];
 
@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
                *fpp++ = file;
                fpl->count++;
        }
+
+       if (!fpl->user)
+               fpl->user = get_uid(current_user());
+
        return num;
 }
 
@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
                scm->fp = NULL;
                for (i=fpl->count-1; i>=0; i--)
                        fput(fpl->fp[i]);
+               free_uid(fpl->user);
                kfree(fpl);
        }
 }
@@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
                for (i = 0; i < fpl->count; i++)
                        get_file(fpl->fp[i]);
                new_fpl->max = new_fpl->count;
+               new_fpl->user = get_uid(fpl->user);
        }
        return new_fpl;
 }
index b2df375..8616d11 100644 (file)
@@ -79,6 +79,8 @@
 
 struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
 
 /**
  *     skb_panic - private function for out-of-line support
@@ -2945,6 +2947,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 }
 EXPORT_SYMBOL_GPL(skb_append_pagefrags);
 
+/**
+ *     skb_push_rcsum - push skb and update receive checksum
+ *     @skb: buffer to update
+ *     @len: length of data pulled
+ *
+ *     This function performs an skb_push on the packet and updates
+ *     the CHECKSUM_COMPLETE checksum.  It should be used on
+ *     receive path processing instead of skb_push unless you know
+ *     that the checksum difference is zero (e.g., a valid IP header)
+ *     or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+       skb_push(skb, len);
+       skb_postpush_rcsum(skb, skb->data, len);
+       return skb->data;
+}
+
 /**
  *     skb_pull_rcsum - pull skb and update receive checksum
  *     @skb: buffer to update
@@ -4082,9 +4102,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
        if (!pskb_may_pull(skb_chk, offset))
                goto err;
 
-       __skb_pull(skb_chk, offset);
+       skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
-       __skb_push(skb_chk, offset);
+       skb_push_rcsum(skb_chk, offset);
 
        if (ret)
                goto err;
index 1df98c5..e92b759 100644 (file)
@@ -93,10 +93,17 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
  *  @sk2: Socket belonging to the existing reuseport group.
  *  May return ENOMEM and not add socket to group under memory pressure.
  */
-int reuseport_add_sock(struct sock *sk, const struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2)
 {
        struct sock_reuseport *reuse;
 
+       if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
+               int err = reuseport_alloc(sk2);
+
+               if (err)
+                       return err;
+       }
+
        spin_lock_bh(&reuseport_lock);
        reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
                                          lockdep_is_held(&reuseport_lock)),
index 95b6139..a6beb7b 100644 (file)
@@ -26,6 +26,7 @@ static int zero = 0;
 static int one = 1;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
 
 static int net_msg_warn;       /* Unused, but still a sysctl */
 
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "max_skb_frags",
+               .data           = &sysctl_max_skb_frags,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+               .extra2         = &max_skb_frags,
+       },
        { }
 };
 
index 5684e14..902d606 100644 (file)
@@ -824,26 +824,26 @@ lookup:
 
        if (sk->sk_state == DCCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
-               struct sock *nsk = NULL;
+               struct sock *nsk;
 
                sk = req->rsk_listener;
-               if (likely(sk->sk_state == DCCP_LISTEN)) {
-                       nsk = dccp_check_req(sk, skb, req);
-               } else {
+               if (unlikely(sk->sk_state != DCCP_LISTEN)) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
+               sock_hold(sk);
+               nsk = dccp_check_req(sk, skb, req);
                if (!nsk) {
                        reqsk_put(req);
-                       goto discard_it;
+                       goto discard_and_relse;
                }
                if (nsk == sk) {
-                       sock_hold(sk);
                        reqsk_put(req);
                } else if (dccp_child_process(sk, nsk, skb)) {
                        dccp_v4_ctl_send_reset(sk, skb);
-                       goto discard_it;
+                       goto discard_and_relse;
                } else {
+                       sock_put(sk);
                        return 0;
                }
        }
index 9c6d050..b8608b7 100644 (file)
@@ -691,26 +691,26 @@ lookup:
 
        if (sk->sk_state == DCCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
-               struct sock *nsk = NULL;
+               struct sock *nsk;
 
                sk = req->rsk_listener;
-               if (likely(sk->sk_state == DCCP_LISTEN)) {
-                       nsk = dccp_check_req(sk, skb, req);
-               } else {
+               if (unlikely(sk->sk_state != DCCP_LISTEN)) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
+               sock_hold(sk);
+               nsk = dccp_check_req(sk, skb, req);
                if (!nsk) {
                        reqsk_put(req);
-                       goto discard_it;
+                       goto discard_and_relse;
                }
                if (nsk == sk) {
-                       sock_hold(sk);
                        reqsk_put(req);
                } else if (dccp_child_process(sk, nsk, skb)) {
                        dccp_v6_ctl_send_reset(sk, skb);
-                       goto discard_it;
+                       goto discard_and_relse;
                } else {
+                       sock_put(sk);
                        return 0;
                }
        }
index 40b9ca7..ab24521 100644 (file)
@@ -1194,7 +1194,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
        if (ret) {
                netdev_err(master, "error %d registering interface %s\n",
                           ret, slave_dev->name);
-               phy_disconnect(p->phy);
                ds->ports[port] = NULL;
                free_netdev(slave_dev);
                return ret;
@@ -1205,6 +1204,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
        ret = dsa_slave_phy_setup(p, slave_dev);
        if (ret) {
                netdev_err(master, "error %d setting up slave phy\n", ret);
+               unregister_netdev(slave_dev);
                free_netdev(slave_dev);
                return ret;
        }
index c229205..7758247 100644 (file)
@@ -353,6 +353,7 @@ config INET_ESP
        select CRYPTO_CBC
        select CRYPTO_SHA1
        select CRYPTO_DES
+       select CRYPTO_ECHAINIV
        ---help---
          Support for IPsec ESP.
 
index cebd9d3..f6303b1 100644 (file)
@@ -1847,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
        if (err < 0)
                goto errout;
 
-       err = EINVAL;
+       err = -EINVAL;
        if (!tb[NETCONFA_IFINDEX])
                goto errout;
 
index 7aea0cc..d07fc07 100644 (file)
@@ -1394,9 +1394,10 @@ found:
                struct fib_info *fi = fa->fa_info;
                int nhsel, err;
 
-               if ((index >= (1ul << fa->fa_slen)) &&
-                   ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
-                       continue;
+               if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
+                       if (index >= (1ul << fa->fa_slen))
+                               continue;
+               }
                if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
                        continue;
                if (fi->fib_dead)
index 05e4cba..b3086cf 100644 (file)
@@ -356,9 +356,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
        skb_dst_set(skb, &rt->dst);
        skb->dev = dev;
 
-       skb->reserved_tailroom = skb_end_offset(skb) -
-                                min(mtu, skb_end_offset(skb));
        skb_reserve(skb, hlen);
+       skb_tailroom_reserve(skb, mtu, tlen);
 
        skb_reset_network_header(skb);
        pip = ip_hdr(skb);
index 46b9c88..6414891 100644 (file)
@@ -789,14 +789,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
        reqsk_put(req);
 }
 
-void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
-                             struct sock *child)
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+                                     struct request_sock *req,
+                                     struct sock *child)
 {
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 
        spin_lock(&queue->rskq_lock);
        if (unlikely(sk->sk_state != TCP_LISTEN)) {
                inet_child_forget(sk, req, child);
+               child = NULL;
        } else {
                req->sk = child;
                req->dl_next = NULL;
@@ -808,6 +810,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
                sk_acceptq_added(sk);
        }
        spin_unlock(&queue->rskq_lock);
+       return child;
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 
@@ -817,11 +820,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
        if (own_req) {
                inet_csk_reqsk_queue_drop(sk, req);
                reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-               inet_csk_reqsk_queue_add(sk, req, child);
-               /* Warning: caller must not call reqsk_put(req);
-                * child stole last reference on it.
-                */
-               return child;
+               if (inet_csk_reqsk_queue_add(sk, req, child))
+                       return child;
        }
        /* Too bad, another child took ownership of the request, undo. */
        bh_unlock_sock(child);
index 8bb8e7a..6029157 100644 (file)
@@ -361,13 +361,20 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
                                 req->id.idiag_dport, req->id.idiag_src[0],
                                 req->id.idiag_sport, req->id.idiag_if);
 #if IS_ENABLED(CONFIG_IPV6)
-       else if (req->sdiag_family == AF_INET6)
-               sk = inet6_lookup(net, hashinfo,
-                                 (struct in6_addr *)req->id.idiag_dst,
-                                 req->id.idiag_dport,
-                                 (struct in6_addr *)req->id.idiag_src,
-                                 req->id.idiag_sport,
-                                 req->id.idiag_if);
+       else if (req->sdiag_family == AF_INET6) {
+               if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+                   ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+                       sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+                                        req->id.idiag_dport, req->id.idiag_src[3],
+                                        req->id.idiag_sport, req->id.idiag_if);
+               else
+                       sk = inet6_lookup(net, hashinfo,
+                                         (struct in6_addr *)req->id.idiag_dst,
+                                         req->id.idiag_dport,
+                                         (struct in6_addr *)req->id.idiag_src,
+                                         req->id.idiag_sport,
+                                         req->id.idiag_if);
+       }
 #endif
        else
                return ERR_PTR(-EINVAL);
index 3f00810..187c6fc 100644 (file)
@@ -661,6 +661,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
        struct ipq *qp;
 
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+       skb_orphan(skb);
 
        /* Lookup (or create) queue header */
        qp = ip_find(net, ip_hdr(skb), user, vif);
index 7c51c4e..41ba68d 100644 (file)
@@ -1054,8 +1054,9 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 static void ipgre_tap_setup(struct net_device *dev)
 {
        ether_setup(dev);
-       dev->netdev_ops         = &gre_tap_netdev_ops;
-       dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
+       dev->netdev_ops = &gre_tap_netdev_ops;
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+       dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        ip_tunnel_setup(dev, gre_tap_net_id);
 }
 
@@ -1240,6 +1241,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
        err = ipgre_newlink(net, dev, tb, NULL);
        if (err < 0)
                goto out;
+
+       /* openvswitch users expect packet sizes to be unrestricted,
+        * so set the largest MTU we can.
+        */
+       err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+       if (err)
+               goto out;
+
        return dev;
 out:
        free_netdev(dev);
index b1209b6..d77eb0c 100644 (file)
@@ -316,7 +316,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
 
-       if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
+       if (sysctl_ip_early_demux &&
+           !skb_dst(skb) &&
+           !skb->sk &&
+           !ip_is_fragment(iph)) {
                const struct net_protocol *ipprot;
                int protocol = iph->protocol;
 
index 64878ef..565bf64 100644 (file)
@@ -1236,13 +1236,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
        if (!skb)
                return -EINVAL;
 
-       cork->length += size;
        if ((size + skb->len > mtu) &&
            (sk->sk_protocol == IPPROTO_UDP) &&
            (rt->dst.dev->features & NETIF_F_UFO)) {
+               if (skb->ip_summed != CHECKSUM_PARTIAL)
+                       return -EOPNOTSUPP;
+
                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
        }
+       cork->length += size;
 
        while (size > 0) {
                if (skb_is_gso(skb)) {
index 5f73a7c..a501242 100644 (file)
@@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
                switch (cmsg->cmsg_type) {
                case IP_RETOPTS:
                        err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+
+                       /* Our caller is responsible for freeing ipc->opt */
                        err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
                                             err < 40 ? err : 40);
                        if (err)
index c7bd72e..336e689 100644 (file)
@@ -661,6 +661,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
        connected = (tunnel->parms.iph.daddr != 0);
 
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
        dst = tnl_params->daddr;
        if (dst == 0) {
                /* NBMA tunnel */
@@ -758,7 +760,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
                        tunnel->err_count--;
 
-                       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
                        dst_link_failure(skb);
                } else
                        tunnel->err_count = 0;
@@ -943,17 +944,31 @@ done:
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 
-int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+       int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 
-       if (new_mtu < 68 ||
-           new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+       if (new_mtu < 68)
                return -EINVAL;
+
+       if (new_mtu > max_mtu) {
+               if (strict)
+                       return -EINVAL;
+
+               new_mtu = max_mtu;
+       }
+
        dev->mtu = new_mtu;
        return 0;
 }
+EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+       return __ip_tunnel_change_mtu(dev, new_mtu, true);
+}
 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 
 static void ip_tunnel_dev_free(struct net_device *dev)
index 67f7c9d..2ed9dd2 100644 (file)
@@ -143,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata;
 
 /* Persistent data: */
 
+#ifdef IPCONFIG_DYNAMIC
 static int ic_proto_used;                      /* Protocol used, if any */
+#else
+#define ic_proto_used 0
+#endif
 static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
 static u8 ic_domain[64];               /* DNS (not NIS) domain name */
 
index 6fb869f..a04dee5 100644 (file)
@@ -27,8 +27,6 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
 {
        int err;
 
-       skb_orphan(skb);
-
        local_bh_disable();
        err = ip_defrag(net, skb, user);
        local_bh_enable();
index c117b21..d3a2716 100644 (file)
@@ -746,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
        if (msg->msg_controllen) {
                err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
-               if (err)
+               if (unlikely(err)) {
+                       kfree(ipc.opt);
                        return err;
+               }
                if (ipc.opt)
                        free = 1;
        }
index bc35f18..7113bae 100644 (file)
@@ -547,8 +547,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
        if (msg->msg_controllen) {
                err = ip_cmsg_send(net, msg, &ipc, false);
-               if (err)
+               if (unlikely(err)) {
+                       kfree(ipc.opt);
                        goto out;
+               }
                if (ipc.opt)
                        free = 1;
        }
index 85f184e..02c6229 100644 (file)
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly  = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly      = 256;
 
+static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
 /*
  *     Interface to generic destination cache.
  */
@@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                                struct fib_nh *nh = &FIB_RES_NH(res);
 
                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
-                                                     0, 0);
+                                               0, jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev,
 #endif
 }
 
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash;
+       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+       u32 hval = fnhe_hashfun(daddr);
+
+       spin_lock_bh(&fnhe_lock);
+
+       hash = rcu_dereference_protected(nh->nh_exceptions,
+                                        lockdep_is_held(&fnhe_lock));
+       hash += hval;
+
+       fnhe_p = &hash->chain;
+       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+       while (fnhe) {
+               if (fnhe->fnhe_daddr == daddr) {
+                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+                       fnhe_flush_routes(fnhe);
+                       kfree_rcu(fnhe, rcu);
+                       break;
+               }
+               fnhe_p = &fnhe->fnhe_next;
+               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+                                                lockdep_is_held(&fnhe_lock));
+       }
+
+       spin_unlock_bh(&fnhe_lock);
+}
+
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
                           const struct fib_result *res,
@@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb,
 
        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
        if (do_cache) {
-               if (fnhe)
+               if (fnhe) {
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
-               else
-                       rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+                       if (rth && rth->dst.expires &&
+                           time_after(jiffies, rth->dst.expires)) {
+                               ip_del_fnhe(&FIB_RES_NH(*res), daddr);
+                               fnhe = NULL;
+                       } else {
+                               goto rt_cache;
+                       }
+               }
+
+               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
 
+rt_cache:
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
@@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
                fnhe = find_exception(nh, fl4->daddr);
-               if (fnhe)
+               if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
-               else {
-                       if (unlikely(fl4->flowi4_flags &
-                                    FLOWI_FLAG_KNOWN_NH &&
-                                    !(nh->nh_gw &&
-                                      nh->nh_scope == RT_SCOPE_LINK))) {
-                               do_cache = false;
-                               goto add;
+                       rth = rcu_dereference(*prth);
+                       if (rth && rth->dst.expires &&
+                           time_after(jiffies, rth->dst.expires)) {
+                               ip_del_fnhe(nh, fl4->daddr);
+                               fnhe = NULL;
+                       } else {
+                               goto rt_cache;
                        }
-                       prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                }
+
+               if (unlikely(fl4->flowi4_flags &
+                            FLOWI_FLAG_KNOWN_NH &&
+                            !(nh->nh_gw &&
+                              nh->nh_scope == RT_SCOPE_LINK))) {
+                       do_cache = false;
+                       goto add;
+               }
+               prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                rth = rcu_dereference(*prth);
+
+rt_cache:
                if (rt_cache_valid(rth)) {
                        dst_hold(&rth->dst);
                        return rth;
@@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 }
 
 #ifdef CONFIG_SYSCTL
-static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
 static int ip_rt_gc_elasticity __read_mostly   = 8;
index fd17eec..483ffdf 100644 (file)
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <asm/unaligned.h>
 #include <net/busy_poll.h>
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -939,7 +940,7 @@ new_segment:
 
                i = skb_shinfo(skb)->nr_frags;
                can_coalesce = skb_can_coalesce(skb, i, page, offset);
-               if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+               if (!can_coalesce && i >= sysctl_max_skb_frags) {
                        tcp_mark_push(tp, skb);
                        goto new_segment;
                }
@@ -1212,7 +1213,7 @@ new_segment:
 
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
-                               if (i == MAX_SKB_FRAGS || !sg) {
+                               if (i == sysctl_max_skb_frags || !sg) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
@@ -2638,6 +2639,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now = tcp_time_stamp;
        unsigned int start;
+       u64 rate64;
        u32 rate;
 
        memset(info, 0, sizeof(*info));
@@ -2703,15 +2705,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_total_retrans = tp->total_retrans;
 
        rate = READ_ONCE(sk->sk_pacing_rate);
-       info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+       rate64 = rate != ~0U ? rate : ~0ULL;
+       put_unaligned(rate64, &info->tcpi_pacing_rate);
 
        rate = READ_ONCE(sk->sk_max_pacing_rate);
-       info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+       rate64 = rate != ~0U ? rate : ~0ULL;
+       put_unaligned(rate64, &info->tcpi_max_pacing_rate);
 
        do {
                start = u64_stats_fetch_begin_irq(&tp->syncp);
-               info->tcpi_bytes_acked = tp->bytes_acked;
-               info->tcpi_bytes_received = tp->bytes_received;
+               put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+               put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
        } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
        info->tcpi_segs_out = tp->segs_out;
        info->tcpi_segs_in = tp->segs_in;
@@ -2946,7 +2950,7 @@ static void __tcp_alloc_md5sig_pool(void)
                        struct crypto_hash *hash;
 
                        hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-                       if (IS_ERR_OR_NULL(hash))
+                       if (IS_ERR(hash))
                                return;
                        per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
                }
index 0003d40..3b2c8e9 100644 (file)
@@ -2164,8 +2164,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-       int cnt, oldcnt;
-       int err;
+       int cnt, oldcnt, lost;
        unsigned int mss;
        /* Use SACK to deduce losses of new sequences sent during recovery */
        const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
@@ -2205,9 +2204,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                                break;
 
                        mss = tcp_skb_mss(skb);
-                       err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
-                                          mss, GFP_ATOMIC);
-                       if (err < 0)
+                       /* If needed, chop off the prefix to mark as lost. */
+                       lost = (packets - oldcnt) * mss;
+                       if (lost < skb->len &&
+                           tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
                                break;
                        cnt = packets;
                }
@@ -2366,8 +2366,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
                        tp->snd_ssthresh = tp->prior_ssthresh;
                        tcp_ecn_withdraw_cwr(tp);
                }
-       } else {
-               tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
        }
        tp->snd_cwnd_stamp = tcp_time_stamp;
        tp->undo_marker = 0;
@@ -2898,7 +2896,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
 {
        const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
        struct rtt_meas *m = tcp_sk(sk)->rtt_min;
-       struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+       struct rtt_meas rttm = {
+               .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
+               .ts = now,
+       };
        u32 elapsed;
 
        /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
index 5ced3e4..487ac67 100644 (file)
@@ -311,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
 
 
 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
-void tcp_req_err(struct sock *sk, u32 seq)
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 {
        struct request_sock *req = inet_reqsk(sk);
        struct net *net = sock_net(sk);
@@ -323,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq)
 
        if (seq != tcp_rsk(req)->snt_isn) {
                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
-       } else {
+       } else if (abort) {
                /*
                 * Still in SYN_RECV, just remove it silently.
                 * There is no good way to pass the error to the newly
@@ -383,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        }
        seq = ntohl(th->seq);
        if (sk->sk_state == TCP_NEW_SYN_RECV)
-               return tcp_req_err(sk, seq);
+               return tcp_req_err(sk, seq,
+                                 type == ICMP_PARAMETERPROB ||
+                                 type == ICMP_TIME_EXCEEDED ||
+                                 (type == ICMP_DEST_UNREACH &&
+                                  (code == ICMP_NET_UNREACH ||
+                                   code == ICMP_HOST_UNREACH)));
 
        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
@@ -707,7 +712,8 @@ release_sk1:
    outside socket context is ugly, certainly. What can I do?
  */
 
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(struct net *net,
+                           struct sk_buff *skb, u32 seq, u32 ack,
                            u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_md5sig_key *key,
                            int reply_flags, u8 tos)
@@ -722,7 +728,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
                        ];
        } rep;
        struct ip_reply_arg arg;
-       struct net *net = dev_net(skb_dst(skb)->dev);
 
        memset(&rep.th, 0, sizeof(struct tcphdr));
        memset(&arg, 0, sizeof(arg));
@@ -784,7 +789,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
-       tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+       tcp_v4_send_ack(sock_net(sk), skb,
+                       tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_time_stamp + tcptw->tw_ts_offset,
                        tcptw->tw_ts_recent,
@@ -803,8 +809,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
-       tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
-                       tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+       u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+                                            tcp_sk(sk)->snd_nxt;
+
+       tcp_v4_send_ack(sock_net(sk), skb, seq,
                        tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
                        tcp_time_stamp,
                        req->ts_recent,
@@ -1589,28 +1597,30 @@ process:
 
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
-               struct sock *nsk = NULL;
+               struct sock *nsk;
 
                sk = req->rsk_listener;
-               if (tcp_v4_inbound_md5_hash(sk, skb))
-                       goto discard_and_relse;
-               if (likely(sk->sk_state == TCP_LISTEN)) {
-                       nsk = tcp_check_req(sk, skb, req, false);
-               } else {
+               if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+                       reqsk_put(req);
+                       goto discard_it;
+               }
+               if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
+               sock_hold(sk);
+               nsk = tcp_check_req(sk, skb, req, false);
                if (!nsk) {
                        reqsk_put(req);
-                       goto discard_it;
+                       goto discard_and_relse;
                }
                if (nsk == sk) {
-                       sock_hold(sk);
                        reqsk_put(req);
                } else if (tcp_child_process(sk, nsk, skb)) {
                        tcp_v4_send_reset(nsk, skb);
-                       goto discard_it;
+                       goto discard_and_relse;
                } else {
+                       sock_put(sk);
                        return 0;
                }
        }
index c8cbc2b..a726d78 100644 (file)
@@ -550,7 +550,7 @@ reset:
         */
        if (crtt > tp->srtt_us) {
                /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
-               crtt /= 8 * USEC_PER_MSEC;
+               crtt /= 8 * USEC_PER_SEC / HZ;
                inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
        } else if (tp->srtt_us == 0) {
                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
index 75632a9..9b02af2 100644 (file)
@@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
                newtp->rcv_wup = newtp->copied_seq =
                newtp->rcv_nxt = treq->rcv_isn + 1;
-               newtp->segs_in = 0;
+               newtp->segs_in = 1;
 
                newtp->snd_sml = newtp->snd_una =
                newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -815,6 +815,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
        int ret = 0;
        int state = child->sk_state;
 
+       tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
        if (!sock_owned_by_user(child)) {
                ret = tcp_rcv_state_process(child, skb);
                /* Wakeup parent, send SIGIO */
index dc45b53..95d2f19 100644 (file)
@@ -499,6 +499,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
        struct sock *sk, *result;
        struct hlist_nulls_node *node;
        int score, badness, matches = 0, reuseport = 0;
+       bool select_ok = true;
        u32 hash = 0;
 
 begin:
@@ -512,14 +513,18 @@ begin:
                        badness = score;
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
-                               struct sock *sk2;
                                hash = udp_ehashfn(net, daddr, hnum,
                                                   saddr, sport);
-                               sk2 = reuseport_select_sock(sk, hash, skb,
-                                                           sizeof(struct udphdr));
-                               if (sk2) {
-                                       result = sk2;
-                                       goto found;
+                               if (select_ok) {
+                                       struct sock *sk2;
+
+                                       sk2 = reuseport_select_sock(sk, hash, skb,
+                                                       sizeof(struct udphdr));
+                                       if (sk2) {
+                                               result = sk2;
+                                               select_ok = false;
+                                               goto found;
+                                       }
                                }
                                matches = 1;
                        }
@@ -563,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
        unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
        struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
        int score, badness, matches = 0, reuseport = 0;
+       bool select_ok = true;
        u32 hash = 0;
 
        rcu_read_lock();
@@ -601,14 +607,18 @@ begin:
                        badness = score;
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
-                               struct sock *sk2;
                                hash = udp_ehashfn(net, daddr, hnum,
                                                   saddr, sport);
-                               sk2 = reuseport_select_sock(sk, hash, skb,
+                               if (select_ok) {
+                                       struct sock *sk2;
+
+                                       sk2 = reuseport_select_sock(sk, hash, skb,
                                                        sizeof(struct udphdr));
-                               if (sk2) {
-                                       result = sk2;
-                                       goto found;
+                                       if (sk2) {
+                                               result = sk2;
+                                               select_ok = false;
+                                               goto found;
+                                       }
                                }
                                matches = 1;
                        }
@@ -1038,8 +1048,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        if (msg->msg_controllen) {
                err = ip_cmsg_send(sock_net(sk), msg, &ipc,
                                   sk->sk_family == AF_INET6);
-               if (err)
+               if (unlikely(err)) {
+                       kfree(ipc.opt);
                        return err;
+               }
                if (ipc.opt)
                        free = 1;
                connected = 0;
index 0ec0881..96599d1 100644 (file)
@@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
        uh->source = src_port;
        uh->len = htons(skb->len);
 
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
        udp_set_csum(nocheck, skb, src, dst, skb->len);
 
        iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
index bb7dabe..40c8975 100644 (file)
@@ -69,6 +69,7 @@ config INET6_ESP
        select CRYPTO_CBC
        select CRYPTO_SHA1
        select CRYPTO_DES
+       select CRYPTO_ECHAINIV
        ---help---
          Support for IPsec ESP.
 
index 38eedde..bdd7eac 100644 (file)
@@ -583,7 +583,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
        if (err < 0)
                goto errout;
 
-       err = EINVAL;
+       err = -EINVAL;
        if (!tb[NETCONFA_IFINDEX])
                goto errout;
 
@@ -3538,6 +3538,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 {
        struct inet6_dev *idev = ifp->idev;
        struct net_device *dev = idev->dev;
+       bool notify = false;
 
        addrconf_join_solict(dev, &ifp->addr);
 
@@ -3583,7 +3584,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
                        /* Because optimistic nodes can use this address,
                         * notify listeners. If DAD fails, RTM_DELADDR is sent.
                         */
-                       ipv6_ifa_notify(RTM_NEWADDR, ifp);
+                       notify = true;
                }
        }
 
@@ -3591,6 +3592,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 out:
        spin_unlock(&ifp->lock);
        read_unlock_bh(&idev->lock);
+       if (notify)
+               ipv6_ifa_notify(RTM_NEWADDR, ifp);
 }
 
 static void addrconf_dad_start(struct inet6_ifaddr *ifp)
index 517c55b..4281621 100644 (file)
@@ -162,6 +162,9 @@ ipv4_connected:
        fl6.fl6_dport = inet->inet_dport;
        fl6.fl6_sport = inet->inet_sport;
 
+       if (!fl6.flowi6_oif)
+               fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
        if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
                fl6.flowi6_oif = np->mcast_oif;
 
index 5c5d23e..9508a20 100644 (file)
@@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                                                *fragoff = _frag_off;
                                        return hp->nexthdr;
                                }
-                               return -ENOENT;
+                               if (!found)
+                                       return -ENOENT;
+                               if (fragoff)
+                                       *fragoff = _frag_off;
+                               break;
                        }
                        hdrlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH) {
index 1f9ebe3..dc2db4f 100644 (file)
@@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
                }
                spin_lock_bh(&ip6_sk_fl_lock);
                for (sflp = &np->ipv6_fl_list;
-                    (sfl = rcu_dereference(*sflp)) != NULL;
+                    (sfl = rcu_dereference_protected(*sflp,
+                                                     lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
                     sflp = &sfl->next) {
                        if (sfl->fl->label == freq.flr_label) {
                                if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
                                        np->flow_label &= ~IPV6_FLOWLABEL_MASK;
-                               *sflp = rcu_dereference(sfl->next);
+                               *sflp = sfl->next;
                                spin_unlock_bh(&ip6_sk_fl_lock);
                                fl_release(sfl->fl);
                                kfree_rcu(sfl, rcu);
index f37f18b..c0d4dc1 100644 (file)
@@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
        __u32 mtu;
        int err;
 
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
        if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                encap_limit = t->parms.encap_limit;
 
@@ -1512,6 +1514,7 @@ static void ip6gre_tap_setup(struct net_device *dev)
        dev->destructor = ip6gre_dev_free;
 
        dev->features |= NETIF_F_NETNS_LOCAL;
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 }
 
 static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
index 23de98f..a163102 100644 (file)
@@ -909,6 +909,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
        struct rt6_info *rt;
 #endif
        int err;
+       int flags = 0;
 
        /* The correct way to handle this would be to do
         * ip6_route_get_saddr, and then ip6_route_output; however,
@@ -940,10 +941,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
                        dst_release(*dst);
                        *dst = NULL;
                }
+
+               if (fl6->flowi6_oif)
+                       flags |= RT6_LOOKUP_F_IFACE;
        }
 
        if (!*dst)
-               *dst = ip6_route_output(net, sk, fl6);
+               *dst = ip6_route_output_flags(net, sk, fl6, flags);
 
        err = (*dst)->error;
        if (err)
index 137fca4..6c5dfec 100644 (file)
@@ -1180,6 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
        u8 tproto;
        int err;
 
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
        tproto = ACCESS_ONCE(t->parms.proto);
        if (tproto != IPPROTO_IPIP && tproto != 0)
                return -1;
index 5ee56d0..d64ee7e 100644 (file)
@@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
                return NULL;
 
        skb->priority = TC_PRIO_CONTROL;
-       skb->reserved_tailroom = skb_end_offset(skb) -
-                                min(mtu, skb_end_offset(skb));
        skb_reserve(skb, hlen);
+       skb_tailroom_reserve(skb, mtu, tlen);
 
        if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
                /* <draft-ietf-magma-mld-source-05.txt>:
index 31ba7ca..051b6a6 100644 (file)
 #include <net/ipv6.h>
 #include <net/netfilter/ipv6/nf_nat_masquerade.h>
 
+#define MAX_WORK_COUNT 16
+
+static atomic_t v6_worker_count;
+
 unsigned int
 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
                       const struct net_device *out)
@@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = {
        .notifier_call  = masq_device_event,
 };
 
+struct masq_dev_work {
+       struct work_struct work;
+       struct net *net;
+       int ifindex;
+};
+
+static void iterate_cleanup_work(struct work_struct *work)
+{
+       struct masq_dev_work *w;
+       long index;
+
+       w = container_of(work, struct masq_dev_work, work);
+
+       index = w->ifindex;
+       nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0);
+
+       put_net(w->net);
+       kfree(w);
+       atomic_dec(&v6_worker_count);
+       module_put(THIS_MODULE);
+}
+
+/* ipv6 inet notifier is an atomic notifier, i.e. we cannot
+ * schedule.
+ *
+ * Unfortunately, nf_ct_iterate_cleanup can run for a long
+ * time if there are lots of conntracks and the system
+ * handles high softirq load, so it frequently calls cond_resched
+ * while iterating the conntrack table.
+ *
+ * So we defer nf_ct_iterate_cleanup walk to the system workqueue.
+ *
+ * As we can have 'a lot' of inet_events (depending on amount
+ * of ipv6 addresses being deleted), we also need to add an upper
+ * limit to the number of queued work items.
+ */
 static int masq_inet_event(struct notifier_block *this,
                           unsigned long event, void *ptr)
 {
        struct inet6_ifaddr *ifa = ptr;
-       struct netdev_notifier_info info;
+       const struct net_device *dev;
+       struct masq_dev_work *w;
+       struct net *net;
+
+       if (event != NETDEV_DOWN ||
+           atomic_read(&v6_worker_count) >= MAX_WORK_COUNT)
+               return NOTIFY_DONE;
+
+       dev = ifa->idev->dev;
+       net = maybe_get_net(dev_net(dev));
+       if (!net)
+               return NOTIFY_DONE;
 
-       netdev_notifier_info_init(&info, ifa->idev->dev);
-       return masq_device_event(this, event, &info);
+       if (!try_module_get(THIS_MODULE))
+               goto err_module;
+
+       w = kmalloc(sizeof(*w), GFP_ATOMIC);
+       if (w) {
+               atomic_inc(&v6_worker_count);
+
+               INIT_WORK(&w->work, iterate_cleanup_work);
+               w->ifindex = dev->ifindex;
+               w->net = net;
+               schedule_work(&w->work);
+
+               return NOTIFY_DONE;
+       }
+
+       module_put(THIS_MODULE);
+ err_module:
+       put_net(net);
+       return NOTIFY_DONE;
 }
 
 static struct notifier_block masq_inet_notifier = {
index 3c8834b..ed44663 100644 (file)
@@ -1183,11 +1183,10 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 }
 
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
-                                   struct flowi6 *fl6)
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+                                        struct flowi6 *fl6, int flags)
 {
        struct dst_entry *dst;
-       int flags = 0;
        bool any_src;
 
        dst = l3mdev_rt6_dst_by_oif(net, fl6);
@@ -1208,7 +1207,7 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
 
        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 }
-EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
 
 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
index e794ef6..2066d1c 100644 (file)
@@ -201,14 +201,14 @@ static int ipip6_tunnel_create(struct net_device *dev)
        if ((__force u16)t->parms.i_flags & SIT_ISATAP)
                dev->priv_flags |= IFF_ISATAP;
 
+       dev->rtnl_link_ops = &sit_link_ops;
+
        err = register_netdevice(dev);
        if (err < 0)
                goto out;
 
        ipip6_tunnel_clone_6rd(dev, sitn);
 
-       dev->rtnl_link_ops = &sit_link_ops;
-
        dev_hold(dev);
 
        ipip6_tunnel_link(sitn, t);
index 006396e..5c8c842 100644 (file)
@@ -327,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        struct tcp_sock *tp;
        __u32 seq, snd_una;
        struct sock *sk;
+       bool fatal;
        int err;
 
        sk = __inet6_lookup_established(net, &tcp_hashinfo,
@@ -345,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                return;
        }
        seq = ntohl(th->seq);
+       fatal = icmpv6_err_convert(type, code, &err);
        if (sk->sk_state == TCP_NEW_SYN_RECV)
-               return tcp_req_err(sk, seq);
+               return tcp_req_err(sk, seq, fatal);
 
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -400,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                goto out;
        }
 
-       icmpv6_err_convert(type, code, &err);
 
        /* Might be for an request_sock */
        switch (sk->sk_state) {
@@ -1386,7 +1387,7 @@ process:
 
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
-               struct sock *nsk = NULL;
+               struct sock *nsk;
 
                sk = req->rsk_listener;
                tcp_v6_fill_cb(skb, hdr, th);
@@ -1394,24 +1395,24 @@ process:
                        reqsk_put(req);
                        goto discard_it;
                }
-               if (likely(sk->sk_state == TCP_LISTEN)) {
-                       nsk = tcp_check_req(sk, skb, req, false);
-               } else {
+               if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
+               sock_hold(sk);
+               nsk = tcp_check_req(sk, skb, req, false);
                if (!nsk) {
                        reqsk_put(req);
-                       goto discard_it;
+                       goto discard_and_relse;
                }
                if (nsk == sk) {
-                       sock_hold(sk);
                        reqsk_put(req);
                        tcp_v6_restore_cb(skb);
                } else if (tcp_child_process(sk, nsk, skb)) {
                        tcp_v6_send_reset(nsk, skb);
-                       goto discard_it;
+                       goto discard_and_relse;
                } else {
+                       sock_put(sk);
                        return 0;
                }
        }
index 5d2c2af..422dd01 100644 (file)
@@ -257,6 +257,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
        struct sock *sk, *result;
        struct hlist_nulls_node *node;
        int score, badness, matches = 0, reuseport = 0;
+       bool select_ok = true;
        u32 hash = 0;
 
 begin:
@@ -270,14 +271,18 @@ begin:
                        badness = score;
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
-                               struct sock *sk2;
                                hash = udp6_ehashfn(net, daddr, hnum,
                                                    saddr, sport);
-                               sk2 = reuseport_select_sock(sk, hash, skb,
-                                                           sizeof(struct udphdr));
-                               if (sk2) {
-                                       result = sk2;
-                                       goto found;
+                               if (select_ok) {
+                                       struct sock *sk2;
+
+                                       sk2 = reuseport_select_sock(sk, hash, skb,
+                                                       sizeof(struct udphdr));
+                                       if (sk2) {
+                                               result = sk2;
+                                               select_ok = false;
+                                               goto found;
+                                       }
                                }
                                matches = 1;
                        }
@@ -321,6 +326,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
        unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
        struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
        int score, badness, matches = 0, reuseport = 0;
+       bool select_ok = true;
        u32 hash = 0;
 
        rcu_read_lock();
@@ -358,14 +364,18 @@ begin:
                        badness = score;
                        reuseport = sk->sk_reuseport;
                        if (reuseport) {
-                               struct sock *sk2;
                                hash = udp6_ehashfn(net, daddr, hnum,
                                                    saddr, sport);
-                               sk2 = reuseport_select_sock(sk, hash, skb,
+                               if (select_ok) {
+                                       struct sock *sk2;
+
+                                       sk2 = reuseport_select_sock(sk, hash, skb,
                                                        sizeof(struct udphdr));
-                               if (sk2) {
-                                       result = sk2;
-                                       goto found;
+                                       if (sk2) {
+                                               result = sk2;
+                                               select_ok = false;
+                                               goto found;
+                                       }
                                }
                                matches = 1;
                        }
@@ -952,11 +962,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                ret = udpv6_queue_rcv_skb(sk, skb);
                sock_put(sk);
 
-               /* a return value > 0 means to resubmit the input, but
-                * it wants the return to be -protocol, or 0
-                */
+               /* a return value > 0 means to resubmit the input */
                if (ret > 0)
-                       return -ret;
+                       return ret;
 
                return 0;
        }
index 3c4caa6..5728e76 100644 (file)
@@ -134,11 +134,10 @@ int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush)
                return -1;
        }
        skb_put(skb, count);
+       pr_debug("%s(), skb->len=%d\n", __func__, skb->len);
 
        spin_unlock_irqrestore(&self->spinlock, flags);
 
-       pr_debug("%s(), skb->len=%d\n", __func__ , skb->len);
-
        if (flush) {
                /* ircomm_tty_do_softint will take care of the rest */
                schedule_work(&self->tqueue);
index ef50a94..fc3598a 100644 (file)
@@ -708,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
        if (!addr || addr->sa_family != AF_IUCV)
                return -EINVAL;
 
+       if (addr_len < sizeof(struct sockaddr_iucv))
+               return -EINVAL;
+
        lock_sock(sk);
        if (sk->sk_state != IUCV_OPEN) {
                err = -EBADFD;
index f93c5be..2caaa84 100644 (file)
@@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family,
        ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq,
                                  NLM_F_ACK, tunnel, cmd);
 
-       if (ret >= 0)
-               return genlmsg_multicast_allns(family, msg, 0,  0, GFP_ATOMIC);
+       if (ret >= 0) {
+               ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+               /* We don't care if no one is listening */
+               if (ret == -ESRCH)
+                       ret = 0;
+               return ret;
+       }
 
        nlmsg_free(msg);
 
@@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family,
        ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
                                   NLM_F_ACK, session, cmd);
 
-       if (ret >= 0)
-               return genlmsg_multicast_allns(family, msg, 0,  0, GFP_ATOMIC);
+       if (ret >= 0) {
+               ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+               /* We don't care if no one is listening */
+               if (ret == -ESRCH)
+                       ret = 0;
+               return ret;
+       }
 
        nlmsg_free(msg);
 
index 10ad4ac..367784b 100644 (file)
@@ -291,7 +291,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
        }
 
        /* prepare A-MPDU MLME for Rx aggregation */
-       tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+       tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
        if (!tid_agg_rx)
                goto end;
 
index f7fc0e0..978d3bc 100644 (file)
@@ -1733,7 +1733,6 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
                if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        continue;
                sdata->u.ibss.last_scan_completed = jiffies;
-               ieee80211_queue_work(&local->hw, &sdata->work);
        }
        mutex_unlock(&local->iflist_mtx);
 }
index b84f6aa..f006f4a 100644 (file)
@@ -92,7 +92,7 @@ struct ieee80211_fragment_entry {
        u16 extra_len;
        u16 last_frag;
        u8 rx_queue;
-       bool ccmp; /* Whether fragments were encrypted with CCMP */
+       bool check_sequential_pn; /* needed for CCMP/GCMP */
        u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
 };
 
index 6bcf0fa..8190bf2 100644 (file)
@@ -248,6 +248,7 @@ static void ieee80211_restart_work(struct work_struct *work)
 
        /* wait for scan work complete */
        flush_workqueue(local->workqueue);
+       flush_work(&local->sched_scan_stopped_work);
 
        WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
             "%s called with hardware scan in progress\n", __func__);
@@ -256,6 +257,11 @@ static void ieee80211_restart_work(struct work_struct *work)
        list_for_each_entry(sdata, &local->interfaces, list)
                flush_delayed_work(&sdata->dec_tailroom_needed_wk);
        ieee80211_scan_cancel(local);
+
+       /* make sure any new ROC will consider local->in_reconfig */
+       flush_delayed_work(&local->roc_work);
+       flush_work(&local->hw_roc_done);
+
        ieee80211_reconfig(local);
        rtnl_unlock();
 }
index fa28500..6f85b6a 100644 (file)
@@ -1370,17 +1370,6 @@ out:
        sdata_unlock(sdata);
 }
 
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
-{
-       struct ieee80211_sub_if_data *sdata;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(sdata, &local->interfaces, list)
-               if (ieee80211_vif_is_mesh(&sdata->vif) &&
-                   ieee80211_sdata_running(sdata))
-                       ieee80211_queue_work(&local->hw, &sdata->work);
-       rcu_read_unlock();
-}
 
 void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 {
index a159634..4a8019f 100644 (file)
@@ -362,14 +362,10 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
        return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP;
 }
 
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
-
 void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
 void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
 void ieee80211s_stop(void);
 #else
-static inline void
-ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
 static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
 { return false; }
 static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
index 1c342e2..bfbb1ac 100644 (file)
@@ -4005,8 +4005,6 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
                if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
                        ieee80211_queue_work(&sdata->local->hw,
                                             &sdata->u.mgd.monitor_work);
-               /* and do all the other regular work too */
-               ieee80211_queue_work(&sdata->local->hw, &sdata->work);
        }
 }
 
index 8b2f4ea..55a9c5b 100644 (file)
@@ -252,14 +252,11 @@ static bool ieee80211_recalc_sw_work(struct ieee80211_local *local,
 static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
                                         unsigned long start_time)
 {
-       struct ieee80211_local *local = roc->sdata->local;
-
        if (WARN_ON(roc->notified))
                return;
 
        roc->start_time = start_time;
        roc->started = true;
-       roc->hw_begun = true;
 
        if (roc->mgmt_tx_cookie) {
                if (!WARN_ON(!roc->frame)) {
@@ -274,9 +271,6 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
        }
 
        roc->notified = true;
-
-       if (!local->ops->remain_on_channel)
-               ieee80211_recalc_sw_work(local, start_time);
 }
 
 static void ieee80211_hw_roc_start(struct work_struct *work)
@@ -291,6 +285,7 @@ static void ieee80211_hw_roc_start(struct work_struct *work)
                if (!roc->started)
                        break;
 
+               roc->hw_begun = true;
                ieee80211_handle_roc_started(roc, local->hw_roc_start_time);
        }
 
@@ -413,6 +408,10 @@ void ieee80211_start_next_roc(struct ieee80211_local *local)
                return;
        }
 
+       /* defer roc if driver is not started (i.e. during reconfig) */
+       if (local->in_reconfig)
+               return;
+
        roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
                               list);
 
@@ -534,8 +533,10 @@ ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local,
         * begin, otherwise they'll both be marked properly by the work
         * struct that runs once the driver notifies us of the beginning
         */
-       if (cur_roc->hw_begun)
+       if (cur_roc->hw_begun) {
+               new_roc->hw_begun = true;
                ieee80211_handle_roc_started(new_roc, now);
+       }
 
        return true;
 }
@@ -658,6 +659,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
                        queued = true;
                        roc->on_channel = tmp->on_channel;
                        ieee80211_handle_roc_started(roc, now);
+                       ieee80211_recalc_sw_work(local, now);
                        break;
                }
 
index 3ece7d1..b54f398 100644 (file)
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
         * computing cur_tp
         */
        tmp_mrs = &mi->r[idx].stats;
-       tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+       tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
        tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
 
        return tmp_cur_tp;
index 3928dbd..370d677 100644 (file)
@@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
            (max_tp_group != MINSTREL_CCK_GROUP))
                return;
 
+       max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+       max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+       max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+
        if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
                cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
                                                    mrs->prob_ewma);
                if (cur_tp_avg > tmp_tp_avg)
                        mi->max_prob_rate = index;
 
-               max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
-               max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
-               max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
                max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
                                                        max_gpr_idx,
                                                        max_gpr_prob);
@@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
        } else {
                if (mrs->prob_ewma > tmp_prob)
                        mi->max_prob_rate = index;
-               if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+               if (mrs->prob_ewma > max_gpr_prob)
                        mg->max_group_prob_rate = index;
        }
 }
@@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
        if (likely(sta->ampdu_mlme.tid_tx[tid]))
                return;
 
-       ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+       ieee80211_start_tx_ba_session(pubsta, tid, 0);
 }
 
 static void
@@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
         *  - if station is in dynamic SMPS (and streams > 1)
         *  - for fallback rates, to increase chances of getting through
         */
-       if (offset > 0 &&
+       if (offset > 0 ||
            (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
             group->streams > 1)) {
                ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
@@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
        prob = mi->groups[i].rates[j].prob_ewma;
 
        /* convert tp_avg from pkt per second in kbps */
-       tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+       tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+       tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
 
        return tp_avg;
 }
index bc08185..60d093f 100644 (file)
@@ -1753,7 +1753,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
        entry->seq = seq;
        entry->rx_queue = rx_queue;
        entry->last_frag = frag;
-       entry->ccmp = 0;
+       entry->check_sequential_pn = false;
        entry->extra_len = 0;
 
        return entry;
@@ -1849,15 +1849,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
                                                 rx->seqno_idx, &(rx->skb));
                if (rx->key &&
                    (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
-                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
                    ieee80211_has_protected(fc)) {
                        int queue = rx->security_idx;
-                       /* Store CCMP PN so that we can verify that the next
-                        * fragment has a sequential PN value. */
-                       entry->ccmp = 1;
+
+                       /* Store CCMP/GCMP PN so that we can verify that the
+                        * next fragment has a sequential PN value.
+                        */
+                       entry->check_sequential_pn = true;
                        memcpy(entry->last_pn,
                               rx->key->u.ccmp.rx_pn[queue],
                               IEEE80211_CCMP_PN_LEN);
+                       BUILD_BUG_ON(offsetof(struct ieee80211_key,
+                                             u.ccmp.rx_pn) !=
+                                    offsetof(struct ieee80211_key,
+                                             u.gcmp.rx_pn));
+                       BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+                                    sizeof(rx->key->u.gcmp.rx_pn[queue]));
+                       BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+                                    IEEE80211_GCMP_PN_LEN);
                }
                return RX_QUEUED;
        }
@@ -1872,15 +1884,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
                return RX_DROP_MONITOR;
        }
 
-       /* Verify that MPDUs within one MSDU have sequential PN values.
-        * (IEEE 802.11i, 8.3.3.4.5) */
-       if (entry->ccmp) {
+       /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+        *  MPDU PN values are not incrementing in steps of 1."
+        * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+        * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+        */
+       if (entry->check_sequential_pn) {
                int i;
                u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
                int queue;
+
                if (!rx->key ||
                    (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
-                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+                    rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
                        return RX_DROP_UNUSABLE;
                memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
                for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -3366,6 +3384,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
                                return false;
                        /* ignore action frames to TDLS-peers */
                        if (ieee80211_is_action(hdr->frame_control) &&
+                           !is_broadcast_ether_addr(bssid) &&
                            !ether_addr_equal(bssid, hdr->addr1))
                                return false;
                }
index a413e52..ae980ce 100644 (file)
@@ -314,6 +314,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
        bool was_scanning = local->scanning;
        struct cfg80211_scan_request *scan_req;
        struct ieee80211_sub_if_data *scan_sdata;
+       struct ieee80211_sub_if_data *sdata;
 
        lockdep_assert_held(&local->mtx);
 
@@ -373,7 +374,16 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 
        ieee80211_mlme_notify_scan_completed(local);
        ieee80211_ibss_notify_scan_completed(local);
-       ieee80211_mesh_notify_scan_completed(local);
+
+       /* Requeue all the work that might have been ignored while
+        * the scan was in progress; if there was none this will
+        * just be a no-op for the particular interface.
+        */
+       list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+               if (ieee80211_sdata_running(sdata))
+                       ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+       }
+
        if (was_scanning)
                ieee80211_start_next_roc(local);
 }
@@ -1213,6 +1223,14 @@ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw)
 
        trace_api_sched_scan_stopped(local);
 
+       /*
+        * this shouldn't really happen, so for simplicity
+        * simply ignore it, and let mac80211 reconfigure
+        * the sched scan later on.
+        */
+       if (local->in_reconfig)
+               return;
+
        schedule_work(&local->sched_scan_stopped_work);
 }
 EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
index 4402ad5..a4a4f89 100644 (file)
@@ -1453,7 +1453,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 
        more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids);
 
-       if (reason == IEEE80211_FRAME_RELEASE_PSPOLL)
+       if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL)
                driver_release_tids =
                        BIT(find_highest_prio_tid(driver_release_tids));
 
index 5bad05e..6101deb 100644 (file)
@@ -51,6 +51,11 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
        struct ieee80211_hdr *hdr = (void *)skb->data;
        int ac;
 
+       if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) {
+               ieee80211_free_txskb(&local->hw, skb);
+               return;
+       }
+
        /*
         * This skb 'survived' a round-trip through the driver, and
         * hopefully the driver didn't mangle it too badly. However,
index 3943d4b..58f58bd 100644 (file)
@@ -2043,16 +2043,26 @@ int ieee80211_reconfig(struct ieee80211_local *local)
                 */
                if (sched_scan_req->n_scan_plans > 1 ||
                    __ieee80211_request_sched_scan_start(sched_scan_sdata,
-                                                        sched_scan_req))
+                                                        sched_scan_req)) {
+                       RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
+                       RCU_INIT_POINTER(local->sched_scan_req, NULL);
                        sched_scan_stopped = true;
+               }
        mutex_unlock(&local->mtx);
 
        if (sched_scan_stopped)
                cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
 
  wake_up:
-       local->in_reconfig = false;
-       barrier();
+       if (local->in_reconfig) {
+               local->in_reconfig = false;
+               barrier();
+
+               /* Restart deferred ROCs */
+               mutex_lock(&local->mtx);
+               ieee80211_start_next_roc(local);
+               mutex_unlock(&local->mtx);
+       }
 
        if (local->monitors == local->open_count && local->monitors > 0)
                ieee80211_add_virtual_monitor(local);
index 8c067e6..95e757c 100644 (file)
@@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE
        depends on IPV6 || IPV6=n
        depends on !NF_CONNTRACK || NF_CONNTRACK
        select NF_DUP_IPV4
-       select NF_DUP_IPV6 if IP6_NF_IPTABLES != n
+       select NF_DUP_IPV6 if IPV6
        ---help---
        This option adds a "TEE" target with which a packet can be cloned and
        this clone be rerouted to another nexthop.
index 43d8c98..f0f688d 100644 (file)
@@ -164,8 +164,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
        };
        struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-       if (e.cidr == 0)
-               return -EINVAL;
        if (adt == IPSET_TEST)
                e.cidr = HOST_MASK;
 
@@ -377,8 +375,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
        };
        struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-       if (e.cidr == 0)
-               return -EINVAL;
        if (adt == IPSET_TEST)
                e.cidr = HOST_MASK;
 
index 3cb3cb8..f60b4fd 100644 (file)
@@ -66,6 +66,21 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly bool nf_conntrack_locks_all;
+
+void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
+{
+       spin_lock(lock);
+       while (unlikely(nf_conntrack_locks_all)) {
+               spin_unlock(lock);
+               spin_lock(&nf_conntrack_locks_all_lock);
+               spin_unlock(&nf_conntrack_locks_all_lock);
+               spin_lock(lock);
+       }
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+
 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 {
        h1 %= CONNTRACK_LOCKS;
@@ -82,12 +97,12 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
        h1 %= CONNTRACK_LOCKS;
        h2 %= CONNTRACK_LOCKS;
        if (h1 <= h2) {
-               spin_lock(&nf_conntrack_locks[h1]);
+               nf_conntrack_lock(&nf_conntrack_locks[h1]);
                if (h1 != h2)
                        spin_lock_nested(&nf_conntrack_locks[h2],
                                         SINGLE_DEPTH_NESTING);
        } else {
-               spin_lock(&nf_conntrack_locks[h2]);
+               nf_conntrack_lock(&nf_conntrack_locks[h2]);
                spin_lock_nested(&nf_conntrack_locks[h1],
                                 SINGLE_DEPTH_NESTING);
        }
@@ -102,16 +117,19 @@ static void nf_conntrack_all_lock(void)
 {
        int i;
 
-       for (i = 0; i < CONNTRACK_LOCKS; i++)
-               spin_lock_nested(&nf_conntrack_locks[i], i);
+       spin_lock(&nf_conntrack_locks_all_lock);
+       nf_conntrack_locks_all = true;
+
+       for (i = 0; i < CONNTRACK_LOCKS; i++) {
+               spin_lock(&nf_conntrack_locks[i]);
+               spin_unlock(&nf_conntrack_locks[i]);
+       }
 }
 
 static void nf_conntrack_all_unlock(void)
 {
-       int i;
-
-       for (i = 0; i < CONNTRACK_LOCKS; i++)
-               spin_unlock(&nf_conntrack_locks[i]);
+       nf_conntrack_locks_all = false;
+       spin_unlock(&nf_conntrack_locks_all_lock);
 }
 
 unsigned int nf_conntrack_htable_size __read_mostly;
@@ -757,7 +775,7 @@ restart:
        hash = hash_bucket(_hash, net);
        for (; i < net->ct.htable_size; i++) {
                lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
-               spin_lock(lockp);
+               nf_conntrack_lock(lockp);
                if (read_seqcount_retry(&net->ct.generation, sequence)) {
                        spin_unlock(lockp);
                        goto restart;
@@ -1382,7 +1400,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
        for (; *bucket < net->ct.htable_size; (*bucket)++) {
                lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
                local_bh_disable();
-               spin_lock(lockp);
+               nf_conntrack_lock(lockp);
                if (*bucket < net->ct.htable_size) {
                        hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
                                if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -1394,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
                }
                spin_unlock(lockp);
                local_bh_enable();
+               cond_resched();
        }
 
        for_each_possible_cpu(cpu) {
@@ -1406,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
                                set_bit(IPS_DYING_BIT, &ct->status);
                }
                spin_unlock_bh(&pcpu->lock);
+               cond_resched();
        }
        return NULL;
 found:
@@ -1422,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net,
        struct nf_conn *ct;
        unsigned int bucket = 0;
 
+       might_sleep();
+
        while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
                /* Time to push up daises... */
                if (del_timer(&ct->timeout))
@@ -1430,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net,
                /* ... else the timer will get him soon. */
 
                nf_ct_put(ct);
+               cond_resched();
        }
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
index bd9d315..3b40ec5 100644 (file)
@@ -425,7 +425,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
        }
        local_bh_disable();
        for (i = 0; i < net->ct.htable_size; i++) {
-               spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+               nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
                if (i < net->ct.htable_size) {
                        hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
                                unhelp(h, me);
index dbb1bb3..355e855 100644 (file)
@@ -840,7 +840,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
 restart:
                lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
-               spin_lock(lockp);
+               nf_conntrack_lock(lockp);
                if (cb->args[0] >= net->ct.htable_size) {
                        spin_unlock(lockp);
                        goto out;
index b6605e0..5eefe4a 100644 (file)
@@ -224,12 +224,12 @@ static int __init nf_tables_netdev_init(void)
 
        nft_register_chain_type(&nft_filter_chain_netdev);
        ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
-       if (ret < 0)
+       if (ret < 0) {
                nft_unregister_chain_type(&nft_filter_chain_netdev);
-
+               return ret;
+       }
        register_netdevice_notifier(&nf_tables_netdev_notifier);
-
-       return ret;
+       return 0;
 }
 
 static void __exit nf_tables_netdev_exit(void)
index a7ba233..857ae89 100644 (file)
@@ -311,14 +311,14 @@ replay:
 #endif
                {
                        nfnl_unlock(subsys_id);
-                       netlink_ack(skb, nlh, -EOPNOTSUPP);
+                       netlink_ack(oskb, nlh, -EOPNOTSUPP);
                        return kfree_skb(skb);
                }
        }
 
        if (!ss->commit || !ss->abort) {
                nfnl_unlock(subsys_id);
-               netlink_ack(skb, nlh, -EOPNOTSUPP);
+               netlink_ack(oskb, nlh, -EOPNOTSUPP);
                return kfree_skb(skb);
        }
 
@@ -328,10 +328,12 @@ replay:
                nlh = nlmsg_hdr(skb);
                err = 0;
 
-               if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
-                   skb->len < nlh->nlmsg_len) {
-                       err = -EINVAL;
-                       goto ack;
+               if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+                   skb->len < nlh->nlmsg_len ||
+                   nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+                       nfnl_err_reset(&err_list);
+                       status |= NFNL_BATCH_FAILURE;
+                       goto done;
                }
 
                /* Only requests are handled by the kernel */
@@ -406,7 +408,7 @@ ack:
                                 * pointing to the batch header.
                                 */
                                nfnl_err_reset(&err_list);
-                               netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
+                               netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM);
                                status |= NFNL_BATCH_FAILURE;
                                goto done;
                        }
index 5d010f2..2671b9d 100644 (file)
@@ -307,7 +307,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
 
        local_bh_disable();
        for (i = 0; i < net->ct.htable_size; i++) {
-               spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+               nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
                if (i < net->ct.htable_size) {
                        hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
                                untimeout(h, timeout);
index 383c171..b78c28b 100644 (file)
@@ -46,16 +46,14 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
                switch (priv->op) {
                case NFT_BYTEORDER_NTOH:
                        for (i = 0; i < priv->len / 8; i++) {
-                               src64 = get_unaligned_be64(&src[i]);
-                               src64 = be64_to_cpu((__force __be64)src64);
+                               src64 = get_unaligned((u64 *)&src[i]);
                                put_unaligned_be64(src64, &dst[i]);
                        }
                        break;
                case NFT_BYTEORDER_HTON:
                        for (i = 0; i < priv->len / 8; i++) {
                                src64 = get_unaligned_be64(&src[i]);
-                               src64 = (__force u64)cpu_to_be64(src64);
-                               put_unaligned_be64(src64, &dst[i]);
+                               put_unaligned(src64, (u64 *)&dst[i]);
                        }
                        break;
                }
index c7808fc..c9743f7 100644 (file)
@@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx,
 
        cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
        if (cpu_stats == NULL)
-               return ENOMEM;
+               return -ENOMEM;
 
        preempt_disable();
        this_cpu = this_cpu_ptr(cpu_stats);
@@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
        cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
                                              GFP_ATOMIC);
        if (cpu_stats == NULL)
-               return ENOMEM;
+               return -ENOMEM;
 
        preempt_disable();
        this_cpu = this_cpu_ptr(cpu_stats);
index a0eb216..d4a4619 100644 (file)
@@ -127,6 +127,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
                               NF_CT_LABELS_MAX_SIZE - size);
                return;
        }
+#endif
        case NFT_CT_BYTES: /* fallthrough */
        case NFT_CT_PKTS: {
                const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
@@ -138,7 +139,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
                memcpy(dest, &count, sizeof(count));
                return;
        }
-#endif
        default:
                break;
        }
index b7c43de..e118397 100644 (file)
@@ -228,7 +228,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        u8 nexthdr;
-       __be16 frag_off;
+       __be16 frag_off, oldlen, newlen;
        int tcphoff;
        int ret;
 
@@ -244,7 +244,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
                return NF_DROP;
        if (ret > 0) {
                ipv6h = ipv6_hdr(skb);
-               ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret);
+               oldlen = ipv6h->payload_len;
+               newlen = htons(ntohs(oldlen) + ret);
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->csum = csum_add(csum_sub(skb->csum, oldlen),
+                                            newlen);
+               ipv6h->payload_len = newlen;
        }
        return XT_CONTINUE;
 }
index 3eff7b6..6e57a39 100644 (file)
@@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
        return XT_CONTINUE;
 }
 
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 static unsigned int
 tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
                .destroy    = tee_tg_destroy,
                .me         = THIS_MODULE,
        },
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
        {
                .name       = "TEE",
                .revision   = 1,
index 81dc1bb..f1ffb34 100644 (file)
@@ -2831,7 +2831,8 @@ static int netlink_dump(struct sock *sk)
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
-       skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+       if (!netlink_rx_is_mmaped(sk))
+               skb_reserve(skb, skb_tailroom(skb) - alloc_size);
        netlink_skb_set_owner_r(skb, sk);
 
        len = cb->dump(skb, cb);
index 1605691..5eb7694 100644 (file)
@@ -90,7 +90,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
        int err;
        struct vxlan_config conf = {
                .no_share = true,
-               .flags = VXLAN_F_COLLECT_METADATA,
+               .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX,
+               /* Don't restrict the packets that can be sent by MTU */
+               .mtu = IP_MAX_MTU,
        };
 
        if (!options) {
index f53bf3b..cf5b69a 100644 (file)
@@ -1095,17 +1095,6 @@ static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
        return res;
 }
 
-static bool rfkill_readable(struct rfkill_data *data)
-{
-       bool r;
-
-       mutex_lock(&data->mtx);
-       r = !list_empty(&data->events);
-       mutex_unlock(&data->mtx);
-
-       return r;
-}
-
 static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
                               size_t count, loff_t *pos)
 {
@@ -1122,8 +1111,11 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
                        goto out;
                }
                mutex_unlock(&data->mtx);
+               /* since we re-check and it just compares pointers,
+                * using !list_empty() without locking isn't a problem
+                */
                ret = wait_event_interruptible(data->read_wait,
-                                              rfkill_readable(data));
+                                              !list_empty(&data->events));
                mutex_lock(&data->mtx);
 
                if (ret)
index d058696..6b70399 100644 (file)
@@ -62,6 +62,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
        struct xt_tgdtor_param par = {
                .target   = t->u.kernel.target,
                .targinfo = t->data,
+               .family   = NFPROTO_IPV4,
        };
        if (par.target->destroy != NULL)
                par.target->destroy(&par);
@@ -195,6 +196,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
        par.hooknum  = ipt->tcfi_hook;
        par.target   = ipt->tcfi_t->u.kernel.target;
        par.targinfo = ipt->tcfi_t->data;
+       par.family   = NFPROTO_IPV4;
        ret = par.target->target(skb, &par);
 
        switch (ret) {
index b5c2cf2..af1acf0 100644 (file)
@@ -1852,6 +1852,7 @@ reset:
        }
 
        tp = old_tp;
+       protocol = tc_skb_protocol(skb);
        goto reclassify;
 #endif
 }
index f26bdea..a1cd778 100644 (file)
@@ -403,6 +403,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
                if (len <= cl->deficit) {
                        cl->deficit -= len;
                        skb = qdisc_dequeue_peeked(cl->qdisc);
+                       if (unlikely(skb == NULL))
+                               goto out;
                        if (cl->qdisc->q.qlen == 0)
                                list_del(&cl->alist);
 
index bf61dfb..49d2cc7 100644 (file)
@@ -935,15 +935,22 @@ static struct sctp_association *__sctp_lookup_association(
                                        struct sctp_transport **pt)
 {
        struct sctp_transport *t;
+       struct sctp_association *asoc = NULL;
 
+       rcu_read_lock();
        t = sctp_addrs_lookup_transport(net, local, peer);
-       if (!t || t->dead)
-               return NULL;
+       if (!t || !sctp_transport_hold(t))
+               goto out;
 
-       sctp_association_hold(t->asoc);
+       asoc = t->asoc;
+       sctp_association_hold(asoc);
        *pt = t;
 
-       return t->asoc;
+       sctp_transport_put(t);
+
+out:
+       rcu_read_unlock();
+       return asoc;
 }
 
 /* Look up an association. protected by RCU read lock */
@@ -955,9 +962,7 @@ struct sctp_association *sctp_lookup_association(struct net *net,
 {
        struct sctp_association *asoc;
 
-       rcu_read_lock();
        asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
-       rcu_read_unlock();
 
        return asoc;
 }
index ec52912..ce46f1c 100644 (file)
@@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
                }
                return 0;
        }
+       if (addr1->v6.sin6_port != addr2->v6.sin6_port)
+               return 0;
        if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
                return 0;
        /* If this is a linklocal address, compare the scope_id. */
index 684c5b3..963dffc 100644 (file)
@@ -165,8 +165,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
        list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list,
                        transports) {
                addr = &transport->ipaddr;
-               if (transport->dead)
-                       continue;
 
                af = sctp_get_af_specific(addr->sa.sa_family);
                if (af->cmp_addr(addr, primary)) {
@@ -380,6 +378,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
        }
 
        transport = (struct sctp_transport *)v;
+       if (!sctp_transport_hold(transport))
+               return 0;
        assoc = transport->asoc;
        epb = &assoc->base;
        sk = epb->sk;
@@ -412,6 +412,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
                sk->sk_rcvbuf);
        seq_printf(seq, "\n");
 
+       sctp_transport_put(transport);
+
        return 0;
 }
 
@@ -480,7 +482,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
        struct sctp_association *assoc;
-       struct sctp_transport *tsp;
+       struct sctp_transport *transport, *tsp;
 
        if (v == SEQ_START_TOKEN) {
                seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
@@ -488,13 +490,13 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
                return 0;
        }
 
-       tsp = (struct sctp_transport *)v;
-       assoc = tsp->asoc;
+       transport = (struct sctp_transport *)v;
+       if (!sctp_transport_hold(transport))
+               return 0;
+       assoc = transport->asoc;
 
        list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
                                transports) {
-               if (tsp->dead)
-                       continue;
                /*
                 * The remote address (ADDR)
                 */
@@ -544,6 +546,8 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
                seq_printf(seq, "\n");
        }
 
+       sctp_transport_put(transport);
+
        return 0;
 }
 
index ab0d538..1099e99 100644 (file)
@@ -60,6 +60,8 @@
 #include <net/inet_common.h>
 #include <net/inet_ecn.h>
 
+#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)
+
 /* Global data structures. */
 struct sctp_globals sctp_globals __read_mostly;
 
@@ -1355,6 +1357,8 @@ static __init int sctp_init(void)
        unsigned long limit;
        int max_share;
        int order;
+       int num_entries;
+       int max_entry_order;
 
        sock_skb_cb_check_size(sizeof(struct sctp_ulpevent));
 
@@ -1407,14 +1411,24 @@ static __init int sctp_init(void)
 
        /* Size and allocate the association hash table.
         * The methodology is similar to that of the tcp hash tables.
+        * Though not identical.  Start by getting a goal size
         */
        if (totalram_pages >= (128 * 1024))
                goal = totalram_pages >> (22 - PAGE_SHIFT);
        else
                goal = totalram_pages >> (24 - PAGE_SHIFT);
 
-       for (order = 0; (1UL << order) < goal; order++)
-               ;
+       /* Then compute the page order for said goal */
+       order = get_order(goal);
+
+       /* Now compute the required page order for the maximum sized table we
+        * want to create
+        */
+       max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES *
+                                   sizeof(struct sctp_bind_hashbucket));
+
+       /* Limit the page order by that maximum hash table size */
+       order = min(order, max_entry_order);
 
        /* Allocate and initialize the endpoint hash table.  */
        sctp_ep_hashsize = 64;
@@ -1430,20 +1444,35 @@ static __init int sctp_init(void)
                INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
        }
 
-       /* Allocate and initialize the SCTP port hash table.  */
+       /* Allocate and initialize the SCTP port hash table.
+        * Note that order is initalized to start at the max sized
+        * table we want to support.  If we can't get that many pages
+        * reduce the order and try again
+        */
        do {
-               sctp_port_hashsize = (1UL << order) * PAGE_SIZE /
-                                       sizeof(struct sctp_bind_hashbucket);
-               if ((sctp_port_hashsize > (64 * 1024)) && order > 0)
-                       continue;
                sctp_port_hashtable = (struct sctp_bind_hashbucket *)
                        __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order);
        } while (!sctp_port_hashtable && --order > 0);
+
        if (!sctp_port_hashtable) {
                pr_err("Failed bind hash alloc\n");
                status = -ENOMEM;
                goto err_bhash_alloc;
        }
+
+       /* Now compute the number of entries that will fit in the
+        * port hash space we allocated
+        */
+       num_entries = (1UL << order) * PAGE_SIZE /
+                     sizeof(struct sctp_bind_hashbucket);
+
+       /* And finish by rounding it down to the nearest power of two
+        * this wastes some memory of course, but its needed because
+        * the hash function operates based on the assumption that
+        * that the number of entries is a power of two
+        */
+       sctp_port_hashsize = rounddown_pow_of_two(num_entries);
+
        for (i = 0; i < sctp_port_hashsize; i++) {
                spin_lock_init(&sctp_port_hashtable[i].lock);
                INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
@@ -1452,7 +1481,8 @@ static __init int sctp_init(void)
        if (sctp_transport_hashtable_init())
                goto err_thash_alloc;
 
-       pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize);
+       pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize,
+               num_entries);
 
        sctp_sysctl_register();
 
index 2e21384..b5327bb 100644 (file)
@@ -259,12 +259,6 @@ void sctp_generate_t3_rtx_event(unsigned long peer)
                goto out_unlock;
        }
 
-       /* Is this transport really dead and just waiting around for
-        * the timer to let go of the reference?
-        */
-       if (transport->dead)
-               goto out_unlock;
-
        /* Run through the state machine.  */
        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX),
@@ -380,12 +374,6 @@ void sctp_generate_heartbeat_event(unsigned long data)
                goto out_unlock;
        }
 
-       /* Is this structure just waiting around for us to actually
-        * get destroyed?
-        */
-       if (transport->dead)
-               goto out_unlock;
-
        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
                           asoc->state, asoc->ep, asoc,
index 9bb80ec..e878da0 100644 (file)
@@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
        struct sctp_hmac_algo_param *hmacs;
        __u16 data_len = 0;
        u32 num_idents;
+       int i;
 
        if (!ep->auth_enable)
                return -EACCES;
@@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
                return -EFAULT;
        if (put_user(num_idents, &p->shmac_num_idents))
                return -EFAULT;
-       if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
-               return -EFAULT;
+       for (i = 0; i < num_idents; i++) {
+               __u16 hmacid = ntohs(hmacs->hmac_ids[i]);
+
+               if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16)))
+                       return -EFAULT;
+       }
        return 0;
 }
 
@@ -6636,6 +6641,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 
                        if (cmsgs->srinfo->sinfo_flags &
                            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+                             SCTP_SACK_IMMEDIATELY |
                              SCTP_ABORT | SCTP_EOF))
                                return -EINVAL;
                        break;
@@ -6659,6 +6665,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 
                        if (cmsgs->sinfo->snd_flags &
                            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+                             SCTP_SACK_IMMEDIATELY |
                              SCTP_ABORT | SCTP_EOF))
                                return -EINVAL;
                        break;
index aab9e3f..a431c14 100644 (file)
@@ -132,8 +132,6 @@ fail:
  */
 void sctp_transport_free(struct sctp_transport *transport)
 {
-       transport->dead = 1;
-
        /* Try to delete the heartbeat timer.  */
        if (del_timer(&transport->hb_timer))
                sctp_transport_put(transport);
@@ -169,7 +167,7 @@ static void sctp_transport_destroy_rcu(struct rcu_head *head)
  */
 static void sctp_transport_destroy(struct sctp_transport *transport)
 {
-       if (unlikely(!transport->dead)) {
+       if (unlikely(atomic_read(&transport->refcnt))) {
                WARN(1, "Attempt to destroy undead transport %p!\n", transport);
                return;
        }
@@ -296,9 +294,9 @@ void sctp_transport_route(struct sctp_transport *transport,
 }
 
 /* Hold a reference to a transport.  */
-void sctp_transport_hold(struct sctp_transport *transport)
+int sctp_transport_hold(struct sctp_transport *transport)
 {
-       atomic_inc(&transport->refcnt);
+       return atomic_add_unless(&transport->refcnt, 1, 0);
 }
 
 /* Release a reference to a transport and clean up
index 799e65b..cabf586 100644 (file)
@@ -740,7 +740,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
                default:
                        printk(KERN_CRIT "%s: bad return from "
                                "gss_fill_context: %zd\n", __func__, err);
-                       BUG();
+                       gss_msg->msg.errno = -EIO;
                }
                goto err_release_msg;
        }
index 2b32fd6..273bc3a 100644 (file)
@@ -1225,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
        if (bp[0] == '\\' && bp[1] == 'x') {
                /* HEX STRING */
                bp += 2;
-               while (len < bufsize) {
+               while (len < bufsize - 1) {
                        int h, l;
 
                        h = hex_to_bin(bp[0]);
index cc1251d..2dcd764 100644 (file)
@@ -341,6 +341,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
        rqst->rq_reply_bytes_recvd = 0;
        rqst->rq_bytes_sent = 0;
        rqst->rq_xid = headerp->rm_xid;
+
+       rqst->rq_private_buf.len = size;
        set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
 
        buf = &rqst->rq_rcv_buf;
index ebc661d..8b5833c 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
 #include <net/ip_fib.h>
 #include <net/switchdev.h>
 
@@ -567,7 +568,6 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
 }
 EXPORT_SYMBOL_GPL(switchdev_port_obj_dump);
 
-static DEFINE_MUTEX(switchdev_mutex);
 static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
 /**
@@ -582,9 +582,9 @@ int register_switchdev_notifier(struct notifier_block *nb)
 {
        int err;
 
-       mutex_lock(&switchdev_mutex);
+       rtnl_lock();
        err = raw_notifier_chain_register(&switchdev_notif_chain, nb);
-       mutex_unlock(&switchdev_mutex);
+       rtnl_unlock();
        return err;
 }
 EXPORT_SYMBOL_GPL(register_switchdev_notifier);
@@ -600,9 +600,9 @@ int unregister_switchdev_notifier(struct notifier_block *nb)
 {
        int err;
 
-       mutex_lock(&switchdev_mutex);
+       rtnl_lock();
        err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb);
-       mutex_unlock(&switchdev_mutex);
+       rtnl_unlock();
        return err;
 }
 EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
@@ -616,16 +616,17 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
  *     Call all network notifier blocks. This should be called by driver
  *     when it needs to propagate hardware event.
  *     Return values are same as for atomic_notifier_call_chain().
+ *     rtnl_lock must be held.
  */
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
                             struct switchdev_notifier_info *info)
 {
        int err;
 
+       ASSERT_RTNL();
+
        info->dev = dev;
-       mutex_lock(&switchdev_mutex);
        err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
-       mutex_unlock(&switchdev_mutex);
        return err;
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
@@ -1092,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                .cb = cb,
                .idx = idx,
        };
+       int err;
 
-       switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+       err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
+                                     switchdev_port_fdb_dump_cb);
+       cb->args[1] = err;
        return dump.idx;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
index 0c2944f..347cdc9 100644 (file)
@@ -1973,8 +1973,10 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
 
        hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
                          NLM_F_MULTI, TIPC_NL_LINK_GET);
-       if (!hdr)
+       if (!hdr) {
+               tipc_bcast_unlock(net);
                return -EMSGSIZE;
+       }
 
        attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
        if (!attrs)
index fa97d96..9d7a16f 100644 (file)
@@ -346,12 +346,6 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
        skb_queue_head_init(&n->bc_entry.inputq2);
        for (i = 0; i < MAX_BEARERS; i++)
                spin_lock_init(&n->links[i].lock);
-       hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]);
-       list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
-               if (n->addr < temp_node->addr)
-                       break;
-       }
-       list_add_tail_rcu(&n->list, &temp_node->list);
        n->state = SELF_DOWN_PEER_LEAVING;
        n->signature = INVALID_NODE_SIG;
        n->active_links[0] = INVALID_BEARER_ID;
@@ -372,6 +366,12 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
        tipc_node_get(n);
        setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n);
        n->keepalive_intv = U32_MAX;
+       hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]);
+       list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
+               if (n->addr < temp_node->addr)
+                       break;
+       }
+       list_add_tail_rcu(&n->list, &temp_node->list);
 exit:
        spin_unlock_bh(&tn->node_list_lock);
        return n;
index 69c2905..4d420bb 100644 (file)
@@ -673,7 +673,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
        struct tipc_sock *tsk = tipc_sk(sk);
        struct net *net = sock_net(sk);
        struct tipc_msg *mhdr = &tsk->phdr;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
        struct iov_iter save = msg->msg_iter;
        uint mtu;
        int rc;
@@ -687,14 +687,16 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
        msg_set_nameupper(mhdr, seq->upper);
        msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
 
+       skb_queue_head_init(&pktchain);
+
 new_mtu:
        mtu = tipc_bcast_get_mtu(net);
-       rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
        if (unlikely(rc < 0))
                return rc;
 
        do {
-               rc = tipc_bcast_xmit(net, pktchain);
+               rc = tipc_bcast_xmit(net, &pktchain);
                if (likely(!rc))
                        return dsz;
 
@@ -704,7 +706,7 @@ new_mtu:
                        if (!rc)
                                continue;
                }
-               __skb_queue_purge(pktchain);
+               __skb_queue_purge(&pktchain);
                if (rc == -EMSGSIZE) {
                        msg->msg_iter = save;
                        goto new_mtu;
@@ -863,7 +865,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
        struct net *net = sock_net(sk);
        struct tipc_msg *mhdr = &tsk->phdr;
        u32 dnode, dport;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
        struct sk_buff *skb;
        struct tipc_name_seq *seq;
        struct iov_iter save;
@@ -924,17 +926,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
                msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
        }
 
+       skb_queue_head_init(&pktchain);
        save = m->msg_iter;
 new_mtu:
        mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
-       rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
        if (rc < 0)
                return rc;
 
        do {
-               skb = skb_peek(pktchain);
+               skb = skb_peek(&pktchain);
                TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
-               rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+               rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
                if (likely(!rc)) {
                        if (sock->state != SS_READY)
                                sock->state = SS_CONNECTING;
@@ -946,7 +949,7 @@ new_mtu:
                        if (!rc)
                                continue;
                }
-               __skb_queue_purge(pktchain);
+               __skb_queue_purge(&pktchain);
                if (rc == -EMSGSIZE) {
                        m->msg_iter = save;
                        goto new_mtu;
@@ -1016,7 +1019,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
        struct net *net = sock_net(sk);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_msg *mhdr = &tsk->phdr;
-       struct sk_buff_head *pktchain = &sk->sk_write_queue;
+       struct sk_buff_head pktchain;
        DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
        u32 portid = tsk->portid;
        int rc = -EINVAL;
@@ -1044,17 +1047,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
 
        timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
        dnode = tsk_peer_node(tsk);
+       skb_queue_head_init(&pktchain);
 
 next:
        save = m->msg_iter;
        mtu = tsk->max_pkt;
        send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
-       rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+       rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
        if (unlikely(rc < 0))
                return rc;
+
        do {
                if (likely(!tsk_conn_cong(tsk))) {
-                       rc = tipc_node_xmit(net, pktchain, dnode, portid);
+                       rc = tipc_node_xmit(net, &pktchain, dnode, portid);
                        if (likely(!rc)) {
                                tsk->sent_unacked++;
                                sent += send;
@@ -1063,7 +1068,7 @@ next:
                                goto next;
                        }
                        if (rc == -EMSGSIZE) {
-                               __skb_queue_purge(pktchain);
+                               __skb_queue_purge(&pktchain);
                                tsk->max_pkt = tipc_node_get_mtu(net, dnode,
                                                                 portid);
                                m->msg_iter = save;
@@ -1077,7 +1082,7 @@ next:
                rc = tipc_wait_for_sndpkt(sock, &timeo);
        } while (!rc);
 
-       __skb_queue_purge(pktchain);
+       __skb_queue_purge(&pktchain);
        return sent ? sent : rc;
 }
 
index 350cca3..f9ff73a 100644 (file)
@@ -289,15 +289,15 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
                                struct sockaddr_tipc *addr, void *usr_data,
                                void *buf, size_t len)
 {
-       struct tipc_subscriber *subscriber = usr_data;
+       struct tipc_subscriber *subscrb = usr_data;
        struct tipc_subscription *sub = NULL;
        struct tipc_net *tn = net_generic(net, tipc_net_id);
 
-       tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub);
+       if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub))
+               return tipc_conn_terminate(tn->topsrv, subscrb->conid);
+
        if (sub)
                tipc_nametbl_subscribe(sub);
-       else
-               tipc_conn_terminate(tn->topsrv, subscriber->conid);
 }
 
 /* Handle one request to establish a new subscriber */
index c5bf5ef..f75f847 100644 (file)
@@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
        UNIXCB(skb).fp = NULL;
 
        for (i = scm->fp->count-1; i >= 0; i--)
-               unix_notinflight(scm->fp->fp[i]);
+               unix_notinflight(scm->fp->user, scm->fp->fp[i]);
 }
 
 static void unix_destruct_scm(struct sk_buff *skb)
@@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
                return -ENOMEM;
 
        for (i = scm->fp->count - 1; i >= 0; i--)
-               unix_inflight(scm->fp->fp[i]);
+               unix_inflight(scm->fp->user, scm->fp->fp[i]);
        return max_level;
 }
 
@@ -1781,7 +1781,12 @@ restart_locked:
                        goto out_unlock;
        }
 
-       if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+       /* other == sk && unix_peer(other) != sk if
+        * - unix_peer(sk) == NULL, destination address bound to sk
+        * - unix_peer(sk) == sk by time of get but disconnected before lock
+        */
+       if (other != sk &&
+           unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
                if (timeo) {
                        timeo = unix_wait_for_peer(other, timeo);
 
@@ -2277,13 +2282,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
        size_t size = state->size;
        unsigned int last_len;
 
-       err = -EINVAL;
-       if (sk->sk_state != TCP_ESTABLISHED)
+       if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+               err = -EINVAL;
                goto out;
+       }
 
-       err = -EOPNOTSUPP;
-       if (flags & MSG_OOB)
+       if (unlikely(flags & MSG_OOB)) {
+               err = -EOPNOTSUPP;
                goto out;
+       }
 
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
        timeo = sock_rcvtimeo(sk, noblock);
@@ -2305,6 +2312,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
                bool drop_skb;
                struct sk_buff *skb, *last;
 
+redo:
                unix_state_lock(sk);
                if (sock_flag(sk, SOCK_DEAD)) {
                        err = -ECONNRESET;
@@ -2329,9 +2337,11 @@ again:
                                goto unlock;
 
                        unix_state_unlock(sk);
-                       err = -EAGAIN;
-                       if (!timeo)
+                       if (!timeo) {
+                               err = -EAGAIN;
                                break;
+                       }
+
                        mutex_unlock(&u->readlock);
 
                        timeo = unix_stream_data_wait(sk, timeo, last,
@@ -2339,11 +2349,12 @@ again:
 
                        if (signal_pending(current)) {
                                err = sock_intr_errno(timeo);
+                               scm_destroy(&scm);
                                goto out;
                        }
 
                        mutex_lock(&u->readlock);
-                       continue;
+                       goto redo;
 unlock:
                        unix_state_unlock(sk);
                        break;
index c512f64..4d96797 100644 (file)
@@ -220,7 +220,7 @@ done:
        return skb->len;
 }
 
-static struct sock *unix_lookup_by_ino(int ino)
+static struct sock *unix_lookup_by_ino(unsigned int ino)
 {
        int i;
        struct sock *sk;
index 8fcdc22..6a0d485 100644 (file)
@@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp)
  * descriptor if it is for an AF_UNIX socket.
  */
 
-void unix_inflight(struct file *fp)
+void unix_inflight(struct user_struct *user, struct file *fp)
 {
        struct sock *s = unix_get_socket(fp);
 
@@ -133,11 +133,11 @@ void unix_inflight(struct file *fp)
                }
                unix_tot_inflight++;
        }
-       fp->f_cred->user->unix_inflight++;
+       user->unix_inflight++;
        spin_unlock(&unix_gc_lock);
 }
 
-void unix_notinflight(struct file *fp)
+void unix_notinflight(struct user_struct *user, struct file *fp)
 {
        struct sock *s = unix_get_socket(fp);
 
@@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp)
                        list_del_init(&u->link);
                unix_tot_inflight--;
        }
-       fp->f_cred->user->unix_inflight--;
+       user->unix_inflight--;
        spin_unlock(&unix_gc_lock);
 }
 
index 7fd1220..bbe65dc 100644 (file)
@@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
        if (err < 0)
                goto out;
 
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
        while (total_written < len) {
                ssize_t written;
 
@@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
                                goto out_wait;
 
                        release_sock(sk);
+                       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                        timeout = schedule_timeout(timeout);
+                       finish_wait(sk_sleep(sk), &wait);
                        lock_sock(sk);
                        if (signal_pending(current)) {
                                err = sock_intr_errno(timeout);
@@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
                                goto out_wait;
                        }
 
-                       prepare_to_wait(sk_sleep(sk), &wait,
-                                       TASK_INTERRUPTIBLE);
                }
 
                /* These checks occur both as part of and after the loop
@@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 out_wait:
        if (total_written > 0)
                err = total_written;
-       finish_wait(sk_sleep(sk), &wait);
 out:
        release_sock(sk);
        return err;
@@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
        if (err < 0)
                goto out;
 
-       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
        while (1) {
                s64 ready = vsock_stream_has_data(vsk);
@@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                         */
 
                        err = -ENOMEM;
-                       goto out_wait;
+                       goto out;
                } else if (ready > 0) {
                        ssize_t read;
 
@@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                                        vsk, target, read,
                                        !(flags & MSG_PEEK), &recv_data);
                        if (err < 0)
-                               goto out_wait;
+                               goto out;
 
                        if (read >= target || flags & MSG_PEEK)
                                break;
@@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                                break;
 
                        release_sock(sk);
+                       prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                        timeout = schedule_timeout(timeout);
+                       finish_wait(sk_sleep(sk), &wait);
                        lock_sock(sk);
 
                        if (signal_pending(current)) {
@@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                                err = -EAGAIN;
                                break;
                        }
-
-                       prepare_to_wait(sk_sleep(sk), &wait,
-                                       TASK_INTERRUPTIBLE);
                }
        }
 
@@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                err = copied;
        }
 
-out_wait:
-       finish_wait(sk_sleep(sk), &wait);
 out:
        release_sock(sk);
        return err;
index b091551..8f0bac7 100644 (file)
@@ -1147,6 +1147,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
                return NOTIFY_DONE;
        }
 
+       wireless_nlevent_flush();
+
        return NOTIFY_OK;
 }
 
index d4786f2..711cb7a 100644 (file)
@@ -7547,7 +7547,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 
                if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
                    no_ht) {
-                       kfree(connkeys);
+                       kzfree(connkeys);
                        return -EINVAL;
                }
        }
index 3b0ce1c..547ceec 100644 (file)
@@ -231,20 +231,22 @@ static const struct ieee80211_regdomain world_regdom = {
                /* IEEE 802.11b/g, channels 1..11 */
                REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
                /* IEEE 802.11b/g, channels 12..13. */
-               REG_RULE(2467-10, 2472+10, 40, 6, 20,
-                       NL80211_RRF_NO_IR),
+               REG_RULE(2467-10, 2472+10, 20, 6, 20,
+                       NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW),
                /* IEEE 802.11 channel 14 - Only JP enables
                 * this and for 802.11b only */
                REG_RULE(2484-10, 2484+10, 20, 6, 20,
                        NL80211_RRF_NO_IR |
                        NL80211_RRF_NO_OFDM),
                /* IEEE 802.11a, channel 36..48 */
-               REG_RULE(5180-10, 5240+10, 160, 6, 20,
-                        NL80211_RRF_NO_IR),
+               REG_RULE(5180-10, 5240+10, 80, 6, 20,
+                        NL80211_RRF_NO_IR |
+                        NL80211_RRF_AUTO_BW),
 
                /* IEEE 802.11a, channel 52..64 - DFS required */
-               REG_RULE(5260-10, 5320+10, 160, 6, 20,
+               REG_RULE(5260-10, 5320+10, 80, 6, 20,
                        NL80211_RRF_NO_IR |
+                       NL80211_RRF_AUTO_BW |
                        NL80211_RRF_DFS),
 
                /* IEEE 802.11a, channel 100..144 - DFS required */
@@ -2745,7 +2747,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
        const struct ieee80211_power_rule *power_rule = NULL;
        char bw[32], cac_time[32];
 
-       pr_info("  (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");
+       pr_debug("  (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");
 
        for (i = 0; i < rd->n_reg_rules; i++) {
                reg_rule = &rd->reg_rules[i];
@@ -2772,7 +2774,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
                 * in certain regions
                 */
                if (power_rule->max_antenna_gain)
-                       pr_info("  (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
+                       pr_debug("  (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
                                freq_range->start_freq_khz,
                                freq_range->end_freq_khz,
                                bw,
@@ -2780,7 +2782,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
                                power_rule->max_eirp,
                                cac_time);
                else
-                       pr_info("  (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
+                       pr_debug("  (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
                                freq_range->start_freq_khz,
                                freq_range->end_freq_khz,
                                bw,
@@ -2813,35 +2815,35 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
                        struct cfg80211_registered_device *rdev;
                        rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx);
                        if (rdev) {
-                               pr_info("Current regulatory domain updated by AP to: %c%c\n",
+                               pr_debug("Current regulatory domain updated by AP to: %c%c\n",
                                        rdev->country_ie_alpha2[0],
                                        rdev->country_ie_alpha2[1]);
                        } else
-                               pr_info("Current regulatory domain intersected:\n");
+                               pr_debug("Current regulatory domain intersected:\n");
                } else
-                       pr_info("Current regulatory domain intersected:\n");
+                       pr_debug("Current regulatory domain intersected:\n");
        } else if (is_world_regdom(rd->alpha2)) {
-               pr_info("World regulatory domain updated:\n");
+               pr_debug("World regulatory domain updated:\n");
        } else {
                if (is_unknown_alpha2(rd->alpha2))
-                       pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n");
+                       pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n");
                else {
                        if (reg_request_cell_base(lr))
-                               pr_info("Regulatory domain changed to country: %c%c by Cell Station\n",
+                               pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n",
                                        rd->alpha2[0], rd->alpha2[1]);
                        else
-                               pr_info("Regulatory domain changed to country: %c%c\n",
+                               pr_debug("Regulatory domain changed to country: %c%c\n",
                                        rd->alpha2[0], rd->alpha2[1]);
                }
        }
 
-       pr_info(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
+       pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
        print_rd_rules(rd);
 }
 
 static void print_regdomain_info(const struct ieee80211_regdomain *rd)
 {
-       pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
+       pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
        print_rd_rules(rd);
 }
 
@@ -2862,7 +2864,8 @@ static int reg_set_rd_user(const struct ieee80211_regdomain *rd,
                return -EALREADY;
 
        if (!is_valid_rd(rd)) {
-               pr_err("Invalid regulatory domain detected:\n");
+               pr_err("Invalid regulatory domain detected: %c%c\n",
+                      rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }
@@ -2898,7 +2901,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
                return -EALREADY;
 
        if (!is_valid_rd(rd)) {
-               pr_err("Invalid regulatory domain detected:\n");
+               pr_err("Invalid regulatory domain detected: %c%c\n",
+                      rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }
@@ -2956,7 +2960,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd,
         */
 
        if (!is_valid_rd(rd)) {
-               pr_err("Invalid regulatory domain detected:\n");
+               pr_err("Invalid regulatory domain detected: %c%c\n",
+                      rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }
index 8020b5b..d49ed76 100644 (file)
@@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
        nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
 
+       /* stop critical protocol if supported */
+       if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) {
+               rdev->crit_proto_nlportid = 0;
+               rdev_crit_proto_stop(rdev, wdev);
+       }
+
        /*
         * Delete all the keys ... pairwise keys can't really
         * exist any more anyway, but default keys might.
index c8717c1..b50ee5d 100644 (file)
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
 
 /* IW event code */
 
+void wireless_nlevent_flush(void)
+{
+       struct sk_buff *skb;
+       struct net *net;
+
+       ASSERT_RTNL();
+
+       for_each_net(net) {
+               while ((skb = skb_dequeue(&net->wext_nlevents)))
+                       rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+                                   GFP_KERNEL);
+       }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+                                    unsigned long state, void *ptr)
+{
+       /*
+        * When a netdev changes state in any way, flush all pending messages
+        * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+        * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+        * or similar - all of which could otherwise happen due to delays from
+        * schedule_work().
+        */
+       wireless_nlevent_flush();
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+       .notifier_call = wext_netdev_notifier_call,
+};
+
 static int __net_init wext_pernet_init(struct net *net)
 {
        skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
 
 static int __init wireless_nlevent_init(void)
 {
-       return register_pernet_subsys(&wext_pernet_ops);
+       int err = register_pernet_subsys(&wext_pernet_ops);
+
+       if (err)
+               return err;
+
+       return register_netdevice_notifier(&wext_netdev_notifier);
 }
 
 subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
 /* Process events generated by the wireless layer or the driver. */
 static void wireless_nlevent_process(struct work_struct *work)
 {
-       struct sk_buff *skb;
-       struct net *net;
-
        rtnl_lock();
-
-       for_each_net(net) {
-               while ((skb = skb_dequeue(&net->wext_nlevents)))
-                       rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
-                                   GFP_KERNEL);
-       }
-
+       wireless_nlevent_flush();
        rtnl_unlock();
 }
 
index 0147c91..874132b 100755 (executable)
@@ -269,7 +269,8 @@ our $Sparse = qr{
                        __init_refok|
                        __kprobes|
                        __ref|
-                       __rcu
+                       __rcu|
+                       __private
                }x;
 our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)};
 our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)};
index d154f08..7bfe9fa 100755 (executable)
@@ -1,7 +1,7 @@
 #!/usr/bin/awk -f
 # extract linker version number from stdin and turn into single number
        {
-       gsub(".*)", "");
+       gsub(".*\\)", "");
        gsub(".*version ", "");
        gsub("-.*", "");
        split($1,a, ".");
index e080746..48958d3 100644 (file)
@@ -594,7 +594,8 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname)
                if (strncmp(symname, "_restgpr0_", sizeof("_restgpr0_") - 1) == 0 ||
                    strncmp(symname, "_savegpr0_", sizeof("_savegpr0_") - 1) == 0 ||
                    strncmp(symname, "_restvr_", sizeof("_restvr_") - 1) == 0 ||
-                   strncmp(symname, "_savevr_", sizeof("_savevr_") - 1) == 0)
+                   strncmp(symname, "_savevr_", sizeof("_savevr_") - 1) == 0 ||
+                   strcmp(symname, ".TOC.") == 0)
                        return 1;
        /* Do not ignore this symbol */
        return 0;
diff --git a/scripts/prune-kernel b/scripts/prune-kernel
new file mode 100755 (executable)
index 0000000..ab5034e
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# because I use CONFIG_LOCALVERSION_AUTO, not the same version again and
+# again, /boot and /lib/modules/ eventually fill up.
+# Dumb script to purge that stuff:
+
+for f in "$@"
+do
+        if rpm -qf "/lib/modules/$f" >/dev/null; then
+                echo "keeping $f (installed from rpm)"
+        elif [ $(uname -r) = "$f" ]; then
+                echo "keeping $f (running kernel) "
+        else
+                echo "removing $f"
+                rm -f "/boot/initramfs-$f.img" "/boot/System.map-$f"
+                rm -f "/boot/vmlinuz-$f"   "/boot/config-$f"
+                rm -rf "/lib/modules/$f"
+                new-kernel-pkg --remove $f
+        fi
+done
index c2423d9..7b29fb1 100644 (file)
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
        return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+       int i;
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) + i, loc);
+               w(r(loc + 1) + i + 4, loc + 1);
+               w(r(loc + 2) + i + 8, loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+
+       qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+       i = 0;
+       while (i < image_size) {
+               uint32_t *loc = (uint32_t *)(extab_image + i);
+
+               w(r(loc) - i, loc);
+               w(r(loc + 1) - (i + 4), loc + 1);
+               w(r(loc + 2) - (i + 8), loc + 2);
+
+               i += sizeof(uint32_t) * 3;
+       }
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
        int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
                break;
        case EM_386:
        case EM_X86_64:
+               custom_sort = x86_sort_relative_table;
+               break;
+
        case EM_S390:
                custom_sort = sort_relative_table;
                break;
index f716025..e6ea9d4 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/integrity.h>
 #include <linux/evm.h>
 #include <crypto/hash.h>
+#include <crypto/algapi.h>
 #include "evm.h"
 
 int evm_initialized;
@@ -148,7 +149,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                                   xattr_value_len, calc.digest);
                if (rc)
                        break;
-               rc = memcmp(xattr_data->digest, calc.digest,
+               rc = crypto_memneq(xattr_data->digest, calc.digest,
                            sizeof(calc.digest));
                if (rc)
                        rc = -EINVAL;
index 07a8731..09ef276 100644 (file)
@@ -430,7 +430,8 @@ static int __key_instantiate_and_link(struct key *key,
 
                        /* and link it into the destination keyring */
                        if (keyring) {
-                               set_bit(KEY_FLAG_KEEP, &key->flags);
+                               if (test_bit(KEY_FLAG_KEEP, &keyring->flags))
+                                       set_bit(KEY_FLAG_KEEP, &key->flags);
 
                                __key_link(key, _edit);
                        }
index f8110cf..f1ab715 100644 (file)
@@ -3249,7 +3249,7 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t
 
 static void selinux_inode_getsecid(struct inode *inode, u32 *secid)
 {
-       struct inode_security_struct *isec = inode_security(inode);
+       struct inode_security_struct *isec = inode_security_novalidate(inode);
        *secid = isec->sid;
 }
 
index 2bbb418..8495b93 100644 (file)
@@ -83,6 +83,7 @@ static struct nlmsg_perm nlmsg_tcpdiag_perms[] =
        { TCPDIAG_GETSOCK,      NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { DCCPDIAG_GETSOCK,     NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
        { SOCK_DIAG_BY_FAMILY,  NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+       { SOCK_DESTROY,         NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
 };
 
 static struct nlmsg_perm nlmsg_xfrm_perms[] =
index e9b98af..e8da3b8 100644 (file)
@@ -141,10 +141,8 @@ int pxa2xx_pcm_mmap(struct snd_pcm_substream *substream,
        struct vm_area_struct *vma)
 {
        struct snd_pcm_runtime *runtime = substream->runtime;
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
 }
 EXPORT_SYMBOL(pxa2xx_pcm_mmap);
 
@@ -156,8 +154,7 @@ int pxa2xx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
        buf->dev.type = SNDRV_DMA_TYPE_DEV;
        buf->dev.dev = pcm->card->dev;
        buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
        if (!buf->area)
                return -ENOMEM;
        buf->bytes = size;
@@ -178,8 +175,7 @@ void pxa2xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
                buf = &substream->dma_buffer;
                if (!buf->area)
                        continue;
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                buf->area = NULL;
        }
 }
index e3e9491..a2a1e24 100644 (file)
@@ -97,11 +97,11 @@ config SND_PCM_TIMER
        bool "PCM timer interface" if EXPERT
        default y
        help
-         If you disable this option, pcm timer will be inavailable, so
-         those stubs used pcm timer (e.g. dmix, dsnoop & co) may work
+         If you disable this option, pcm timer will be unavailable, so
+         those stubs that use pcm timer (e.g. dmix, dsnoop & co) may work
          incorrectlly.
 
-         For some embedded device, we may disable it to reduce memory
+         For some embedded devices, we may disable it to reduce memory
          footprint, about 20KB on x86_64 platform.
 
 config SND_SEQUENCER_OSS
index 18b8dc4..7fac3ca 100644 (file)
 #include <sound/compress_offload.h>
 #include <sound/compress_driver.h>
 
+/* struct snd_compr_codec_caps overflows the ioctl bit size for some
+ * architectures, so we need to disable the relevant ioctls.
+ */
+#if _IOC_SIZEBITS < 14
+#define COMPR_CODEC_CAPS_OVERFLOW
+#endif
+
 /* TODO:
  * - add substream support for multiple devices in case of
  *     SND_DYNAMIC_MINORS is not used
@@ -440,6 +447,7 @@ out:
        return retval;
 }
 
+#ifndef COMPR_CODEC_CAPS_OVERFLOW
 static int
 snd_compr_get_codec_caps(struct snd_compr_stream *stream, unsigned long arg)
 {
@@ -463,6 +471,7 @@ out:
        kfree(caps);
        return retval;
 }
+#endif /* !COMPR_CODEC_CAPS_OVERFLOW */
 
 /* revisit this with snd_pcm_preallocate_xxx */
 static int snd_compr_allocate_buffer(struct snd_compr_stream *stream,
@@ -801,9 +810,11 @@ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
        case _IOC_NR(SNDRV_COMPRESS_GET_CAPS):
                retval = snd_compr_get_caps(stream, arg);
                break;
+#ifndef COMPR_CODEC_CAPS_OVERFLOW
        case _IOC_NR(SNDRV_COMPRESS_GET_CODEC_CAPS):
                retval = snd_compr_get_codec_caps(stream, arg);
                break;
+#endif
        case _IOC_NR(SNDRV_COMPRESS_SET_PARAMS):
                retval = snd_compr_set_params(stream, arg);
                break;
index b9c0910..0608f21 100644 (file)
@@ -170,6 +170,19 @@ struct snd_ctl_elem_value32 {
         unsigned char reserved[128];
 };
 
+#ifdef CONFIG_X86_X32
+/* x32 has a different alignment for 64bit values from ia32 */
+struct snd_ctl_elem_value_x32 {
+       struct snd_ctl_elem_id id;
+       unsigned int indirect;  /* bit-field causes misalignment */
+       union {
+               s32 integer[128];
+               unsigned char data[512];
+               s64 integer64[64];
+       } value;
+       unsigned char reserved[128];
+};
+#endif /* CONFIG_X86_X32 */
 
 /* get the value type and count of the control */
 static int get_ctl_type(struct snd_card *card, struct snd_ctl_elem_id *id,
@@ -219,9 +232,11 @@ static int get_elem_size(int type, int count)
 
 static int copy_ctl_value_from_user(struct snd_card *card,
                                    struct snd_ctl_elem_value *data,
-                                   struct snd_ctl_elem_value32 __user *data32,
+                                   void __user *userdata,
+                                   void __user *valuep,
                                    int *typep, int *countp)
 {
+       struct snd_ctl_elem_value32 __user *data32 = userdata;
        int i, type, size;
        int uninitialized_var(count);
        unsigned int indirect;
@@ -239,8 +254,9 @@ static int copy_ctl_value_from_user(struct snd_card *card,
        if (type == SNDRV_CTL_ELEM_TYPE_BOOLEAN ||
            type == SNDRV_CTL_ELEM_TYPE_INTEGER) {
                for (i = 0; i < count; i++) {
+                       s32 __user *intp = valuep;
                        int val;
-                       if (get_user(val, &data32->value.integer[i]))
+                       if (get_user(val, &intp[i]))
                                return -EFAULT;
                        data->value.integer.value[i] = val;
                }
@@ -250,8 +266,7 @@ static int copy_ctl_value_from_user(struct snd_card *card,
                        dev_err(card->dev, "snd_ioctl32_ctl_elem_value: unknown type %d\n", type);
                        return -EINVAL;
                }
-               if (copy_from_user(data->value.bytes.data,
-                                  data32->value.data, size))
+               if (copy_from_user(data->value.bytes.data, valuep, size))
                        return -EFAULT;
        }
 
@@ -261,7 +276,8 @@ static int copy_ctl_value_from_user(struct snd_card *card,
 }
 
 /* restore the value to 32bit */
-static int copy_ctl_value_to_user(struct snd_ctl_elem_value32 __user *data32,
+static int copy_ctl_value_to_user(void __user *userdata,
+                                 void __user *valuep,
                                  struct snd_ctl_elem_value *data,
                                  int type, int count)
 {
@@ -270,22 +286,22 @@ static int copy_ctl_value_to_user(struct snd_ctl_elem_value32 __user *data32,
        if (type == SNDRV_CTL_ELEM_TYPE_BOOLEAN ||
            type == SNDRV_CTL_ELEM_TYPE_INTEGER) {
                for (i = 0; i < count; i++) {
+                       s32 __user *intp = valuep;
                        int val;
                        val = data->value.integer.value[i];
-                       if (put_user(val, &data32->value.integer[i]))
+                       if (put_user(val, &intp[i]))
                                return -EFAULT;
                }
        } else {
                size = get_elem_size(type, count);
-               if (copy_to_user(data32->value.data,
-                                data->value.bytes.data, size))
+               if (copy_to_user(valuep, data->value.bytes.data, size))
                        return -EFAULT;
        }
        return 0;
 }
 
-static int snd_ctl_elem_read_user_compat(struct snd_card *card, 
-                                        struct snd_ctl_elem_value32 __user *data32)
+static int ctl_elem_read_user(struct snd_card *card,
+                             void __user *userdata, void __user *valuep)
 {
        struct snd_ctl_elem_value *data;
        int err, type, count;
@@ -294,7 +310,9 @@ static int snd_ctl_elem_read_user_compat(struct snd_card *card,
        if (data == NULL)
                return -ENOMEM;
 
-       if ((err = copy_ctl_value_from_user(card, data, data32, &type, &count)) < 0)
+       err = copy_ctl_value_from_user(card, data, userdata, valuep,
+                                      &type, &count);
+       if (err < 0)
                goto error;
 
        snd_power_lock(card);
@@ -303,14 +321,15 @@ static int snd_ctl_elem_read_user_compat(struct snd_card *card,
                err = snd_ctl_elem_read(card, data);
        snd_power_unlock(card);
        if (err >= 0)
-               err = copy_ctl_value_to_user(data32, data, type, count);
+               err = copy_ctl_value_to_user(userdata, valuep, data,
+                                            type, count);
  error:
        kfree(data);
        return err;
 }
 
-static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
-                                         struct snd_ctl_elem_value32 __user *data32)
+static int ctl_elem_write_user(struct snd_ctl_file *file,
+                              void __user *userdata, void __user *valuep)
 {
        struct snd_ctl_elem_value *data;
        struct snd_card *card = file->card;
@@ -320,7 +339,9 @@ static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
        if (data == NULL)
                return -ENOMEM;
 
-       if ((err = copy_ctl_value_from_user(card, data, data32, &type, &count)) < 0)
+       err = copy_ctl_value_from_user(card, data, userdata, valuep,
+                                      &type, &count);
+       if (err < 0)
                goto error;
 
        snd_power_lock(card);
@@ -329,12 +350,39 @@ static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
                err = snd_ctl_elem_write(card, file, data);
        snd_power_unlock(card);
        if (err >= 0)
-               err = copy_ctl_value_to_user(data32, data, type, count);
+               err = copy_ctl_value_to_user(userdata, valuep, data,
+                                            type, count);
  error:
        kfree(data);
        return err;
 }
 
+static int snd_ctl_elem_read_user_compat(struct snd_card *card,
+                                        struct snd_ctl_elem_value32 __user *data32)
+{
+       return ctl_elem_read_user(card, data32, &data32->value);
+}
+
+static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
+                                         struct snd_ctl_elem_value32 __user *data32)
+{
+       return ctl_elem_write_user(file, data32, &data32->value);
+}
+
+#ifdef CONFIG_X86_X32
+static int snd_ctl_elem_read_user_x32(struct snd_card *card,
+                                     struct snd_ctl_elem_value_x32 __user *data32)
+{
+       return ctl_elem_read_user(card, data32, &data32->value);
+}
+
+static int snd_ctl_elem_write_user_x32(struct snd_ctl_file *file,
+                                      struct snd_ctl_elem_value_x32 __user *data32)
+{
+       return ctl_elem_write_user(file, data32, &data32->value);
+}
+#endif /* CONFIG_X86_X32 */
+
 /* add or replace a user control */
 static int snd_ctl_elem_add_compat(struct snd_ctl_file *file,
                                   struct snd_ctl_elem_info32 __user *data32,
@@ -393,6 +441,10 @@ enum {
        SNDRV_CTL_IOCTL_ELEM_WRITE32 = _IOWR('U', 0x13, struct snd_ctl_elem_value32),
        SNDRV_CTL_IOCTL_ELEM_ADD32 = _IOWR('U', 0x17, struct snd_ctl_elem_info32),
        SNDRV_CTL_IOCTL_ELEM_REPLACE32 = _IOWR('U', 0x18, struct snd_ctl_elem_info32),
+#ifdef CONFIG_X86_X32
+       SNDRV_CTL_IOCTL_ELEM_READ_X32 = _IOWR('U', 0x12, struct snd_ctl_elem_value_x32),
+       SNDRV_CTL_IOCTL_ELEM_WRITE_X32 = _IOWR('U', 0x13, struct snd_ctl_elem_value_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static inline long snd_ctl_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -431,6 +483,12 @@ static inline long snd_ctl_ioctl_compat(struct file *file, unsigned int cmd, uns
                return snd_ctl_elem_add_compat(ctl, argp, 0);
        case SNDRV_CTL_IOCTL_ELEM_REPLACE32:
                return snd_ctl_elem_add_compat(ctl, argp, 1);
+#ifdef CONFIG_X86_X32
+       case SNDRV_CTL_IOCTL_ELEM_READ_X32:
+               return snd_ctl_elem_read_user_x32(ctl->card, argp);
+       case SNDRV_CTL_IOCTL_ELEM_WRITE_X32:
+               return snd_ctl_elem_write_user_x32(ctl, argp);
+#endif /* CONFIG_X86_X32 */
        }
 
        down_read(&snd_ioctl_rwsem);
index 0e73d03..ebc9fdf 100644 (file)
@@ -835,7 +835,8 @@ static int choose_rate(struct snd_pcm_substream *substream,
        return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL);
 }
 
-static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream)
+static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream,
+                                    bool trylock)
 {
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_hw_params *params, *sparams;
@@ -849,7 +850,10 @@ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream)
        struct snd_mask sformat_mask;
        struct snd_mask mask;
 
-       if (mutex_lock_interruptible(&runtime->oss.params_lock))
+       if (trylock) {
+               if (!(mutex_trylock(&runtime->oss.params_lock)))
+                       return -EAGAIN;
+       } else if (mutex_lock_interruptible(&runtime->oss.params_lock))
                return -EINTR;
        sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL);
        params = kmalloc(sizeof(*params), GFP_KERNEL);
@@ -1092,7 +1096,7 @@ static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_fil
                if (asubstream == NULL)
                        asubstream = substream;
                if (substream->runtime->oss.params) {
-                       err = snd_pcm_oss_change_params(substream);
+                       err = snd_pcm_oss_change_params(substream, false);
                        if (err < 0)
                                return err;
                }
@@ -1132,7 +1136,7 @@ static int snd_pcm_oss_make_ready(struct snd_pcm_substream *substream)
                return 0;
        runtime = substream->runtime;
        if (runtime->oss.params) {
-               err = snd_pcm_oss_change_params(substream);
+               err = snd_pcm_oss_change_params(substream, false);
                if (err < 0)
                        return err;
        }
@@ -2163,7 +2167,7 @@ static int snd_pcm_oss_get_space(struct snd_pcm_oss_file *pcm_oss_file, int stre
        runtime = substream->runtime;
 
        if (runtime->oss.params &&
-           (err = snd_pcm_oss_change_params(substream)) < 0)
+           (err = snd_pcm_oss_change_params(substream, false)) < 0)
                return err;
 
        info.fragsize = runtime->oss.period_bytes;
@@ -2804,7 +2808,12 @@ static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area)
                return -EIO;
        
        if (runtime->oss.params) {
-               if ((err = snd_pcm_oss_change_params(substream)) < 0)
+               /* use mutex_trylock() for params_lock for avoiding a deadlock
+                * between mmap_sem and params_lock taken by
+                * copy_from/to_user() in snd_pcm_oss_write/read()
+                */
+               err = snd_pcm_oss_change_params(substream, true);
+               if (err < 0)
                        return err;
        }
 #ifdef CONFIG_SND_PCM_OSS_PLUGINS
index 9630e9f..1f64ab0 100644 (file)
@@ -183,6 +183,14 @@ static int snd_pcm_ioctl_channel_info_compat(struct snd_pcm_substream *substream
        return err;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has the same struct as x86-64 for snd_pcm_channel_info */
+static int snd_pcm_channel_info_user(struct snd_pcm_substream *substream,
+                                    struct snd_pcm_channel_info __user *src);
+#define snd_pcm_ioctl_channel_info_x32(s, p)   \
+       snd_pcm_channel_info_user(s, p)
+#endif /* CONFIG_X86_X32 */
+
 struct snd_pcm_status32 {
        s32 state;
        struct compat_timespec trigger_tstamp;
@@ -243,6 +251,71 @@ static int snd_pcm_status_user_compat(struct snd_pcm_substream *substream,
        return err;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_pcm_status_x32 {
+       s32 state;
+       u32 rsvd; /* alignment */
+       struct timespec trigger_tstamp;
+       struct timespec tstamp;
+       u32 appl_ptr;
+       u32 hw_ptr;
+       s32 delay;
+       u32 avail;
+       u32 avail_max;
+       u32 overrange;
+       s32 suspended_state;
+       u32 audio_tstamp_data;
+       struct timespec audio_tstamp;
+       struct timespec driver_tstamp;
+       u32 audio_tstamp_accuracy;
+       unsigned char reserved[52-2*sizeof(struct timespec)];
+} __packed;
+
+#define put_timespec(src, dst) copy_to_user(dst, src, sizeof(*dst))
+
+static int snd_pcm_status_user_x32(struct snd_pcm_substream *substream,
+                                  struct snd_pcm_status_x32 __user *src,
+                                  bool ext)
+{
+       struct snd_pcm_status status;
+       int err;
+
+       memset(&status, 0, sizeof(status));
+       /*
+        * with extension, parameters are read/write,
+        * get audio_tstamp_data from user,
+        * ignore rest of status structure
+        */
+       if (ext && get_user(status.audio_tstamp_data,
+                               (u32 __user *)(&src->audio_tstamp_data)))
+               return -EFAULT;
+       err = snd_pcm_status(substream, &status);
+       if (err < 0)
+               return err;
+
+       if (clear_user(src, sizeof(*src)))
+               return -EFAULT;
+       if (put_user(status.state, &src->state) ||
+           put_timespec(&status.trigger_tstamp, &src->trigger_tstamp) ||
+           put_timespec(&status.tstamp, &src->tstamp) ||
+           put_user(status.appl_ptr, &src->appl_ptr) ||
+           put_user(status.hw_ptr, &src->hw_ptr) ||
+           put_user(status.delay, &src->delay) ||
+           put_user(status.avail, &src->avail) ||
+           put_user(status.avail_max, &src->avail_max) ||
+           put_user(status.overrange, &src->overrange) ||
+           put_user(status.suspended_state, &src->suspended_state) ||
+           put_user(status.audio_tstamp_data, &src->audio_tstamp_data) ||
+           put_timespec(&status.audio_tstamp, &src->audio_tstamp) ||
+           put_timespec(&status.driver_tstamp, &src->driver_tstamp) ||
+           put_user(status.audio_tstamp_accuracy, &src->audio_tstamp_accuracy))
+               return -EFAULT;
+
+       return err;
+}
+#endif /* CONFIG_X86_X32 */
+
 /* both for HW_PARAMS and HW_REFINE */
 static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream,
                                          int refine, 
@@ -469,6 +542,93 @@ static int snd_pcm_ioctl_sync_ptr_compat(struct snd_pcm_substream *substream,
        return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_pcm_mmap_status_x32 {
+       s32 state;
+       s32 pad1;
+       u32 hw_ptr;
+       u32 pad2; /* alignment */
+       struct timespec tstamp;
+       s32 suspended_state;
+       struct timespec audio_tstamp;
+} __packed;
+
+struct snd_pcm_mmap_control_x32 {
+       u32 appl_ptr;
+       u32 avail_min;
+};
+
+struct snd_pcm_sync_ptr_x32 {
+       u32 flags;
+       u32 rsvd; /* alignment */
+       union {
+               struct snd_pcm_mmap_status_x32 status;
+               unsigned char reserved[64];
+       } s;
+       union {
+               struct snd_pcm_mmap_control_x32 control;
+               unsigned char reserved[64];
+       } c;
+} __packed;
+
+static int snd_pcm_ioctl_sync_ptr_x32(struct snd_pcm_substream *substream,
+                                     struct snd_pcm_sync_ptr_x32 __user *src)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       volatile struct snd_pcm_mmap_status *status;
+       volatile struct snd_pcm_mmap_control *control;
+       u32 sflags;
+       struct snd_pcm_mmap_control scontrol;
+       struct snd_pcm_mmap_status sstatus;
+       snd_pcm_uframes_t boundary;
+       int err;
+
+       if (snd_BUG_ON(!runtime))
+               return -EINVAL;
+
+       if (get_user(sflags, &src->flags) ||
+           get_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
+           get_user(scontrol.avail_min, &src->c.control.avail_min))
+               return -EFAULT;
+       if (sflags & SNDRV_PCM_SYNC_PTR_HWSYNC) {
+               err = snd_pcm_hwsync(substream);
+               if (err < 0)
+                       return err;
+       }
+       status = runtime->status;
+       control = runtime->control;
+       boundary = recalculate_boundary(runtime);
+       if (!boundary)
+               boundary = 0x7fffffff;
+       snd_pcm_stream_lock_irq(substream);
+       /* FIXME: we should consider the boundary for the sync from app */
+       if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL))
+               control->appl_ptr = scontrol.appl_ptr;
+       else
+               scontrol.appl_ptr = control->appl_ptr % boundary;
+       if (!(sflags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN))
+               control->avail_min = scontrol.avail_min;
+       else
+               scontrol.avail_min = control->avail_min;
+       sstatus.state = status->state;
+       sstatus.hw_ptr = status->hw_ptr % boundary;
+       sstatus.tstamp = status->tstamp;
+       sstatus.suspended_state = status->suspended_state;
+       sstatus.audio_tstamp = status->audio_tstamp;
+       snd_pcm_stream_unlock_irq(substream);
+       if (put_user(sstatus.state, &src->s.status.state) ||
+           put_user(sstatus.hw_ptr, &src->s.status.hw_ptr) ||
+           put_timespec(&sstatus.tstamp, &src->s.status.tstamp) ||
+           put_user(sstatus.suspended_state, &src->s.status.suspended_state) ||
+           put_timespec(&sstatus.audio_tstamp, &src->s.status.audio_tstamp) ||
+           put_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
+           put_user(scontrol.avail_min, &src->c.control.avail_min))
+               return -EFAULT;
+
+       return 0;
+}
+#endif /* CONFIG_X86_X32 */
 
 /*
  */
@@ -487,7 +647,12 @@ enum {
        SNDRV_PCM_IOCTL_WRITEN_FRAMES32 = _IOW('A', 0x52, struct snd_xfern32),
        SNDRV_PCM_IOCTL_READN_FRAMES32 = _IOR('A', 0x53, struct snd_xfern32),
        SNDRV_PCM_IOCTL_SYNC_PTR32 = _IOWR('A', 0x23, struct snd_pcm_sync_ptr32),
-
+#ifdef CONFIG_X86_X32
+       SNDRV_PCM_IOCTL_CHANNEL_INFO_X32 = _IOR('A', 0x32, struct snd_pcm_channel_info),
+       SNDRV_PCM_IOCTL_STATUS_X32 = _IOR('A', 0x20, struct snd_pcm_status_x32),
+       SNDRV_PCM_IOCTL_STATUS_EXT_X32 = _IOWR('A', 0x24, struct snd_pcm_status_x32),
+       SNDRV_PCM_IOCTL_SYNC_PTR_X32 = _IOWR('A', 0x23, struct snd_pcm_sync_ptr_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -559,6 +724,16 @@ static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned l
                return snd_pcm_ioctl_rewind_compat(substream, argp);
        case SNDRV_PCM_IOCTL_FORWARD32:
                return snd_pcm_ioctl_forward_compat(substream, argp);
+#ifdef CONFIG_X86_X32
+       case SNDRV_PCM_IOCTL_STATUS_X32:
+               return snd_pcm_status_user_x32(substream, argp, false);
+       case SNDRV_PCM_IOCTL_STATUS_EXT_X32:
+               return snd_pcm_status_user_x32(substream, argp, true);
+       case SNDRV_PCM_IOCTL_SYNC_PTR_X32:
+               return snd_pcm_ioctl_sync_ptr_x32(substream, argp);
+       case SNDRV_PCM_IOCTL_CHANNEL_INFO_X32:
+               return snd_pcm_ioctl_channel_info_x32(substream, argp);
+#endif /* CONFIG_X86_X32 */
        }
 
        return -ENOIOCTLCMD;
index fadd3eb..9106d8e 100644 (file)
@@ -74,6 +74,18 @@ static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream);
 static DEFINE_RWLOCK(snd_pcm_link_rwlock);
 static DECLARE_RWSEM(snd_pcm_link_rwsem);
 
+/* Writer in rwsem may block readers even during its waiting in queue,
+ * and this may lead to a deadlock when the code path takes read sem
+ * twice (e.g. one in snd_pcm_action_nonatomic() and another in
+ * snd_pcm_stream_lock()).  As a (suboptimal) workaround, let writer to
+ * spin until it gets the lock.
+ */
+static inline void down_write_nonblock(struct rw_semaphore *lock)
+{
+       while (!down_write_trylock(lock))
+               cond_resched();
+}
+
 /**
  * snd_pcm_stream_lock - Lock the PCM stream
  * @substream: PCM substream
@@ -1813,7 +1825,7 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)
                res = -ENOMEM;
                goto _nolock;
        }
-       down_write(&snd_pcm_link_rwsem);
+       down_write_nonblock(&snd_pcm_link_rwsem);
        write_lock_irq(&snd_pcm_link_rwlock);
        if (substream->runtime->status->state == SNDRV_PCM_STATE_OPEN ||
            substream->runtime->status->state != substream1->runtime->status->state ||
@@ -1860,7 +1872,7 @@ static int snd_pcm_unlink(struct snd_pcm_substream *substream)
        struct snd_pcm_substream *s;
        int res = 0;
 
-       down_write(&snd_pcm_link_rwsem);
+       down_write_nonblock(&snd_pcm_link_rwsem);
        write_lock_irq(&snd_pcm_link_rwlock);
        if (!snd_pcm_stream_linked(substream)) {
                res = -EALREADY;
index a775984..795437b 100644 (file)
@@ -942,31 +942,36 @@ static long snd_rawmidi_kernel_read1(struct snd_rawmidi_substream *substream,
        unsigned long flags;
        long result = 0, count1;
        struct snd_rawmidi_runtime *runtime = substream->runtime;
+       unsigned long appl_ptr;
 
+       spin_lock_irqsave(&runtime->lock, flags);
        while (count > 0 && runtime->avail) {
                count1 = runtime->buffer_size - runtime->appl_ptr;
                if (count1 > count)
                        count1 = count;
-               spin_lock_irqsave(&runtime->lock, flags);
                if (count1 > (int)runtime->avail)
                        count1 = runtime->avail;
+
+               /* update runtime->appl_ptr before unlocking for userbuf */
+               appl_ptr = runtime->appl_ptr;
+               runtime->appl_ptr += count1;
+               runtime->appl_ptr %= runtime->buffer_size;
+               runtime->avail -= count1;
+
                if (kernelbuf)
-                       memcpy(kernelbuf + result, runtime->buffer + runtime->appl_ptr, count1);
+                       memcpy(kernelbuf + result, runtime->buffer + appl_ptr, count1);
                if (userbuf) {
                        spin_unlock_irqrestore(&runtime->lock, flags);
                        if (copy_to_user(userbuf + result,
-                                        runtime->buffer + runtime->appl_ptr, count1)) {
+                                        runtime->buffer + appl_ptr, count1)) {
                                return result > 0 ? result : -EFAULT;
                        }
                        spin_lock_irqsave(&runtime->lock, flags);
                }
-               runtime->appl_ptr += count1;
-               runtime->appl_ptr %= runtime->buffer_size;
-               runtime->avail -= count1;
-               spin_unlock_irqrestore(&runtime->lock, flags);
                result += count1;
                count -= count1;
        }
+       spin_unlock_irqrestore(&runtime->lock, flags);
        return result;
 }
 
@@ -1055,23 +1060,16 @@ int snd_rawmidi_transmit_empty(struct snd_rawmidi_substream *substream)
 EXPORT_SYMBOL(snd_rawmidi_transmit_empty);
 
 /**
- * snd_rawmidi_transmit_peek - copy data from the internal buffer
+ * __snd_rawmidi_transmit_peek - copy data from the internal buffer
  * @substream: the rawmidi substream
  * @buffer: the buffer pointer
  * @count: data size to transfer
  *
- * Copies data from the internal output buffer to the given buffer.
- *
- * Call this in the interrupt handler when the midi output is ready,
- * and call snd_rawmidi_transmit_ack() after the transmission is
- * finished.
- *
- * Return: The size of copied data, or a negative error code on failure.
+ * This is a variant of snd_rawmidi_transmit_peek() without spinlock.
  */
-int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
+int __snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
                              unsigned char *buffer, int count)
 {
-       unsigned long flags;
        int result, count1;
        struct snd_rawmidi_runtime *runtime = substream->runtime;
 
@@ -1081,7 +1079,6 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
                return -EINVAL;
        }
        result = 0;
-       spin_lock_irqsave(&runtime->lock, flags);
        if (runtime->avail >= runtime->buffer_size) {
                /* warning: lowlevel layer MUST trigger down the hardware */
                goto __skip;
@@ -1106,25 +1103,47 @@ int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
                }
        }
       __skip:
+       return result;
+}
+EXPORT_SYMBOL(__snd_rawmidi_transmit_peek);
+
+/**
+ * snd_rawmidi_transmit_peek - copy data from the internal buffer
+ * @substream: the rawmidi substream
+ * @buffer: the buffer pointer
+ * @count: data size to transfer
+ *
+ * Copies data from the internal output buffer to the given buffer.
+ *
+ * Call this in the interrupt handler when the midi output is ready,
+ * and call snd_rawmidi_transmit_ack() after the transmission is
+ * finished.
+ *
+ * Return: The size of copied data, or a negative error code on failure.
+ */
+int snd_rawmidi_transmit_peek(struct snd_rawmidi_substream *substream,
+                             unsigned char *buffer, int count)
+{
+       struct snd_rawmidi_runtime *runtime = substream->runtime;
+       int result;
+       unsigned long flags;
+
+       spin_lock_irqsave(&runtime->lock, flags);
+       result = __snd_rawmidi_transmit_peek(substream, buffer, count);
        spin_unlock_irqrestore(&runtime->lock, flags);
        return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit_peek);
 
 /**
- * snd_rawmidi_transmit_ack - acknowledge the transmission
+ * __snd_rawmidi_transmit_ack - acknowledge the transmission
  * @substream: the rawmidi substream
  * @count: the transferred count
  *
- * Advances the hardware pointer for the internal output buffer with
- * the given size and updates the condition.
- * Call after the transmission is finished.
- *
- * Return: The advanced size if successful, or a negative error code on failure.
+ * This is a variant of __snd_rawmidi_transmit_ack() without spinlock.
  */
-int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
+int __snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
 {
-       unsigned long flags;
        struct snd_rawmidi_runtime *runtime = substream->runtime;
 
        if (runtime->buffer == NULL) {
@@ -1132,7 +1151,6 @@ int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
                          "snd_rawmidi_transmit_ack: output is not active!!!\n");
                return -EINVAL;
        }
-       spin_lock_irqsave(&runtime->lock, flags);
        snd_BUG_ON(runtime->avail + count > runtime->buffer_size);
        runtime->hw_ptr += count;
        runtime->hw_ptr %= runtime->buffer_size;
@@ -1142,9 +1160,32 @@ int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
                if (runtime->drain || snd_rawmidi_ready(substream))
                        wake_up(&runtime->sleep);
        }
-       spin_unlock_irqrestore(&runtime->lock, flags);
        return count;
 }
+EXPORT_SYMBOL(__snd_rawmidi_transmit_ack);
+
+/**
+ * snd_rawmidi_transmit_ack - acknowledge the transmission
+ * @substream: the rawmidi substream
+ * @count: the transferred count
+ *
+ * Advances the hardware pointer for the internal output buffer with
+ * the given size and updates the condition.
+ * Call after the transmission is finished.
+ *
+ * Return: The advanced size if successful, or a negative error code on failure.
+ */
+int snd_rawmidi_transmit_ack(struct snd_rawmidi_substream *substream, int count)
+{
+       struct snd_rawmidi_runtime *runtime = substream->runtime;
+       int result;
+       unsigned long flags;
+
+       spin_lock_irqsave(&runtime->lock, flags);
+       result = __snd_rawmidi_transmit_ack(substream, count);
+       spin_unlock_irqrestore(&runtime->lock, flags);
+       return result;
+}
 EXPORT_SYMBOL(snd_rawmidi_transmit_ack);
 
 /**
@@ -1160,12 +1201,22 @@ EXPORT_SYMBOL(snd_rawmidi_transmit_ack);
 int snd_rawmidi_transmit(struct snd_rawmidi_substream *substream,
                         unsigned char *buffer, int count)
 {
+       struct snd_rawmidi_runtime *runtime = substream->runtime;
+       int result;
+       unsigned long flags;
+
+       spin_lock_irqsave(&runtime->lock, flags);
        if (!substream->opened)
-               return -EBADFD;
-       count = snd_rawmidi_transmit_peek(substream, buffer, count);
-       if (count < 0)
-               return count;
-       return snd_rawmidi_transmit_ack(substream, count);
+               result = -EBADFD;
+       else {
+               count = __snd_rawmidi_transmit_peek(substream, buffer, count);
+               if (count <= 0)
+                       result = count;
+               else
+                       result = __snd_rawmidi_transmit_ack(substream, count);
+       }
+       spin_unlock_irqrestore(&runtime->lock, flags);
+       return result;
 }
 EXPORT_SYMBOL(snd_rawmidi_transmit);
 
@@ -1177,8 +1228,9 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
        unsigned long flags;
        long count1, result;
        struct snd_rawmidi_runtime *runtime = substream->runtime;
+       unsigned long appl_ptr;
 
-       if (snd_BUG_ON(!kernelbuf && !userbuf))
+       if (!kernelbuf && !userbuf)
                return -EINVAL;
        if (snd_BUG_ON(!runtime->buffer))
                return -EINVAL;
@@ -1197,12 +1249,19 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
                        count1 = count;
                if (count1 > (long)runtime->avail)
                        count1 = runtime->avail;
+
+               /* update runtime->appl_ptr before unlocking for userbuf */
+               appl_ptr = runtime->appl_ptr;
+               runtime->appl_ptr += count1;
+               runtime->appl_ptr %= runtime->buffer_size;
+               runtime->avail -= count1;
+
                if (kernelbuf)
-                       memcpy(runtime->buffer + runtime->appl_ptr,
+                       memcpy(runtime->buffer + appl_ptr,
                               kernelbuf + result, count1);
                else if (userbuf) {
                        spin_unlock_irqrestore(&runtime->lock, flags);
-                       if (copy_from_user(runtime->buffer + runtime->appl_ptr,
+                       if (copy_from_user(runtime->buffer + appl_ptr,
                                           userbuf + result, count1)) {
                                spin_lock_irqsave(&runtime->lock, flags);
                                result = result > 0 ? result : -EFAULT;
@@ -1210,9 +1269,6 @@ static long snd_rawmidi_kernel_write1(struct snd_rawmidi_substream *substream,
                        }
                        spin_lock_irqsave(&runtime->lock, flags);
                }
-               runtime->appl_ptr += count1;
-               runtime->appl_ptr %= runtime->buffer_size;
-               runtime->avail -= count1;
                result += count1;
                count -= count1;
        }
index 5268c1f..f69764d 100644 (file)
@@ -85,8 +85,7 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile,
        if (err < 0)
                return err;
 
-       if (put_user(status.tstamp.tv_sec, &src->tstamp.tv_sec) ||
-           put_user(status.tstamp.tv_nsec, &src->tstamp.tv_nsec) ||
+       if (compat_put_timespec(&status.tstamp, &src->tstamp) ||
            put_user(status.avail, &src->avail) ||
            put_user(status.xruns, &src->xruns))
                return -EFAULT;
@@ -94,9 +93,58 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile,
        return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_rawmidi_status_x32 {
+       s32 stream;
+       u32 rsvd; /* alignment */
+       struct timespec tstamp;
+       u32 avail;
+       u32 xruns;
+       unsigned char reserved[16];
+} __attribute__((packed));
+
+#define put_timespec(src, dst) copy_to_user(dst, src, sizeof(*dst))
+
+static int snd_rawmidi_ioctl_status_x32(struct snd_rawmidi_file *rfile,
+                                       struct snd_rawmidi_status_x32 __user *src)
+{
+       int err;
+       struct snd_rawmidi_status status;
+
+       if (rfile->output == NULL)
+               return -EINVAL;
+       if (get_user(status.stream, &src->stream))
+               return -EFAULT;
+
+       switch (status.stream) {
+       case SNDRV_RAWMIDI_STREAM_OUTPUT:
+               err = snd_rawmidi_output_status(rfile->output, &status);
+               break;
+       case SNDRV_RAWMIDI_STREAM_INPUT:
+               err = snd_rawmidi_input_status(rfile->input, &status);
+               break;
+       default:
+               return -EINVAL;
+       }
+       if (err < 0)
+               return err;
+
+       if (put_timespec(&status.tstamp, &src->tstamp) ||
+           put_user(status.avail, &src->avail) ||
+           put_user(status.xruns, &src->xruns))
+               return -EFAULT;
+
+       return 0;
+}
+#endif /* CONFIG_X86_X32 */
+
 enum {
        SNDRV_RAWMIDI_IOCTL_PARAMS32 = _IOWR('W', 0x10, struct snd_rawmidi_params32),
        SNDRV_RAWMIDI_IOCTL_STATUS32 = _IOWR('W', 0x20, struct snd_rawmidi_status32),
+#ifdef CONFIG_X86_X32
+       SNDRV_RAWMIDI_IOCTL_STATUS_X32 = _IOWR('W', 0x20, struct snd_rawmidi_status_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_rawmidi_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -115,6 +163,10 @@ static long snd_rawmidi_ioctl_compat(struct file *file, unsigned int cmd, unsign
                return snd_rawmidi_ioctl_params_compat(rfile, argp);
        case SNDRV_RAWMIDI_IOCTL_STATUS32:
                return snd_rawmidi_ioctl_status_compat(rfile, argp);
+#ifdef CONFIG_X86_X32
+       case SNDRV_RAWMIDI_IOCTL_STATUS_X32:
+               return snd_rawmidi_ioctl_status_x32(rfile, argp);
+#endif /* CONFIG_X86_X32 */
        }
        return -ENOIOCTLCMD;
 }
index 8db156b..8cdf489 100644 (file)
@@ -149,8 +149,6 @@ odev_release(struct inode *inode, struct file *file)
        if ((dp = file->private_data) == NULL)
                return 0;
 
-       snd_seq_oss_drain_write(dp);
-
        mutex_lock(&register_mutex);
        snd_seq_oss_release(dp);
        mutex_unlock(&register_mutex);
index b439243..d7b4d01 100644 (file)
@@ -127,7 +127,6 @@ int snd_seq_oss_write(struct seq_oss_devinfo *dp, const char __user *buf, int co
 unsigned int snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wait);
 
 void snd_seq_oss_reset(struct seq_oss_devinfo *dp);
-void snd_seq_oss_drain_write(struct seq_oss_devinfo *dp);
 
 /* */
 void snd_seq_oss_process_queue(struct seq_oss_devinfo *dp, abstime_t time);
index b1221b2..92c96a9 100644 (file)
@@ -202,7 +202,7 @@ snd_seq_oss_open(struct file *file, int level)
 
        dp->index = i;
        if (i >= SNDRV_SEQ_OSS_MAX_CLIENTS) {
-               pr_err("ALSA: seq_oss: too many applications\n");
+               pr_debug("ALSA: seq_oss: too many applications\n");
                rc = -ENOMEM;
                goto _error;
        }
@@ -435,22 +435,6 @@ snd_seq_oss_release(struct seq_oss_devinfo *dp)
 }
 
 
-/*
- * Wait until the queue is empty (if we don't have nonblock)
- */
-void
-snd_seq_oss_drain_write(struct seq_oss_devinfo *dp)
-{
-       if (! dp->timer->running)
-               return;
-       if (is_write_mode(dp->file_mode) && !is_nonblock_mode(dp->file_mode) &&
-           dp->writeq) {
-               while (snd_seq_oss_writeq_sync(dp->writeq))
-                       ;
-       }
-}
-
-
 /*
  * reset sequencer devices
  */
index 0f3b381..b16dbef 100644 (file)
@@ -308,7 +308,7 @@ snd_seq_oss_synth_cleanup(struct seq_oss_devinfo *dp)
        struct seq_oss_synth *rec;
        struct seq_oss_synthinfo *info;
 
-       if (snd_BUG_ON(dp->max_synthdev >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS))
+       if (snd_BUG_ON(dp->max_synthdev > SNDRV_SEQ_OSS_MAX_SYNTH_DEVS))
                return;
        for (i = 0; i < dp->max_synthdev; i++) {
                info = &dp->synths[i];
index 13cfa81..58e79e0 100644 (file)
@@ -678,6 +678,9 @@ static int deliver_to_subscribers(struct snd_seq_client *client,
        else
                down_read(&grp->list_mutex);
        list_for_each_entry(subs, &grp->list_head, src_list) {
+               /* both ports ready? */
+               if (atomic_read(&subs->ref_count) != 2)
+                       continue;
                event->dest = subs->info.dest;
                if (subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP)
                        /* convert time according to flag with subscription */
index 8010766..c850345 100644 (file)
@@ -383,15 +383,20 @@ int snd_seq_pool_init(struct snd_seq_pool *pool)
 
        if (snd_BUG_ON(!pool))
                return -EINVAL;
-       if (pool->ptr)                  /* should be atomic? */
-               return 0;
 
-       pool->ptr = vmalloc(sizeof(struct snd_seq_event_cell) * pool->size);
-       if (!pool->ptr)
+       cellptr = vmalloc(sizeof(struct snd_seq_event_cell) * pool->size);
+       if (!cellptr)
                return -ENOMEM;
 
        /* add new cells to the free cell list */
        spin_lock_irqsave(&pool->lock, flags);
+       if (pool->ptr) {
+               spin_unlock_irqrestore(&pool->lock, flags);
+               vfree(cellptr);
+               return 0;
+       }
+
+       pool->ptr = cellptr;
        pool->free = NULL;
 
        for (cell = 0; cell < pool->size; cell++) {
index 55170a2..fe686ee 100644 (file)
@@ -173,10 +173,6 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client,
 }
 
 /* */
-enum group_type {
-       SRC_LIST, DEST_LIST
-};
-
 static int subscribe_port(struct snd_seq_client *client,
                          struct snd_seq_client_port *port,
                          struct snd_seq_port_subs_info *grp,
@@ -203,6 +199,20 @@ static struct snd_seq_client_port *get_client_port(struct snd_seq_addr *addr,
        return NULL;
 }
 
+static void delete_and_unsubscribe_port(struct snd_seq_client *client,
+                                       struct snd_seq_client_port *port,
+                                       struct snd_seq_subscribers *subs,
+                                       bool is_src, bool ack);
+
+static inline struct snd_seq_subscribers *
+get_subscriber(struct list_head *p, bool is_src)
+{
+       if (is_src)
+               return list_entry(p, struct snd_seq_subscribers, src_list);
+       else
+               return list_entry(p, struct snd_seq_subscribers, dest_list);
+}
+
 /*
  * remove all subscribers on the list
  * this is called from port_delete, for each src and dest list.
@@ -210,7 +220,7 @@ static struct snd_seq_client_port *get_client_port(struct snd_seq_addr *addr,
 static void clear_subscriber_list(struct snd_seq_client *client,
                                  struct snd_seq_client_port *port,
                                  struct snd_seq_port_subs_info *grp,
-                                 int grptype)
+                                 int is_src)
 {
        struct list_head *p, *n;
 
@@ -219,15 +229,13 @@ static void clear_subscriber_list(struct snd_seq_client *client,
                struct snd_seq_client *c;
                struct snd_seq_client_port *aport;
 
-               if (grptype == SRC_LIST) {
-                       subs = list_entry(p, struct snd_seq_subscribers, src_list);
+               subs = get_subscriber(p, is_src);
+               if (is_src)
                        aport = get_client_port(&subs->info.dest, &c);
-               } else {
-                       subs = list_entry(p, struct snd_seq_subscribers, dest_list);
+               else
                        aport = get_client_port(&subs->info.sender, &c);
-               }
-               list_del(p);
-               unsubscribe_port(client, port, grp, &subs->info, 0);
+               delete_and_unsubscribe_port(client, port, subs, is_src, false);
+
                if (!aport) {
                        /* looks like the connected port is being deleted.
                         * we decrease the counter, and when both ports are deleted
@@ -235,21 +243,14 @@ static void clear_subscriber_list(struct snd_seq_client *client,
                         */
                        if (atomic_dec_and_test(&subs->ref_count))
                                kfree(subs);
-               } else {
-                       /* ok we got the connected port */
-                       struct snd_seq_port_subs_info *agrp;
-                       agrp = (grptype == SRC_LIST) ? &aport->c_dest : &aport->c_src;
-                       down_write(&agrp->list_mutex);
-                       if (grptype == SRC_LIST)
-                               list_del(&subs->dest_list);
-                       else
-                               list_del(&subs->src_list);
-                       up_write(&agrp->list_mutex);
-                       unsubscribe_port(c, aport, agrp, &subs->info, 1);
-                       kfree(subs);
-                       snd_seq_port_unlock(aport);
-                       snd_seq_client_unlock(c);
+                       continue;
                }
+
+               /* ok we got the connected port */
+               delete_and_unsubscribe_port(c, aport, subs, !is_src, true);
+               kfree(subs);
+               snd_seq_port_unlock(aport);
+               snd_seq_client_unlock(c);
        }
 }
 
@@ -262,8 +263,8 @@ static int port_delete(struct snd_seq_client *client,
        snd_use_lock_sync(&port->use_lock); 
 
        /* clear subscribers info */
-       clear_subscriber_list(client, port, &port->c_src, SRC_LIST);
-       clear_subscriber_list(client, port, &port->c_dest, DEST_LIST);
+       clear_subscriber_list(client, port, &port->c_src, true);
+       clear_subscriber_list(client, port, &port->c_dest, false);
 
        if (port->private_free)
                port->private_free(port->private_data);
@@ -479,85 +480,123 @@ static int match_subs_info(struct snd_seq_port_subscribe *r,
        return 0;
 }
 
-
-/* connect two ports */
-int snd_seq_port_connect(struct snd_seq_client *connector,
-                        struct snd_seq_client *src_client,
-                        struct snd_seq_client_port *src_port,
-                        struct snd_seq_client *dest_client,
-                        struct snd_seq_client_port *dest_port,
-                        struct snd_seq_port_subscribe *info)
+static int check_and_subscribe_port(struct snd_seq_client *client,
+                                   struct snd_seq_client_port *port,
+                                   struct snd_seq_subscribers *subs,
+                                   bool is_src, bool exclusive, bool ack)
 {
-       struct snd_seq_port_subs_info *src = &src_port->c_src;
-       struct snd_seq_port_subs_info *dest = &dest_port->c_dest;
-       struct snd_seq_subscribers *subs, *s;
-       int err, src_called = 0;
-       unsigned long flags;
-       int exclusive;
-
-       subs = kzalloc(sizeof(*subs), GFP_KERNEL);
-       if (! subs)
-               return -ENOMEM;
-
-       subs->info = *info;
-       atomic_set(&subs->ref_count, 2);
+       struct snd_seq_port_subs_info *grp;
+       struct list_head *p;
+       struct snd_seq_subscribers *s;
+       int err;
 
-       down_write(&src->list_mutex);
-       down_write_nested(&dest->list_mutex, SINGLE_DEPTH_NESTING);
-
-       exclusive = info->flags & SNDRV_SEQ_PORT_SUBS_EXCLUSIVE ? 1 : 0;
+       grp = is_src ? &port->c_src : &port->c_dest;
        err = -EBUSY;
+       down_write(&grp->list_mutex);
        if (exclusive) {
-               if (! list_empty(&src->list_head) || ! list_empty(&dest->list_head))
+               if (!list_empty(&grp->list_head))
                        goto __error;
        } else {
-               if (src->exclusive || dest->exclusive)
+               if (grp->exclusive)
                        goto __error;
                /* check whether already exists */
-               list_for_each_entry(s, &src->list_head, src_list) {
-                       if (match_subs_info(info, &s->info))
-                               goto __error;
-               }
-               list_for_each_entry(s, &dest->list_head, dest_list) {
-                       if (match_subs_info(info, &s->info))
+               list_for_each(p, &grp->list_head) {
+                       s = get_subscriber(p, is_src);
+                       if (match_subs_info(&subs->info, &s->info))
                                goto __error;
                }
        }
 
-       if ((err = subscribe_port(src_client, src_port, src, info,
-                                 connector->number != src_client->number)) < 0)
-               goto __error;
-       src_called = 1;
-
-       if ((err = subscribe_port(dest_client, dest_port, dest, info,
-                                 connector->number != dest_client->number)) < 0)
+       err = subscribe_port(client, port, grp, &subs->info, ack);
+       if (err < 0) {
+               grp->exclusive = 0;
                goto __error;
+       }
 
        /* add to list */
-       write_lock_irqsave(&src->list_lock, flags);
-       // write_lock(&dest->list_lock); // no other lock yet
-       list_add_tail(&subs->src_list, &src->list_head);
-       list_add_tail(&subs->dest_list, &dest->list_head);
-       // write_unlock(&dest->list_lock); // no other lock yet
-       write_unlock_irqrestore(&src->list_lock, flags);
+       write_lock_irq(&grp->list_lock);
+       if (is_src)
+               list_add_tail(&subs->src_list, &grp->list_head);
+       else
+               list_add_tail(&subs->dest_list, &grp->list_head);
+       grp->exclusive = exclusive;
+       atomic_inc(&subs->ref_count);
+       write_unlock_irq(&grp->list_lock);
+       err = 0;
 
-       src->exclusive = dest->exclusive = exclusive;
+ __error:
+       up_write(&grp->list_mutex);
+       return err;
+}
+
+static void delete_and_unsubscribe_port(struct snd_seq_client *client,
+                                       struct snd_seq_client_port *port,
+                                       struct snd_seq_subscribers *subs,
+                                       bool is_src, bool ack)
+{
+       struct snd_seq_port_subs_info *grp;
+       struct list_head *list;
+       bool empty;
+
+       grp = is_src ? &port->c_src : &port->c_dest;
+       list = is_src ? &subs->src_list : &subs->dest_list;
+       down_write(&grp->list_mutex);
+       write_lock_irq(&grp->list_lock);
+       empty = list_empty(list);
+       if (!empty)
+               list_del_init(list);
+       grp->exclusive = 0;
+       write_unlock_irq(&grp->list_lock);
+       up_write(&grp->list_mutex);
+
+       if (!empty)
+               unsubscribe_port(client, port, grp, &subs->info, ack);
+}
+
+/* connect two ports */
+int snd_seq_port_connect(struct snd_seq_client *connector,
+                        struct snd_seq_client *src_client,
+                        struct snd_seq_client_port *src_port,
+                        struct snd_seq_client *dest_client,
+                        struct snd_seq_client_port *dest_port,
+                        struct snd_seq_port_subscribe *info)
+{
+       struct snd_seq_subscribers *subs;
+       bool exclusive;
+       int err;
+
+       subs = kzalloc(sizeof(*subs), GFP_KERNEL);
+       if (!subs)
+               return -ENOMEM;
+
+       subs->info = *info;
+       atomic_set(&subs->ref_count, 0);
+       INIT_LIST_HEAD(&subs->src_list);
+       INIT_LIST_HEAD(&subs->dest_list);
+
+       exclusive = !!(info->flags & SNDRV_SEQ_PORT_SUBS_EXCLUSIVE);
+
+       err = check_and_subscribe_port(src_client, src_port, subs, true,
+                                      exclusive,
+                                      connector->number != src_client->number);
+       if (err < 0)
+               goto error;
+       err = check_and_subscribe_port(dest_client, dest_port, subs, false,
+                                      exclusive,
+                                      connector->number != dest_client->number);
+       if (err < 0)
+               goto error_dest;
 
-       up_write(&dest->list_mutex);
-       up_write(&src->list_mutex);
        return 0;
 
__error:
-       if (src_called)
-               unsubscribe_port(src_client, src_port, src, info,
-                                connector->number != src_client->number);
error_dest:
+       delete_and_unsubscribe_port(src_client, src_port, subs, true,
+                                   connector->number != src_client->number);
+ error:
        kfree(subs);
-       up_write(&dest->list_mutex);
-       up_write(&src->list_mutex);
        return err;
 }
 
-
 /* remove the connection */
 int snd_seq_port_disconnect(struct snd_seq_client *connector,
                            struct snd_seq_client *src_client,
@@ -567,37 +606,28 @@ int snd_seq_port_disconnect(struct snd_seq_client *connector,
                            struct snd_seq_port_subscribe *info)
 {
        struct snd_seq_port_subs_info *src = &src_port->c_src;
-       struct snd_seq_port_subs_info *dest = &dest_port->c_dest;
        struct snd_seq_subscribers *subs;
        int err = -ENOENT;
-       unsigned long flags;
 
        down_write(&src->list_mutex);
-       down_write_nested(&dest->list_mutex, SINGLE_DEPTH_NESTING);
-
        /* look for the connection */
        list_for_each_entry(subs, &src->list_head, src_list) {
                if (match_subs_info(info, &subs->info)) {
-                       write_lock_irqsave(&src->list_lock, flags);
-                       // write_lock(&dest->list_lock);  // no lock yet
-                       list_del(&subs->src_list);
-                       list_del(&subs->dest_list);
-                       // write_unlock(&dest->list_lock);
-                       write_unlock_irqrestore(&src->list_lock, flags);
-                       src->exclusive = dest->exclusive = 0;
-                       unsubscribe_port(src_client, src_port, src, info,
-                                        connector->number != src_client->number);
-                       unsubscribe_port(dest_client, dest_port, dest, info,
-                                        connector->number != dest_client->number);
-                       kfree(subs);
+                       atomic_dec(&subs->ref_count); /* mark as not ready */
                        err = 0;
                        break;
                }
        }
-
-       up_write(&dest->list_mutex);
        up_write(&src->list_mutex);
-       return err;
+       if (err < 0)
+               return err;
+
+       delete_and_unsubscribe_port(src_client, src_port, subs, true,
+                                   connector->number != src_client->number);
+       delete_and_unsubscribe_port(dest_client, dest_port, subs, false,
+                                   connector->number != dest_client->number);
+       kfree(subs);
+       return 0;
 }
 
 
index 82b220c..2931049 100644 (file)
@@ -90,6 +90,9 @@ void snd_seq_timer_delete(struct snd_seq_timer **tmr)
 
 void snd_seq_timer_defaults(struct snd_seq_timer * tmr)
 {
+       unsigned long flags;
+
+       spin_lock_irqsave(&tmr->lock, flags);
        /* setup defaults */
        tmr->ppq = 96;          /* 96 PPQ */
        tmr->tempo = 500000;    /* 120 BPM */
@@ -105,21 +108,25 @@ void snd_seq_timer_defaults(struct snd_seq_timer * tmr)
        tmr->preferred_resolution = seq_default_timer_resolution;
 
        tmr->skew = tmr->skew_base = SKEW_BASE;
+       spin_unlock_irqrestore(&tmr->lock, flags);
 }
 
-void snd_seq_timer_reset(struct snd_seq_timer * tmr)
+static void seq_timer_reset(struct snd_seq_timer *tmr)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&tmr->lock, flags);
-
        /* reset time & songposition */
        tmr->cur_time.tv_sec = 0;
        tmr->cur_time.tv_nsec = 0;
 
        tmr->tick.cur_tick = 0;
        tmr->tick.fraction = 0;
+}
+
+void snd_seq_timer_reset(struct snd_seq_timer *tmr)
+{
+       unsigned long flags;
 
+       spin_lock_irqsave(&tmr->lock, flags);
+       seq_timer_reset(tmr);
        spin_unlock_irqrestore(&tmr->lock, flags);
 }
 
@@ -138,8 +145,11 @@ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri,
        tmr = q->timer;
        if (tmr == NULL)
                return;
-       if (!tmr->running)
+       spin_lock_irqsave(&tmr->lock, flags);
+       if (!tmr->running) {
+               spin_unlock_irqrestore(&tmr->lock, flags);
                return;
+       }
 
        resolution *= ticks;
        if (tmr->skew != tmr->skew_base) {
@@ -148,8 +158,6 @@ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri,
                        (((resolution & 0xffff) * tmr->skew) >> 16);
        }
 
-       spin_lock_irqsave(&tmr->lock, flags);
-
        /* update timer */
        snd_seq_inc_time_nsec(&tmr->cur_time, resolution);
 
@@ -296,26 +304,30 @@ int snd_seq_timer_open(struct snd_seq_queue *q)
        t->callback = snd_seq_timer_interrupt;
        t->callback_data = q;
        t->flags |= SNDRV_TIMER_IFLG_AUTO;
+       spin_lock_irq(&tmr->lock);
        tmr->timeri = t;
+       spin_unlock_irq(&tmr->lock);
        return 0;
 }
 
 int snd_seq_timer_close(struct snd_seq_queue *q)
 {
        struct snd_seq_timer *tmr;
+       struct snd_timer_instance *t;
        
        tmr = q->timer;
        if (snd_BUG_ON(!tmr))
                return -EINVAL;
-       if (tmr->timeri) {
-               snd_timer_stop(tmr->timeri);
-               snd_timer_close(tmr->timeri);
-               tmr->timeri = NULL;
-       }
+       spin_lock_irq(&tmr->lock);
+       t = tmr->timeri;
+       tmr->timeri = NULL;
+       spin_unlock_irq(&tmr->lock);
+       if (t)
+               snd_timer_close(t);
        return 0;
 }
 
-int snd_seq_timer_stop(struct snd_seq_timer * tmr)
+static int seq_timer_stop(struct snd_seq_timer *tmr)
 {
        if (! tmr->timeri)
                return -EINVAL;
@@ -326,6 +338,17 @@ int snd_seq_timer_stop(struct snd_seq_timer * tmr)
        return 0;
 }
 
+int snd_seq_timer_stop(struct snd_seq_timer *tmr)
+{
+       unsigned long flags;
+       int err;
+
+       spin_lock_irqsave(&tmr->lock, flags);
+       err = seq_timer_stop(tmr);
+       spin_unlock_irqrestore(&tmr->lock, flags);
+       return err;
+}
+
 static int initialize_timer(struct snd_seq_timer *tmr)
 {
        struct snd_timer *t;
@@ -358,13 +381,13 @@ static int initialize_timer(struct snd_seq_timer *tmr)
        return 0;
 }
 
-int snd_seq_timer_start(struct snd_seq_timer * tmr)
+static int seq_timer_start(struct snd_seq_timer *tmr)
 {
        if (! tmr->timeri)
                return -EINVAL;
        if (tmr->running)
-               snd_seq_timer_stop(tmr);
-       snd_seq_timer_reset(tmr);
+               seq_timer_stop(tmr);
+       seq_timer_reset(tmr);
        if (initialize_timer(tmr) < 0)
                return -EINVAL;
        snd_timer_start(tmr->timeri, tmr->ticks);
@@ -373,14 +396,25 @@ int snd_seq_timer_start(struct snd_seq_timer * tmr)
        return 0;
 }
 
-int snd_seq_timer_continue(struct snd_seq_timer * tmr)
+int snd_seq_timer_start(struct snd_seq_timer *tmr)
+{
+       unsigned long flags;
+       int err;
+
+       spin_lock_irqsave(&tmr->lock, flags);
+       err = seq_timer_start(tmr);
+       spin_unlock_irqrestore(&tmr->lock, flags);
+       return err;
+}
+
+static int seq_timer_continue(struct snd_seq_timer *tmr)
 {
        if (! tmr->timeri)
                return -EINVAL;
        if (tmr->running)
                return -EBUSY;
        if (! tmr->initialized) {
-               snd_seq_timer_reset(tmr);
+               seq_timer_reset(tmr);
                if (initialize_timer(tmr) < 0)
                        return -EINVAL;
        }
@@ -390,11 +424,24 @@ int snd_seq_timer_continue(struct snd_seq_timer * tmr)
        return 0;
 }
 
+int snd_seq_timer_continue(struct snd_seq_timer *tmr)
+{
+       unsigned long flags;
+       int err;
+
+       spin_lock_irqsave(&tmr->lock, flags);
+       err = seq_timer_continue(tmr);
+       spin_unlock_irqrestore(&tmr->lock, flags);
+       return err;
+}
+
 /* return current 'real' time. use timeofday() to get better granularity. */
 snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr)
 {
        snd_seq_real_time_t cur_time;
+       unsigned long flags;
 
+       spin_lock_irqsave(&tmr->lock, flags);
        cur_time = tmr->cur_time;
        if (tmr->running) { 
                struct timeval tm;
@@ -410,7 +457,7 @@ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr)
                }
                snd_seq_sanity_real_time(&cur_time);
        }
-                
+       spin_unlock_irqrestore(&tmr->lock, flags);
        return cur_time;        
 }
 
index 3da2d48..c82ed3e 100644 (file)
@@ -155,21 +155,26 @@ static void snd_virmidi_output_trigger(struct snd_rawmidi_substream *substream,
        struct snd_virmidi *vmidi = substream->runtime->private_data;
        int count, res;
        unsigned char buf[32], *pbuf;
+       unsigned long flags;
 
        if (up) {
                vmidi->trigger = 1;
                if (vmidi->seq_mode == SNDRV_VIRMIDI_SEQ_DISPATCH &&
                    !(vmidi->rdev->flags & SNDRV_VIRMIDI_SUBSCRIBE)) {
-                       snd_rawmidi_transmit_ack(substream, substream->runtime->buffer_size - substream->runtime->avail);
-                       return;         /* ignored */
+                       while (snd_rawmidi_transmit(substream, buf,
+                                                   sizeof(buf)) > 0) {
+                               /* ignored */
+                       }
+                       return;
                }
                if (vmidi->event.type != SNDRV_SEQ_EVENT_NONE) {
                        if (snd_seq_kernel_client_dispatch(vmidi->client, &vmidi->event, in_atomic(), 0) < 0)
                                return;
                        vmidi->event.type = SNDRV_SEQ_EVENT_NONE;
                }
+               spin_lock_irqsave(&substream->runtime->lock, flags);
                while (1) {
-                       count = snd_rawmidi_transmit_peek(substream, buf, sizeof(buf));
+                       count = __snd_rawmidi_transmit_peek(substream, buf, sizeof(buf));
                        if (count <= 0)
                                break;
                        pbuf = buf;
@@ -179,16 +184,18 @@ static void snd_virmidi_output_trigger(struct snd_rawmidi_substream *substream,
                                        snd_midi_event_reset_encode(vmidi->parser);
                                        continue;
                                }
-                               snd_rawmidi_transmit_ack(substream, res);
+                               __snd_rawmidi_transmit_ack(substream, res);
                                pbuf += res;
                                count -= res;
                                if (vmidi->event.type != SNDRV_SEQ_EVENT_NONE) {
                                        if (snd_seq_kernel_client_dispatch(vmidi->client, &vmidi->event, in_atomic(), 0) < 0)
-                                               return;
+                                               goto out;
                                        vmidi->event.type = SNDRV_SEQ_EVENT_NONE;
                                }
                        }
                }
+       out:
+               spin_unlock_irqrestore(&substream->runtime->lock, flags);
        } else {
                vmidi->trigger = 0;
        }
@@ -254,9 +261,13 @@ static int snd_virmidi_output_open(struct snd_rawmidi_substream *substream)
  */
 static int snd_virmidi_input_close(struct snd_rawmidi_substream *substream)
 {
+       struct snd_virmidi_dev *rdev = substream->rmidi->private_data;
        struct snd_virmidi *vmidi = substream->runtime->private_data;
-       snd_midi_event_free(vmidi->parser);
+
+       write_lock_irq(&rdev->filelist_lock);
        list_del(&vmidi->list);
+       write_unlock_irq(&rdev->filelist_lock);
+       snd_midi_event_free(vmidi->parser);
        substream->runtime->private_data = NULL;
        kfree(vmidi);
        return 0;
index af1f68f..dca817f 100644 (file)
@@ -422,7 +422,7 @@ static void snd_timer_notify1(struct snd_timer_instance *ti, int event)
        spin_lock_irqsave(&timer->lock, flags);
        list_for_each_entry(ts, &ti->slave_active_head, active_list)
                if (ts->ccallback)
-                       ts->ccallback(ti, event + 100, &tstamp, resolution);
+                       ts->ccallback(ts, event + 100, &tstamp, resolution);
        spin_unlock_irqrestore(&timer->lock, flags);
 }
 
@@ -451,6 +451,10 @@ static int snd_timer_start_slave(struct snd_timer_instance *timeri)
        unsigned long flags;
 
        spin_lock_irqsave(&slave_active_lock, flags);
+       if (timeri->flags & SNDRV_TIMER_IFLG_RUNNING) {
+               spin_unlock_irqrestore(&slave_active_lock, flags);
+               return -EBUSY;
+       }
        timeri->flags |= SNDRV_TIMER_IFLG_RUNNING;
        if (timeri->master && timeri->timer) {
                spin_lock(&timeri->timer->lock);
@@ -475,7 +479,8 @@ int snd_timer_start(struct snd_timer_instance *timeri, unsigned int ticks)
                return -EINVAL;
        if (timeri->flags & SNDRV_TIMER_IFLG_SLAVE) {
                result = snd_timer_start_slave(timeri);
-               snd_timer_notify1(timeri, SNDRV_TIMER_EVENT_START);
+               if (result >= 0)
+                       snd_timer_notify1(timeri, SNDRV_TIMER_EVENT_START);
                return result;
        }
        timer = timeri->timer;
@@ -484,11 +489,18 @@ int snd_timer_start(struct snd_timer_instance *timeri, unsigned int ticks)
        if (timer->card && timer->card->shutdown)
                return -ENODEV;
        spin_lock_irqsave(&timer->lock, flags);
+       if (timeri->flags & (SNDRV_TIMER_IFLG_RUNNING |
+                            SNDRV_TIMER_IFLG_START)) {
+               result = -EBUSY;
+               goto unlock;
+       }
        timeri->ticks = timeri->cticks = ticks;
        timeri->pticks = 0;
        result = snd_timer_start1(timer, timeri, ticks);
+ unlock:
        spin_unlock_irqrestore(&timer->lock, flags);
-       snd_timer_notify1(timeri, SNDRV_TIMER_EVENT_START);
+       if (result >= 0)
+               snd_timer_notify1(timeri, SNDRV_TIMER_EVENT_START);
        return result;
 }
 
@@ -502,9 +514,17 @@ static int _snd_timer_stop(struct snd_timer_instance *timeri, int event)
 
        if (timeri->flags & SNDRV_TIMER_IFLG_SLAVE) {
                spin_lock_irqsave(&slave_active_lock, flags);
+               if (!(timeri->flags & SNDRV_TIMER_IFLG_RUNNING)) {
+                       spin_unlock_irqrestore(&slave_active_lock, flags);
+                       return -EBUSY;
+               }
+               if (timeri->timer)
+                       spin_lock(&timeri->timer->lock);
                timeri->flags &= ~SNDRV_TIMER_IFLG_RUNNING;
                list_del_init(&timeri->ack_list);
                list_del_init(&timeri->active_list);
+               if (timeri->timer)
+                       spin_unlock(&timeri->timer->lock);
                spin_unlock_irqrestore(&slave_active_lock, flags);
                goto __end;
        }
@@ -512,6 +532,11 @@ static int _snd_timer_stop(struct snd_timer_instance *timeri, int event)
        if (!timer)
                return -EINVAL;
        spin_lock_irqsave(&timer->lock, flags);
+       if (!(timeri->flags & (SNDRV_TIMER_IFLG_RUNNING |
+                              SNDRV_TIMER_IFLG_START))) {
+               spin_unlock_irqrestore(&timer->lock, flags);
+               return -EBUSY;
+       }
        list_del_init(&timeri->ack_list);
        list_del_init(&timeri->active_list);
        if (timer->card && timer->card->shutdown) {
@@ -581,10 +606,15 @@ int snd_timer_continue(struct snd_timer_instance *timeri)
        if (timer->card && timer->card->shutdown)
                return -ENODEV;
        spin_lock_irqsave(&timer->lock, flags);
+       if (timeri->flags & SNDRV_TIMER_IFLG_RUNNING) {
+               result = -EBUSY;
+               goto unlock;
+       }
        if (!timeri->cticks)
                timeri->cticks = 1;
        timeri->pticks = 0;
        result = snd_timer_start1(timer, timeri, timer->sticks);
+ unlock:
        spin_unlock_irqrestore(&timer->lock, flags);
        snd_timer_notify1(timeri, SNDRV_TIMER_EVENT_CONTINUE);
        return result;
@@ -718,8 +748,8 @@ void snd_timer_interrupt(struct snd_timer * timer, unsigned long ticks_left)
                        ti->cticks = ti->ticks;
                } else {
                        ti->flags &= ~SNDRV_TIMER_IFLG_RUNNING;
-                       if (--timer->running)
-                               list_del_init(&ti->active_list);
+                       --timer->running;
+                       list_del_init(&ti->active_list);
                }
                if ((timer->hw.flags & SNDRV_TIMER_HW_TASKLET) ||
                    (ti->flags & SNDRV_TIMER_IFLG_FAST))
@@ -1032,11 +1062,21 @@ static int snd_timer_s_stop(struct snd_timer * timer)
        return 0;
 }
 
+static int snd_timer_s_close(struct snd_timer *timer)
+{
+       struct snd_timer_system_private *priv;
+
+       priv = (struct snd_timer_system_private *)timer->private_data;
+       del_timer_sync(&priv->tlist);
+       return 0;
+}
+
 static struct snd_timer_hardware snd_timer_system =
 {
        .flags =        SNDRV_TIMER_HW_FIRST | SNDRV_TIMER_HW_TASKLET,
        .resolution =   1000000000L / HZ,
        .ticks =        10000000L,
+       .close =        snd_timer_s_close,
        .start =        snd_timer_s_start,
        .stop =         snd_timer_s_stop
 };
@@ -1893,6 +1933,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 {
        struct snd_timer_user *tu;
        long result = 0, unit;
+       int qhead;
        int err = 0;
 
        tu = file->private_data;
@@ -1904,7 +1945,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 
                        if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
                                err = -EAGAIN;
-                               break;
+                               goto _error;
                        }
 
                        set_current_state(TASK_INTERRUPTIBLE);
@@ -1919,42 +1960,37 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 
                        if (tu->disconnected) {
                                err = -ENODEV;
-                               break;
+                               goto _error;
                        }
                        if (signal_pending(current)) {
                                err = -ERESTARTSYS;
-                               break;
+                               goto _error;
                        }
                }
 
+               qhead = tu->qhead++;
+               tu->qhead %= tu->queue_size;
                spin_unlock_irq(&tu->qlock);
-               if (err < 0)
-                       goto _error;
 
                if (tu->tread) {
-                       if (copy_to_user(buffer, &tu->tqueue[tu->qhead++],
-                                        sizeof(struct snd_timer_tread))) {
+                       if (copy_to_user(buffer, &tu->tqueue[qhead],
+                                        sizeof(struct snd_timer_tread)))
                                err = -EFAULT;
-                               goto _error;
-                       }
                } else {
-                       if (copy_to_user(buffer, &tu->queue[tu->qhead++],
-                                        sizeof(struct snd_timer_read))) {
+                       if (copy_to_user(buffer, &tu->queue[qhead],
+                                        sizeof(struct snd_timer_read)))
                                err = -EFAULT;
-                               goto _error;
-                       }
                }
 
-               tu->qhead %= tu->queue_size;
-
-               result += unit;
-               buffer += unit;
-
                spin_lock_irq(&tu->qlock);
                tu->qused--;
+               if (err < 0)
+                       goto _error;
+               result += unit;
+               buffer += unit;
        }
-       spin_unlock_irq(&tu->qlock);
  _error:
+       spin_unlock_irq(&tu->qlock);
        return result > 0 ? result : err;
 }
 
index e05802a..2e90822 100644 (file)
@@ -70,13 +70,14 @@ static int snd_timer_user_status_compat(struct file *file,
                                        struct snd_timer_status32 __user *_status)
 {
        struct snd_timer_user *tu;
-       struct snd_timer_status status;
+       struct snd_timer_status32 status;
        
        tu = file->private_data;
        if (snd_BUG_ON(!tu->timeri))
                return -ENXIO;
        memset(&status, 0, sizeof(status));
-       status.tstamp = tu->tstamp;
+       status.tstamp.tv_sec = tu->tstamp.tv_sec;
+       status.tstamp.tv_nsec = tu->tstamp.tv_nsec;
        status.resolution = snd_timer_resolution(tu->timeri);
        status.lost = tu->timeri->lost;
        status.overrun = tu->overrun;
@@ -88,12 +89,21 @@ static int snd_timer_user_status_compat(struct file *file,
        return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has the same struct as x86-64 */
+#define snd_timer_user_status_x32(file, s) \
+       snd_timer_user_status(file, s)
+#endif /* CONFIG_X86_X32 */
+
 /*
  */
 
 enum {
        SNDRV_TIMER_IOCTL_INFO32 = _IOR('T', 0x11, struct snd_timer_info32),
        SNDRV_TIMER_IOCTL_STATUS32 = _IOW('T', 0x14, struct snd_timer_status32),
+#ifdef CONFIG_X86_X32
+       SNDRV_TIMER_IOCTL_STATUS_X32 = _IOW('T', 0x14, struct snd_timer_status),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_timer_user_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -122,6 +132,10 @@ static long snd_timer_user_ioctl_compat(struct file *file, unsigned int cmd, uns
                return snd_timer_user_info_compat(file, argp);
        case SNDRV_TIMER_IOCTL_STATUS32:
                return snd_timer_user_status_compat(file, argp);
+#ifdef CONFIG_X86_X32
+       case SNDRV_TIMER_IOCTL_STATUS_X32:
+               return snd_timer_user_status_x32(file, argp);
+#endif /* CONFIG_X86_X32 */
        }
        return -ENOIOCTLCMD;
 }
index 75b7485..c0f8f61 100644 (file)
@@ -109,6 +109,9 @@ struct dummy_timer_ops {
        snd_pcm_uframes_t (*pointer)(struct snd_pcm_substream *);
 };
 
+#define get_dummy_ops(substream) \
+       (*(const struct dummy_timer_ops **)(substream)->runtime->private_data)
+
 struct dummy_model {
        const char *name;
        int (*playback_constraints)(struct snd_pcm_runtime *runtime);
@@ -137,7 +140,6 @@ struct snd_dummy {
        int iobox;
        struct snd_kcontrol *cd_volume_ctl;
        struct snd_kcontrol *cd_switch_ctl;
-       const struct dummy_timer_ops *timer_ops;
 };
 
 /*
@@ -231,6 +233,8 @@ static struct dummy_model *dummy_models[] = {
  */
 
 struct dummy_systimer_pcm {
+       /* ops must be the first item */
+       const struct dummy_timer_ops *timer_ops;
        spinlock_t lock;
        struct timer_list timer;
        unsigned long base_time;
@@ -366,6 +370,8 @@ static const struct dummy_timer_ops dummy_systimer_ops = {
  */
 
 struct dummy_hrtimer_pcm {
+       /* ops must be the first item */
+       const struct dummy_timer_ops *timer_ops;
        ktime_t base_time;
        ktime_t period_time;
        atomic_t running;
@@ -492,31 +498,25 @@ static const struct dummy_timer_ops dummy_hrtimer_ops = {
 
 static int dummy_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 {
-       struct snd_dummy *dummy = snd_pcm_substream_chip(substream);
-
        switch (cmd) {
        case SNDRV_PCM_TRIGGER_START:
        case SNDRV_PCM_TRIGGER_RESUME:
-               return dummy->timer_ops->start(substream);
+               return get_dummy_ops(substream)->start(substream);
        case SNDRV_PCM_TRIGGER_STOP:
        case SNDRV_PCM_TRIGGER_SUSPEND:
-               return dummy->timer_ops->stop(substream);
+               return get_dummy_ops(substream)->stop(substream);
        }
        return -EINVAL;
 }
 
 static int dummy_pcm_prepare(struct snd_pcm_substream *substream)
 {
-       struct snd_dummy *dummy = snd_pcm_substream_chip(substream);
-
-       return dummy->timer_ops->prepare(substream);
+       return get_dummy_ops(substream)->prepare(substream);
 }
 
 static snd_pcm_uframes_t dummy_pcm_pointer(struct snd_pcm_substream *substream)
 {
-       struct snd_dummy *dummy = snd_pcm_substream_chip(substream);
-
-       return dummy->timer_ops->pointer(substream);
+       return get_dummy_ops(substream)->pointer(substream);
 }
 
 static struct snd_pcm_hardware dummy_pcm_hardware = {
@@ -562,17 +562,19 @@ static int dummy_pcm_open(struct snd_pcm_substream *substream)
        struct snd_dummy *dummy = snd_pcm_substream_chip(substream);
        struct dummy_model *model = dummy->model;
        struct snd_pcm_runtime *runtime = substream->runtime;
+       const struct dummy_timer_ops *ops;
        int err;
 
-       dummy->timer_ops = &dummy_systimer_ops;
+       ops = &dummy_systimer_ops;
 #ifdef CONFIG_HIGH_RES_TIMERS
        if (hrtimer)
-               dummy->timer_ops = &dummy_hrtimer_ops;
+               ops = &dummy_hrtimer_ops;
 #endif
 
-       err = dummy->timer_ops->create(substream);
+       err = ops->create(substream);
        if (err < 0)
                return err;
+       get_dummy_ops(substream) = ops;
 
        runtime->hw = dummy->pcm_hw;
        if (substream->pcm->device & 1) {
@@ -594,7 +596,7 @@ static int dummy_pcm_open(struct snd_pcm_substream *substream)
                        err = model->capture_constraints(substream->runtime);
        }
        if (err < 0) {
-               dummy->timer_ops->free(substream);
+               get_dummy_ops(substream)->free(substream);
                return err;
        }
        return 0;
@@ -602,8 +604,7 @@ static int dummy_pcm_open(struct snd_pcm_substream *substream)
 
 static int dummy_pcm_close(struct snd_pcm_substream *substream)
 {
-       struct snd_dummy *dummy = snd_pcm_substream_chip(substream);
-       dummy->timer_ops->free(substream);
+       get_dummy_ops(substream)->free(substream);
        return 0;
 }
 
index 926e5dc..5022c9b 100644 (file)
@@ -47,14 +47,16 @@ static const unsigned int bridgeco_freq_table[] = {
        [6] = 0x07,
 };
 
-static unsigned int
-get_formation_index(unsigned int rate)
+static int
+get_formation_index(unsigned int rate, unsigned int *index)
 {
        unsigned int i;
 
        for (i = 0; i < ARRAY_SIZE(snd_bebob_rate_table); i++) {
-               if (snd_bebob_rate_table[i] == rate)
-                       return i;
+               if (snd_bebob_rate_table[i] == rate) {
+                       *index = i;
+                       return 0;
+               }
        }
        return -EINVAL;
 }
@@ -425,7 +427,9 @@ make_both_connections(struct snd_bebob *bebob, unsigned int rate)
                goto end;
 
        /* confirm params for both streams */
-       index = get_formation_index(rate);
+       err = get_formation_index(rate, &index);
+       if (err < 0)
+               goto end;
        pcm_channels = bebob->tx_stream_formations[index].pcm;
        midi_channels = bebob->tx_stream_formations[index].midi;
        err = amdtp_am824_set_parameters(&bebob->tx_stream, rate,
index b02a5e8..0ac92ab 100644 (file)
@@ -63,7 +63,7 @@ struct amdtp_dot {
 #define BYTE_PER_SAMPLE (4)
 #define MAGIC_DOT_BYTE (2)
 #define MAGIC_BYTE_OFF(x) (((x) * BYTE_PER_SAMPLE) + MAGIC_DOT_BYTE)
-static const u8 dot_scrt(const u8 idx, const unsigned int off)
+static u8 dot_scrt(const u8 idx, const unsigned int off)
 {
        /*
         * the length of the added pattern only depends on the lower nibble
index 904ce03..040a96d 100644 (file)
@@ -230,6 +230,7 @@ int snd_tscm_transaction_register(struct snd_tscm *tscm)
        return err;
 error:
        fw_core_remove_address_handler(&tscm->async_handler);
+       tscm->async_handler.callback_data = NULL;
        return err;
 }
 
@@ -276,6 +277,9 @@ void snd_tscm_transaction_unregister(struct snd_tscm *tscm)
        __be32 reg;
        unsigned int i;
 
+       if (tscm->async_handler.callback_data == NULL)
+               return;
+
        /* Turn off FireWire LED. */
        reg = cpu_to_be32(0x0000008e);
        snd_fw_transaction(tscm->unit, TCODE_WRITE_QUADLET_REQUEST,
@@ -297,6 +301,8 @@ void snd_tscm_transaction_unregister(struct snd_tscm *tscm)
                           &reg, sizeof(reg), 0);
 
        fw_core_remove_address_handler(&tscm->async_handler);
+       tscm->async_handler.callback_data = NULL;
+
        for (i = 0; i < TSCM_MIDI_OUT_PORT_MAX; i++)
                snd_fw_async_midi_port_destroy(&tscm->out_ports[i]);
 }
index ee0bc18..e281c33 100644 (file)
@@ -21,7 +21,6 @@ static struct snd_tscm_spec model_specs[] = {
                .pcm_playback_analog_channels = 8,
                .midi_capture_ports = 4,
                .midi_playback_ports = 4,
-               .is_controller = true,
        },
        {
                .name = "FW-1082",
@@ -31,9 +30,16 @@ static struct snd_tscm_spec model_specs[] = {
                .pcm_playback_analog_channels = 2,
                .midi_capture_ports = 2,
                .midi_playback_ports = 2,
-               .is_controller = true,
        },
-       /* FW-1804 may be supported. */
+       {
+               .name = "FW-1804",
+               .has_adat = true,
+               .has_spdif = true,
+               .pcm_capture_analog_channels = 8,
+               .pcm_playback_analog_channels = 2,
+               .midi_capture_ports = 2,
+               .midi_playback_ports = 4,
+       },
 };
 
 static int identify_model(struct snd_tscm *tscm)
index 2d028d2..30ab77e 100644 (file)
@@ -39,7 +39,6 @@ struct snd_tscm_spec {
        unsigned int pcm_playback_analog_channels;
        unsigned int midi_capture_ports;
        unsigned int midi_playback_ports;
-       bool is_controller;
 };
 
 #define TSCM_MIDI_IN_PORT_MAX  4
@@ -72,9 +71,6 @@ struct snd_tscm {
        struct snd_fw_async_midi_port out_ports[TSCM_MIDI_OUT_PORT_MAX];
        u8 running_status[TSCM_MIDI_OUT_PORT_MAX];
        bool on_sysex[TSCM_MIDI_OUT_PORT_MAX];
-
-       /* For control messages. */
-       struct snd_firewire_tascam_status *status;
 };
 
 #define TSCM_ADDR_BASE                 0xffff00000000ull
index b5a17cb..8c48623 100644 (file)
@@ -426,18 +426,22 @@ EXPORT_SYMBOL_GPL(snd_hdac_bus_stop_chip);
  * @bus: HD-audio core bus
  * @status: INTSTS register value
  * @ask: callback to be called for woken streams
+ *
+ * Returns the bits of handled streams, or zero if no stream is handled.
  */
-void snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
+int snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
                                    void (*ack)(struct hdac_bus *,
                                                struct hdac_stream *))
 {
        struct hdac_stream *azx_dev;
        u8 sd_status;
+       int handled = 0;
 
        list_for_each_entry(azx_dev, &bus->stream_list, list) {
                if (status & azx_dev->sd_int_sta_mask) {
                        sd_status = snd_hdac_stream_readb(azx_dev, SD_STS);
                        snd_hdac_stream_writeb(azx_dev, SD_STS, SD_INT_MASK);
+                       handled |= 1 << azx_dev->index;
                        if (!azx_dev->substream || !azx_dev->running ||
                            !(sd_status & SD_INT_COMPLETE))
                                continue;
@@ -445,6 +449,7 @@ void snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
                                ack(bus, azx_dev);
                }
        }
+       return handled;
 }
 EXPORT_SYMBOL_GPL(snd_hdac_bus_handle_stream_irq);
 
index 0216475..37adcc6 100644 (file)
@@ -3,6 +3,7 @@
 config SND_WSS_LIB
         tristate
         select SND_PCM
+       select SND_TIMER
 
 config SND_SB_COMMON
         tristate
@@ -42,6 +43,7 @@ config SND_AD1816A
        select SND_OPL3_LIB
        select SND_MPU401_UART
        select SND_PCM
+       select SND_TIMER
        help
          Say Y here to include support for Analog Devices SoundPort
          AD1816A or compatible sound chips.
@@ -209,6 +211,7 @@ config SND_GUSCLASSIC
        tristate "Gravis UltraSound Classic"
        select SND_RAWMIDI
        select SND_PCM
+       select SND_TIMER
        help
          Say Y here to include support for Gravis UltraSound Classic
          soundcards.
@@ -221,6 +224,7 @@ config SND_GUSEXTREME
        select SND_OPL3_LIB
        select SND_MPU401_UART
        select SND_PCM
+       select SND_TIMER
        help
          Say Y here to include support for Gravis UltraSound Extreme
          soundcards.
index 656ce39..8f6594a 100644 (file)
@@ -155,6 +155,7 @@ config SND_AZT3328
        select SND_PCM
        select SND_RAWMIDI
        select SND_AC97_CODEC
+       select SND_TIMER
        depends on ZONE_DMA
        help
          Say Y here to include support for Aztech AZF3328 (PCI168)
@@ -463,6 +464,7 @@ config SND_EMU10K1
        select SND_HWDEP
        select SND_RAWMIDI
        select SND_AC97_CODEC
+       select SND_TIMER
        depends on ZONE_DMA
        help
          Say Y to include support for Sound Blaster PCI 512, Live!,
@@ -889,6 +891,7 @@ config SND_YMFPCI
        select SND_OPL3_LIB
        select SND_MPU401_UART
        select SND_AC97_CODEC
+       select SND_TIMER
        help
          Say Y here to include support for Yamaha PCI audio chips -
          YMF724, YMF724F, YMF740, YMF740C, YMF744, YMF754.
index 28e2f8b..8914534 100644 (file)
@@ -1141,6 +1141,14 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
                emu->emu1010.firmware_thread =
                        kthread_create(emu1010_firmware_thread, emu,
                                       "emu1010_firmware");
+               if (IS_ERR(emu->emu1010.firmware_thread)) {
+                       err = PTR_ERR(emu->emu1010.firmware_thread);
+                       emu->emu1010.firmware_thread = NULL;
+                       dev_info(emu->card->dev,
+                                       "emu1010: Creating thread failed\n");
+                       return err;
+               }
+
                wake_up_process(emu->emu1010.firmware_thread);
        }
 
index 37cf9ce..27de801 100644 (file)
@@ -930,6 +930,8 @@ irqreturn_t azx_interrupt(int irq, void *dev_id)
        struct azx *chip = dev_id;
        struct hdac_bus *bus = azx_bus(chip);
        u32 status;
+       bool active, handled = false;
+       int repeat = 0; /* count for avoiding endless loop */
 
 #ifdef CONFIG_PM
        if (azx_has_pm_runtime(chip))
@@ -939,33 +941,36 @@ irqreturn_t azx_interrupt(int irq, void *dev_id)
 
        spin_lock(&bus->reg_lock);
 
-       if (chip->disabled) {
-               spin_unlock(&bus->reg_lock);
-               return IRQ_NONE;
-       }
-
-       status = azx_readl(chip, INTSTS);
-       if (status == 0 || status == 0xffffffff) {
-               spin_unlock(&bus->reg_lock);
-               return IRQ_NONE;
-       }
+       if (chip->disabled)
+               goto unlock;
 
-       snd_hdac_bus_handle_stream_irq(bus, status, stream_update);
+       do {
+               status = azx_readl(chip, INTSTS);
+               if (status == 0 || status == 0xffffffff)
+                       break;
 
-       /* clear rirb int */
-       status = azx_readb(chip, RIRBSTS);
-       if (status & RIRB_INT_MASK) {
-               if (status & RIRB_INT_RESPONSE) {
-                       if (chip->driver_caps & AZX_DCAPS_CTX_WORKAROUND)
-                               udelay(80);
-                       snd_hdac_bus_update_rirb(bus);
+               handled = true;
+               active = false;
+               if (snd_hdac_bus_handle_stream_irq(bus, status, stream_update))
+                       active = true;
+
+               /* clear rirb int */
+               status = azx_readb(chip, RIRBSTS);
+               if (status & RIRB_INT_MASK) {
+                       active = true;
+                       if (status & RIRB_INT_RESPONSE) {
+                               if (chip->driver_caps & AZX_DCAPS_CTX_WORKAROUND)
+                                       udelay(80);
+                               snd_hdac_bus_update_rirb(bus);
+                       }
+                       azx_writeb(chip, RIRBSTS, RIRB_INT_MASK);
                }
-               azx_writeb(chip, RIRBSTS, RIRB_INT_MASK);
-       }
+       } while (active && ++repeat < 10);
 
+ unlock:
        spin_unlock(&bus->reg_lock);
 
-       return IRQ_HANDLED;
+       return IRQ_RETVAL(handled);
 }
 EXPORT_SYMBOL_GPL(azx_interrupt);
 
index 30c8efe..7ca5b89 100644 (file)
@@ -4028,9 +4028,9 @@ static void pin_power_callback(struct hda_codec *codec,
                               struct hda_jack_callback *jack,
                               bool on)
 {
-       if (jack && jack->tbl->nid)
+       if (jack && jack->nid)
                sync_power_state_change(codec,
-                                       set_pin_power_jack(codec, jack->tbl->nid, on));
+                                       set_pin_power_jack(codec, jack->nid, on));
 }
 
 /* callback only doing power up -- called at first */
index 256e6cd..e5240cb 100644 (file)
@@ -90,6 +90,8 @@ enum {
 #define NVIDIA_HDA_ENABLE_COHBIT      0x01
 
 /* Defines for Intel SCH HDA snoop control */
+#define INTEL_HDA_CGCTL         0x48
+#define INTEL_HDA_CGCTL_MISCBDCGE        (0x1 << 6)
 #define INTEL_SCH_HDA_DEVC      0x78
 #define INTEL_SCH_HDA_DEVC_NOSNOOP       (0x1<<11)
 
@@ -361,7 +363,10 @@ enum {
                                        ((pci)->device == 0x0d0c) || \
                                        ((pci)->device == 0x160c))
 
-#define IS_BROXTON(pci)        ((pci)->device == 0x5a98)
+#define IS_SKL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa170)
+#define IS_SKL_LP(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x9d70)
+#define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98)
+#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci))
 
 static char *driver_short_names[] = {
        [AZX_DRIVER_ICH] = "HDA Intel",
@@ -534,15 +539,26 @@ static void hda_intel_init_chip(struct azx *chip, bool full_reset)
 {
        struct hdac_bus *bus = azx_bus(chip);
        struct pci_dev *pci = chip->pci;
+       u32 val;
 
        if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL)
                snd_hdac_set_codec_wakeup(bus, true);
+       if (IS_SKL_PLUS(pci)) {
+               pci_read_config_dword(pci, INTEL_HDA_CGCTL, &val);
+               val = val & ~INTEL_HDA_CGCTL_MISCBDCGE;
+               pci_write_config_dword(pci, INTEL_HDA_CGCTL, val);
+       }
        azx_init_chip(chip, full_reset);
+       if (IS_SKL_PLUS(pci)) {
+               pci_read_config_dword(pci, INTEL_HDA_CGCTL, &val);
+               val = val | INTEL_HDA_CGCTL_MISCBDCGE;
+               pci_write_config_dword(pci, INTEL_HDA_CGCTL, val);
+       }
        if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL)
                snd_hdac_set_codec_wakeup(bus, false);
 
        /* reduce dma latency to avoid noise */
-       if (IS_BROXTON(pci))
+       if (IS_BXT(pci))
                bxt_reduce_dma_latency(chip);
 }
 
@@ -964,11 +980,6 @@ static int azx_resume(struct device *dev)
 /* put codec down to D3 at hibernation for Intel SKL+;
  * otherwise BIOS may still access the codec and screw up the driver
  */
-#define IS_SKL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa170)
-#define IS_SKL_LP(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x9d70)
-#define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98)
-#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci))
-
 static int azx_freeze_noirq(struct device *dev)
 {
        struct pci_dev *pci = to_pci_dev(dev);
@@ -2155,10 +2166,10 @@ static void azx_remove(struct pci_dev *pci)
        struct hda_intel *hda;
 
        if (card) {
-               /* flush the pending probing work */
+               /* cancel the pending probing work */
                chip = card->private_data;
                hda = container_of(chip, struct hda_intel, chip);
-               flush_work(&hda->probe_work);
+               cancel_work_sync(&hda->probe_work);
 
                snd_card_free(card);
        }
index c945e25..a33234e 100644 (file)
@@ -259,7 +259,7 @@ snd_hda_jack_detect_enable_callback(struct hda_codec *codec, hda_nid_t nid,
                if (!callback)
                        return ERR_PTR(-ENOMEM);
                callback->func = func;
-               callback->tbl = jack;
+               callback->nid = jack->nid;
                callback->next = jack->callback;
                jack->callback = callback;
        }
index 858708a..e9814c0 100644 (file)
@@ -21,7 +21,7 @@ struct hda_jack_callback;
 typedef void (*hda_jack_callback_fn) (struct hda_codec *, struct hda_jack_callback *);
 
 struct hda_jack_callback {
-       struct hda_jack_tbl *tbl;
+       hda_nid_t nid;
        hda_jack_callback_fn func;
        unsigned int private_data;      /* arbitrary data */
        struct hda_jack_callback *next;
index 4ef2259..9ceb2bc 100644 (file)
@@ -4427,13 +4427,16 @@ static void ca0132_process_dsp_response(struct hda_codec *codec,
 static void hp_callback(struct hda_codec *codec, struct hda_jack_callback *cb)
 {
        struct ca0132_spec *spec = codec->spec;
+       struct hda_jack_tbl *tbl;
 
        /* Delay enabling the HP amp, to let the mic-detection
         * state machine run.
         */
        cancel_delayed_work_sync(&spec->unsol_hp_work);
        schedule_delayed_work(&spec->unsol_hp_work, msecs_to_jiffies(500));
-       cb->tbl->block_report = 1;
+       tbl = snd_hda_jack_tbl_get(codec, cb->nid);
+       if (tbl)
+               tbl->block_report = 1;
 }
 
 static void amic_callback(struct hda_codec *codec, struct hda_jack_callback *cb)
index a12ae8a..c1c855a 100644 (file)
@@ -614,6 +614,7 @@ enum {
        CS4208_MAC_AUTO,
        CS4208_MBA6,
        CS4208_MBP11,
+       CS4208_MACMINI,
        CS4208_GPIO0,
 };
 
@@ -621,6 +622,7 @@ static const struct hda_model_fixup cs4208_models[] = {
        { .id = CS4208_GPIO0, .name = "gpio0" },
        { .id = CS4208_MBA6, .name = "mba6" },
        { .id = CS4208_MBP11, .name = "mbp11" },
+       { .id = CS4208_MACMINI, .name = "macmini" },
        {}
 };
 
@@ -632,6 +634,7 @@ static const struct snd_pci_quirk cs4208_fixup_tbl[] = {
 /* codec SSID matching */
 static const struct snd_pci_quirk cs4208_mac_fixup_tbl[] = {
        SND_PCI_QUIRK(0x106b, 0x5e00, "MacBookPro 11,2", CS4208_MBP11),
+       SND_PCI_QUIRK(0x106b, 0x6c00, "MacMini 7,1", CS4208_MACMINI),
        SND_PCI_QUIRK(0x106b, 0x7100, "MacBookAir 6,1", CS4208_MBA6),
        SND_PCI_QUIRK(0x106b, 0x7200, "MacBookAir 6,2", CS4208_MBA6),
        SND_PCI_QUIRK(0x106b, 0x7b00, "MacBookPro 12,1", CS4208_MBP11),
@@ -666,6 +669,24 @@ static void cs4208_fixup_mac(struct hda_codec *codec,
        snd_hda_apply_fixup(codec, action);
 }
 
+/* MacMini 7,1 has the inverted jack detection */
+static void cs4208_fixup_macmini(struct hda_codec *codec,
+                                const struct hda_fixup *fix, int action)
+{
+       static const struct hda_pintbl pincfgs[] = {
+               { 0x18, 0x00ab9150 }, /* mic (audio-in) jack: disable detect */
+               { 0x21, 0x004be140 }, /* SPDIF: disable detect */
+               { }
+       };
+
+       if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+               /* HP pin (0x10) has an inverted detection */
+               codec->inv_jack_detect = 1;
+               /* disable the bogus Mic and SPDIF jack detections */
+               snd_hda_apply_pincfgs(codec, pincfgs);
+       }
+}
+
 static int cs4208_spdif_sw_put(struct snd_kcontrol *kcontrol,
                               struct snd_ctl_elem_value *ucontrol)
 {
@@ -709,6 +730,12 @@ static const struct hda_fixup cs4208_fixups[] = {
                .chained = true,
                .chain_id = CS4208_GPIO0,
        },
+       [CS4208_MACMINI] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = cs4208_fixup_macmini,
+               .chained = true,
+               .chain_id = CS4208_GPIO0,
+       },
        [CS4208_GPIO0] = {
                .type = HDA_FIXUP_FUNC,
                .v.func = cs4208_fixup_gpio0,
index 426a29a..bcbc4ee 100644 (file)
@@ -448,7 +448,8 @@ static int hdmi_eld_ctl_get(struct snd_kcontrol *kcontrol,
        eld = &per_pin->sink_eld;
 
        mutex_lock(&per_pin->lock);
-       if (eld->eld_size > ARRAY_SIZE(ucontrol->value.bytes.data)) {
+       if (eld->eld_size > ARRAY_SIZE(ucontrol->value.bytes.data) ||
+           eld->eld_size > ELD_MAX_SIZE) {
                mutex_unlock(&per_pin->lock);
                snd_BUG();
                return -EINVAL;
@@ -1193,7 +1194,7 @@ static void check_presence_and_report(struct hda_codec *codec, hda_nid_t nid)
 static void jack_callback(struct hda_codec *codec,
                          struct hda_jack_callback *jack)
 {
-       check_presence_and_report(codec, jack->tbl->nid);
+       check_presence_and_report(codec, jack->nid);
 }
 
 static void hdmi_intrinsic_event(struct hda_codec *codec, unsigned int res)
@@ -2476,13 +2477,6 @@ static int patch_generic_hdmi(struct hda_codec *codec)
                        is_broxton(codec))
                codec->core.link_power_control = 1;
 
-       if (codec_has_acomp(codec)) {
-               codec->depop_delay = 0;
-               spec->i915_audio_ops.audio_ptr = codec;
-               spec->i915_audio_ops.pin_eld_notify = intel_pin_eld_notify;
-               snd_hdac_i915_register_notifier(&spec->i915_audio_ops);
-       }
-
        if (hdmi_parse_codec(codec) < 0) {
                if (spec->i915_bound)
                        snd_hdac_i915_exit(&codec->bus->core);
@@ -2504,6 +2498,18 @@ static int patch_generic_hdmi(struct hda_codec *codec)
 
        init_channel_allocations();
 
+       if (codec_has_acomp(codec)) {
+               codec->depop_delay = 0;
+               spec->i915_audio_ops.audio_ptr = codec;
+               /* intel_audio_codec_enable() or intel_audio_codec_disable()
+                * will call pin_eld_notify with using audio_ptr pointer
+                * We need make sure audio_ptr is really setup
+                */
+               wmb();
+               spec->i915_audio_ops.pin_eld_notify = intel_pin_eld_notify;
+               snd_hdac_i915_register_notifier(&spec->i915_audio_ops);
+       }
+
        return 0;
 }
 
@@ -3653,6 +3659,7 @@ HDA_CODEC_ENTRY(0x10de0070, "GPU 70 HDMI/DP",     patch_nvhdmi),
 HDA_CODEC_ENTRY(0x10de0071, "GPU 71 HDMI/DP",  patch_nvhdmi),
 HDA_CODEC_ENTRY(0x10de0072, "GPU 72 HDMI/DP",  patch_nvhdmi),
 HDA_CODEC_ENTRY(0x10de007d, "GPU 7d HDMI/DP",  patch_nvhdmi),
+HDA_CODEC_ENTRY(0x10de0083, "GPU 83 HDMI/DP",  patch_nvhdmi),
 HDA_CODEC_ENTRY(0x10de8001, "MCP73 HDMI",      patch_nvhdmi_2ch),
 HDA_CODEC_ENTRY(0x11069f80, "VX900 HDMI/DP",   patch_via_hdmi),
 HDA_CODEC_ENTRY(0x11069f81, "VX900 HDMI/DP",   patch_via_hdmi),
index 3375324..93d2156 100644 (file)
@@ -282,7 +282,7 @@ static void alc_update_knob_master(struct hda_codec *codec,
        uctl = kzalloc(sizeof(*uctl), GFP_KERNEL);
        if (!uctl)
                return;
-       val = snd_hda_codec_read(codec, jack->tbl->nid, 0,
+       val = snd_hda_codec_read(codec, jack->nid, 0,
                                 AC_VERB_GET_VOLUME_KNOB_CONTROL, 0);
        val &= HDA_AMP_VOLMASK;
        uctl->value.integer.value[0] = val;
@@ -327,6 +327,7 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
        case 0x10ec0292:
                alc_update_coef_idx(codec, 0x4, 1<<15, 0);
                break;
+       case 0x10ec0225:
        case 0x10ec0233:
        case 0x10ec0255:
        case 0x10ec0256:
@@ -900,6 +901,7 @@ static struct alc_codec_rename_pci_table rename_pci_tbl[] = {
        { 0x10ec0899, 0x1028, 0, "ALC3861" },
        { 0x10ec0298, 0x1028, 0, "ALC3266" },
        { 0x10ec0256, 0x1028, 0, "ALC3246" },
+       { 0x10ec0225, 0x1028, 0, "ALC3253" },
        { 0x10ec0670, 0x1025, 0, "ALC669X" },
        { 0x10ec0676, 0x1025, 0, "ALC679X" },
        { 0x10ec0282, 0x1043, 0, "ALC3229" },
@@ -1785,7 +1787,6 @@ enum {
        ALC882_FIXUP_NO_PRIMARY_HP,
        ALC887_FIXUP_ASUS_BASS,
        ALC887_FIXUP_BASS_CHMAP,
-       ALC882_FIXUP_DISABLE_AAMIX,
 };
 
 static void alc889_fixup_coef(struct hda_codec *codec,
@@ -1947,8 +1948,6 @@ static void alc882_fixup_no_primary_hp(struct hda_codec *codec,
 
 static void alc_fixup_bass_chmap(struct hda_codec *codec,
                                 const struct hda_fixup *fix, int action);
-static void alc_fixup_disable_aamix(struct hda_codec *codec,
-                                   const struct hda_fixup *fix, int action);
 
 static const struct hda_fixup alc882_fixups[] = {
        [ALC882_FIXUP_ABIT_AW9D_MAX] = {
@@ -2186,10 +2185,6 @@ static const struct hda_fixup alc882_fixups[] = {
                .type = HDA_FIXUP_FUNC,
                .v.func = alc_fixup_bass_chmap,
        },
-       [ALC882_FIXUP_DISABLE_AAMIX] = {
-               .type = HDA_FIXUP_FUNC,
-               .v.func = alc_fixup_disable_aamix,
-       },
 };
 
 static const struct snd_pci_quirk alc882_fixup_tbl[] = {
@@ -2228,6 +2223,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = {
        SND_PCI_QUIRK(0x104d, 0x9047, "Sony Vaio TT", ALC889_FIXUP_VAIO_TT),
        SND_PCI_QUIRK(0x104d, 0x905a, "Sony Vaio Z", ALC882_FIXUP_NO_PRIMARY_HP),
        SND_PCI_QUIRK(0x104d, 0x9043, "Sony Vaio VGC-LN51JGB", ALC882_FIXUP_NO_PRIMARY_HP),
+       SND_PCI_QUIRK(0x104d, 0x9044, "Sony VAIO AiO", ALC882_FIXUP_NO_PRIMARY_HP),
 
        /* All Apple entries are in codec SSIDs */
        SND_PCI_QUIRK(0x106b, 0x00a0, "MacBookPro 3,1", ALC889_FIXUP_MBP_VREF),
@@ -2257,7 +2253,6 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD),
        SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3),
        SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE),
-       SND_PCI_QUIRK(0x1458, 0xa182, "Gigabyte Z170X-UD3", ALC882_FIXUP_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x147b, 0x107a, "Abit AW9D-MAX", ALC882_FIXUP_ABIT_AW9D_MAX),
        SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD),
        SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD),
@@ -2651,6 +2646,7 @@ enum {
        ALC269_TYPE_ALC298,
        ALC269_TYPE_ALC255,
        ALC269_TYPE_ALC256,
+       ALC269_TYPE_ALC225,
 };
 
 /*
@@ -2680,6 +2676,7 @@ static int alc269_parse_auto_config(struct hda_codec *codec)
        case ALC269_TYPE_ALC298:
        case ALC269_TYPE_ALC255:
        case ALC269_TYPE_ALC256:
+       case ALC269_TYPE_ALC225:
                ssids = alc269_ssids;
                break;
        default:
@@ -3658,6 +3655,16 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec)
                WRITE_COEF(0xb7, 0x802b),
                {}
        };
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEF(0x4a, 1<<8, 0),
+               UPDATE_COEFEX(0x57, 0x05, 1<<14, 0),
+               UPDATE_COEF(0x63, 3<<14, 3<<14),
+               UPDATE_COEF(0x4a, 3<<4, 2<<4),
+               UPDATE_COEF(0x4a, 3<<10, 3<<10),
+               UPDATE_COEF(0x45, 0x3f<<10, 0x34<<10),
+               UPDATE_COEF(0x4a, 3<<10, 0),
+               {}
+       };
 
        switch (codec->core.vendor_id) {
        case 0x10ec0255:
@@ -3682,6 +3689,9 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec)
        case 0x10ec0668:
                alc_process_coef_fw(codec, coef0668);
                break;
+       case 0x10ec0225:
+               alc_process_coef_fw(codec, coef0225);
+               break;
        }
        codec_dbg(codec, "Headset jack set to unplugged mode.\n");
 }
@@ -3727,6 +3737,13 @@ static void alc_headset_mode_mic_in(struct hda_codec *codec, hda_nid_t hp_pin,
                UPDATE_COEF(0xc3, 0, 1<<12),
                {}
        };
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEFEX(0x57, 0x05, 1<<14, 1<<14),
+               UPDATE_COEF(0x4a, 3<<4, 2<<4),
+               UPDATE_COEF(0x63, 3<<14, 0),
+               {}
+       };
+
 
        switch (codec->core.vendor_id) {
        case 0x10ec0255:
@@ -3772,12 +3789,22 @@ static void alc_headset_mode_mic_in(struct hda_codec *codec, hda_nid_t hp_pin,
                alc_process_coef_fw(codec, coef0688);
                snd_hda_set_pin_ctl_cache(codec, mic_pin, PIN_VREF50);
                break;
+       case 0x10ec0225:
+               alc_update_coef_idx(codec, 0x45, 0x3f<<10, 0x31<<10);
+               snd_hda_set_pin_ctl_cache(codec, hp_pin, 0);
+               alc_process_coef_fw(codec, coef0225);
+               snd_hda_set_pin_ctl_cache(codec, mic_pin, PIN_VREF50);
+               break;
        }
        codec_dbg(codec, "Headset jack set to mic-in mode.\n");
 }
 
 static void alc_headset_mode_default(struct hda_codec *codec)
 {
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEF(0x45, 0x3f<<10, 0x34<<10),
+               {}
+       };
        static struct coef_fw coef0255[] = {
                WRITE_COEF(0x45, 0xc089),
                WRITE_COEF(0x45, 0xc489),
@@ -3819,6 +3846,9 @@ static void alc_headset_mode_default(struct hda_codec *codec)
        };
 
        switch (codec->core.vendor_id) {
+       case 0x10ec0225:
+               alc_process_coef_fw(codec, coef0225);
+               break;
        case 0x10ec0255:
        case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
@@ -3884,6 +3914,13 @@ static void alc_headset_mode_ctia(struct hda_codec *codec)
                WRITE_COEF(0xc3, 0x0000),
                {}
        };
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEF(0x45, 0x3f<<10, 0x35<<10),
+               UPDATE_COEF(0x49, 1<<8, 1<<8),
+               UPDATE_COEF(0x4a, 7<<6, 7<<6),
+               UPDATE_COEF(0x4a, 3<<4, 3<<4),
+               {}
+       };
 
        switch (codec->core.vendor_id) {
        case 0x10ec0255:
@@ -3912,6 +3949,9 @@ static void alc_headset_mode_ctia(struct hda_codec *codec)
        case 0x10ec0668:
                alc_process_coef_fw(codec, coef0688);
                break;
+       case 0x10ec0225:
+               alc_process_coef_fw(codec, coef0225);
+               break;
        }
        codec_dbg(codec, "Headset jack set to iPhone-style headset mode.\n");
 }
@@ -3955,6 +3995,13 @@ static void alc_headset_mode_omtp(struct hda_codec *codec)
                WRITE_COEF(0xc3, 0x0000),
                {}
        };
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEF(0x45, 0x3f<<10, 0x39<<10),
+               UPDATE_COEF(0x49, 1<<8, 1<<8),
+               UPDATE_COEF(0x4a, 7<<6, 7<<6),
+               UPDATE_COEF(0x4a, 3<<4, 3<<4),
+               {}
+       };
 
        switch (codec->core.vendor_id) {
        case 0x10ec0255:
@@ -3983,6 +4030,9 @@ static void alc_headset_mode_omtp(struct hda_codec *codec)
        case 0x10ec0668:
                alc_process_coef_fw(codec, coef0688);
                break;
+       case 0x10ec0225:
+               alc_process_coef_fw(codec, coef0225);
+               break;
        }
        codec_dbg(codec, "Headset jack set to Nokia-style headset mode.\n");
 }
@@ -4014,6 +4064,11 @@ static void alc_determine_headset_type(struct hda_codec *codec)
                WRITE_COEF(0xc3, 0x0c00),
                {}
        };
+       static struct coef_fw coef0225[] = {
+               UPDATE_COEF(0x45, 0x3f<<10, 0x34<<10),
+               UPDATE_COEF(0x49, 1<<8, 1<<8),
+               {}
+       };
 
        switch (codec->core.vendor_id) {
        case 0x10ec0255:
@@ -4058,6 +4113,12 @@ static void alc_determine_headset_type(struct hda_codec *codec)
                val = alc_read_coef_idx(codec, 0xbe);
                is_ctia = (val & 0x1c02) == 0x1c02;
                break;
+       case 0x10ec0225:
+               alc_process_coef_fw(codec, coef0225);
+               msleep(800);
+               val = alc_read_coef_idx(codec, 0x46);
+               is_ctia = (val & 0x00f0) == 0x00f0;
+               break;
        }
 
        codec_dbg(codec, "Headset jack detected iPhone-style headset: %s\n",
@@ -4695,6 +4756,9 @@ enum {
        ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE,
        ALC293_FIXUP_LENOVO_SPK_NOISE,
        ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY,
+       ALC255_FIXUP_DELL_SPK_NOISE,
+       ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
+       ALC280_FIXUP_HP_HEADSET_MIC,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -5314,6 +5378,29 @@ static const struct hda_fixup alc269_fixups[] = {
                .type = HDA_FIXUP_FUNC,
                .v.func = alc233_fixup_lenovo_line2_mic_hotkey,
        },
+       [ALC255_FIXUP_DELL_SPK_NOISE] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc_fixup_disable_aamix,
+               .chained = true,
+               .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE
+       },
+       [ALC225_FIXUP_DELL1_MIC_NO_PRESENCE] = {
+               .type = HDA_FIXUP_VERBS,
+               .v.verbs = (const struct hda_verb[]) {
+                       /* Disable pass-through path for FRONT 14h */
+                       { 0x20, AC_VERB_SET_COEF_INDEX, 0x36 },
+                       { 0x20, AC_VERB_SET_PROC_COEF, 0x57d7 },
+                       {}
+               },
+               .chained = true,
+               .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
+       },
+       [ALC280_FIXUP_HP_HEADSET_MIC] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc_fixup_disable_aamix,
+               .chained = true,
+               .chain_id = ALC269_FIXUP_HEADSET_MIC,
+       },
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -5325,6 +5412,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1025, 0x080d, "Acer Aspire V5-122P", ALC269_FIXUP_ASPIRE_HEADSET_MIC),
        SND_PCI_QUIRK(0x1025, 0x0740, "Acer AO725", ALC271_FIXUP_HP_GATE_MIC_JACK),
        SND_PCI_QUIRK(0x1025, 0x0742, "Acer AO756", ALC271_FIXUP_HP_GATE_MIC_JACK),
+       SND_PCI_QUIRK(0x1025, 0x0762, "Acer Aspire E1-472", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572),
        SND_PCI_QUIRK(0x1025, 0x0775, "Acer Aspire E1-572", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572),
        SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS),
        SND_PCI_QUIRK(0x1025, 0x106d, "Acer Cloudbook 14", ALC283_FIXUP_CHROME_BOOK),
@@ -5356,6 +5444,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
        SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
        SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
+       SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE),
        SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),
@@ -5416,6 +5505,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x2335, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1),
        SND_PCI_QUIRK(0x103c, 0x2336, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1),
        SND_PCI_QUIRK(0x103c, 0x2337, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1),
+       SND_PCI_QUIRK(0x103c, 0x221c, "HP EliteBook 755 G2", ALC280_FIXUP_HP_HEADSET_MIC),
        SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
        SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
        SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
@@ -5560,6 +5650,9 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
        {.id = ALC292_FIXUP_TPT440, .name = "tpt440"},
        {}
 };
+#define ALC225_STANDARD_PINS \
+       {0x12, 0xb7a60130}, \
+       {0x21, 0x04211020}
 
 #define ALC256_STANDARD_PINS \
        {0x12, 0x90a60140}, \
@@ -5581,6 +5674,12 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
        {0x21, 0x03211020}
 
 static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
+       SND_HDA_PIN_QUIRK(0x10ec0225, 0x1028, "Dell", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
+               ALC225_STANDARD_PINS,
+               {0x14, 0x901701a0}),
+       SND_HDA_PIN_QUIRK(0x10ec0225, 0x1028, "Dell", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
+               ALC225_STANDARD_PINS,
+               {0x14, 0x901701b0}),
        SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL2_MIC_NO_PRESENCE,
                {0x14, 0x90170110},
                {0x21, 0x02211020}),
@@ -5906,6 +6005,9 @@ static int patch_alc269(struct hda_codec *codec)
                spec->gen.mixer_nid = 0; /* ALC256 does not have any loopback mixer path */
                alc_update_coef_idx(codec, 0x36, 1 << 13, 1 << 5); /* Switch pcbeep path to Line in path*/
                break;
+       case 0x10ec0225:
+               spec->codec_variant = ALC269_TYPE_ALC225;
+               break;
        }
 
        if (snd_hda_codec_read(codec, 0x51, 0, AC_VERB_PARAMETERS, 0) == 0x10ec5505) {
@@ -6796,6 +6898,7 @@ static int patch_alc680(struct hda_codec *codec)
  */
 static const struct hda_device_id snd_hda_id_realtek[] = {
        HDA_CODEC_ENTRY(0x10ec0221, "ALC221", patch_alc269),
+       HDA_CODEC_ENTRY(0x10ec0225, "ALC225", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0231, "ALC231", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0233, "ALC233", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0235, "ALC233", patch_alc269),
index 2c7c5eb..37b70f8 100644 (file)
@@ -493,9 +493,9 @@ static void jack_update_power(struct hda_codec *codec,
        if (!spec->num_pwrs)
                return;
 
-       if (jack && jack->tbl->nid) {
-               stac_toggle_power_map(codec, jack->tbl->nid,
-                                     snd_hda_jack_detect(codec, jack->tbl->nid),
+       if (jack && jack->nid) {
+               stac_toggle_power_map(codec, jack->nid,
+                                     snd_hda_jack_detect(codec, jack->nid),
                                      true);
                return;
        }
index 2875b4f..7c8941b 100644 (file)
@@ -2879,7 +2879,7 @@ static int snd_hdsp_get_dds_offset(struct snd_kcontrol *kcontrol, struct snd_ctl
 {
        struct hdsp *hdsp = snd_kcontrol_chip(kcontrol);
 
-       ucontrol->value.enumerated.item[0] = hdsp_dds_offset(hdsp);
+       ucontrol->value.integer.value[0] = hdsp_dds_offset(hdsp);
        return 0;
 }
 
@@ -2891,7 +2891,7 @@ static int snd_hdsp_put_dds_offset(struct snd_kcontrol *kcontrol, struct snd_ctl
 
        if (!snd_hdsp_use_is_exclusive(hdsp))
                return -EBUSY;
-       val = ucontrol->value.enumerated.item[0];
+       val = ucontrol->value.integer.value[0];
        spin_lock_irq(&hdsp->lock);
        if (val != hdsp_dds_offset(hdsp))
                change = (hdsp_set_dds_offset(hdsp, val) == 0) ? 1 : 0;
index 8bc8016..a4a999a 100644 (file)
@@ -1601,6 +1601,9 @@ static void hdspm_set_dds_value(struct hdspm *hdspm, int rate)
 {
        u64 n;
 
+       if (snd_BUG_ON(rate <= 0))
+               return;
+
        if (rate >= 112000)
                rate /= 4;
        else if (rate >= 56000)
@@ -2215,6 +2218,8 @@ static int hdspm_get_system_sample_rate(struct hdspm *hdspm)
                } else {
                        /* slave mode, return external sample rate */
                        rate = hdspm_external_sample_rate(hdspm);
+                       if (!rate)
+                               rate = hdspm->system_sample_rate;
                }
        }
 
@@ -2260,8 +2265,11 @@ static int snd_hdspm_put_system_sample_rate(struct snd_kcontrol *kcontrol,
                                            ucontrol)
 {
        struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
+       int rate = ucontrol->value.integer.value[0];
 
-       hdspm_set_dds_value(hdspm, ucontrol->value.enumerated.item[0]);
+       if (rate < 27000 || rate > 207000)
+               return -EINVAL;
+       hdspm_set_dds_value(hdspm, ucontrol->value.integer.value[0]);
        return 0;
 }
 
@@ -4449,7 +4457,7 @@ static int snd_hdspm_get_tco_word_term(struct snd_kcontrol *kcontrol,
 {
        struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
 
-       ucontrol->value.enumerated.item[0] = hdspm->tco->term;
+       ucontrol->value.integer.value[0] = hdspm->tco->term;
 
        return 0;
 }
@@ -4460,8 +4468,8 @@ static int snd_hdspm_put_tco_word_term(struct snd_kcontrol *kcontrol,
 {
        struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
 
-       if (hdspm->tco->term != ucontrol->value.enumerated.item[0]) {
-               hdspm->tco->term = ucontrol->value.enumerated.item[0];
+       if (hdspm->tco->term != ucontrol->value.integer.value[0]) {
+               hdspm->tco->term = ucontrol->value.integer.value[0];
 
                hdspm_tco_write(hdspm);
 
index 3191e0a..d1fb035 100644 (file)
@@ -635,6 +635,7 @@ static int acp_dma_open(struct snd_pcm_substream *substream)
                                            SNDRV_PCM_HW_PARAM_PERIODS);
        if (ret < 0) {
                dev_err(prtd->platform->dev, "set integer constraint failed\n");
+               kfree(adata);
                return ret;
        }
 
index affb192..faae693 100644 (file)
@@ -1130,7 +1130,7 @@ static int sid_status_control_get(struct snd_kcontrol *kcontrol,
        struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
 
        mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->sid_status;
+       ucontrol->value.enumerated.item[0] = drvdata->sid_status;
        mutex_unlock(&drvdata->ctrl_lock);
 
        return 0;
@@ -1147,7 +1147,7 @@ static int sid_status_control_put(struct snd_kcontrol *kcontrol,
 
        dev_dbg(codec->dev, "%s: Enter\n", __func__);
 
-       if (ucontrol->value.integer.value[0] != SID_APPLY_FIR) {
+       if (ucontrol->value.enumerated.item[0] != SID_APPLY_FIR) {
                dev_err(codec->dev,
                        "%s: ERROR: This control supports '%s' only!\n",
                        __func__, enum_sid_state[SID_APPLY_FIR]);
@@ -1199,7 +1199,7 @@ static int anc_status_control_get(struct snd_kcontrol *kcontrol,
        struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
 
        mutex_lock(&drvdata->ctrl_lock);
-       ucontrol->value.integer.value[0] = drvdata->anc_status;
+       ucontrol->value.enumerated.item[0] = drvdata->anc_status;
        mutex_unlock(&drvdata->ctrl_lock);
 
        return 0;
@@ -1220,7 +1220,7 @@ static int anc_status_control_put(struct snd_kcontrol *kcontrol,
 
        mutex_lock(&drvdata->ctrl_lock);
 
-       req = ucontrol->value.integer.value[0];
+       req = ucontrol->value.enumerated.item[0];
        if (req >= ARRAY_SIZE(enum_anc_state)) {
                status = -EINVAL;
                goto cleanup;
index e13583e..5ae87a0 100644 (file)
@@ -103,9 +103,9 @@ bool adau17x1_has_dsp(struct adau *adau);
 #define ADAU17X1_CLOCK_CONTROL_CORECLK_SRC_PLL BIT(3)
 #define ADAU17X1_CLOCK_CONTROL_SYSCLK_EN       BIT(0)
 
-#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x0 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x1 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x2 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK64           (0x0 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK32           (0x1 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK48           (0x2 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK128          (0x3 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK256          (0x4 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK_MASK                (0x7 << 5)
index 33143fe..9178531 100644 (file)
@@ -1929,6 +1929,25 @@ static struct {
        { 1000000, 13500000, 0,  1 },
 };
 
+static const unsigned int pseudo_fref_max[ARIZONA_FLL_MAX_FRATIO] = {
+       13500000,
+        6144000,
+        6144000,
+        3072000,
+        3072000,
+        2822400,
+        2822400,
+        1536000,
+        1536000,
+        1536000,
+        1536000,
+        1536000,
+        1536000,
+        1536000,
+        1536000,
+         768000,
+};
+
 static struct {
        unsigned int min;
        unsigned int max;
@@ -2042,16 +2061,32 @@ static int arizona_calc_fratio(struct arizona_fll *fll,
        /* Adjust FRATIO/refdiv to avoid integer mode if possible */
        refdiv = cfg->refdiv;
 
+       arizona_fll_dbg(fll, "pseudo: initial ratio=%u fref=%u refdiv=%u\n",
+                       init_ratio, Fref, refdiv);
+
        while (div <= ARIZONA_FLL_MAX_REFDIV) {
                for (ratio = init_ratio; ratio <= ARIZONA_FLL_MAX_FRATIO;
                     ratio++) {
                        if ((ARIZONA_FLL_VCO_CORNER / 2) /
-                           (fll->vco_mult * ratio) < Fref)
+                           (fll->vco_mult * ratio) < Fref) {
+                               arizona_fll_dbg(fll, "pseudo: hit VCO corner\n");
                                break;
+                       }
+
+                       if (Fref > pseudo_fref_max[ratio - 1]) {
+                               arizona_fll_dbg(fll,
+                                       "pseudo: exceeded max fref(%u) for ratio=%u\n",
+                                       pseudo_fref_max[ratio - 1],
+                                       ratio);
+                               break;
+                       }
 
                        if (target % (ratio * Fref)) {
                                cfg->refdiv = refdiv;
                                cfg->fratio = ratio - 1;
+                               arizona_fll_dbg(fll,
+                                       "pseudo: found fref=%u refdiv=%d(%d) ratio=%d\n",
+                                       Fref, refdiv, div, ratio);
                                return ratio;
                        }
                }
@@ -2060,6 +2095,9 @@ static int arizona_calc_fratio(struct arizona_fll *fll,
                        if (target % (ratio * Fref)) {
                                cfg->refdiv = refdiv;
                                cfg->fratio = ratio - 1;
+                               arizona_fll_dbg(fll,
+                                       "pseudo: found fref=%u refdiv=%d(%d) ratio=%d\n",
+                                       Fref, refdiv, div, ratio);
                                return ratio;
                        }
                }
@@ -2068,6 +2106,9 @@ static int arizona_calc_fratio(struct arizona_fll *fll,
                Fref /= 2;
                refdiv++;
                init_ratio = arizona_find_fratio(Fref, NULL);
+               arizona_fll_dbg(fll,
+                               "pseudo: change fref=%u refdiv=%d(%d) ratio=%u\n",
+                               Fref, refdiv, div, init_ratio);
        }
 
        arizona_fll_warn(fll, "Falling back to integer mode operation\n");
index b395152..35488f1 100644 (file)
@@ -60,15 +60,15 @@ static int cs42l51_get_chan_mix(struct snd_kcontrol *kcontrol,
        switch (value) {
        default:
        case 0:
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
                break;
        /* same value : (L+R)/2 and (R+L)/2 */
        case 1:
        case 2:
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
                break;
        case 3:
-               ucontrol->value.integer.value[0] = 2;
+               ucontrol->value.enumerated.item[0] = 2;
                break;
        }
 
@@ -85,7 +85,7 @@ static int cs42l51_set_chan_mix(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        unsigned char val;
 
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
        default:
        case 0:
                val = CHAN_MIX_NORMAL;
index 1d5a89c..461506a 100644 (file)
@@ -334,7 +334,7 @@ static int da732x_hpf_set(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct soc_enum *enum_ctrl = (struct soc_enum *)kcontrol->private_value;
        unsigned int reg = enum_ctrl->reg;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
        unsigned int bits;
 
        switch (sel) {
@@ -368,13 +368,13 @@ static int da732x_hpf_get(struct snd_kcontrol *kcontrol,
 
        switch (val) {
        case DA732X_HPF_VOICE_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_VOICE;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_VOICE;
                break;
        case DA732X_HPF_MUSIC_EN:
-               ucontrol->value.integer.value[0] = DA732X_HPF_MUSIC;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_MUSIC;
                break;
        default:
-               ucontrol->value.integer.value[0] = DA732X_HPF_DISABLED;
+               ucontrol->value.enumerated.item[0] = DA732X_HPF_DISABLED;
                break;
        }
 
index 20dcc49..fc22804 100644 (file)
@@ -1496,7 +1496,7 @@ static int max98088_put_eq_enum(struct snd_kcontrol *kcontrol,
        struct max98088_pdata *pdata = max98088->pdata;
        int channel = max98088_get_channel(codec, kcontrol->id.name);
        struct max98088_cdata *cdata;
-       int sel = ucontrol->value.integer.value[0];
+       int sel = ucontrol->value.enumerated.item[0];
 
        if (channel < 0)
               return channel;
index 1fedac5..3577003 100644 (file)
@@ -1499,7 +1499,7 @@ static int max98095_put_eq_enum(struct snd_kcontrol *kcontrol,
        struct max98095_pdata *pdata = max98095->pdata;
        int channel = max98095_get_eq_channel(kcontrol->id.name);
        struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
        struct max98095_eq_cfg *coef_set;
        int fs, best, best_val, i;
        int regmask, regsave;
@@ -1653,7 +1653,7 @@ static int max98095_put_bq_enum(struct snd_kcontrol *kcontrol,
        struct max98095_pdata *pdata = max98095->pdata;
        int channel = max98095_get_bq_channel(codec, kcontrol->id.name);
        struct max98095_cdata *cdata;
-       unsigned int sel = ucontrol->value.integer.value[0];
+       unsigned int sel = ucontrol->value.enumerated.item[0];
        struct max98095_biquad_cfg *coef_set;
        int fs, best, best_val, i;
        int regmask, regsave;
index bc08f0c..1bd3164 100644 (file)
@@ -266,6 +266,8 @@ static int rt286_jack_detect(struct rt286_priv *rt286, bool *hp, bool *mic)
                } else {
                        *mic = false;
                        regmap_write(rt286->regmap, RT286_SET_MIC1, 0x20);
+                       regmap_update_bits(rt286->regmap,
+                               RT286_CBJ_CTRL1, 0x0400, 0x0000);
                }
        } else {
                regmap_read(rt286->regmap, RT286_GET_HP_SENSE, &buf);
@@ -470,24 +472,6 @@ static int rt286_set_dmic1_event(struct snd_soc_dapm_widget *w,
        return 0;
 }
 
-static int rt286_vref_event(struct snd_soc_dapm_widget *w,
-                            struct snd_kcontrol *kcontrol, int event)
-{
-       struct snd_soc_codec *codec = snd_soc_dapm_to_codec(w->dapm);
-
-       switch (event) {
-       case SND_SOC_DAPM_PRE_PMU:
-               snd_soc_update_bits(codec,
-                       RT286_CBJ_CTRL1, 0x0400, 0x0000);
-               mdelay(50);
-               break;
-       default:
-               return 0;
-       }
-
-       return 0;
-}
-
 static int rt286_ldo2_event(struct snd_soc_dapm_widget *w,
                             struct snd_kcontrol *kcontrol, int event)
 {
@@ -536,7 +520,7 @@ static const struct snd_soc_dapm_widget rt286_dapm_widgets[] = {
        SND_SOC_DAPM_SUPPLY_S("HV", 1, RT286_POWER_CTRL1,
                12, 1, NULL, 0),
        SND_SOC_DAPM_SUPPLY("VREF", RT286_POWER_CTRL1,
-               0, 1, rt286_vref_event, SND_SOC_DAPM_PRE_PMU),
+               0, 1, NULL, 0),
        SND_SOC_DAPM_SUPPLY_S("LDO1", 1, RT286_POWER_CTRL2,
                2, 0, NULL, 0),
        SND_SOC_DAPM_SUPPLY_S("LDO2", 2, RT286_POWER_CTRL1,
@@ -910,8 +894,6 @@ static int rt286_set_bias_level(struct snd_soc_codec *codec,
 
        case SND_SOC_BIAS_ON:
                mdelay(10);
-               snd_soc_update_bits(codec,
-                       RT286_CBJ_CTRL1, 0x0400, 0x0400);
                snd_soc_update_bits(codec,
                        RT286_DC_GAIN, 0x200, 0x0);
 
@@ -920,8 +902,6 @@ static int rt286_set_bias_level(struct snd_soc_codec *codec,
        case SND_SOC_BIAS_STANDBY:
                snd_soc_write(codec,
                        RT286_SET_AUDIO_POWER, AC_PWRST_D3);
-               snd_soc_update_bits(codec,
-                       RT286_CBJ_CTRL1, 0x0400, 0x0000);
                break;
 
        default:
index c61d38b..93e8c90 100644 (file)
@@ -776,7 +776,7 @@ static const struct snd_kcontrol_new rt5645_snd_controls[] = {
 
        /* IN1/IN2 Control */
        SOC_SINGLE_TLV("IN1 Boost", RT5645_IN1_CTRL1,
-               RT5645_BST_SFT1, 8, 0, bst_tlv),
+               RT5645_BST_SFT1, 12, 0, bst_tlv),
        SOC_SINGLE_TLV("IN2 Boost", RT5645_IN2_CTRL,
                RT5645_BST_SFT2, 8, 0, bst_tlv),
 
index 820d8fa..fb8ea05 100644 (file)
@@ -3985,7 +3985,6 @@ static int rt5659_i2c_probe(struct i2c_client *i2c,
        if (rt5659 == NULL)
                return -ENOMEM;
 
-       rt5659->i2c = i2c;
        i2c_set_clientdata(i2c, rt5659);
 
        if (pdata)
@@ -4157,24 +4156,17 @@ static int rt5659_i2c_probe(struct i2c_client *i2c,
 
        INIT_DELAYED_WORK(&rt5659->jack_detect_work, rt5659_jack_detect_work);
 
-       if (rt5659->i2c->irq) {
-               ret = request_threaded_irq(rt5659->i2c->irq, NULL, rt5659_irq,
-                       IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING
+       if (i2c->irq) {
+               ret = devm_request_threaded_irq(&i2c->dev, i2c->irq, NULL,
+                       rt5659_irq, IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING
                        | IRQF_ONESHOT, "rt5659", rt5659);
                if (ret)
                        dev_err(&i2c->dev, "Failed to reguest IRQ: %d\n", ret);
 
        }
 
-       ret = snd_soc_register_codec(&i2c->dev, &soc_codec_dev_rt5659,
+       return snd_soc_register_codec(&i2c->dev, &soc_codec_dev_rt5659,
                        rt5659_dai, ARRAY_SIZE(rt5659_dai));
-
-       if (ret) {
-               if (rt5659->i2c->irq)
-                       free_irq(rt5659->i2c->irq, rt5659);
-       }
-
-       return 0;
 }
 
 static int rt5659_i2c_remove(struct i2c_client *i2c)
@@ -4191,24 +4183,29 @@ void rt5659_i2c_shutdown(struct i2c_client *client)
        regmap_write(rt5659->regmap, RT5659_RESET, 0);
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id rt5659_of_match[] = {
        { .compatible = "realtek,rt5658", },
        { .compatible = "realtek,rt5659", },
-       {},
+       { },
 };
+MODULE_DEVICE_TABLE(of, rt5659_of_match);
+#endif
 
+#ifdef CONFIG_ACPI
 static struct acpi_device_id rt5659_acpi_match[] = {
-               { "10EC5658", 0},
-               { "10EC5659", 0},
-               { },
+       { "10EC5658", 0, },
+       { "10EC5659", 0, },
+       { },
 };
 MODULE_DEVICE_TABLE(acpi, rt5659_acpi_match);
+#endif
 
 struct i2c_driver rt5659_i2c_driver = {
        .driver = {
                .name = "rt5659",
                .owner = THIS_MODULE,
-               .of_match_table = rt5659_of_match,
+               .of_match_table = of_match_ptr(rt5659_of_match),
                .acpi_match_table = ACPI_PTR(rt5659_acpi_match),
        },
        .probe = rt5659_i2c_probe,
index 8f07ee9..d31c9e5 100644 (file)
@@ -1792,7 +1792,6 @@ struct rt5659_priv {
        struct snd_soc_codec *codec;
        struct rt5659_platform_data pdata;
        struct regmap *regmap;
-       struct i2c_client *i2c;
        struct gpio_desc *gpiod_ldo1_en;
        struct gpio_desc *gpiod_reset;
        struct snd_soc_jack *hs_jack;
index 21ca3a5..d374c18 100644 (file)
@@ -31,7 +31,10 @@ static int sigmadsp_write_i2c(void *control_data,
 
        kfree(buf);
 
-       return ret;
+       if (ret < 0)
+               return ret;
+
+       return 0;
 }
 
 static int sigmadsp_read_i2c(void *control_data,
index 781398f..f7a6ce7 100644 (file)
@@ -446,7 +446,7 @@ static int dac33_get_fifo_mode(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
 
-       ucontrol->value.integer.value[0] = dac33->fifo_mode;
+       ucontrol->value.enumerated.item[0] = dac33->fifo_mode;
 
        return 0;
 }
@@ -458,17 +458,16 @@ static int dac33_set_fifo_mode(struct snd_kcontrol *kcontrol,
        struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
        int ret = 0;
 
-       if (dac33->fifo_mode == ucontrol->value.integer.value[0])
+       if (dac33->fifo_mode == ucontrol->value.enumerated.item[0])
                return 0;
        /* Do not allow changes while stream is running*/
        if (snd_soc_codec_is_active(codec))
                return -EPERM;
 
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >= DAC33_FIFO_LAST_MODE)
+       if (ucontrol->value.enumerated.item[0] >= DAC33_FIFO_LAST_MODE)
                ret = -EINVAL;
        else
-               dac33->fifo_mode = ucontrol->value.integer.value[0];
+               dac33->fifo_mode = ucontrol->value.enumerated.item[0];
 
        return ret;
 }
index 7693c11..1b79778 100644 (file)
@@ -175,7 +175,7 @@ static int snd_wl1273_get_audio_route(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
 
-       ucontrol->value.integer.value[0] = wl1273->mode;
+       ucontrol->value.enumerated.item[0] = wl1273->mode;
 
        return 0;
 }
@@ -193,18 +193,17 @@ static int snd_wl1273_set_audio_route(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
 
-       if (wl1273->mode == ucontrol->value.integer.value[0])
+       if (wl1273->mode == ucontrol->value.enumerated.item[0])
                return 0;
 
        /* Do not allow changes while stream is running */
        if (snd_soc_codec_is_active(codec))
                return -EPERM;
 
-       if (ucontrol->value.integer.value[0] < 0 ||
-           ucontrol->value.integer.value[0] >=  ARRAY_SIZE(wl1273_audio_route))
+       if (ucontrol->value.enumerated.item[0] >=  ARRAY_SIZE(wl1273_audio_route))
                return -EINVAL;
 
-       wl1273->mode = ucontrol->value.integer.value[0];
+       wl1273->mode = ucontrol->value.enumerated.item[0];
 
        return 1;
 }
@@ -219,7 +218,7 @@ static int snd_wl1273_fm_audio_get(struct snd_kcontrol *kcontrol,
 
        dev_dbg(codec->dev, "%s: enter.\n", __func__);
 
-       ucontrol->value.integer.value[0] = wl1273->core->audio_mode;
+       ucontrol->value.enumerated.item[0] = wl1273->core->audio_mode;
 
        return 0;
 }
@@ -233,7 +232,7 @@ static int snd_wl1273_fm_audio_put(struct snd_kcontrol *kcontrol,
 
        dev_dbg(codec->dev, "%s: enter.\n", __func__);
 
-       val = ucontrol->value.integer.value[0];
+       val = ucontrol->value.enumerated.item[0];
        if (wl1273->core->audio_mode == val)
                return 0;
 
index 6088d30..97c0f1e 100644 (file)
@@ -2382,6 +2382,7 @@ error:
 
 static int wm5110_remove(struct platform_device *pdev)
 {
+       snd_soc_unregister_platform(&pdev->dev);
        snd_soc_unregister_codec(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
 
index 61299ca..6f1024f 100644 (file)
@@ -233,7 +233,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
 
-       ucontrol->value.integer.value[0] = wm8753->dai_func;
+       ucontrol->value.enumerated.item[0] = wm8753->dai_func;
        return 0;
 }
 
@@ -244,7 +244,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
        struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
        u16 ioctl;
 
-       if (wm8753->dai_func == ucontrol->value.integer.value[0])
+       if (wm8753->dai_func == ucontrol->value.enumerated.item[0])
                return 0;
 
        if (snd_soc_codec_is_active(codec))
@@ -252,7 +252,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
 
        ioctl = snd_soc_read(codec, WM8753_IOCTL);
 
-       wm8753->dai_func = ucontrol->value.integer.value[0];
+       wm8753->dai_func = ucontrol->value.enumerated.item[0];
 
        if (((ioctl >> 2) & 0x3) == wm8753->dai_func)
                return 1;
index 8172e49..edd7a77 100644 (file)
@@ -396,7 +396,7 @@ static int wm8904_put_drc_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
        struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
 
        if (value >= pdata->num_drc_cfgs)
                return -EINVAL;
@@ -467,7 +467,7 @@ static int wm8904_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
        struct wm8904_pdata *pdata = wm8904->pdata;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
 
        if (value >= pdata->num_retune_mobile_cfgs)
                return -EINVAL;
index c799cca..6b864c0 100644 (file)
@@ -459,7 +459,7 @@ static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
        struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
        int reg;
 
        /* Don't allow on the fly reconfiguration */
@@ -549,7 +549,7 @@ static int wm8958_put_vss_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
        struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
        int reg;
 
        /* Don't allow on the fly reconfiguration */
@@ -582,7 +582,7 @@ static int wm8958_put_vss_hpf_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
        struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
        int reg;
 
        /* Don't allow on the fly reconfiguration */
@@ -749,7 +749,7 @@ static int wm8958_put_enh_eq_enum(struct snd_kcontrol *kcontrol,
        struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
        struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
        struct wm8994 *control = wm8994->wm8994;
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
        int reg;
 
        /* Don't allow on the fly reconfiguration */
index ff23772..d7f444f 100644 (file)
@@ -240,13 +240,13 @@ SOC_DOUBLE_R("Capture Volume ZC Switch", WM8960_LINVOL, WM8960_RINVOL,
 SOC_DOUBLE_R("Capture Switch", WM8960_LINVOL, WM8960_RINVOL,
        7, 1, 1),
 
-SOC_SINGLE_TLV("Right Input Boost Mixer RINPUT3 Volume",
+SOC_SINGLE_TLV("Left Input Boost Mixer LINPUT3 Volume",
               WM8960_INBMIX1, 4, 7, 0, lineinboost_tlv),
-SOC_SINGLE_TLV("Right Input Boost Mixer RINPUT2 Volume",
+SOC_SINGLE_TLV("Left Input Boost Mixer LINPUT2 Volume",
               WM8960_INBMIX1, 1, 7, 0, lineinboost_tlv),
-SOC_SINGLE_TLV("Left Input Boost Mixer LINPUT3 Volume",
+SOC_SINGLE_TLV("Right Input Boost Mixer RINPUT3 Volume",
               WM8960_INBMIX2, 4, 7, 0, lineinboost_tlv),
-SOC_SINGLE_TLV("Left Input Boost Mixer LINPUT2 Volume",
+SOC_SINGLE_TLV("Right Input Boost Mixer RINPUT2 Volume",
               WM8960_INBMIX2, 1, 7, 0, lineinboost_tlv),
 SOC_SINGLE_TLV("Right Input Boost Mixer RINPUT1 Volume",
                WM8960_RINPATH, 4, 3, 0, micboost_tlv),
@@ -643,29 +643,31 @@ static int wm8960_configure_clocking(struct snd_soc_codec *codec)
                return -EINVAL;
        }
 
-       /* check if the sysclk frequency is available. */
-       for (i = 0; i < ARRAY_SIZE(sysclk_divs); ++i) {
-               if (sysclk_divs[i] == -1)
-                       continue;
-               sysclk = freq_out / sysclk_divs[i];
-               for (j = 0; j < ARRAY_SIZE(dac_divs); ++j) {
-                       if (sysclk == dac_divs[j] * lrclk) {
+       if (wm8960->clk_id != WM8960_SYSCLK_PLL) {
+               /* check if the sysclk frequency is available. */
+               for (i = 0; i < ARRAY_SIZE(sysclk_divs); ++i) {
+                       if (sysclk_divs[i] == -1)
+                               continue;
+                       sysclk = freq_out / sysclk_divs[i];
+                       for (j = 0; j < ARRAY_SIZE(dac_divs); ++j) {
+                               if (sysclk != dac_divs[j] * lrclk)
+                                       continue;
                                for (k = 0; k < ARRAY_SIZE(bclk_divs); ++k)
                                        if (sysclk == bclk * bclk_divs[k] / 10)
                                                break;
                                if (k != ARRAY_SIZE(bclk_divs))
                                        break;
                        }
+                       if (j != ARRAY_SIZE(dac_divs))
+                               break;
                }
-               if (j != ARRAY_SIZE(dac_divs))
-                       break;
-       }
 
-       if (i != ARRAY_SIZE(sysclk_divs)) {
-               goto configure_clock;
-       } else if (wm8960->clk_id != WM8960_SYSCLK_AUTO) {
-               dev_err(codec->dev, "failed to configure clock\n");
-               return -EINVAL;
+               if (i != ARRAY_SIZE(sysclk_divs)) {
+                       goto configure_clock;
+               } else if (wm8960->clk_id != WM8960_SYSCLK_AUTO) {
+                       dev_err(codec->dev, "failed to configure clock\n");
+                       return -EINVAL;
+               }
        }
        /* get a available pll out frequency and set pll */
        for (i = 0; i < ARRAY_SIZE(sysclk_divs); ++i) {
index 7350ff6..0c002a5 100644 (file)
@@ -497,9 +497,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
 
        reg = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
        if (reg & WM8983_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
        else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
 
        return 0;
 }
@@ -511,18 +511,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
        unsigned int regpwr2, regpwr3;
        unsigned int reg_eq;
 
-       if (ucontrol->value.integer.value[0] != 0
-           && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+           && ucontrol->value.enumerated.item[0] != 1)
                return -EINVAL;
 
        reg_eq = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
        switch ((reg_eq & WM8983_EQ3DMODE) >> WM8983_EQ3DMODE_SHIFT) {
        case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                        return 0;
                break;
        case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                        return 0;
                break;
        }
@@ -537,7 +537,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
        /* set the desired eqmode */
        snd_soc_update_bits(codec, WM8983_EQ1_LOW_SHELF,
                            WM8983_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                            << WM8983_EQ3DMODE_SHIFT);
        /* restore DAC/ADC configuration */
        snd_soc_write(codec, WM8983_POWER_MANAGEMENT_2, regpwr2);
index 9918152..6ac76fe 100644 (file)
@@ -531,9 +531,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
 
        reg = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
        if (reg & WM8985_EQ3DMODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
        else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
 
        return 0;
 }
@@ -545,18 +545,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
        unsigned int regpwr2, regpwr3;
        unsigned int reg_eq;
 
-       if (ucontrol->value.integer.value[0] != 0
-                       && ucontrol->value.integer.value[0] != 1)
+       if (ucontrol->value.enumerated.item[0] != 0
+                       && ucontrol->value.enumerated.item[0] != 1)
                return -EINVAL;
 
        reg_eq = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
        switch ((reg_eq & WM8985_EQ3DMODE) >> WM8985_EQ3DMODE_SHIFT) {
        case 0:
-               if (!ucontrol->value.integer.value[0])
+               if (!ucontrol->value.enumerated.item[0])
                        return 0;
                break;
        case 1:
-               if (ucontrol->value.integer.value[0])
+               if (ucontrol->value.enumerated.item[0])
                        return 0;
                break;
        }
@@ -573,7 +573,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
        /* set the desired eqmode */
        snd_soc_update_bits(codec, WM8985_EQ1_LOW_SHELF,
                            WM8985_EQ3DMODE_MASK,
-                           ucontrol->value.integer.value[0]
+                           ucontrol->value.enumerated.item[0]
                            << WM8985_EQ3DMODE_SHIFT);
        /* restore DAC/ADC configuration */
        snd_soc_write(codec, WM8985_POWER_MANAGEMENT_2, regpwr2);
index 2ccbb32..a18aecb 100644 (file)
@@ -362,7 +362,7 @@ static int wm8994_put_drc_enum(struct snd_kcontrol *kcontrol,
        struct wm8994 *control = wm8994->wm8994;
        struct wm8994_pdata *pdata = &control->pdata;
        int drc = wm8994_get_drc(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
 
        if (drc < 0)
                return drc;
@@ -469,7 +469,7 @@ static int wm8994_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
        struct wm8994 *control = wm8994->wm8994;
        struct wm8994_pdata *pdata = &control->pdata;
        int block = wm8994_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
 
        if (block < 0)
                return block;
index 8d7d6c0..f99b34f 100644 (file)
@@ -416,7 +416,7 @@ static int wm8996_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
        struct wm8996_priv *wm8996 = snd_soc_codec_get_drvdata(codec);
        struct wm8996_pdata *pdata = &wm8996->pdata;
        int block = wm8996_get_retune_mobile_block(kcontrol->id.name);
-       int value = ucontrol->value.integer.value[0];
+       int value = ucontrol->value.enumerated.item[0];
 
        if (block < 0)
                return block;
index ccb3b15..363b3b6 100644 (file)
@@ -344,9 +344,9 @@ static int speaker_mode_get(struct snd_kcontrol *kcontrol,
 
        reg = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
        if (reg & WM9081_SPK_MODE)
-               ucontrol->value.integer.value[0] = 1;
+               ucontrol->value.enumerated.item[0] = 1;
        else
-               ucontrol->value.integer.value[0] = 0;
+               ucontrol->value.enumerated.item[0] = 0;
 
        return 0;
 }
@@ -365,7 +365,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
        unsigned int reg2 = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
 
        /* Are we changing anything? */
-       if (ucontrol->value.integer.value[0] ==
+       if (ucontrol->value.enumerated.item[0] ==
            ((reg2 & WM9081_SPK_MODE) != 0))
                return 0;
 
@@ -373,7 +373,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
        if (reg_pwr & WM9081_SPK_ENA)
                return -EINVAL;
 
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                /* Class AB */
                reg2 &= ~(WM9081_SPK_INV_MUTE | WM9081_OUT_SPK_CTRL);
                reg2 |= WM9081_SPK_MODE;
index 79e1436..9849643 100644 (file)
@@ -1212,7 +1212,7 @@ static int wm9713_soc_probe(struct snd_soc_codec *codec)
        if (IS_ERR(wm9713->ac97))
                return PTR_ERR(wm9713->ac97);
 
-       regmap = devm_regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
+       regmap = regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
        if (IS_ERR(regmap)) {
                snd_soc_free_ac97_codec(wm9713->ac97);
                return PTR_ERR(regmap);
index 33806d4..b9195b9 100644 (file)
@@ -586,7 +586,7 @@ static int wm_adsp_fw_get(struct snd_kcontrol *kcontrol,
        struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
        struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
 
-       ucontrol->value.integer.value[0] = dsp[e->shift_l].fw;
+       ucontrol->value.enumerated.item[0] = dsp[e->shift_l].fw;
 
        return 0;
 }
@@ -599,10 +599,10 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
        struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
        int ret = 0;
 
-       if (ucontrol->value.integer.value[0] == dsp[e->shift_l].fw)
+       if (ucontrol->value.enumerated.item[0] == dsp[e->shift_l].fw)
                return 0;
 
-       if (ucontrol->value.integer.value[0] >= WM_ADSP_NUM_FW)
+       if (ucontrol->value.enumerated.item[0] >= WM_ADSP_NUM_FW)
                return -EINVAL;
 
        mutex_lock(&dsp[e->shift_l].pwr_lock);
@@ -610,7 +610,7 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
        if (dsp[e->shift_l].running || dsp[e->shift_l].compr)
                ret = -EBUSY;
        else
-               dsp[e->shift_l].fw = ucontrol->value.integer.value[0];
+               dsp[e->shift_l].fw = ucontrol->value.enumerated.item[0];
 
        mutex_unlock(&dsp[e->shift_l].pwr_lock);
 
index ce664c2..bff258d 100644 (file)
@@ -645,6 +645,8 @@ static int dw_i2s_probe(struct platform_device *pdev)
 
        dev->dev = &pdev->dev;
 
+       dev->i2s_reg_comp1 = I2S_COMP_PARAM_1;
+       dev->i2s_reg_comp2 = I2S_COMP_PARAM_2;
        if (pdata) {
                dev->capability = pdata->cap;
                clk_id = NULL;
@@ -652,9 +654,6 @@ static int dw_i2s_probe(struct platform_device *pdev)
                if (dev->quirks & DW_I2S_QUIRK_COMP_REG_OFFSET) {
                        dev->i2s_reg_comp1 = pdata->i2s_reg_comp1;
                        dev->i2s_reg_comp2 = pdata->i2s_reg_comp2;
-               } else {
-                       dev->i2s_reg_comp1 = I2S_COMP_PARAM_1;
-                       dev->i2s_reg_comp2 = I2S_COMP_PARAM_2;
                }
                ret = dw_configure_dai_by_pd(dev, dw_i2s_dai, res, pdata);
        } else {
index 49d7513..e63cd5e 100644 (file)
@@ -217,8 +217,8 @@ static int snd_imx_pcm_mmap(struct snd_pcm_substream *substream,
        struct snd_pcm_runtime *runtime = substream->runtime;
        int ret;
 
-       ret = dma_mmap_writecombine(substream->pcm->card->dev, vma,
-               runtime->dma_area, runtime->dma_addr, runtime->dma_bytes);
+       ret = dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                         runtime->dma_addr, runtime->dma_bytes);
 
        pr_debug("%s: ret: %d %p %pad 0x%08x\n", __func__, ret,
                        runtime->dma_area,
@@ -247,8 +247,7 @@ static int imx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
        buf->dev.type = SNDRV_DMA_TYPE_DEV;
        buf->dev.dev = pcm->card->dev;
        buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
        if (!buf->area)
                return -ENOMEM;
        buf->bytes = size;
@@ -330,8 +329,7 @@ static void imx_pcm_free(struct snd_pcm *pcm)
                if (!buf->area)
                        continue;
 
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                buf->area = NULL;
        }
 }
index a407e83..fb896b2 100644 (file)
@@ -72,8 +72,6 @@ static int imx_spdif_audio_probe(struct platform_device *pdev)
                goto end;
        }
 
-       platform_set_drvdata(pdev, data);
-
 end:
        of_node_put(spdif_np);
 
index 1ded881..2389ab4 100644 (file)
@@ -99,7 +99,7 @@ static int asoc_simple_card_hw_params(struct snd_pcm_substream *substream,
                if (ret && ret != -ENOTSUPP)
                        goto err;
        }
-
+       return 0;
 err:
        return ret;
 }
index 803f95e..7d7c872 100644 (file)
@@ -30,11 +30,15 @@ config SND_SST_IPC_ACPI
 config SND_SOC_INTEL_SST
        tristate
        select SND_SOC_INTEL_SST_ACPI if ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
        depends on (X86 || COMPILE_TEST)
 
 config SND_SOC_INTEL_SST_ACPI
        tristate
 
+config SND_SOC_INTEL_SST_MATCH
+       tristate
+
 config SND_SOC_INTEL_HASWELL
        tristate
 
@@ -57,7 +61,7 @@ config SND_SOC_INTEL_HASWELL_MACH
 config SND_SOC_INTEL_BYT_RT5640_MACH
        tristate "ASoC Audio driver for Intel Baytrail with RT5640 codec"
        depends on X86_INTEL_LPSS && I2C
-       depends on DW_DMAC_CORE=y && (SND_SOC_INTEL_BYTCR_RT5640_MACH = n)
+       depends on DW_DMAC_CORE=y && (SND_SST_IPC_ACPI = n)
        select SND_SOC_INTEL_SST
        select SND_SOC_INTEL_BAYTRAIL
        select SND_SOC_RT5640
@@ -69,7 +73,7 @@ config SND_SOC_INTEL_BYT_RT5640_MACH
 config SND_SOC_INTEL_BYT_MAX98090_MACH
        tristate "ASoC Audio driver for Intel Baytrail with MAX98090 codec"
        depends on X86_INTEL_LPSS && I2C
-       depends on DW_DMAC_CORE=y
+       depends on DW_DMAC_CORE=y && (SND_SST_IPC_ACPI = n)
        select SND_SOC_INTEL_SST
        select SND_SOC_INTEL_BAYTRAIL
        select SND_SOC_MAX98090
@@ -97,6 +101,7 @@ config SND_SOC_INTEL_BYTCR_RT5640_MACH
        select SND_SOC_RT5640
        select SND_SST_MFLD_PLATFORM
        select SND_SST_IPC_ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
        help
           This adds support for ASoC machine driver for Intel(R) Baytrail and Baytrail-CR
           platforms with RT5640 audio codec.
@@ -109,6 +114,7 @@ config SND_SOC_INTEL_BYTCR_RT5651_MACH
        select SND_SOC_RT5651
        select SND_SST_MFLD_PLATFORM
        select SND_SST_IPC_ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
        help
           This adds support for ASoC machine driver for Intel(R) Baytrail and Baytrail-CR
           platforms with RT5651 audio codec.
@@ -121,6 +127,7 @@ config SND_SOC_INTEL_CHT_BSW_RT5672_MACH
         select SND_SOC_RT5670
         select SND_SST_MFLD_PLATFORM
         select SND_SST_IPC_ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
         help
           This adds support for ASoC machine driver for Intel(R) Cherrytrail & Braswell
           platforms with RT5672 audio codec.
@@ -133,6 +140,7 @@ config SND_SOC_INTEL_CHT_BSW_RT5645_MACH
        select SND_SOC_RT5645
        select SND_SST_MFLD_PLATFORM
        select SND_SST_IPC_ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
        help
          This adds support for ASoC machine driver for Intel(R) Cherrytrail & Braswell
          platforms with RT5645/5650 audio codec.
@@ -145,6 +153,7 @@ config SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH
        select SND_SOC_TS3A227E
        select SND_SST_MFLD_PLATFORM
        select SND_SST_IPC_ACPI
+       select SND_SOC_INTEL_SST_MATCH if ACPI
        help
       This adds support for ASoC machine driver for Intel(R) Cherrytrail & Braswell
       platforms with MAX98090 audio codec it also can support TI jack chip as aux device.
index 55c33dc..52ed434 100644 (file)
@@ -528,6 +528,7 @@ static struct snd_soc_dai_driver sst_platform_dai[] = {
        .ops = &sst_compr_dai_ops,
        .playback = {
                .stream_name = "Compress Playback",
+               .channels_min = 1,
        },
 },
 /* BE CPU  Dais */
index 2d3afdd..a7b96a9 100644 (file)
@@ -367,8 +367,12 @@ static int snd_cht_mc_probe(struct platform_device *pdev)
        }
        card->dev = &pdev->dev;
        sprintf(codec_name, "i2c-%s:00", drv->acpi_card->codec_id);
+
        /* set correct codec name */
-       strcpy((char *)card->dai_link[2].codec_name, codec_name);
+       for (i = 0; i < ARRAY_SIZE(cht_dailink); i++)
+               if (!strcmp(card->dai_link[i].codec_name, "i2c-10EC5645:00"))
+                       card->dai_link[i].codec_name = kstrdup(codec_name, GFP_KERNEL);
+
        snd_soc_card_set_drvdata(card, drv);
        ret_val = devm_snd_soc_register_card(&pdev->dev, card);
        if (ret_val) {
index 49c09a0..34f46c7 100644 (file)
@@ -94,7 +94,7 @@ static const struct soc_enum lo_enum =
 static int headset_get_switch(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = hs_switch;
+       ucontrol->value.enumerated.item[0] = hs_switch;
        return 0;
 }
 
@@ -104,12 +104,12 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
        struct snd_soc_dapm_context *dapm = &card->dapm;
 
-       if (ucontrol->value.integer.value[0] == hs_switch)
+       if (ucontrol->value.enumerated.item[0] == hs_switch)
                return 0;
 
        snd_soc_dapm_mutex_lock(dapm);
 
-       if (ucontrol->value.integer.value[0]) {
+       if (ucontrol->value.enumerated.item[0]) {
                pr_debug("hs_set HS path\n");
                snd_soc_dapm_enable_pin_unlocked(dapm, "Headphones");
                snd_soc_dapm_disable_pin_unlocked(dapm, "EPOUT");
@@ -123,7 +123,7 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
 
        snd_soc_dapm_mutex_unlock(dapm);
 
-       hs_switch = ucontrol->value.integer.value[0];
+       hs_switch = ucontrol->value.enumerated.item[0];
 
        return 0;
 }
@@ -148,7 +148,7 @@ static void lo_enable_out_pins(struct snd_soc_dapm_context *dapm)
 static int lo_get_switch(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = lo_dac;
+       ucontrol->value.enumerated.item[0] = lo_dac;
        return 0;
 }
 
@@ -158,7 +158,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
        struct snd_soc_dapm_context *dapm = &card->dapm;
 
-       if (ucontrol->value.integer.value[0] == lo_dac)
+       if (ucontrol->value.enumerated.item[0] == lo_dac)
                return 0;
 
        snd_soc_dapm_mutex_lock(dapm);
@@ -168,7 +168,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
         */
        lo_enable_out_pins(dapm);
 
-       switch (ucontrol->value.integer.value[0]) {
+       switch (ucontrol->value.enumerated.item[0]) {
        case 0:
                pr_debug("set vibra path\n");
                snd_soc_dapm_disable_pin_unlocked(dapm, "VIB1OUT");
@@ -202,7 +202,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
 
        snd_soc_dapm_mutex_unlock(dapm);
 
-       lo_dac = ucontrol->value.integer.value[0];
+       lo_dac = ucontrol->value.enumerated.item[0];
        return 0;
 }
 
index 7396ddb..2cbcbe4 100644 (file)
@@ -212,7 +212,10 @@ static int skylake_dmic_fixup(struct snd_soc_pcm_runtime *rtd,
 {
        struct snd_interval *channels = hw_param_interval(params,
                                                SNDRV_PCM_HW_PARAM_CHANNELS);
-       channels->min = channels->max = 4;
+       if (params_channels(params) == 2)
+               channels->min = channels->max = 2;
+       else
+               channels->min = channels->max = 4;
 
        return 0;
 }
index 668fdee..fbbb25c 100644 (file)
@@ -1,13 +1,10 @@
 snd-soc-sst-dsp-objs := sst-dsp.o
-ifneq ($(CONFIG_SND_SST_IPC_ACPI),)
-snd-soc-sst-acpi-objs := sst-match-acpi.o
-else
-snd-soc-sst-acpi-objs := sst-acpi.o sst-match-acpi.o
-endif
-
+snd-soc-sst-acpi-objs := sst-acpi.o
+snd-soc-sst-match-objs := sst-match-acpi.o
 snd-soc-sst-ipc-objs := sst-ipc.o
 
 snd-soc-sst-dsp-$(CONFIG_DW_DMAC_CORE) += sst-firmware.o
 
 obj-$(CONFIG_SND_SOC_INTEL_SST) += snd-soc-sst-dsp.o snd-soc-sst-ipc.o
 obj-$(CONFIG_SND_SOC_INTEL_SST_ACPI) += snd-soc-sst-acpi.o
+obj-$(CONFIG_SND_SOC_INTEL_SST_MATCH) += snd-soc-sst-match.o
index 7a85c57..2c5eda1 100644 (file)
@@ -215,6 +215,7 @@ static struct sst_acpi_desc sst_acpi_broadwell_desc = {
        .dma_size = SST_LPT_DSP_DMA_SIZE,
 };
 
+#if !IS_ENABLED(CONFIG_SND_SST_IPC_ACPI)
 static struct sst_acpi_mach baytrail_machines[] = {
        { "10EC5640", "byt-rt5640", "intel/fw_sst_0f28.bin-48kHz_i2s_master", NULL, NULL, NULL },
        { "193C9890", "byt-max98090", "intel/fw_sst_0f28.bin-48kHz_i2s_master", NULL, NULL, NULL },
@@ -231,11 +232,14 @@ static struct sst_acpi_desc sst_acpi_baytrail_desc = {
        .sst_id = SST_DEV_ID_BYT,
        .resindex_dma_base = -1,
 };
+#endif
 
 static const struct acpi_device_id sst_acpi_match[] = {
        { "INT33C8", (unsigned long)&sst_acpi_haswell_desc },
        { "INT3438", (unsigned long)&sst_acpi_broadwell_desc },
+#if !IS_ENABLED(CONFIG_SND_SST_IPC_ACPI)
        { "80860F28", (unsigned long)&sst_acpi_baytrail_desc },
+#endif
        { }
 };
 MODULE_DEVICE_TABLE(acpi, sst_acpi_match);
index dd077e1..3b4539d 100644 (file)
@@ -41,3 +41,6 @@ struct sst_acpi_mach *sst_acpi_find_machine(struct sst_acpi_mach *machines)
        return NULL;
 }
 EXPORT_SYMBOL_GPL(sst_acpi_find_machine);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel Common ACPI Match module");
index de6dac4..4629372 100644 (file)
@@ -688,14 +688,14 @@ int skl_unbind_modules(struct skl_sst *ctx,
        /* get src queue index */
        src_index = skl_get_queue_index(src_mcfg->m_out_pin, dst_id, out_max);
        if (src_index < 0)
-               return -EINVAL;
+               return 0;
 
        msg.src_queue = src_index;
 
        /* get dst queue index */
        dst_index  = skl_get_queue_index(dst_mcfg->m_in_pin, src_id, in_max);
        if (dst_index < 0)
-               return -EINVAL;
+               return 0;
 
        msg.dst_queue = dst_index;
 
@@ -747,7 +747,7 @@ int skl_bind_modules(struct skl_sst *ctx,
 
        skl_dump_bind_info(ctx, src_mcfg, dst_mcfg);
 
-       if (src_mcfg->m_state < SKL_MODULE_INIT_DONE &&
+       if (src_mcfg->m_state < SKL_MODULE_INIT_DONE ||
                dst_mcfg->m_state < SKL_MODULE_INIT_DONE)
                return 0;
 
index f355325..b6e6b61 100644 (file)
@@ -863,6 +863,7 @@ static int skl_get_delay_from_lpib(struct hdac_ext_bus *ebus,
                else
                        delay += hstream->bufsize;
        }
+       delay = (hstream->bufsize == delay) ? 0 : delay;
 
        if (delay >= hstream->period_bytes) {
                dev_info(bus->dev,
index 4624556..5a4837d 100644 (file)
@@ -54,12 +54,9 @@ static int is_skl_dsp_widget_type(struct snd_soc_dapm_widget *w)
 
 /*
  * Each pipelines needs memory to be allocated. Check if we have free memory
- * from available pool. Then only add this to pool
- * This is freed when pipe is deleted
- * Note: DSP does actual memory management we only keep track for complete
- * pool
+ * from available pool.
  */
-static bool skl_tplg_alloc_pipe_mem(struct skl *skl,
+static bool skl_is_pipe_mem_avail(struct skl *skl,
                                struct skl_module_cfg *mconfig)
 {
        struct skl_sst *ctx = skl->skl_sst;
@@ -74,10 +71,20 @@ static bool skl_tplg_alloc_pipe_mem(struct skl *skl,
                                "exceeds ppl memory available %d mem %d\n",
                                skl->resource.max_mem, skl->resource.mem);
                return false;
+       } else {
+               return true;
        }
+}
 
+/*
+ * Add the mem to the mem pool. This is freed when pipe is deleted.
+ * Note: DSP does actual memory management we only keep track for complete
+ * pool
+ */
+static void skl_tplg_alloc_pipe_mem(struct skl *skl,
+                               struct skl_module_cfg *mconfig)
+{
        skl->resource.mem += mconfig->pipe->memory_pages;
-       return true;
 }
 
 /*
@@ -85,10 +92,10 @@ static bool skl_tplg_alloc_pipe_mem(struct skl *skl,
  * quantified in MCPS (Million Clocks Per Second) required for module/pipe
  *
  * Each pipelines needs mcps to be allocated. Check if we have mcps for this
- * pipe. This adds the mcps to driver counter
- * This is removed on pipeline delete
+ * pipe.
  */
-static bool skl_tplg_alloc_pipe_mcps(struct skl *skl,
+
+static bool skl_is_pipe_mcps_avail(struct skl *skl,
                                struct skl_module_cfg *mconfig)
 {
        struct skl_sst *ctx = skl->skl_sst;
@@ -98,13 +105,18 @@ static bool skl_tplg_alloc_pipe_mcps(struct skl *skl,
                        "%s: module_id %d instance %d\n", __func__,
                        mconfig->id.module_id, mconfig->id.instance_id);
                dev_err(ctx->dev,
-                       "exceeds ppl memory available %d > mem %d\n",
+                       "exceeds ppl mcps available %d > mem %d\n",
                        skl->resource.max_mcps, skl->resource.mcps);
                return false;
+       } else {
+               return true;
        }
+}
 
+static void skl_tplg_alloc_pipe_mcps(struct skl *skl,
+                               struct skl_module_cfg *mconfig)
+{
        skl->resource.mcps += mconfig->mcps;
-       return true;
 }
 
 /*
@@ -411,7 +423,7 @@ skl_tplg_init_pipe_modules(struct skl *skl, struct skl_pipe *pipe)
                mconfig = w->priv;
 
                /* check resource available */
-               if (!skl_tplg_alloc_pipe_mcps(skl, mconfig))
+               if (!skl_is_pipe_mcps_avail(skl, mconfig))
                        return -ENOMEM;
 
                if (mconfig->is_loadable && ctx->dsp->fw_ops.load_mod) {
@@ -435,6 +447,7 @@ skl_tplg_init_pipe_modules(struct skl *skl, struct skl_pipe *pipe)
                ret = skl_tplg_set_module_params(w, ctx);
                if (ret < 0)
                        return ret;
+               skl_tplg_alloc_pipe_mcps(skl, mconfig);
        }
 
        return 0;
@@ -477,10 +490,10 @@ static int skl_tplg_mixer_dapm_pre_pmu_event(struct snd_soc_dapm_widget *w,
        struct skl_sst *ctx = skl->skl_sst;
 
        /* check resource available */
-       if (!skl_tplg_alloc_pipe_mcps(skl, mconfig))
+       if (!skl_is_pipe_mcps_avail(skl, mconfig))
                return -EBUSY;
 
-       if (!skl_tplg_alloc_pipe_mem(skl, mconfig))
+       if (!skl_is_pipe_mem_avail(skl, mconfig))
                return -ENOMEM;
 
        /*
@@ -526,11 +539,15 @@ static int skl_tplg_mixer_dapm_pre_pmu_event(struct snd_soc_dapm_widget *w,
                src_module = dst_module;
        }
 
+       skl_tplg_alloc_pipe_mem(skl, mconfig);
+       skl_tplg_alloc_pipe_mcps(skl, mconfig);
+
        return 0;
 }
 
 static int skl_tplg_bind_sinks(struct snd_soc_dapm_widget *w,
                                struct skl *skl,
+                               struct snd_soc_dapm_widget *src_w,
                                struct skl_module_cfg *src_mconfig)
 {
        struct snd_soc_dapm_path *p;
@@ -547,6 +564,10 @@ static int skl_tplg_bind_sinks(struct snd_soc_dapm_widget *w,
                dev_dbg(ctx->dev, "%s: sink widget=%s\n", __func__, p->sink->name);
 
                next_sink = p->sink;
+
+               if (!is_skl_dsp_widget_type(p->sink))
+                       return skl_tplg_bind_sinks(p->sink, skl, src_w, src_mconfig);
+
                /*
                 * here we will check widgets in sink pipelines, so that
                 * can be any widgets type and we are only interested if
@@ -576,7 +597,7 @@ static int skl_tplg_bind_sinks(struct snd_soc_dapm_widget *w,
        }
 
        if (!sink)
-               return skl_tplg_bind_sinks(next_sink, skl, src_mconfig);
+               return skl_tplg_bind_sinks(next_sink, skl, src_w, src_mconfig);
 
        return 0;
 }
@@ -605,7 +626,7 @@ static int skl_tplg_pga_dapm_pre_pmu_event(struct snd_soc_dapm_widget *w,
         * if sink is not started, start sink pipe first, then start
         * this pipe
         */
-       ret = skl_tplg_bind_sinks(w, skl, src_mconfig);
+       ret = skl_tplg_bind_sinks(w, skl, w, src_mconfig);
        if (ret)
                return ret;
 
@@ -773,10 +794,7 @@ static int skl_tplg_mixer_dapm_post_pmd_event(struct snd_soc_dapm_widget *w,
                        continue;
                }
 
-               ret = skl_unbind_modules(ctx, src_module, dst_module);
-               if (ret < 0)
-                       return ret;
-
+               skl_unbind_modules(ctx, src_module, dst_module);
                src_module = dst_module;
        }
 
@@ -814,9 +832,6 @@ static int skl_tplg_pga_dapm_post_pmd_event(struct snd_soc_dapm_widget *w,
                         * This is a connecter and if path is found that means
                         * unbind between source and sink has not happened yet
                         */
-                       ret = skl_stop_pipe(ctx, sink_mconfig->pipe);
-                       if (ret < 0)
-                               return ret;
                        ret = skl_unbind_modules(ctx, src_mconfig,
                                                        sink_mconfig);
                }
@@ -842,6 +857,12 @@ static int skl_tplg_vmixer_event(struct snd_soc_dapm_widget *w,
        case SND_SOC_DAPM_PRE_PMU:
                return skl_tplg_mixer_dapm_pre_pmu_event(w, skl);
 
+       case SND_SOC_DAPM_POST_PMU:
+               return skl_tplg_mixer_dapm_post_pmu_event(w, skl);
+
+       case SND_SOC_DAPM_PRE_PMD:
+               return skl_tplg_mixer_dapm_pre_pmd_event(w, skl);
+
        case SND_SOC_DAPM_POST_PMD:
                return skl_tplg_mixer_dapm_post_pmd_event(w, skl);
        }
@@ -916,6 +937,13 @@ static int skl_tplg_tlv_control_get(struct snd_kcontrol *kcontrol,
                skl_get_module_params(skl->skl_sst, (u32 *)bc->params,
                                      bc->max, bc->param_id, mconfig);
 
+       /* decrement size for TLV header */
+       size -= 2 * sizeof(u32);
+
+       /* check size as we don't want to send kernel data */
+       if (size > bc->max)
+               size = bc->max;
+
        if (bc->params) {
                if (copy_to_user(data, &bc->param_id, sizeof(u32)))
                        return -EFAULT;
@@ -950,7 +978,7 @@ static int skl_tplg_tlv_control_set(struct snd_kcontrol *kcontrol,
                                return -EFAULT;
                } else {
                        if (copy_from_user(ac->params,
-                                          data + 2 * sizeof(u32), size))
+                                          data + 2, size))
                                return -EFAULT;
                }
 
@@ -1510,6 +1538,7 @@ int skl_tplg_init(struct snd_soc_platform *platform, struct hdac_ext_bus *ebus)
                                        &skl_tplg_ops, fw, 0);
        if (ret < 0) {
                dev_err(bus->dev, "tplg component load failed%d\n", ret);
+               release_firmware(fw);
                return -EINVAL;
        }
 
index 443a15d..092705e 100644 (file)
@@ -614,8 +614,6 @@ static int skl_probe(struct pci_dev *pci,
                goto out_unregister;
 
        /*configure PM */
-       pm_runtime_set_autosuspend_delay(bus->dev, SKL_SUSPEND_DELAY);
-       pm_runtime_use_autosuspend(bus->dev);
        pm_runtime_put_noidle(bus->dev);
        pm_runtime_allow(bus->dev);
 
index 15c04e2..9769676 100644 (file)
@@ -9,7 +9,7 @@ config SND_SOC_MEDIATEK
 
 config SND_SOC_MT8173_MAX98090
        tristate "ASoC Audio driver for MT8173 with MAX98090 codec"
-       depends on SND_SOC_MEDIATEK
+       depends on SND_SOC_MEDIATEK && I2C
        select SND_SOC_MAX98090
        help
          This adds ASoC driver for Mediatek MT8173 boards
@@ -19,7 +19,7 @@ config SND_SOC_MT8173_MAX98090
 
 config SND_SOC_MT8173_RT5650_RT5676
        tristate "ASoC Audio driver for MT8173 with RT5650 RT5676 codecs"
-       depends on SND_SOC_MEDIATEK
+       depends on SND_SOC_MEDIATEK && I2C
        select SND_SOC_RT5645
        select SND_SOC_RT5677
        help
index c866ade..a6c7b8d 100644 (file)
@@ -381,9 +381,19 @@ static int mxs_saif_startup(struct snd_pcm_substream *substream,
        __raw_writel(BM_SAIF_CTRL_CLKGATE,
                saif->base + SAIF_CTRL + MXS_CLR_ADDR);
 
+       clk_prepare(saif->clk);
+
        return 0;
 }
 
+static void mxs_saif_shutdown(struct snd_pcm_substream *substream,
+                             struct snd_soc_dai *cpu_dai)
+{
+       struct mxs_saif *saif = snd_soc_dai_get_drvdata(cpu_dai);
+
+       clk_unprepare(saif->clk);
+}
+
 /*
  * Should only be called when port is inactive.
  * although can be called multiple times by upper layers.
@@ -424,8 +434,6 @@ static int mxs_saif_hw_params(struct snd_pcm_substream *substream,
                return ret;
        }
 
-       /* prepare clk in hw_param, enable in trigger */
-       clk_prepare(saif->clk);
        if (saif != master_saif) {
                /*
                * Set an initial clock rate for the saif internal logic to work
@@ -611,6 +619,7 @@ static int mxs_saif_trigger(struct snd_pcm_substream *substream, int cmd,
 
 static const struct snd_soc_dai_ops mxs_saif_dai_ops = {
        .startup = mxs_saif_startup,
+       .shutdown = mxs_saif_shutdown,
        .trigger = mxs_saif_trigger,
        .prepare = mxs_saif_prepare,
        .hw_params = mxs_saif_hw_params,
index e093261..2cca055 100644 (file)
@@ -267,10 +267,8 @@ static int nuc900_dma_mmap(struct snd_pcm_substream *substream,
 {
        struct snd_pcm_runtime *runtime = substream->runtime;
 
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                       runtime->dma_area,
-                                       runtime->dma_addr,
-                                       runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
 }
 
 static struct snd_pcm_ops nuc900_dma_ops = {
index 190f868..fdecb70 100644 (file)
@@ -133,7 +133,7 @@ static struct snd_soc_ops n810_ops = {
 static int n810_get_spk(struct snd_kcontrol *kcontrol,
                        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = n810_spk_func;
+       ucontrol->value.enumerated.item[0] = n810_spk_func;
 
        return 0;
 }
@@ -143,10 +143,10 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (n810_spk_func == ucontrol->value.integer.value[0])
+       if (n810_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       n810_spk_func = ucontrol->value.integer.value[0];
+       n810_spk_func = ucontrol->value.enumerated.item[0];
        n810_ext_control(&card->dapm);
 
        return 1;
@@ -155,7 +155,7 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
 static int n810_get_jack(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = n810_jack_func;
+       ucontrol->value.enumerated.item[0] = n810_jack_func;
 
        return 0;
 }
@@ -165,10 +165,10 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (n810_jack_func == ucontrol->value.integer.value[0])
+       if (n810_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       n810_jack_func = ucontrol->value.integer.value[0];
+       n810_jack_func = ucontrol->value.enumerated.item[0];
        n810_ext_control(&card->dapm);
 
        return 1;
@@ -177,7 +177,7 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
 static int n810_get_input(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = n810_dmic_func;
+       ucontrol->value.enumerated.item[0] = n810_dmic_func;
 
        return 0;
 }
@@ -187,10 +187,10 @@ static int n810_set_input(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (n810_dmic_func == ucontrol->value.integer.value[0])
+       if (n810_dmic_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       n810_dmic_func = ucontrol->value.integer.value[0];
+       n810_dmic_func = ucontrol->value.enumerated.item[0];
        n810_ext_control(&card->dapm);
 
        return 1;
index 6bb623a..99381a2 100644 (file)
@@ -156,10 +156,8 @@ static int omap_pcm_mmap(struct snd_pcm_substream *substream,
 {
        struct snd_pcm_runtime *runtime = substream->runtime;
 
-       return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-                                    runtime->dma_area,
-                                    runtime->dma_addr,
-                                    runtime->dma_bytes);
+       return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+                          runtime->dma_addr, runtime->dma_bytes);
 }
 
 static struct snd_pcm_ops omap_pcm_ops = {
@@ -183,8 +181,7 @@ static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
        buf->dev.type = SNDRV_DMA_TYPE_DEV;
        buf->dev.dev = pcm->card->dev;
        buf->private_data = NULL;
-       buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-                                          &buf->addr, GFP_KERNEL);
+       buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
        if (!buf->area)
                return -ENOMEM;
 
@@ -207,8 +204,7 @@ static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
                if (!buf->area)
                        continue;
 
-               dma_free_writecombine(pcm->card->dev, buf->bytes,
-                                     buf->area, buf->addr);
+               dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
                buf->area = NULL;
        }
 }
index 5e21f08..5494924 100644 (file)
@@ -132,7 +132,7 @@ static struct snd_soc_ops rx51_ops = {
 static int rx51_get_spk(struct snd_kcontrol *kcontrol,
                        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = rx51_spk_func;
+       ucontrol->value.enumerated.item[0] = rx51_spk_func;
 
        return 0;
 }
@@ -142,10 +142,10 @@ static int rx51_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (rx51_spk_func == ucontrol->value.integer.value[0])
+       if (rx51_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       rx51_spk_func = ucontrol->value.integer.value[0];
+       rx51_spk_func = ucontrol->value.enumerated.item[0];
        rx51_ext_control(&card->dapm);
 
        return 1;
@@ -180,7 +180,7 @@ static int rx51_hp_event(struct snd_soc_dapm_widget *w,
 static int rx51_get_input(struct snd_kcontrol *kcontrol,
                          struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = rx51_dmic_func;
+       ucontrol->value.enumerated.item[0] = rx51_dmic_func;
 
        return 0;
 }
@@ -190,10 +190,10 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (rx51_dmic_func == ucontrol->value.integer.value[0])
+       if (rx51_dmic_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       rx51_dmic_func = ucontrol->value.integer.value[0];
+       rx51_dmic_func = ucontrol->value.enumerated.item[0];
        rx51_ext_control(&card->dapm);
 
        return 1;
@@ -202,7 +202,7 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
 static int rx51_get_jack(struct snd_kcontrol *kcontrol,
                         struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = rx51_jack_func;
+       ucontrol->value.enumerated.item[0] = rx51_jack_func;
 
        return 0;
 }
@@ -212,10 +212,10 @@ static int rx51_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (rx51_jack_func == ucontrol->value.integer.value[0])
+       if (rx51_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       rx51_jack_func = ucontrol->value.integer.value[0];
+       rx51_jack_func = ucontrol->value.enumerated.item[0];
        rx51_ext_control(&card->dapm);
 
        return 1;
index c97dc13..dcbb7aa 100644 (file)
@@ -163,7 +163,7 @@ static struct snd_soc_ops corgi_ops = {
 static int corgi_get_jack(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = corgi_jack_func;
+       ucontrol->value.enumerated.item[0] = corgi_jack_func;
        return 0;
 }
 
@@ -172,10 +172,10 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (corgi_jack_func == ucontrol->value.integer.value[0])
+       if (corgi_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       corgi_jack_func = ucontrol->value.integer.value[0];
+       corgi_jack_func = ucontrol->value.enumerated.item[0];
        corgi_ext_control(&card->dapm);
        return 1;
 }
@@ -183,7 +183,7 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
 static int corgi_get_spk(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = corgi_spk_func;
+       ucontrol->value.enumerated.item[0] = corgi_spk_func;
        return 0;
 }
 
@@ -192,10 +192,10 @@ static int corgi_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (corgi_spk_func == ucontrol->value.integer.value[0])
+       if (corgi_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       corgi_spk_func = ucontrol->value.integer.value[0];
+       corgi_spk_func = ucontrol->value.enumerated.item[0];
        corgi_ext_control(&card->dapm);
        return 1;
 }
index 241d0be..62b8377 100644 (file)
@@ -308,17 +308,17 @@ static int magician_set_spk(struct snd_kcontrol *kcontrol,
 static int magician_get_input(struct snd_kcontrol *kcontrol,
                              struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = magician_in_sel;
+       ucontrol->value.enumerated.item[0] = magician_in_sel;
        return 0;
 }
 
 static int magician_set_input(struct snd_kcontrol *kcontrol,
                              struct snd_ctl_elem_value *ucontrol)
 {
-       if (magician_in_sel == ucontrol->value.integer.value[0])
+       if (magician_in_sel == ucontrol->value.enumerated.item[0])
                return 0;
 
-       magician_in_sel = ucontrol->value.integer.value[0];
+       magician_in_sel = ucontrol->value.enumerated.item[0];
 
        switch (magician_in_sel) {
        case MAGICIAN_MIC:
index 84d0e2e..4b3b714 100644 (file)
@@ -138,7 +138,7 @@ static struct snd_soc_ops poodle_ops = {
 static int poodle_get_jack(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = poodle_jack_func;
+       ucontrol->value.enumerated.item[0] = poodle_jack_func;
        return 0;
 }
 
@@ -147,10 +147,10 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (poodle_jack_func == ucontrol->value.integer.value[0])
+       if (poodle_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       poodle_jack_func = ucontrol->value.integer.value[0];
+       poodle_jack_func = ucontrol->value.enumerated.item[0];
        poodle_ext_control(&card->dapm);
        return 1;
 }
@@ -158,7 +158,7 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
 static int poodle_get_spk(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = poodle_spk_func;
+       ucontrol->value.enumerated.item[0] = poodle_spk_func;
        return 0;
 }
 
@@ -167,10 +167,10 @@ static int poodle_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-       if (poodle_spk_func == ucontrol->value.integer.value[0])
+       if (poodle_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       poodle_spk_func = ucontrol->value.integer.value[0];
+       poodle_spk_func = ucontrol->value.enumerated.item[0];
        poodle_ext_control(&card->dapm);
        return 1;
 }
index b002226..0e02634 100644 (file)
@@ -164,7 +164,7 @@ static struct snd_soc_ops spitz_ops = {
 static int spitz_get_jack(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = spitz_jack_func;
+       ucontrol->value.enumerated.item[0] = spitz_jack_func;
        return 0;
 }
 
@@ -173,10 +173,10 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (spitz_jack_func == ucontrol->value.integer.value[0])
+       if (spitz_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       spitz_jack_func = ucontrol->value.integer.value[0];
+       spitz_jack_func = ucontrol->value.enumerated.item[0];
        spitz_ext_control(&card->dapm);
        return 1;
 }
@@ -184,7 +184,7 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
 static int spitz_get_spk(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = spitz_spk_func;
+       ucontrol->value.enumerated.item[0] = spitz_spk_func;
        return 0;
 }
 
@@ -193,10 +193,10 @@ static int spitz_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (spitz_spk_func == ucontrol->value.integer.value[0])
+       if (spitz_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       spitz_spk_func = ucontrol->value.integer.value[0];
+       spitz_spk_func = ucontrol->value.enumerated.item[0];
        spitz_ext_control(&card->dapm);
        return 1;
 }
index 49518dd..c508f02 100644 (file)
@@ -95,7 +95,7 @@ static struct snd_soc_ops tosa_ops = {
 static int tosa_get_jack(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = tosa_jack_func;
+       ucontrol->value.enumerated.item[0] = tosa_jack_func;
        return 0;
 }
 
@@ -104,10 +104,10 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (tosa_jack_func == ucontrol->value.integer.value[0])
+       if (tosa_jack_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       tosa_jack_func = ucontrol->value.integer.value[0];
+       tosa_jack_func = ucontrol->value.enumerated.item[0];
        tosa_ext_control(&card->dapm);
        return 1;
 }
@@ -115,7 +115,7 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
 static int tosa_get_spk(struct snd_kcontrol *kcontrol,
        struct snd_ctl_elem_value *ucontrol)
 {
-       ucontrol->value.integer.value[0] = tosa_spk_func;
+       ucontrol->value.enumerated.item[0] = tosa_spk_func;
        return 0;
 }
 
@@ -124,10 +124,10 @@ static int tosa_set_spk(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-       if (tosa_spk_func == ucontrol->value.integer.value[0])
+       if (tosa_spk_func == ucontrol->value.enumerated.item[0])
                return 0;
 
-       tosa_spk_func = ucontrol->value.integer.value[0];
+       tosa_spk_func = ucontrol->value.enumerated.item[0];
        tosa_ext_control(&card->dapm);
        return 1;
 }
index 00b6c9d..e5101e0 100644 (file)
@@ -355,7 +355,6 @@ static struct regmap_config lpass_cpu_regmap_config = {
        .readable_reg = lpass_cpu_regmap_readable,
        .volatile_reg = lpass_cpu_regmap_volatile,
        .cache_type = REGCACHE_FLAT,
-       .val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 int asoc_qcom_lpass_cpu_platform_probe(struct platform_device *pdev)
index 79688aa..4aeb8e1 100644 (file)
@@ -440,18 +440,18 @@ static irqreturn_t lpass_platform_lpaif_irq(int irq, void *data)
 }
 
 static int lpass_platform_alloc_buffer(struct snd_pcm_substream *substream,
-               struct snd_soc_pcm_runtime *soc_runtime)
+               struct snd_soc_pcm_runtime *rt)
 {
        struct snd_dma_buffer *buf = &substream->dma_buffer;
        size_t size = lpass_platform_pcm_hardware.buffer_bytes_max;
 
        buf->dev.type = SNDRV_DMA_TYPE_DEV;
-       buf->dev.dev = soc_runtime->dev;
+       buf->dev.dev = rt->platform->dev;
        buf->private_data = NULL;
-       buf->area = dma_alloc_coherent(soc_runtime->dev, size, &buf->addr,
+       buf->area = dma_alloc_coherent(rt->platform->dev, size, &buf->addr,
                        GFP_KERNEL);
        if (!buf->area) {
-               dev_err(soc_runtime->dev, "%s: Could not allocate DMA buffer\n",
+               dev_err(rt->platform->dev, "%s: Could not allocate DMA buffer\n",
                                __func__);
                return -ENOMEM;
        }
@@ -461,12 +461,12 @@ static int lpass_platform_alloc_buffer(struct snd_pcm_substream *substream,
 }
 
 static void lpass_platform_free_buffer(struct snd_pcm_substream *substream,
-               struct snd_soc_pcm_runtime *soc_runtime)
+               struct snd_soc_pcm_runtime *rt)
 {
        struct snd_dma_buffer *buf = &substream->dma_buffer;
 
        if (buf->area) {
-               dma_free_coherent(soc_runtime->dev, buf->bytes, buf->area,
+               dma_free_coherent(rt->dev, buf->bytes, buf->area,
                                buf->addr);
        }
        buf->area = NULL;
@@ -499,9 +499,6 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
 
        snd_soc_pcm_set_drvdata(soc_runtime, data);
 
-       soc_runtime->dev->coherent_dma_mask = DMA_BIT_MASK(32);
-       soc_runtime->dev->dma_mask = &soc_runtime->dev->coherent_dma_mask;
-
        ret = lpass_platform_alloc_buffer(substream, soc_runtime);
        if (ret)
                return ret;
index 84d9e77..70a2559 100644 (file)
@@ -481,10 +481,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
        unsigned int cdcon_mask = 1 << i2s_regs->cdclkcon_off;
        unsigned int rsrc_mask = 1 << i2s_regs->rclksrc_off;
        u32 mod, mask, val = 0;
+       unsigned long flags;
 
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
        mod = readl(i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
 
        switch (clk_id) {
        case SAMSUNG_I2S_OPCLK:
@@ -575,11 +576,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
                return -EINVAL;
        }
 
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
        mod = readl(i2s->addr + I2SMOD);
        mod = (mod & ~mask) | val;
        writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
 
        return 0;
 }
@@ -590,6 +591,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
        struct i2s_dai *i2s = to_info(dai);
        int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave;
        u32 mod, tmp = 0;
+       unsigned long flags;
 
        lrp_shift = i2s->variant_regs->lrp_off;
        sdf_shift = i2s->variant_regs->sdf_off;
@@ -649,7 +651,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
                return -EINVAL;
        }
 
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
        mod = readl(i2s->addr + I2SMOD);
        /*
         * Don't change the I2S mode if any controller is active on this
@@ -657,7 +659,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
         */
        if (any_active(i2s) &&
                ((mod & (sdf_mask | lrp_rlow | mod_slave)) != tmp)) {
-               spin_unlock(i2s->lock);
+               spin_unlock_irqrestore(i2s->lock, flags);
                dev_err(&i2s->pdev->dev,
                                "%s:%d Other DAI busy\n", __func__, __LINE__);
                return -EAGAIN;
@@ -666,7 +668,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
        mod &= ~(sdf_mask | lrp_rlow | mod_slave);
        mod |= tmp;
        writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
 
        return 0;
 }
@@ -676,6 +678,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
 {
        struct i2s_dai *i2s = to_info(dai);
        u32 mod, mask = 0, val = 0;
+       unsigned long flags;
 
        if (!is_secondary(i2s))
                mask |= (MOD_DC2_EN | MOD_DC1_EN);
@@ -744,11 +747,11 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
                return -EINVAL;
        }
 
-       spin_lock(i2s->lock);
+       spin_lock_irqsave(i2s->lock, flags);
        mod = readl(i2s->addr + I2SMOD);
        mod = (mod & ~mask) | val;
        writel(mod, i2s->addr + I2SMOD);
-       spin_unlock(i2s->lock);
+       spin_unlock_irqrestore(i2s->lock, flags);
 
        samsung_asoc_init_dma_data(dai, &i2s->dma_playback, &i2s->dma_capture);
 
index 5a2812f..581175a 100644 (file)
@@ -310,7 +310,7 @@ struct dapm_kcontrol_data {
 };
 
 static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget,
-       struct snd_kcontrol *kcontrol)
+       struct snd_kcontrol *kcontrol, const char *ctrl_name)
 {
        struct dapm_kcontrol_data *data;
        struct soc_mixer_control *mc;
@@ -333,7 +333,7 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget,
                if (mc->autodisable) {
                        struct snd_soc_dapm_widget template;
 
-                       name = kasprintf(GFP_KERNEL, "%s %s", kcontrol->id.name,
+                       name = kasprintf(GFP_KERNEL, "%s %s", ctrl_name,
                                         "Autodisable");
                        if (!name) {
                                ret = -ENOMEM;
@@ -371,7 +371,7 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget,
                if (e->autodisable) {
                        struct snd_soc_dapm_widget template;
 
-                       name = kasprintf(GFP_KERNEL, "%s %s", kcontrol->id.name,
+                       name = kasprintf(GFP_KERNEL, "%s %s", ctrl_name,
                                         "Autodisable");
                        if (!name) {
                                ret = -ENOMEM;
@@ -871,7 +871,7 @@ static int dapm_create_or_share_kcontrol(struct snd_soc_dapm_widget *w,
 
                kcontrol->private_free = dapm_kcontrol_free;
 
-               ret = dapm_kcontrol_data_alloc(w, kcontrol);
+               ret = dapm_kcontrol_data_alloc(w, kcontrol, name);
                if (ret) {
                        snd_ctl_free_one(kcontrol);
                        goto exit_free;
@@ -3573,7 +3573,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
 {
        struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
 
-       ucontrol->value.integer.value[0] = w->params_select;
+       ucontrol->value.enumerated.item[0] = w->params_select;
 
        return 0;
 }
@@ -3587,13 +3587,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
        if (w->power)
                return -EBUSY;
 
-       if (ucontrol->value.integer.value[0] == w->params_select)
+       if (ucontrol->value.enumerated.item[0] == w->params_select)
                return 0;
 
-       if (ucontrol->value.integer.value[0] >= w->num_params)
+       if (ucontrol->value.enumerated.item[0] >= w->num_params)
                return -EINVAL;
 
-       w->params_select = ucontrol->value.integer.value[0];
+       w->params_select = ucontrol->value.enumerated.item[0];
 
        return 0;
 }
index e898b42..1af4f23 100644 (file)
@@ -1810,7 +1810,8 @@ int dpcm_be_dai_hw_free(struct snd_soc_pcm_runtime *fe, int stream)
                    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PREPARE) &&
                    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_HW_FREE) &&
                    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED) &&
-                   (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP))
+                   (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP) &&
+                   (be->dpcm[stream].state != SND_SOC_DPCM_STATE_SUSPEND))
                        continue;
 
                dev_dbg(be->dev, "ASoC: hw_free BE %s\n",
index d75deba..dfcd386 100644 (file)
@@ -22,6 +22,7 @@ config SND_SUN_AMD7930
 config SND_SUN_CS4231
        tristate "Sun CS4231"
        select SND_PCM
+       select SND_TIMER
        help
          Say Y here to include support for CS4231 sound device on Sun.
 
index cc39f63..007cf58 100644 (file)
@@ -2455,7 +2455,6 @@ int snd_usbmidi_create(struct snd_card *card,
        else
                err = snd_usbmidi_create_endpoints(umidi, endpoints);
        if (err < 0) {
-               snd_usbmidi_free(umidi);
                return err;
        }
 
index 23ea6d8..c458d60 100644 (file)
@@ -1121,8 +1121,10 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
        switch (chip->usb_id) {
        case USB_ID(0x045E, 0x075D): /* MS Lifecam Cinema  */
        case USB_ID(0x045E, 0x076D): /* MS Lifecam HD-5000 */
+       case USB_ID(0x045E, 0x076F): /* MS Lifecam HD-6000 */
        case USB_ID(0x045E, 0x0772): /* MS Lifecam Studio */
        case USB_ID(0x045E, 0x0779): /* MS Lifecam HD-3000 */
+       case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */
        case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
        case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
        case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
@@ -1205,8 +1207,12 @@ void snd_usb_set_interface_quirk(struct usb_device *dev)
         * "Playback Design" products need a 50ms delay after setting the
         * USB interface.
         */
-       if (le16_to_cpu(dev->descriptor.idVendor) == 0x23ba)
+       switch (le16_to_cpu(dev->descriptor.idVendor)) {
+       case 0x23ba: /* Playback Design */
+       case 0x0644: /* TEAC Corp. */
                mdelay(50);
+               break;
+       }
 }
 
 void snd_usb_ctl_msg_quirk(struct usb_device *dev, unsigned int pipe,
@@ -1221,6 +1227,14 @@ void snd_usb_ctl_msg_quirk(struct usb_device *dev, unsigned int pipe,
            (requesttype & USB_TYPE_MASK) == USB_TYPE_CLASS)
                mdelay(20);
 
+       /*
+        * "TEAC Corp." products need a 20ms delay after each
+        * class compliant request
+        */
+       if ((le16_to_cpu(dev->descriptor.idVendor) == 0x0644) &&
+           (requesttype & USB_TYPE_MASK) == USB_TYPE_CLASS)
+               mdelay(20);
+
        /* Marantz/Denon devices with USB DAC functionality need a delay
         * after each class compliant request
         */
@@ -1269,7 +1283,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
        case USB_ID(0x20b1, 0x3008): /* iFi Audio micro/nano iDSD */
        case USB_ID(0x20b1, 0x2008): /* Matrix Audio X-Sabre */
        case USB_ID(0x20b1, 0x300a): /* Matrix Audio Mini-i Pro */
-       case USB_ID(0x22d8, 0x0416): /* OPPO HA-1*/
+       case USB_ID(0x22d9, 0x0416): /* OPPO HA-1 */
                if (fp->altsetting == 2)
                        return SNDRV_PCM_FMTBIT_DSD_U32_BE;
                break;
@@ -1278,6 +1292,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
        case USB_ID(0x20b1, 0x2009): /* DIYINHK DSD DXD 384kHz USB to I2S/DSD */
        case USB_ID(0x20b1, 0x2023): /* JLsounds I2SoverUSB */
        case USB_ID(0x20b1, 0x3023): /* Aune X1S 32BIT/384 DSD DAC */
+       case USB_ID(0x2616, 0x0106): /* PS Audio NuWave DAC */
                if (fp->altsetting == 3)
                        return SNDRV_PCM_FMTBIT_DSD_U32_BE;
                break;
index 4a96473..ee566e8 100644 (file)
@@ -85,7 +85,7 @@ $(OUTPUT)%.i: %.c FORCE
        $(call rule_mkdir)
        $(call if_changed_dep,cc_i_c)
 
-$(OUTPUT)%.i: %.S FORCE
+$(OUTPUT)%.s: %.S FORCE
        $(call rule_mkdir)
        $(call if_changed_dep,cc_i_c)
 
index 02db3cd..6b77072 100644 (file)
@@ -27,7 +27,7 @@ endef
 #   the rule that uses them - an example for that is the 'bionic'
 #   feature check. ]
 #
-FEATURE_TESTS ?=                       \
+FEATURE_TESTS_BASIC :=                 \
        backtrace                       \
        dwarf                           \
        fortify-source                  \
@@ -46,6 +46,7 @@ FEATURE_TESTS ?=                      \
        libpython                       \
        libpython-version               \
        libslang                        \
+       libcrypto                       \
        libunwind                       \
        pthread-attr-setaffinity-np     \
        stackprotector-all              \
@@ -56,6 +57,25 @@ FEATURE_TESTS ?=                     \
        get_cpuid                       \
        bpf
 
+# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
+# of all feature tests
+FEATURE_TESTS_EXTRA :=                 \
+       bionic                          \
+       compile-32                      \
+       compile-x32                     \
+       cplus-demangle                  \
+       hello                           \
+       libbabeltrace                   \
+       liberty                         \
+       liberty-z                       \
+       libunwind-debug-frame
+
+FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
+
+ifeq ($(FEATURE_TESTS),all)
+  FEATURE_TESTS := $(FEATURE_TESTS_BASIC) $(FEATURE_TESTS_EXTRA)
+endif
+
 FEATURE_DISPLAY ?=                     \
        dwarf                           \
        glibc                           \
@@ -68,6 +88,7 @@ FEATURE_DISPLAY ?=                    \
        libperl                         \
        libpython                       \
        libslang                        \
+       libcrypto                       \
        libunwind                       \
        libdw-dwarf-unwind              \
        zlib                            \
@@ -100,6 +121,14 @@ ifeq ($(feature-all), 1)
   # test-all.c passed - just set all the core feature flags to 1:
   #
   $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
+  #
+  # test-all.c does not comprise these tests, so we need to
+  # for this case to get features proper values
+  #
+  $(call feature_check,compile-32)
+  $(call feature_check,compile-x32)
+  $(call feature_check,bionic)
+  $(call feature_check,libbabeltrace)
 else
   $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
 endif
index bf8f035..c5f4c41 100644 (file)
@@ -23,6 +23,7 @@ FILES=                                        \
        test-libpython.bin              \
        test-libpython-version.bin      \
        test-libslang.bin               \
+       test-libcrypto.bin              \
        test-libunwind.bin              \
        test-libunwind-debug-frame.bin  \
        test-pthread-attr-setaffinity-np.bin    \
@@ -105,6 +106,9 @@ $(OUTPUT)test-libaudit.bin:
 $(OUTPUT)test-libslang.bin:
        $(BUILD) -I/usr/include/slang -lslang
 
+$(OUTPUT)test-libcrypto.bin:
+       $(BUILD) -lcrypto
+
 $(OUTPUT)test-gtk2.bin:
        $(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
index 81025ca..e499a36 100644 (file)
 # include "test-bpf.c"
 #undef main
 
+#define main main_test_libcrypto
+# include "test-libcrypto.c"
+#undef main
+
 int main(int argc, char *argv[])
 {
        main_test_libpython();
@@ -158,6 +162,7 @@ int main(int argc, char *argv[])
        main_test_lzma();
        main_test_get_cpuid();
        main_test_bpf();
+       main_test_libcrypto();
 
        return 0;
 }
index 31dbf45..c54e655 100644 (file)
@@ -1,4 +1,6 @@
+#include <stdio.h>
 int main(void)
 {
+       printf("Hello World!\n");
        return 0;
 }
diff --git a/tools/build/feature/test-libcrypto.c b/tools/build/feature/test-libcrypto.c
new file mode 100644 (file)
index 0000000..bd79dc7
--- /dev/null
@@ -0,0 +1,17 @@
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+
+int main(void)
+{
+       MD5_CTX context;
+       unsigned char md[MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH];
+       unsigned char dat[] = "12345";
+
+       MD5_Init(&context);
+       MD5_Update(&context, &dat[0], sizeof(dat));
+       MD5_Final(&md[0], &context);
+
+       SHA1(&dat[0], sizeof(dat), &md[0]);
+
+       return 0;
+}
index e8b8a23..954c644 100644 (file)
@@ -1,3 +1,4 @@
 libapi-y += fd/
 libapi-y += fs/
 libapi-y += cpu.o
+libapi-y += debug.o
index d85904d..bbc82c6 100644 (file)
@@ -18,6 +18,7 @@ LIBFILE = $(OUTPUT)libapi.a
 CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
 CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+CFLAGS += -I$(srctree)/tools/lib/api
 
 RM = rm -f
 
diff --git a/tools/lib/api/debug-internal.h b/tools/lib/api/debug-internal.h
new file mode 100644 (file)
index 0000000..188f788
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef __API_DEBUG_INTERNAL_H__
+#define __API_DEBUG_INTERNAL_H__
+
+#include "debug.h"
+
+#define __pr(func, fmt, ...)   \
+do {                           \
+       if ((func))             \
+               (func)("libapi: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+extern libapi_print_fn_t __pr_warning;
+extern libapi_print_fn_t __pr_info;
+extern libapi_print_fn_t __pr_debug;
+
+#define pr_warning(fmt, ...)   __pr(__pr_warning, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...)      __pr(__pr_info, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...)     __pr(__pr_debug, fmt, ##__VA_ARGS__)
+
+#endif /* __API_DEBUG_INTERNAL_H__ */
diff --git a/tools/lib/api/debug.c b/tools/lib/api/debug.c
new file mode 100644 (file)
index 0000000..5fa5cf5
--- /dev/null
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include "debug.h"
+#include "debug-internal.h"
+
+static int __base_pr(const char *format, ...)
+{
+       va_list args;
+       int err;
+
+       va_start(args, format);
+       err = vfprintf(stderr, format, args);
+       va_end(args);
+       return err;
+}
+
+libapi_print_fn_t __pr_warning = __base_pr;
+libapi_print_fn_t __pr_info    = __base_pr;
+libapi_print_fn_t __pr_debug;
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug)
+{
+       __pr_warning = warn;
+       __pr_info    = info;
+       __pr_debug   = debug;
+}
diff --git a/tools/lib/api/debug.h b/tools/lib/api/debug.h
new file mode 100644 (file)
index 0000000..a0872f6
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef __API_DEBUG_H__
+#define __API_DEBUG_H__
+
+typedef int (*libapi_print_fn_t)(const char *, ...);
+
+void libapi_set_print(libapi_print_fn_t warn,
+                     libapi_print_fn_t info,
+                     libapi_print_fn_t debug);
+
+#endif /* __API_DEBUG_H__ */
index 459599d..ef78c22 100644 (file)
@@ -13,6 +13,7 @@
 #include <sys/mount.h>
 
 #include "fs.h"
+#include "debug-internal.h"
 
 #define _STR(x) #x
 #define STR(x) _STR(x)
@@ -300,6 +301,56 @@ int filename__read_ull(const char *filename, unsigned long long *value)
        return err;
 }
 
+#define STRERR_BUFSIZE  128     /* For the buffer size of strerror_r */
+
+int filename__read_str(const char *filename, char **buf, size_t *sizep)
+{
+       size_t size = 0, alloc_size = 0;
+       void *bf = NULL, *nbf;
+       int fd, n, err = 0;
+       char sbuf[STRERR_BUFSIZE];
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       do {
+               if (size == alloc_size) {
+                       alloc_size += BUFSIZ;
+                       nbf = realloc(bf, alloc_size);
+                       if (!nbf) {
+                               err = -ENOMEM;
+                               break;
+                       }
+
+                       bf = nbf;
+               }
+
+               n = read(fd, bf + size, alloc_size - size);
+               if (n < 0) {
+                       if (size) {
+                               pr_warning("read failed %d: %s\n", errno,
+                                        strerror_r(errno, sbuf, sizeof(sbuf)));
+                               err = 0;
+                       } else
+                               err = -errno;
+
+                       break;
+               }
+
+               size += n;
+       } while (n > 0);
+
+       if (!err) {
+               *sizep = size;
+               *buf   = bf;
+       } else
+               free(bf);
+
+       close(fd);
+       return err;
+}
+
 int sysfs__read_ull(const char *entry, unsigned long long *value)
 {
        char path[PATH_MAX];
@@ -326,6 +377,19 @@ int sysfs__read_int(const char *entry, int *value)
        return filename__read_int(path, value);
 }
 
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+       char path[PATH_MAX];
+       const char *sysfs = sysfs__mountpoint();
+
+       if (!sysfs)
+               return -1;
+
+       snprintf(path, sizeof(path), "%s/%s", sysfs, entry);
+
+       return filename__read_str(path, buf, sizep);
+}
+
 int sysctl__read_int(const char *sysctl, int *value)
 {
        char path[PATH_MAX];
index d024a7f..9f65980 100644 (file)
@@ -2,6 +2,7 @@
 #define __API_FS__
 
 #include <stdbool.h>
+#include <unistd.h>
 
 /*
  * On most systems <limits.h> would have given us this, but  not on some systems
@@ -26,8 +27,10 @@ FS(tracefs)
 
 int filename__read_int(const char *filename, int *value);
 int filename__read_ull(const char *filename, unsigned long long *value);
+int filename__read_str(const char *filename, char **buf, size_t *sizep);
 
 int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
 #endif /* __API_FS__ */
index 8334a5a..7e543c3 100644 (file)
@@ -201,6 +201,7 @@ struct bpf_object {
                        Elf_Data *data;
                } *reloc;
                int nr_reloc;
+               int maps_shndx;
        } efile;
        /*
         * All loaded bpf_object is linked in a list, which is
@@ -350,6 +351,7 @@ static struct bpf_object *bpf_object__new(const char *path,
         */
        obj->efile.obj_buf = obj_buf;
        obj->efile.obj_buf_sz = obj_buf_sz;
+       obj->efile.maps_shndx = -1;
 
        obj->loaded = false;
 
@@ -529,12 +531,12 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
 }
 
 static int
-bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+bpf_object__init_maps_name(struct bpf_object *obj)
 {
        int i;
        Elf_Data *symbols = obj->efile.symbols;
 
-       if (!symbols || maps_shndx < 0)
+       if (!symbols || obj->efile.maps_shndx < 0)
                return -EINVAL;
 
        for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
@@ -544,7 +546,7 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
 
                if (!gelf_getsym(symbols, i, &sym))
                        continue;
-               if (sym.st_shndx != maps_shndx)
+               if (sym.st_shndx != obj->efile.maps_shndx)
                        continue;
 
                map_name = elf_strptr(obj->efile.elf,
@@ -572,7 +574,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
        Elf *elf = obj->efile.elf;
        GElf_Ehdr *ep = &obj->efile.ehdr;
        Elf_Scn *scn = NULL;
-       int idx = 0, err = 0, maps_shndx = -1;
+       int idx = 0, err = 0;
 
        /* Elf is corrupted/truncated, avoid calling elf_strptr. */
        if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -625,7 +627,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                else if (strcmp(name, "maps") == 0) {
                        err = bpf_object__init_maps(obj, data->d_buf,
                                                    data->d_size);
-                       maps_shndx = idx;
+                       obj->efile.maps_shndx = idx;
                } else if (sh.sh_type == SHT_SYMTAB) {
                        if (obj->efile.symbols) {
                                pr_warning("bpf: multiple SYMTAB in %s\n",
@@ -674,8 +676,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                pr_warning("Corrupted ELF file: index of strtab invalid\n");
                return LIBBPF_ERRNO__FORMAT;
        }
-       if (maps_shndx >= 0)
-               err = bpf_object__init_maps_name(obj, maps_shndx);
+       if (obj->efile.maps_shndx >= 0)
+               err = bpf_object__init_maps_name(obj);
 out:
        return err;
 }
@@ -697,7 +699,8 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
 static int
 bpf_program__collect_reloc(struct bpf_program *prog,
                           size_t nr_maps, GElf_Shdr *shdr,
-                          Elf_Data *data, Elf_Data *symbols)
+                          Elf_Data *data, Elf_Data *symbols,
+                          int maps_shndx)
 {
        int i, nrels;
 
@@ -724,9 +727,6 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                        return -LIBBPF_ERRNO__FORMAT;
                }
 
-               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-               pr_debug("relocation: insn_idx=%u\n", insn_idx);
-
                if (!gelf_getsym(symbols,
                                 GELF_R_SYM(rel.r_info),
                                 &sym)) {
@@ -735,6 +735,15 @@ bpf_program__collect_reloc(struct bpf_program *prog,
                        return -LIBBPF_ERRNO__FORMAT;
                }
 
+               if (sym.st_shndx != maps_shndx) {
+                       pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
+                                  prog->section_name, sym.st_shndx);
+                       return -LIBBPF_ERRNO__RELOC;
+               }
+
+               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+               pr_debug("relocation: insn_idx=%u\n", insn_idx);
+
                if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
                        pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
                                   insn_idx, insns[insn_idx].code);
@@ -863,7 +872,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 
                err = bpf_program__collect_reloc(prog, nr_maps,
                                                 shdr, data,
-                                                obj->efile.symbols);
+                                                obj->efile.symbols,
+                                                obj->efile.maps_shndx);
                if (err)
                        return err;
        }
index 90d2bae..1d57af5 100644 (file)
@@ -100,7 +100,7 @@ include $(srctree)/tools/build/Makefile.include
 
 do_compile_shared_library =                    \
        ($(print_shared_lib_compile)            \
-       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so))
+       $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so))
 
 do_build_static_lib =                          \
        ($(print_static_lib_build)              \
index 9be6633..d1c89cc 100644 (file)
@@ -11,11 +11,6 @@ static __thread struct task_struct current_obj;
 bool debug_locks = true;
 bool debug_locks_silent;
 
-__attribute__((constructor)) static void liblockdep_init(void)
-{
-       lockdep_init();
-}
-
 __attribute__((destructor)) static void liblockdep_exit(void)
 {
        debug_check_no_locks_held();
index a60c14b..6e66277 100644 (file)
@@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 void lock_release(struct lockdep_map *lock, int nested,
                        unsigned long ip);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void lockdep_init(void);
 
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
        { .name = (_name), .key = (void *)(_key), }
index f42b7e9..a0a2e3a 100644 (file)
@@ -1,2 +1,8 @@
 #include <linux/lockdep.h>
+
+/* Trivial API wrappers, we don't (yet) have RCU in user-space: */
+#define hlist_for_each_entry_rcu       hlist_for_each_entry
+#define hlist_add_head_rcu             hlist_add_head
+#define hlist_del_rcu                  hlist_del
+
 #include "../../../kernel/locking/lockdep.c"
index 21cdf86..5284484 100644 (file)
@@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void)
        ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
 #endif
 
-       lockdep_init();
-
        __init_state = done;
 }
index 0f782ff..18211a5 100644 (file)
@@ -1,13 +1,13 @@
 #include <liblockdep/mutex.h>
 
-void main(void)
+int main(void)
 {
-       pthread_mutex_t a, b;
+       pthread_mutex_t a;
 
        pthread_mutex_init(&a, NULL);
-       pthread_mutex_init(&b, NULL);
 
        pthread_mutex_lock(&a);
-       pthread_mutex_lock(&b);
        pthread_mutex_lock(&a);
+
+       return 0;
 }
diff --git a/tools/lib/lockdep/tests/ABA.c b/tools/lib/lockdep/tests/ABA.c
new file mode 100644 (file)
index 0000000..0f782ff
--- /dev/null
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+       pthread_mutex_t a, b;
+
+       pthread_mutex_init(&a, NULL);
+       pthread_mutex_init(&b, NULL);
+
+       pthread_mutex_lock(&a);
+       pthread_mutex_lock(&b);
+       pthread_mutex_lock(&a);
+}
diff --git a/tools/lib/lockdep/tests/ABBA_2threads.c b/tools/lib/lockdep/tests/ABBA_2threads.c
new file mode 100644 (file)
index 0000000..cd807d7
--- /dev/null
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <pthread.h>
+
+pthread_mutex_t a = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t b = PTHREAD_MUTEX_INITIALIZER;
+pthread_barrier_t bar;
+
+void *ba_lock(void *arg)
+{
+       int ret, i;
+
+       pthread_mutex_lock(&b);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&a);
+
+       pthread_mutex_unlock(&a);
+       pthread_mutex_unlock(&b);
+}
+
+int main(void)
+{
+       pthread_t t;
+
+       pthread_barrier_init(&bar, NULL, 2);
+
+       if (pthread_create(&t, NULL, ba_lock, NULL)) {
+               fprintf(stderr, "pthread_create() failed\n");
+               return 1;
+       }
+       pthread_mutex_lock(&a);
+
+       if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+               pthread_barrier_destroy(&bar);
+
+       pthread_mutex_lock(&b);
+
+       pthread_mutex_unlock(&b);
+       pthread_mutex_unlock(&a);
+
+       pthread_join(t, NULL);
+
+       return 0;
+}
index 6386dc3..fd3e56a 100644 (file)
@@ -3,6 +3,7 @@
 
 #define __used         __attribute__((__unused__))
 #define unlikely
+#define READ_ONCE(x) (x)
 #define WRITE_ONCE(x, val) x=(val)
 #define RCU_INIT_POINTER(p, v) p=(v)
 
index c3bd294..190cc88 100644 (file)
@@ -1951,6 +1951,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
                   strcmp(token, "*") == 0 ||
                   strcmp(token, "^") == 0 ||
                   strcmp(token, "/") == 0 ||
+                  strcmp(token, "%") == 0 ||
                   strcmp(token, "<") == 0 ||
                   strcmp(token, ">") == 0 ||
                   strcmp(token, "<=") == 0 ||
@@ -2397,6 +2398,12 @@ static int arg_num_eval(struct print_arg *arg, long long *val)
                                break;
                        *val = left + right;
                        break;
+               case '~':
+                       ret = arg_num_eval(arg->op.right, &right);
+                       if (!ret)
+                               break;
+                       *val = ~right;
+                       break;
                default:
                        do_warning("unknown op '%s'", arg->op.op);
                        ret = 0;
@@ -2634,6 +2641,7 @@ process_hex(struct event_format *event, struct print_arg *arg, char **tok)
 
 free_field:
        free_arg(arg->hex.field);
+       arg->hex.field = NULL;
 out:
        *tok = NULL;
        return EVENT_ERROR;
@@ -2658,8 +2666,10 @@ process_int_array(struct event_format *event, struct print_arg *arg, char **tok)
 
 free_size:
        free_arg(arg->int_array.count);
+       arg->int_array.count = NULL;
 free_field:
        free_arg(arg->int_array.field);
+       arg->int_array.field = NULL;
 out:
        *tok = NULL;
        return EVENT_ERROR;
@@ -3689,6 +3699,9 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg
                case '/':
                        val = left / right;
                        break;
+               case '%':
+                       val = left % right;
+                       break;
                case '*':
                        val = left * right;
                        break;
@@ -4971,7 +4984,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
                                                break;
                                        }
                                }
-                               if (pevent->long_size == 8 && ls &&
+                               if (pevent->long_size == 8 && ls == 1 &&
                                    sizeof(long) != 8) {
                                        char *p;
 
@@ -5335,41 +5348,45 @@ static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock)
        return false;
 }
 
-void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
-                       struct pevent_record *record, bool use_trace_clock)
+/**
+ * pevent_find_event_by_record - return the event from a given record
+ * @pevent: a handle to the pevent
+ * @record: The record to get the event from
+ *
+ * Returns the associated event for a given record, or NULL if non is
+ * is found.
+ */
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record)
 {
-       static const char *spaces = "                    "; /* 20 spaces */
-       struct event_format *event;
-       unsigned long secs;
-       unsigned long usecs;
-       unsigned long nsecs;
-       const char *comm;
-       void *data = record->data;
        int type;
-       int pid;
-       int len;
-       int p;
-       bool use_usec_format;
-
-       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
-                                                       use_trace_clock);
-       if (use_usec_format) {
-               secs = record->ts / NSECS_PER_SEC;
-               nsecs = record->ts - secs * NSECS_PER_SEC;
-       }
 
        if (record->size < 0) {
                do_warning("ug! negative record size %d", record->size);
-               return;
+               return NULL;
        }
 
-       type = trace_parse_common_type(pevent, data);
+       type = trace_parse_common_type(pevent, record->data);
 
-       event = pevent_find_event(pevent, type);
-       if (!event) {
-               do_warning("ug! no event found for type %d", type);
-               return;
-       }
+       return pevent_find_event(pevent, type);
+}
+
+/**
+ * pevent_print_event_task - Write the event task comm, pid and CPU
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the tasks comm, pid and CPU to @s.
+ */
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       void *data = record->data;
+       const char *comm;
+       int pid;
 
        pid = parse_common_pid(pevent, data);
        comm = find_cmdline(pevent, pid);
@@ -5377,9 +5394,43 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
        if (pevent->latency_format) {
                trace_seq_printf(s, "%8.8s-%-5d %3d",
                       comm, pid, record->cpu);
-               pevent_data_lat_fmt(pevent, s, record);
        } else
                trace_seq_printf(s, "%16s-%-5d [%03d]", comm, pid, record->cpu);
+}
+
+/**
+ * pevent_print_event_time - Write the event timestamp
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ * @use_trace_clock: Set to parse according to the @pevent->trace_clock
+ *
+ * Writes the timestamp of the record into @s.
+ */
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock)
+{
+       unsigned long secs;
+       unsigned long usecs;
+       unsigned long nsecs;
+       int p;
+       bool use_usec_format;
+
+       use_usec_format = is_timestamp_in_us(pevent->trace_clock,
+                                                       use_trace_clock);
+       if (use_usec_format) {
+               secs = record->ts / NSECS_PER_SEC;
+               nsecs = record->ts - secs * NSECS_PER_SEC;
+       }
+
+       if (pevent->latency_format) {
+               trace_seq_printf(s, " %3d", record->cpu);
+               pevent_data_lat_fmt(pevent, s, record);
+       } else
+               trace_seq_printf(s, " [%03d]", record->cpu);
 
        if (use_usec_format) {
                if (pevent->flags & PEVENT_NSEC_OUTPUT) {
@@ -5387,14 +5438,36 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                        p = 9;
                } else {
                        usecs = (nsecs + 500) / NSECS_PER_USEC;
+                       /* To avoid usecs larger than 1 sec */
+                       if (usecs >= 1000000) {
+                               usecs -= 1000000;
+                               secs++;
+                       }
                        p = 6;
                }
 
-               trace_seq_printf(s, " %5lu.%0*lu: %s: ",
-                                       secs, p, usecs, event->name);
+               trace_seq_printf(s, " %5lu.%0*lu:", secs, p, usecs);
        } else
-               trace_seq_printf(s, " %12llu: %s: ",
-                                       record->ts, event->name);
+               trace_seq_printf(s, " %12llu:", record->ts);
+}
+
+/**
+ * pevent_print_event_data - Write the event data section
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the parsing of the record's data to @s.
+ */
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record)
+{
+       static const char *spaces = "                    "; /* 20 spaces */
+       int len;
+
+       trace_seq_printf(s, " %s: ", event->name);
 
        /* Space out the event names evenly. */
        len = strlen(event->name);
@@ -5404,6 +5477,23 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
        pevent_event_info(s, event, record);
 }
 
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+                       struct pevent_record *record, bool use_trace_clock)
+{
+       struct event_format *event;
+
+       event = pevent_find_event_by_record(pevent, record);
+       if (!event) {
+               do_warning("ug! no event found for type %d",
+                          trace_parse_common_type(pevent, record->data));
+               return;
+       }
+
+       pevent_print_event_task(pevent, s, event, record);
+       pevent_print_event_time(pevent, s, event, record, use_trace_clock);
+       pevent_print_event_data(pevent, s, event, record);
+}
+
 static int events_id_cmp(const void *a, const void *b)
 {
        struct event_format * const * ea = a;
index 706d9bc..9ffde37 100644 (file)
@@ -628,6 +628,16 @@ int pevent_register_print_string(struct pevent *pevent, const char *fmt,
                                 unsigned long long addr);
 int pevent_pid_is_registered(struct pevent *pevent, int pid);
 
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record,
+                            bool use_trace_clock);
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+                            struct event_format *event,
+                            struct pevent_record *record);
 void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
                        struct pevent_record *record, bool use_trace_clock);
 
@@ -694,6 +704,9 @@ struct event_format *pevent_find_event(struct pevent *pevent, int id);
 struct event_format *
 pevent_find_event_by_name(struct pevent *pevent, const char *sys, const char *name);
 
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record);
+
 void pevent_data_lat_fmt(struct pevent *pevent,
                         struct trace_seq *s, struct pevent_record *record);
 int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
index b9ca1e3..15949e2 100644 (file)
@@ -8,7 +8,7 @@ perf-config - Get and set variables in a configuration file.
 SYNOPSIS
 --------
 [verse]
-'perf config' -l | --list
+'perf config' [<file-option>] -l | --list
 
 DESCRIPTION
 -----------
@@ -21,6 +21,14 @@ OPTIONS
 --list::
        Show current config variables, name and value, for all sections.
 
+--user::
+       For writing and reading options: write to user
+       '$HOME/.perfconfig' file or read it.
+
+--system::
+       For writing and reading options: write to system-wide
+       '$(sysconfdir)/perfconfig' or read it.
+
 CONFIGURATION FILE
 ------------------
 
@@ -30,6 +38,10 @@ The '$HOME/.perfconfig' file is used to store a per-user configuration.
 The file '$(sysconfdir)/perfconfig' can be used to
 store a system-wide default configuration.
 
+When reading or writing, the values are read from the system and user
+configuration files by default, and options '--system' and '--user'
+can be used to tell the command to read from or write to only that location.
+
 Syntax
 ~~~~~~
 
@@ -62,7 +74,7 @@ Given a $HOME/.perfconfig like this:
                medium = green, default
                normal = lightgray, default
                selected = white, lightgray
-               code = blue, default
+               jump_arrows = blue, default
                addr = magenta, default
                root = white, blue
 
@@ -98,6 +110,347 @@ Given a $HOME/.perfconfig like this:
                order = caller
                sort-key = function
 
+Variables
+~~~~~~~~~
+
+colors.*::
+       The variables for customizing the colors used in the output for the
+       'report', 'top' and 'annotate' in the TUI. They should specify the
+       foreground and background colors, separated by a comma, for example:
+
+               medium = green, lightgray
+
+       If you want to use the color configured for you terminal, just leave it
+       as 'default', for example:
+
+               medium = default, lightgray
+
+       Available colors:
+       red, yellow, green, cyan, gray, black, blue,
+       white, default, magenta, lightgray
+
+       colors.top::
+               'top' means a overhead percentage which is more than 5%.
+               And values of this variable specify percentage colors.
+               Basic key values are foreground-color 'red' and
+               background-color 'default'.
+       colors.medium::
+               'medium' means a overhead percentage which has more than 0.5%.
+               Default values are 'green' and 'default'.
+       colors.normal::
+               'normal' means the rest of overhead percentages
+               except 'top', 'medium', 'selected'.
+               Default values are 'lightgray' and 'default'.
+       colors.selected::
+               This selects the colors for the current entry in a list of entries
+               from sub-commands (top, report, annotate).
+               Default values are 'black' and 'lightgray'.
+       colors.jump_arrows::
+               Colors for jump arrows on assembly code listings
+               such as 'jns', 'jmp', 'jane', etc.
+               Default values are 'blue', 'default'.
+       colors.addr::
+               This selects colors for addresses from 'annotate'.
+               Default values are 'magenta', 'default'.
+       colors.root::
+               Colors for headers in the output of a sub-commands (top, report).
+               Default values are 'white', 'blue'.
+
+tui.*, gtk.*::
+       Subcommands that can be configured here are 'top', 'report' and 'annotate'.
+       These values are booleans, for example:
+
+       [tui]
+               top = true
+
+       will make the TUI be the default for the 'top' subcommand. Those will be
+       available if the required libs were detected at tool build time.
+
+buildid.*::
+       buildid.dir::
+               Each executable and shared library in modern distributions comes with a
+               content based identifier that, if available, will be inserted in a
+               'perf.data' file header to, at analysis time find what is needed to do
+               symbol resolution, code annotation, etc.
+
+               The recording tools also stores a hard link or copy in a per-user
+               directory, $HOME/.debug/, of binaries, shared libraries, /proc/kallsyms
+               and /proc/kcore files to be used at analysis time.
+
+               The buildid.dir variable can be used to either change this directory
+               cache location, or to disable it altogether. If you want to disable it,
+               set buildid.dir to /dev/null. The default is $HOME/.debug
+
+annotate.*::
+       These options work only for TUI.
+       These are in control of addresses, jump function, source code
+       in lines of assembly code from a specific program.
+
+       annotate.hide_src_code::
+               If a program which is analyzed has source code,
+               this option lets 'annotate' print a list of assembly code with the source code.
+               For example, let's see a part of a program. There're four lines.
+               If this option is 'true', they can be printed
+               without source code from a program as below.
+
+               â”‚        push   %rbp
+               â”‚        mov    %rsp,%rbp
+               â”‚        sub    $0x10,%rsp
+               â”‚        mov    (%rdi),%rdx
+
+               But if this option is 'false', source code of the part
+               can be also printed as below. Default is 'false'.
+
+               â”‚      struct rb_node *rb_next(const struct rb_node *node)
+               â”‚      {
+               â”‚        push   %rbp
+               â”‚        mov    %rsp,%rbp
+               â”‚        sub    $0x10,%rsp
+               â”‚              struct rb_node *parent;
+               â”‚
+               â”‚              if (RB_EMPTY_NODE(node))
+               â”‚        mov    (%rdi),%rdx
+               â”‚              return n;
+
+        annotate.use_offset::
+               Basing on a first address of a loaded function, offset can be used.
+               Instead of using original addresses of assembly code,
+               addresses subtracted from a base address can be printed.
+               Let's illustrate an example.
+               If a base address is 0XFFFFFFFF81624d50 as below,
+
+               ffffffff81624d50 <load0>
+
+               an address on assembly code has a specific absolute address as below
+
+               ffffffff816250b8:│  mov    0x8(%r14),%rdi
+
+               but if use_offset is 'true', an address subtracted from a base address is printed.
+               Default is true. This option is only applied to TUI.
+
+                            368:│  mov    0x8(%r14),%rdi
+
+       annotate.jump_arrows::
+               There can be jump instruction among assembly code.
+               Depending on a boolean value of jump_arrows,
+               arrows can be printed or not which represent
+               where do the instruction jump into as below.
+
+               â”‚     â”Œâ”€â”€jmp    1333
+               â”‚     â”‚  xchg   %ax,%ax
+               â”‚1330:│  mov    %r15,%r10
+               â”‚1333:└─→cmp    %r15,%r14
+
+               If jump_arrow is 'false', the arrows isn't printed as below.
+               Default is 'false'.
+
+               â”‚      â†“ jmp    1333
+               â”‚        xchg   %ax,%ax
+               â”‚1330:   mov    %r15,%r10
+               â”‚1333:   cmp    %r15,%r14
+
+        annotate.show_linenr::
+               When showing source code if this option is 'true',
+               line numbers are printed as below.
+
+               â”‚1628         if (type & PERF_SAMPLE_IDENTIFIER) {
+               â”‚     â†“ jne    508
+               â”‚1628                 data->id = *array;
+               â”‚1629                 array++;
+               â”‚1630         }
+
+               However if this option is 'false', they aren't printed as below.
+               Default is 'false'.
+
+               â”‚             if (type & PERF_SAMPLE_IDENTIFIER) {
+               â”‚     â†“ jne    508
+               â”‚                     data->id = *array;
+               â”‚                     array++;
+               â”‚             }
+
+        annotate.show_nr_jumps::
+               Let's see a part of assembly code.
+
+               â”‚1382:   movb   $0x1,-0x270(%rbp)
+
+               If use this, the number of branches jumping to that address can be printed as below.
+               Default is 'false'.
+
+               â”‚1 1382:   movb   $0x1,-0x270(%rbp)
+
+        annotate.show_total_period::
+               To compare two records on an instruction base, with this option
+               provided, display total number of samples that belong to a line
+               in assembly code. If this option is 'true', total periods are printed
+               instead of percent values as below.
+
+                 302 â”‚      mov    %eax,%eax
+
+               But if this option is 'false', percent values for overhead are printed i.e.
+               Default is 'false'.
+
+               99.93 â”‚      mov    %eax,%eax
+
+hist.*::
+       hist.percentage::
+               This option control the way to calculate overhead of filtered entries -
+               that means the value of this option is effective only if there's a
+               filter (by comm, dso or symbol name). Suppose a following example:
+
+                      Overhead  Symbols
+                      ........  .......
+                       33.33%     foo
+                       33.33%     bar
+                       33.33%     baz
+
+              This is an original overhead and we'll filter out the first 'foo'
+              entry. The value of 'relative' would increase the overhead of 'bar'
+              and 'baz' to 50.00% for each, while 'absolute' would show their
+              current overhead (33.33%).
+
+ui.*::
+       ui.show-headers::
+               This option controls display of column headers (like 'Overhead' and 'Symbol')
+               in 'report' and 'top'. If this option is false, they are hidden.
+               This option is only applied to TUI.
+
+call-graph.*::
+       When sub-commands 'top' and 'report' work with -g/—-children
+       there're options in control of call-graph.
+
+       call-graph.record-mode::
+               The record-mode can be 'fp' (frame pointer), 'dwarf' and 'lbr'.
+               The value of 'dwarf' is effective only if perf detect needed library
+               (libunwind or a recent version of libdw).
+               'lbr' only work for cpus that support it.
+
+       call-graph.dump-size::
+               The size of stack to dump in order to do post-unwinding. Default is 8192 (byte).
+               When using dwarf into record-mode, the default size will be used if omitted.
+
+       call-graph.print-type::
+               The print-types can be graph (graph absolute), fractal (graph relative),
+               flat and folded. This option controls a way to show overhead for each callchain
+               entry. Suppose a following example.
+
+                Overhead  Symbols
+                ........  .......
+                  40.00%  foo
+                          |
+                          ---foo
+                             |
+                             |--50.00%--bar
+                             |          main
+                             |
+                              --50.00%--baz
+                                        main
+
+               This output is a 'fractal' format. The 'foo' came from 'bar' and 'baz' exactly
+               half and half so 'fractal' shows 50.00% for each
+               (meaning that it assumes 100% total overhead of 'foo').
+
+               The 'graph' uses absolute overhead value of 'foo' as total so each of
+               'bar' and 'baz' callchain will have 20.00% of overhead.
+               If 'flat' is used, single column and linear exposure of call chains.
+               'folded' mean call chains are displayed in a line, separated by semicolons.
+
+       call-graph.order::
+               This option controls print order of callchains. The default is
+               'callee' which means callee is printed at top and then followed by its
+               caller and so on. The 'caller' prints it in reverse order.
+
+               If this option is not set and report.children or top.children is
+               set to true (or the equivalent command line option is given),
+               the default value of this option is changed to 'caller' for the
+               execution of 'perf report' or 'perf top'. Other commands will
+               still default to 'callee'.
+
+       call-graph.sort-key::
+               The callchains are merged if they contain same information.
+               The sort-key option determines a way to compare the callchains.
+               A value of 'sort-key' can be 'function' or 'address'.
+               The default is 'function'.
+
+       call-graph.threshold::
+               When there're many callchains it'd print tons of lines. So perf omits
+               small callchains under a certain overhead (threshold) and this option
+               control the threshold. Default is 0.5 (%). The overhead is calculated
+               by value depends on call-graph.print-type.
+
+       call-graph.print-limit::
+               This is a maximum number of lines of callchain printed for a single
+               histogram entry. Default is 0 which means no limitation.
+
+report.*::
+       report.percent-limit::
+               This one is mostly the same as call-graph.threshold but works for
+               histogram entries. Entries having an overhead lower than this
+               percentage will not be printed. Default is '0'. If percent-limit
+               is '10', only entries which have more than 10% of overhead will be
+               printed.
+
+       report.queue-size::
+               This option sets up the maximum allocation size of the internal
+               event queue for ordering events. Default is 0, meaning no limit.
+
+       report.children::
+               'Children' means functions called from another function.
+               If this option is true, 'perf report' cumulates callchains of children
+               and show (accumulated) total overhead as well as 'Self' overhead.
+               Please refer to the 'perf report' manual. The default is 'true'.
+
+       report.group::
+               This option is to show event group information together.
+               Example output with this turned on, notice that there is one column
+               per event in the group, ref-cycles and cycles:
+
+               # group: {ref-cycles,cycles}
+               # ========
+               #
+               # Samples: 7K of event 'anon group { ref-cycles, cycles }'
+               # Event count (approx.): 6876107743
+               #
+               #         Overhead  Command      Shared Object               Symbol
+               # ................  .......  .................  ...................
+               #
+                   99.84%  99.76%  noploop  noploop            [.] main
+                    0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
+                    0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
+
+top.*::
+       top.children::
+               Same as 'report.children'. So if it is enabled, the output of 'top'
+               command will have 'Children' overhead column as well as 'Self' overhead
+               column by default.
+               The default is 'true'.
+
+man.*::
+       man.viewer::
+               This option can assign a tool to view manual pages when 'help'
+               subcommand was invoked. Supported tools are 'man', 'woman'
+               (with emacs client) and 'konqueror'. Default is 'man'.
+
+               New man viewer tool can be also added using 'man.<tool>.cmd'
+               or use different path using 'man.<tool>.path' config option.
+
+pager.*::
+       pager.<subcommand>::
+               When the subcommand is run on stdio, determine whether it uses
+               pager or not based on this value. Default is 'unspecified'.
+
+kmem.*::
+       kmem.default::
+               This option decides which allocator is to be analyzed if neither
+               '--slab' nor '--page' option is used. Default is 'slab'.
+
+record.*::
+       record.build-id::
+               This option can be 'cache', 'no-cache' or 'skip'.
+               'cache' is to post-process data and save/update the binaries into
+               the build-id cache (in ~/.debug). This is the default.
+               But if this option is 'no-cache', it will not update the build-id cache.
+               'skip' skips post-processing and does not update the cache.
+
 SEE ALSO
 --------
 linkperf:perf[1]
index 0b1cede..87b2588 100644 (file)
@@ -53,6 +53,13 @@ include::itrace.txt[]
 --strip::
        Use with --itrace to strip out non-synthesized events.
 
+-j::
+--jit::
+       Process jitdump files by injecting the mmap records corresponding to jitted
+       functions. This option also generates the ELF images for each jitted function
+       found in the jitdumps files captured in the input perf.data file. Use this option
+       if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
index fbceb63..19aa175 100644 (file)
@@ -341,6 +341,12 @@ Specify vmlinux path which has debuginfo.
 --buildid-all::
 Record build-id of all DSOs regardless whether it's actually hit or not.
 
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
index 8a301f6..1211399 100644 (file)
@@ -117,6 +117,22 @@ OPTIONS
        And default sort keys are changed to comm, dso_from, symbol_from, dso_to
        and symbol_to, see '--branch-stack'.
 
+       If the --mem-mode option is used, the following sort keys are also available
+       (incompatible with --branch-stack):
+       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
+
+       - symbol_daddr: name of data symbol being executed on at the time of sample
+       - dso_daddr: name of library or module containing the data being executed
+       on at the time of the sample
+       - locked: whether the bus was locked at the time of the sample
+       - tlb: type of tlb access for the data at the time of the sample
+       - mem: type of memory access for the data at the time of the sample
+       - snoop: type of snoop (if any) for the data at the time of the sample
+       - dcacheline: the cacheline the data address is on at the time of the sample
+
+       And the default sort keys are changed to local_weight, mem, sym, dso,
+       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+
        If the data file has tracepoint event(s), following (dynamic) sort keys
        are also available:
        trace, trace_fields, [<event>.]<field>[/raw]
@@ -151,22 +167,6 @@ OPTIONS
        By default, every sort keys not specified in -F will be appended
        automatically.
 
-       If --mem-mode option is used, following sort keys are also available
-       (incompatible with --branch-stack):
-       symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
-
-       - symbol_daddr: name of data symbol being executed on at the time of sample
-       - dso_daddr: name of library or module containing the data being executed
-       on at the time of sample
-       - locked: whether the bus was locked at the time of sample
-       - tlb: type of tlb access for the data at the time of sample
-       - mem: type of memory access for the data at the time of sample
-       - snoop: type of snoop (if any) for the data at the time of sample
-       - dcacheline: the cacheline the data address is on at the time of sample
-
-       And default sort keys are changed to local_weight, mem, sym, dso,
-       symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
-
 -p::
 --parent=<regex>::
         A regex filter to identify parent. The parent is a caller of this
@@ -351,7 +351,10 @@ OPTIONS
 
 --percent-limit::
        Do not show entries which have an overhead under that percent.
-       (Default: 0).
+       (Default: 0).  Note that this option also sets the percent limit (threshold)
+       of callchains.  However the default value of callchain threshold is
+       different than the default value of hist entries.  Please see the
+       --call-graph option for details.
 
 --percentage::
        Determine how to display the overhead percentage of filtered entries.
@@ -398,6 +401,9 @@ include::itrace.txt[]
 --raw-trace::
        When displaying traceevent output, do not use print fmt or plugins.
 
+--hierarchy::
+       Enable hierarchical output.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
index 52ef7a9..04f23b4 100644 (file)
@@ -69,6 +69,14 @@ report::
 --scale::
        scale/normalize counter values
 
+-d::
+--detailed::
+       print more detailed statistics, can be specified up to 3 times
+
+          -d:          detailed events, L1 and LLC data cache
+        -d -d:     more detailed events, dTLB and iTLB events
+     -d -d -d:     very detailed events, adding prefetch events
+
 -r::
 --repeat=<n>::
        repeat command and print average + stddev (max: 100). 0 means forever.
@@ -139,6 +147,10 @@ Print count deltas every N milliseconds (minimum: 10ms)
 The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
        example: 'perf stat -I 1000 -e cycles -a sleep 5'
 
+--metric-only::
+Only print computed metrics. Print them in a single line.
+Don't show any raw values. Not supported with --per-thread.
+
 --per-socket::
 Aggregate counts per processor socket for system-wide mode measurements.  This
 is a useful mode to detect imbalance between sockets.  To enable this mode,
@@ -211,6 +223,29 @@ $ perf stat -- make -j
 
  Wall-clock time elapsed:   719.554352 msecs
 
+CSV FORMAT
+----------
+
+With -x, perf stat is able to output a not-quite-CSV format output
+Commas in the output are not put into "". To make it easy to parse
+it is recommended to use a different character like -x \;
+
+The fields are in this order:
+
+       - optional usec time stamp in fractions of second (with -I xxx)
+       - optional CPU, core, or socket identifier
+       - optional number of logical CPUs aggregated
+       - counter value
+       - unit of the counter value or empty
+       - event name
+       - run time of counter
+       - percentage of measurement time the counter was running
+       - optional variance if multiple values are collected with -r
+       - optional metric value
+       - optional unit of metric
+
+Additional metrics may be printed with all earlier fields being empty.
+
 SEE ALSO
 --------
 linkperf:perf-top[1], linkperf:perf-list[1]
index b0e60e1..19f046f 100644 (file)
@@ -233,6 +233,9 @@ Default is to monitor all CPUS.
 --raw-trace::
        When displaying traceevent output, do not use print fmt or plugins.
 
+--hierarchy::
+       Enable hierarchy output.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
index 767ea24..1d8d5bc 100644 (file)
@@ -5,7 +5,7 @@
        medium = green, lightgray
        normal = black, lightgray
        selected = lightgray, magenta
-       code = blue, lightgray
+       jump_arrows = blue, lightgray
        addr = magenta, lightgray
 
 [tui]
index e0ce957..5950b5a 100644 (file)
@@ -27,3 +27,4 @@ Skip collecing build-id when recording: perf record -B
 To change sampling frequency to 100 Hz: perf record -F 100
 See assembly instructions with percentage: perf annotate <symbol>
 If you prefer Intel style assembly, try: perf annotate -M intel
+For hierarchical output, try: perf report --hierarchy
index dcd9a70..32a64e6 100644 (file)
@@ -68,6 +68,20 @@ all tags TAGS:
        $(print_msg)
        $(make)
 
+ifdef MAKECMDGOALS
+has_clean := 0
+ifneq ($(filter clean,$(MAKECMDGOALS)),)
+  has_clean := 1
+endif # clean
+
+ifeq ($(has_clean),1)
+  rest := $(filter-out clean,$(MAKECMDGOALS))
+  ifneq ($(rest),)
+$(rest): clean
+  endif # rest
+endif # has_clean
+endif # MAKECMDGOALS
+
 #
 # The clean target is not really parallel, don't print the jobs info:
 #
@@ -75,10 +89,17 @@ clean:
        $(make)
 
 #
-# The build-test target is not really parallel, don't print the jobs info:
+# The build-test target is not really parallel, don't print the jobs info,
+# it also uses only the tests/make targets that don't pollute the source
+# repository, i.e. that uses O= or builds the tarpkg outside the source
+# repo directories.
+#
+# For a full test, use:
+#
+# make -C tools/perf -f tests/make
 #
 build-test:
-       @$(MAKE) SHUF=1 -f tests/make --no-print-directory
+       @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
 
 #
 # All other targets get passed through:
index 0a22407..4a4fad4 100644 (file)
@@ -58,6 +58,9 @@ include config/utilities.mak
 #
 # Define NO_LIBBIONIC if you do not want bionic support
 #
+# Define NO_LIBCRYPTO if you do not want libcrypto (openssl) support
+# used for generating build-ids for ELFs generated by jitdump.
+#
 # Define NO_LIBDW_DWARF_UNWIND if you do not want libdw support
 # for dwarf backtrace post unwind.
 #
@@ -77,6 +80,9 @@ include config/utilities.mak
 # Define NO_AUXTRACE if you do not want AUX area tracing support
 #
 # Define NO_LIBBPF if you do not want BPF support
+#
+# Define FEATURES_DUMP to provide features detection dump file
+# and bypass the feature detection
 
 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
@@ -133,6 +139,8 @@ $(call allow-override,CC,$(CROSS_COMPILE)gcc)
 $(call allow-override,AR,$(CROSS_COMPILE)ar)
 $(call allow-override,LD,$(CROSS_COMPILE)ld)
 
+LD += $(EXTRA_LDFLAGS)
+
 PKG_CONFIG = $(CROSS_COMPILE)pkg-config
 
 RM      = rm -f
@@ -162,10 +170,28 @@ ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
 endif
 endif
 
+# Set FEATURE_TESTS to 'all' so all possible feature checkers are executed.
+# Without this setting the output feature dump file misses some features, for
+# example, liberty. Select all checkers so we won't get an incomplete feature
+# dump file.
 ifeq ($(config),1)
+ifdef MAKECMDGOALS
+ifeq ($(filter feature-dump,$(MAKECMDGOALS)),feature-dump)
+FEATURE_TESTS := all
+endif
+endif
 include config/Makefile
 endif
 
+# The FEATURE_DUMP_EXPORT holds location of the actual
+# FEATURE_DUMP file to be used to bypass feature detection
+# (for bpf or any other subproject)
+ifeq ($(FEATURES_DUMP),)
+FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP)
+else
+FEATURE_DUMP_EXPORT := $(FEATURES_DUMP)
+endif
+
 export prefix bindir sharedir sysconfdir DESTDIR
 
 # sparse is architecture-neutral, which means that we need to tell it
@@ -436,7 +462,7 @@ $(LIBAPI)-clean:
        $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
 $(LIBBPF): fixdep FORCE
-       $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP)
+       $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT)
 
 $(LIBBPF)-clean:
        $(call QUIET_CLEAN, libbpf)
@@ -606,10 +632,21 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean
        $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
        $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
                $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
-               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
+               $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c
        $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
        $(python-clean)
 
+#
+# To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY)
+# file if defined, with no further action.
+feature-dump:
+ifdef FEATURE_DUMP_COPY
+       @cp $(OUTPUT)FEATURE-DUMP $(FEATURE_DUMP_COPY)
+       @echo "FEATURE-DUMP file copied into $(FEATURE_DUMP_COPY)"
+else
+       @echo "FEATURE-DUMP file available in $(OUTPUT)FEATURE-DUMP"
+endif
+
 #
 # Trick: if ../../.git does not exist - we are building out of tree for example,
 # then force version regeneration:
index 7fbca17..18b1351 100644 (file)
@@ -1,3 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+PERF_HAVE_JITDUMP := 1
index 7fbca17..18b1351 100644 (file)
@@ -1,3 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+PERF_HAVE_JITDUMP := 1
index 7fbca17..56e05f1 100644 (file)
@@ -1,3 +1,6 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+
+HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_JITDUMP := 1
index 7b8b0d1..c8fe207 100644 (file)
@@ -1,5 +1,6 @@
 libperf-y += header.o
 libperf-y += sym-handling.o
+libperf-y += kvm-stat.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/arch/powerpc/util/book3s_hcalls.h
new file mode 100644 (file)
index 0000000..0dd6b7f
--- /dev/null
@@ -0,0 +1,123 @@
+#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
+#define ARCH_PERF_BOOK3S_HV_HCALLS_H
+
+/*
+ * PowerPC HCALL codes : hcall code to name mapping
+ */
+#define kvm_trace_symbol_hcall \
+       {0x4, "H_REMOVE"},                                      \
+       {0x8, "H_ENTER"},                                       \
+       {0xc, "H_READ"},                                        \
+       {0x10, "H_CLEAR_MOD"},                                  \
+       {0x14, "H_CLEAR_REF"},                                  \
+       {0x18, "H_PROTECT"},                                    \
+       {0x1c, "H_GET_TCE"},                                    \
+       {0x20, "H_PUT_TCE"},                                    \
+       {0x24, "H_SET_SPRG0"},                                  \
+       {0x28, "H_SET_DABR"},                                   \
+       {0x2c, "H_PAGE_INIT"},                                  \
+       {0x30, "H_SET_ASR"},                                    \
+       {0x34, "H_ASR_ON"},                                     \
+       {0x38, "H_ASR_OFF"},                                    \
+       {0x3c, "H_LOGICAL_CI_LOAD"},                            \
+       {0x40, "H_LOGICAL_CI_STORE"},                           \
+       {0x44, "H_LOGICAL_CACHE_LOAD"},                         \
+       {0x48, "H_LOGICAL_CACHE_STORE"},                        \
+       {0x4c, "H_LOGICAL_ICBI"},                               \
+       {0x50, "H_LOGICAL_DCBF"},                               \
+       {0x54, "H_GET_TERM_CHAR"},                              \
+       {0x58, "H_PUT_TERM_CHAR"},                              \
+       {0x5c, "H_REAL_TO_LOGICAL"},                            \
+       {0x60, "H_HYPERVISOR_DATA"},                            \
+       {0x64, "H_EOI"},                                        \
+       {0x68, "H_CPPR"},                                       \
+       {0x6c, "H_IPI"},                                        \
+       {0x70, "H_IPOLL"},                                      \
+       {0x74, "H_XIRR"},                                       \
+       {0x78, "H_MIGRATE_DMA"},                                \
+       {0x7c, "H_PERFMON"},                                    \
+       {0xdc, "H_REGISTER_VPA"},                               \
+       {0xe0, "H_CEDE"},                                       \
+       {0xe4, "H_CONFER"},                                     \
+       {0xe8, "H_PROD"},                                       \
+       {0xec, "H_GET_PPP"},                                    \
+       {0xf0, "H_SET_PPP"},                                    \
+       {0xf4, "H_PURR"},                                       \
+       {0xf8, "H_PIC"},                                        \
+       {0xfc, "H_REG_CRQ"},                                    \
+       {0x100, "H_FREE_CRQ"},                                  \
+       {0x104, "H_VIO_SIGNAL"},                                \
+       {0x108, "H_SEND_CRQ"},                                  \
+       {0x110, "H_COPY_RDMA"},                                 \
+       {0x114, "H_REGISTER_LOGICAL_LAN"},                      \
+       {0x118, "H_FREE_LOGICAL_LAN"},                          \
+       {0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},                    \
+       {0x120, "H_SEND_LOGICAL_LAN"},                          \
+       {0x124, "H_BULK_REMOVE"},                               \
+       {0x130, "H_MULTICAST_CTRL"},                            \
+       {0x134, "H_SET_XDABR"},                                 \
+       {0x138, "H_STUFF_TCE"},                                 \
+       {0x13c, "H_PUT_TCE_INDIRECT"},                          \
+       {0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},                    \
+       {0x150, "H_VTERM_PARTNER_INFO"},                        \
+       {0x154, "H_REGISTER_VTERM"},                            \
+       {0x158, "H_FREE_VTERM"},                                \
+       {0x15c, "H_RESET_EVENTS"},                              \
+       {0x160, "H_ALLOC_RESOURCE"},                            \
+       {0x164, "H_FREE_RESOURCE"},                             \
+       {0x168, "H_MODIFY_QP"},                                 \
+       {0x16c, "H_QUERY_QP"},                                  \
+       {0x170, "H_REREGISTER_PMR"},                            \
+       {0x174, "H_REGISTER_SMR"},                              \
+       {0x178, "H_QUERY_MR"},                                  \
+       {0x17c, "H_QUERY_MW"},                                  \
+       {0x180, "H_QUERY_HCA"},                                 \
+       {0x184, "H_QUERY_PORT"},                                \
+       {0x188, "H_MODIFY_PORT"},                               \
+       {0x18c, "H_DEFINE_AQP1"},                               \
+       {0x190, "H_GET_TRACE_BUFFER"},                          \
+       {0x194, "H_DEFINE_AQP0"},                               \
+       {0x198, "H_RESIZE_MR"},                                 \
+       {0x19c, "H_ATTACH_MCQP"},                               \
+       {0x1a0, "H_DETACH_MCQP"},                               \
+       {0x1a4, "H_CREATE_RPT"},                                \
+       {0x1a8, "H_REMOVE_RPT"},                                \
+       {0x1ac, "H_REGISTER_RPAGES"},                           \
+       {0x1b0, "H_DISABLE_AND_GETC"},                          \
+       {0x1b4, "H_ERROR_DATA"},                                \
+       {0x1b8, "H_GET_HCA_INFO"},                              \
+       {0x1bc, "H_GET_PERF_COUNT"},                            \
+       {0x1c0, "H_MANAGE_TRACE"},                              \
+       {0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},                   \
+       {0x1d8, "H_POLL_PENDING"},                              \
+       {0x1e4, "H_QUERY_INT_STATE"},                           \
+       {0x244, "H_ILLAN_ATTRIBUTES"},                          \
+       {0x250, "H_MODIFY_HEA_QP"},                             \
+       {0x254, "H_QUERY_HEA_QP"},                              \
+       {0x258, "H_QUERY_HEA"},                                 \
+       {0x25c, "H_QUERY_HEA_PORT"},                            \
+       {0x260, "H_MODIFY_HEA_PORT"},                           \
+       {0x264, "H_REG_BCMC"},                                  \
+       {0x268, "H_DEREG_BCMC"},                                \
+       {0x26c, "H_REGISTER_HEA_RPAGES"},                       \
+       {0x270, "H_DISABLE_AND_GET_HEA"},                       \
+       {0x274, "H_GET_HEA_INFO"},                              \
+       {0x278, "H_ALLOC_HEA_RESOURCE"},                        \
+       {0x284, "H_ADD_CONN"},                                  \
+       {0x288, "H_DEL_CONN"},                                  \
+       {0x298, "H_JOIN"},                                      \
+       {0x2a4, "H_VASI_STATE"},                                \
+       {0x2b0, "H_ENABLE_CRQ"},                                \
+       {0x2b8, "H_GET_EM_PARMS"},                              \
+       {0x2d0, "H_SET_MPP"},                                   \
+       {0x2d4, "H_GET_MPP"},                                   \
+       {0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},                   \
+       {0x2f4, "H_BEST_ENERGY"},                               \
+       {0x2fc, "H_XIRR_X"},                                    \
+       {0x300, "H_RANDOM"},                                    \
+       {0x304, "H_COP"},                                       \
+       {0x314, "H_GET_MPP_X"},                                 \
+       {0x31c, "H_SET_MODE"},                                  \
+       {0xf000, "H_RTAS"}                                      \
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
new file mode 100644 (file)
index 0000000..e68ba2d
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
+#define ARCH_PERF_BOOK3S_HV_EXITS_H
+
+/*
+ * PowerPC Interrupt vectors : exit code to name mapping
+ */
+
+#define kvm_trace_symbol_exit \
+       {0x0,   "RETURN_TO_HOST"}, \
+       {0x100, "SYSTEM_RESET"}, \
+       {0x200, "MACHINE_CHECK"}, \
+       {0x300, "DATA_STORAGE"}, \
+       {0x380, "DATA_SEGMENT"}, \
+       {0x400, "INST_STORAGE"}, \
+       {0x480, "INST_SEGMENT"}, \
+       {0x500, "EXTERNAL"}, \
+       {0x501, "EXTERNAL_LEVEL"}, \
+       {0x502, "EXTERNAL_HV"}, \
+       {0x600, "ALIGNMENT"}, \
+       {0x700, "PROGRAM"}, \
+       {0x800, "FP_UNAVAIL"}, \
+       {0x900, "DECREMENTER"}, \
+       {0x980, "HV_DECREMENTER"}, \
+       {0xc00, "SYSCALL"}, \
+       {0xd00, "TRACE"}, \
+       {0xe00, "H_DATA_STORAGE"}, \
+       {0xe20, "H_INST_STORAGE"}, \
+       {0xe40, "H_EMUL_ASSIST"}, \
+       {0xf00, "PERFMON"}, \
+       {0xf20, "ALTIVEC"}, \
+       {0xf40, "VSX"}
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
new file mode 100644 (file)
index 0000000..74eee30
--- /dev/null
@@ -0,0 +1,170 @@
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+#include "util/debug.h"
+
+#include "book3s_hv_exits.h"
+#include "book3s_hcalls.h"
+
+#define NR_TPS 4
+
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 40;
+const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
+const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
+
+define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
+
+/* Tracepoints specific to ppc_book3s_hv */
+const char *ppc_book3s_hv_kvm_tp[] = {
+       "kvm_hv:kvm_guest_enter",
+       "kvm_hv:kvm_guest_exit",
+       "kvm_hv:kvm_hcall_enter",
+       "kvm_hv:kvm_hcall_exit",
+       NULL,
+};
+
+/* 1 extra placeholder for NULL */
+const char *kvm_events_tp[NR_TPS + 1];
+const char *kvm_exit_reason;
+
+static void hcall_event_get_key(struct perf_evsel *evsel,
+                               struct perf_sample *sample,
+                               struct event_key *key)
+{
+       key->info = 0;
+       key->key = perf_evsel__intval(evsel, sample, "req");
+}
+
+static const char *get_hcall_exit_reason(u64 exit_code)
+{
+       struct exit_reasons_table *tbl = hcall_reasons;
+
+       while (tbl->reason != NULL) {
+               if (tbl->exit_code == exit_code)
+                       return tbl->reason;
+               tbl++;
+       }
+
+       pr_debug("Unknown hcall code: %lld\n",
+              (unsigned long long)exit_code);
+       return "UNKNOWN";
+}
+
+static bool hcall_event_end(struct perf_evsel *evsel,
+                           struct perf_sample *sample __maybe_unused,
+                           struct event_key *key __maybe_unused)
+{
+       return (!strcmp(evsel->name, kvm_events_tp[3]));
+}
+
+static bool hcall_event_begin(struct perf_evsel *evsel,
+                             struct perf_sample *sample, struct event_key *key)
+{
+       if (!strcmp(evsel->name, kvm_events_tp[2])) {
+               hcall_event_get_key(evsel, sample, key);
+               return true;
+       }
+
+       return false;
+}
+static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+                                  struct event_key *key,
+                                  char *decode)
+{
+       const char *hcall_reason = get_hcall_exit_reason(key->key);
+
+       scnprintf(decode, decode_str_len, "%s", hcall_reason);
+}
+
+static struct kvm_events_ops hcall_events = {
+       .is_begin_event = hcall_event_begin,
+       .is_end_event = hcall_event_end,
+       .decode_key = hcall_event_decode_key,
+       .name = "HCALL-EVENT",
+};
+
+static struct kvm_events_ops exit_events = {
+       .is_begin_event = exit_event_begin,
+       .is_end_event = exit_event_end,
+       .decode_key = exit_event_decode_key,
+       .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+       { .name = "vmexit", .ops = &exit_events },
+       { .name = "hcall", .ops = &hcall_events },
+       { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+       NULL,
+};
+
+
+static int is_tracepoint_available(const char *str, struct perf_evlist *evlist)
+{
+       struct parse_events_error err;
+       int ret;
+
+       err.str = NULL;
+       ret = parse_events(evlist, str, &err);
+       if (err.str)
+               pr_err("%s : %s\n", str, err.str);
+       return ret;
+}
+
+static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
+                               struct perf_evlist *evlist)
+{
+       const char **events_ptr;
+       int i, nr_tp = 0, err = -1;
+
+       /* Check for book3s_hv tracepoints */
+       for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
+               err = is_tracepoint_available(*events_ptr, evlist);
+               if (err)
+                       return -1;
+               nr_tp++;
+       }
+
+       for (i = 0; i < nr_tp; i++)
+               kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+
+       kvm_events_tp[i] = NULL;
+       kvm_exit_reason = "trap";
+       kvm->exit_reasons = hv_exit_reasons;
+       kvm->exit_reasons_isa = "HV";
+
+       return 0;
+}
+
+/* Wrapper to setup kvm tracepoints */
+static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
+{
+       struct perf_evlist *evlist = perf_evlist__new();
+
+       if (evlist == NULL)
+               return -ENOMEM;
+
+       /* Right now, only supported on book3s_hv */
+       return ppc__setup_book3s_hv(kvm, evlist);
+}
+
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+{
+       return ppc__setup_kvm_tp(kvm);
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+       int ret;
+
+       ret = ppc__setup_kvm_tp(kvm);
+       if (ret) {
+               kvm->exit_reasons = NULL;
+               kvm->exit_reasons_isa = NULL;
+       }
+
+       return ret;
+}
index a5dbc07..ed57df2 100644 (file)
@@ -10,7 +10,7 @@
  */
 
 #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/sie.h>
 
 define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
 define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
@@ -18,6 +18,12 @@ define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
 define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
 define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
 
+const char *vcpu_id_str = "id";
+const int decode_str_len = 40;
+const char *kvm_exit_reason = "icptcode";
+const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
+const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
+
 static void event_icpt_insn_get_key(struct perf_evsel *evsel,
                                    struct perf_sample *sample,
                                    struct event_key *key)
@@ -73,7 +79,7 @@ static struct kvm_events_ops exit_events = {
        .name = "VM-EXIT"
 };
 
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
        "kvm:kvm_s390_sie_enter",
        "kvm:kvm_s390_sie_exit",
        "kvm:kvm_s390_intercept_instruction",
index 09ba923..269af21 100644 (file)
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
 endif
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+PERF_HAVE_JITDUMP := 1
index 3e89ba8..7f064eb 100644 (file)
@@ -17,7 +17,7 @@ static pid_t spawn(void)
        if (pid)
                return pid;
 
-       while(1);
+       while(1)
                sleep(5);
        return 0;
 }
index 7bb0d13..72193f1 100644 (file)
@@ -59,7 +59,7 @@ static u64 mmap_read_self(void *addr)
                u64 quot, rem;
 
                quot = (cyc >> time_shift);
-               rem = cyc & ((1 << time_shift) - 1);
+               rem = cyc & (((u64)1 << time_shift) - 1);
                delta = time_offset + quot * time_mult +
                        ((rem * time_mult) >> time_shift);
 
@@ -103,6 +103,7 @@ static int __test__rdpmc(void)
 
        sigfillset(&sa.sa_mask);
        sa.sa_sigaction = segfault_handler;
+       sa.sa_flags = 0;
        sigaction(SIGSEGV, &sa, NULL);
 
        fd = sys_perf_event_open(&attr, 0, -1, -1,
index 8d8150f..d66f9ad 100644 (file)
@@ -60,7 +60,9 @@ struct branch {
        u64 misc;
 };
 
-static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused)
 {
        return INTEL_BTS_AUXTRACE_PRIV_SIZE;
 }
index f05daac..a339517 100644 (file)
@@ -89,7 +89,7 @@ static int intel_pt_parse_terms_with_default(struct list_head *formats,
 
        *config = attr.config;
 out_free:
-       parse_events__free_terms(terms);
+       parse_events_terms__delete(terms);
        return err;
 }
 
@@ -273,7 +273,9 @@ intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
        return attr;
 }
 
-static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                       struct perf_evlist *evlist __maybe_unused)
 {
        return INTEL_PT_AUXTRACE_PRIV_SIZE;
 }
index 14e4e66..b63d4be 100644 (file)
@@ -1,5 +1,7 @@
 #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
 
 define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
 define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
@@ -11,6 +13,12 @@ static struct kvm_events_ops exit_events = {
        .name = "VM-EXIT"
 };
 
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 20;
+const char *kvm_exit_reason = "exit_reason";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
 /*
  * For the mmio events, we treat:
  * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
@@ -65,7 +73,7 @@ static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                  struct event_key *key,
                                  char *decode)
 {
-       scnprintf(decode, DECODE_STR_LEN, "%#lx:%s",
+       scnprintf(decode, decode_str_len, "%#lx:%s",
                  (unsigned long)key->key,
                  key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
 }
@@ -109,7 +117,7 @@ static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
                                    struct event_key *key,
                                    char *decode)
 {
-       scnprintf(decode, DECODE_STR_LEN, "%#llx:%s",
+       scnprintf(decode, decode_str_len, "%#llx:%s",
                  (unsigned long long)key->key,
                  key->info ? "POUT" : "PIN");
 }
@@ -121,7 +129,7 @@ static struct kvm_events_ops ioport_events = {
        .name = "IO Port Access"
 };
 
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
        "kvm:kvm_entry",
        "kvm:kvm_exit",
        "kvm:kvm_mmio",
index e4c2c30..5c3cce0 100644 (file)
@@ -1,6 +1,11 @@
+
+/* Various wrappers to make the kernel .S file build in user-space: */
+
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
+#define _ASM_EXTABLE_FAULT(x, y)
+
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
index cc5c126..cfe3663 100644 (file)
@@ -245,7 +245,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
                        hists__collapse_resort(hists, NULL);
                        /* Don't sort callchain */
                        perf_evsel__reset_sample_bit(pos, CALLCHAIN);
-                       hists__output_resort(hists, NULL);
+                       perf_evsel__output_resort(pos, NULL);
 
                        if (symbol_conf.event_group &&
                            !perf_evsel__is_group_leader(pos))
index d93bff7..632efc6 100644 (file)
@@ -38,19 +38,7 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
 
 static int build_id_cache__kcore_dir(char *dir, size_t sz)
 {
-       struct timeval tv;
-       struct tm tm;
-       char dt[32];
-
-       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
-               return -1;
-
-       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
-               return -1;
-
-       scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
-
-       return 0;
+       return fetch_current_timestamp(dir, sz);
 }
 
 static bool same_kallsyms_reloc(const char *from_dir, char *to_dir)
index f04e804..c42448e 100644 (file)
 #include "util/util.h"
 #include "util/debug.h"
 
+static bool use_system_config, use_user_config;
+
 static const char * const config_usage[] = {
-       "perf config [options]",
+       "perf config [<file-option>] [options]",
        NULL
 };
 
@@ -25,6 +27,8 @@ enum actions {
 static struct option config_options[] = {
        OPT_SET_UINT('l', "list", &actions,
                     "show current config variables", ACTION_LIST),
+       OPT_BOOLEAN(0, "system", &use_system_config, "use system config file"),
+       OPT_BOOLEAN(0, "user", &use_user_config, "use user config file"),
        OPT_END()
 };
 
@@ -42,10 +46,23 @@ static int show_config(const char *key, const char *value,
 int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 {
        int ret = 0;
+       char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
 
        argc = parse_options(argc, argv, config_options, config_usage,
                             PARSE_OPT_STOP_AT_NON_OPTION);
 
+       if (use_system_config && use_user_config) {
+               pr_err("Error: only one config file at a time\n");
+               parse_options_usage(config_usage, config_options, "user", 0);
+               parse_options_usage(NULL, config_options, "system", 0);
+               return -1;
+       }
+
+       if (use_system_config)
+               config_exclusive_filename = perf_etc_perfconfig();
+       else if (use_user_config)
+               config_exclusive_filename = user_config;
+
        switch (actions) {
        case ACTION_LIST:
                if (argc) {
@@ -53,9 +70,13 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
                        parse_options_usage(config_usage, config_options, "l", 1);
                } else {
                        ret = perf_config(show_config, NULL);
-                       if (ret < 0)
+                       if (ret < 0) {
+                               const char * config_filename = config_exclusive_filename;
+                               if (!config_exclusive_filename)
+                                       config_filename = user_config;
                                pr_err("Nothing configured, "
-                                      "please check your ~/.perfconfig file\n");
+                                      "please check your %s \n", config_filename);
+                       }
                }
                break;
        default:
index 36ccc2b..4d72359 100644 (file)
@@ -1264,8 +1264,6 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
        if (ret < 0)
                return ret;
 
-       perf_config(perf_default_config, NULL);
-
        argc = parse_options(argc, argv, options, diff_usage, 0);
 
        if (symbol__init(NULL) < 0)
index 96c1a4c..49d55e2 100644 (file)
@@ -86,8 +86,7 @@ static int check_emacsclient_version(void)
                return -1;
        }
 
-       strbuf_remove(&buffer, 0, strlen("emacsclient"));
-       version = atoi(buffer.buf);
+       version = atoi(buffer.buf + strlen("emacsclient"));
 
        if (version < 22) {
                fprintf(stderr,
@@ -273,7 +272,7 @@ static int perf_help_config(const char *var, const char *value, void *cb)
        if (!prefixcmp(var, "man."))
                return add_man_viewer_info(var, value);
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 static struct cmdnames main_cmds, other_cmds;
index 0022e02..7fa6866 100644 (file)
@@ -17,6 +17,7 @@
 #include "util/build-id.h"
 #include "util/data.h"
 #include "util/auxtrace.h"
+#include "util/jit.h"
 
 #include <subcmd/parse-options.h>
 
@@ -29,6 +30,7 @@ struct perf_inject {
        bool                    sched_stat;
        bool                    have_auxtrace;
        bool                    strip;
+       bool                    jit_mode;
        const char              *input_name;
        struct perf_data_file   output;
        u64                     bytes_written;
@@ -71,6 +73,15 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
        return perf_event__repipe_synth(tool, event);
 }
 
+#ifdef HAVE_JITDUMP
+static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
+                              union perf_event *event __maybe_unused,
+                              struct ordered_events *oe __maybe_unused)
+{
+       return 0;
+}
+#endif
+
 static int perf_event__repipe_op2_synth(struct perf_tool *tool,
                                        union perf_event *event,
                                        struct perf_session *session
@@ -234,6 +245,31 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
        return err;
 }
 
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_sample *sample,
+                                      struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap(tool, event, sample, machine);
+}
+#endif
+
 static int perf_event__repipe_mmap2(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct perf_sample *sample,
@@ -247,6 +283,31 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
        return err;
 }
 
+#ifdef HAVE_JITDUMP
+static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
+                                       union perf_event *event,
+                                       struct perf_sample *sample,
+                                       struct machine *machine)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+       u64 n = 0;
+       int ret;
+
+       /*
+        * if jit marker, then inject jit mmaps and generate ELF images
+        */
+       ret = jit_process(inject->session, &inject->output, machine,
+                         event->mmap2.filename, sample->pid, &n);
+       if (ret < 0)
+               return ret;
+       if (ret) {
+               inject->bytes_written += n;
+               return 0;
+       }
+       return perf_event__repipe_mmap2(tool, event, sample, machine);
+}
+#endif
+
 static int perf_event__repipe_fork(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct perf_sample *sample,
@@ -626,12 +687,16 @@ static int __cmd_inject(struct perf_inject *inject)
        ret = perf_session__process_events(session);
 
        if (!file_out->is_pipe) {
-               if (inject->build_ids) {
+               if (inject->build_ids)
                        perf_header__set_feat(&session->header,
                                              HEADER_BUILD_ID);
-                       if (inject->have_auxtrace)
-                               dsos__hit_all(session);
-               }
+               /*
+                * Keep all buildids when there is unprocessed AUX data because
+                * it is not known which ones the AUX trace hits.
+                */
+               if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) &&
+                   inject->have_auxtrace && !inject->itrace_synth_opts.set)
+                       dsos__hit_all(session);
                /*
                 * The AUX areas have been removed and replaced with
                 * synthesized hardware events, so clear the feature flag and
@@ -703,7 +768,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
        };
        int ret;
 
-       const struct option options[] = {
+       struct option options[] = {
                OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
                            "Inject build-ids into the output stream"),
                OPT_STRING('i', "input", &inject.input_name, "file",
@@ -713,6 +778,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
                            "Merge sched-stat and sched-switch for getting events "
                            "where and how long tasks slept"),
+#ifdef HAVE_JITDUMP
+               OPT_BOOLEAN('j', "jit", &inject.jit_mode, "merge jitdump files into perf.data file"),
+#endif
                OPT_INCR('v', "verbose", &verbose,
                         "be more verbose (show build ids, etc)"),
                OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
@@ -729,7 +797,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                "perf inject [<options>]",
                NULL
        };
-
+#ifndef HAVE_JITDUMP
+       set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
+#endif
        argc = parse_options(argc, argv, options, inject_usage, 0);
 
        /*
@@ -755,6 +825,29 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
        if (inject.session == NULL)
                return -1;
 
+       if (inject.build_ids) {
+               /*
+                * to make sure the mmap records are ordered correctly
+                * and so that the correct especially due to jitted code
+                * mmaps. We cannot generate the buildid hit list and
+                * inject the jit mmaps at the same time for now.
+                */
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+       }
+#ifdef HAVE_JITDUMP
+       if (inject.jit_mode) {
+               inject.tool.mmap2          = perf_event__jit_repipe_mmap2;
+               inject.tool.mmap           = perf_event__jit_repipe_mmap;
+               inject.tool.ordered_events = true;
+               inject.tool.ordering_requires_timestamps = true;
+               /*
+                * JIT MMAP injection injects all MMAP events in one go, so it
+                * does not obey finished_round semantics.
+                */
+               inject.tool.finished_round = perf_event__drop_oe;
+       }
+#endif
        ret = symbol__init(&inject.session->header.env);
        if (ret < 0)
                goto out_delete;
index 1180105..4d3340c 100644 (file)
@@ -1834,7 +1834,7 @@ static int __cmd_record(int argc, const char **argv)
        return cmd_record(i, rec_argv, NULL);
 }
 
-static int kmem_config(const char *var, const char *value, void *cb)
+static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
 {
        if (!strcmp(var, "kmem.default")) {
                if (!strcmp(value, "slab"))
@@ -1847,7 +1847,7 @@ static int kmem_config(const char *var, const char *value, void *cb)
                return 0;
        }
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
index 4418d92..bff6664 100644 (file)
@@ -30,7 +30,6 @@
 #include <math.h>
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include <asm/kvm_perf.h>
 #include "util/kvm-stat.h"
 
 void exit_event_get_key(struct perf_evsel *evsel,
@@ -38,12 +37,12 @@ void exit_event_get_key(struct perf_evsel *evsel,
                        struct event_key *key)
 {
        key->info = 0;
-       key->key = perf_evsel__intval(evsel, sample, KVM_EXIT_REASON);
+       key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
 }
 
 bool kvm_exit_event(struct perf_evsel *evsel)
 {
-       return !strcmp(evsel->name, KVM_EXIT_TRACE);
+       return !strcmp(evsel->name, kvm_exit_trace);
 }
 
 bool exit_event_begin(struct perf_evsel *evsel,
@@ -59,7 +58,7 @@ bool exit_event_begin(struct perf_evsel *evsel,
 
 bool kvm_entry_event(struct perf_evsel *evsel)
 {
-       return !strcmp(evsel->name, KVM_ENTRY_TRACE);
+       return !strcmp(evsel->name, kvm_entry_trace);
 }
 
 bool exit_event_end(struct perf_evsel *evsel,
@@ -91,7 +90,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
        const char *exit_reason = get_exit_reason(kvm, key->exit_reasons,
                                                  key->key);
 
-       scnprintf(decode, DECODE_STR_LEN, "%s", exit_reason);
+       scnprintf(decode, decode_str_len, "%s", exit_reason);
 }
 
 static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
@@ -357,7 +356,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
        time_diff = sample->time - time_begin;
 
        if (kvm->duration && time_diff > kvm->duration) {
-               char decode[DECODE_STR_LEN];
+               char decode[decode_str_len];
 
                kvm->events_ops->decode_key(kvm, &event->key, decode);
                if (!skip_event(decode)) {
@@ -385,7 +384,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
                        return NULL;
                }
 
-               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, VCPU_ID);
+               vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample,
+                                                         vcpu_id_str);
                thread__set_priv(thread, vcpu_record);
        }
 
@@ -574,7 +574,7 @@ static void show_timeofday(void)
 
 static void print_result(struct perf_kvm_stat *kvm)
 {
-       char decode[DECODE_STR_LEN];
+       char decode[decode_str_len];
        struct kvm_event *event;
        int vcpu = kvm->trace_vcpu;
 
@@ -585,7 +585,7 @@ static void print_result(struct perf_kvm_stat *kvm)
 
        pr_info("\n\n");
        print_vcpu_info(kvm);
-       pr_info("%*s ", DECODE_STR_LEN, kvm->events_ops->name);
+       pr_info("%*s ", decode_str_len, kvm->events_ops->name);
        pr_info("%10s ", "Samples");
        pr_info("%9s ", "Samples%");
 
@@ -604,7 +604,7 @@ static void print_result(struct perf_kvm_stat *kvm)
                min = get_event_min(event, vcpu);
 
                kvm->events_ops->decode_key(kvm, &event->key, decode);
-               pr_info("%*s ", DECODE_STR_LEN, decode);
+               pr_info("%*s ", decode_str_len, decode);
                pr_info("%10llu ", (unsigned long long)ecount);
                pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
                pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
@@ -1132,6 +1132,11 @@ exit:
                _p;                     \
        })
 
+int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
+{
+       return 0;
+}
+
 static int
 kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 {
@@ -1148,7 +1153,14 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
                NULL
        };
        const char * const *events_tp;
+       int ret;
+
        events_tp_size = 0;
+       ret = setup_kvm_events_tp(kvm);
+       if (ret < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return ret;
+       }
 
        for (events_tp = kvm_events_tp; *events_tp; events_tp++)
                events_tp_size++;
@@ -1377,6 +1389,12 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
        /*
         * generate the event list
         */
+       err = setup_kvm_events_tp(kvm);
+       if (err < 0) {
+               pr_err("Unable to setup the kvm tracepoints\n");
+               return err;
+       }
+
        kvm->evlist = kvm_live_event_list();
        if (kvm->evlist == NULL) {
                err = -1;
index 3901700..88aeac9 100644 (file)
@@ -6,6 +6,8 @@
 #include "util/tool.h"
 #include "util/session.h"
 #include "util/data.h"
+#include "util/mem-events.h"
+#include "util/debug.h"
 
 #define MEM_OPERATION_LOAD     0x1
 #define MEM_OPERATION_STORE    0x2
@@ -21,11 +23,56 @@ struct perf_mem {
        DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
+static int parse_record_events(const struct option *opt,
+                              const char *str, int unset __maybe_unused)
+{
+       struct perf_mem *mem = *(struct perf_mem **)opt->value;
+       int j;
+
+       if (strcmp(str, "list")) {
+               if (!perf_mem_events__parse(str)) {
+                       mem->operation = 0;
+                       return 0;
+               }
+               exit(-1);
+       }
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               struct perf_mem_event *e = &perf_mem_events[j];
+
+               fprintf(stderr, "%-13s%-*s%s\n",
+                       e->tag,
+                       verbose ? 25 : 0,
+                       verbose ? perf_mem_events__name(j) : "",
+                       e->supported ? ": available" : "");
+       }
+       exit(0);
+}
+
+static const char * const __usage[] = {
+       "perf mem record [<options>] [<command>]",
+       "perf mem record [<options>] -- <command> [<options>]",
+       NULL
+};
+
+static const char * const *record_mem_usage = __usage;
+
 static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 {
        int rec_argc, i = 0, j;
        const char **rec_argv;
        int ret;
+       struct option options[] = {
+       OPT_CALLBACK('e', "event", &mem, "event",
+                    "event selector. use 'perf mem record -e list' to list available events",
+                    parse_record_events),
+       OPT_INCR('v', "verbose", &verbose,
+                "be more verbose (show counter open errors, etc)"),
+       OPT_END()
+       };
+
+       argc = parse_options(argc, argv, options, record_mem_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
 
        rec_argc = argc + 7; /* max number of arguments */
        rec_argv = calloc(rec_argc + 1, sizeof(char *));
@@ -35,23 +82,40 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
        rec_argv[i++] = "record";
 
        if (mem->operation & MEM_OPERATION_LOAD)
+               perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
+
+       if (perf_mem_events[PERF_MEM_EVENTS__LOAD].record)
                rec_argv[i++] = "-W";
 
        rec_argv[i++] = "-d";
 
-       if (mem->operation & MEM_OPERATION_LOAD) {
-               rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-loads/pp";
-       }
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               if (!perf_mem_events[j].record)
+                       continue;
+
+               if (!perf_mem_events[j].supported) {
+                       pr_err("failed: event '%s' not supported\n",
+                              perf_mem_events__name(j));
+                       return -1;
+               }
 
-       if (mem->operation & MEM_OPERATION_STORE) {
                rec_argv[i++] = "-e";
-               rec_argv[i++] = "cpu/mem-stores/pp";
-       }
+               rec_argv[i++] = perf_mem_events__name(j);
+       };
 
-       for (j = 1; j < argc; j++, i++)
+       for (j = 0; j < argc; j++, i++)
                rec_argv[i] = argv[j];
 
+       if (verbose > 0) {
+               pr_debug("calling: record ");
+
+               while (rec_argv[j]) {
+                       pr_debug("%s ", rec_argv[j]);
+                       j++;
+               }
+               pr_debug("\n");
+       }
+
        ret = cmd_record(i, rec_argv, NULL);
        free(rec_argv);
        return ret;
@@ -298,6 +362,10 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
                NULL
        };
 
+       if (perf_mem_events__init()) {
+               pr_err("failed: memory events not supported\n");
+               return -1;
+       }
 
        argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
                                        mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
index 319712a..515510e 100644 (file)
@@ -32,6 +32,8 @@
 #include "util/parse-branch-options.h"
 #include "util/parse-regs-options.h"
 #include "util/llvm-utils.h"
+#include "util/bpf-loader.h"
+#include "asm/bug.h"
 
 #include <unistd.h>
 #include <sched.h>
@@ -49,7 +51,9 @@ struct record {
        const char              *progname;
        int                     realtime_prio;
        bool                    no_buildid;
+       bool                    no_buildid_set;
        bool                    no_buildid_cache;
+       bool                    no_buildid_cache_set;
        bool                    buildid_all;
        unsigned long long      samples;
 };
@@ -320,7 +324,10 @@ try_again:
                } else {
                        pr_err("failed to mmap with %d (%s)\n", errno,
                                strerror_r(errno, msg, sizeof(msg)));
-                       rc = -errno;
+                       if (errno)
+                               rc = -errno;
+                       else
+                               rc = -EINVAL;
                }
                goto out;
        }
@@ -464,6 +471,29 @@ static void record__init_features(struct record *rec)
        perf_header__clear_feat(&session->header, HEADER_STAT);
 }
 
+static void
+record__finish_output(struct record *rec)
+{
+       struct perf_data_file *file = &rec->file;
+       int fd = perf_data_file__fd(file);
+
+       if (file->is_pipe)
+               return;
+
+       rec->session->header.data_size += rec->bytes_written;
+       file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
+
+       if (!rec->no_buildid) {
+               process_buildids(rec);
+
+               if (rec->buildid_all)
+                       dsos__hit_all(rec->session);
+       }
+       perf_session__write_header(rec->session, rec->evlist, fd, true);
+
+       return;
+}
+
 static volatile int workload_exec_errno;
 
 /*
@@ -482,6 +512,74 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 
 static void snapshot_sig_handler(int sig);
 
+static int record__synthesize(struct record *rec)
+{
+       struct perf_session *session = rec->session;
+       struct machine *machine = &session->machines.host;
+       struct perf_data_file *file = &rec->file;
+       struct record_opts *opts = &rec->opts;
+       struct perf_tool *tool = &rec->tool;
+       int fd = perf_data_file__fd(file);
+       int err = 0;
+
+       if (file->is_pipe) {
+               err = perf_event__synthesize_attrs(tool, session,
+                                                  process_synthesized_event);
+               if (err < 0) {
+                       pr_err("Couldn't synthesize attrs.\n");
+                       goto out;
+               }
+
+               if (have_tracepoints(&rec->evlist->entries)) {
+                       /*
+                        * FIXME err <= 0 here actually means that
+                        * there were no tracepoints so its not really
+                        * an error, just that we don't need to
+                        * synthesize anything.  We really have to
+                        * return this more properly and also
+                        * propagate errors that now are calling die()
+                        */
+                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
+                                                                 process_synthesized_event);
+                       if (err <= 0) {
+                               pr_err("Couldn't record tracing data.\n");
+                               goto out;
+                       }
+                       rec->bytes_written += err;
+               }
+       }
+
+       if (rec->opts.full_auxtrace) {
+               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+                                       session, process_synthesized_event);
+               if (err)
+                       goto out;
+       }
+
+       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/kallsyms permission or run as root.\n");
+
+       err = perf_event__synthesize_modules(tool, process_synthesized_event,
+                                            machine);
+       WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
+                          "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+                          "Check /proc/modules permission or run as root.\n");
+
+       if (perf_guest) {
+               machines__process_guests(&session->machines,
+                                        perf_event__synthesize_guest_os, tool);
+       }
+
+       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
+                                           process_synthesized_event, opts->sample_address,
+                                           opts->proc_map_timeout);
+out:
+       return err;
+}
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
        int err;
@@ -534,6 +632,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                goto out_child;
        }
 
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_child;
+       }
+
        /*
         * Normally perf_session__new would do this, but it doesn't have the
         * evlist.
@@ -566,63 +674,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 
        machine = &session->machines.host;
 
-       if (file->is_pipe) {
-               err = perf_event__synthesize_attrs(tool, session,
-                                                  process_synthesized_event);
-               if (err < 0) {
-                       pr_err("Couldn't synthesize attrs.\n");
-                       goto out_child;
-               }
-
-               if (have_tracepoints(&rec->evlist->entries)) {
-                       /*
-                        * FIXME err <= 0 here actually means that
-                        * there were no tracepoints so its not really
-                        * an error, just that we don't need to
-                        * synthesize anything.  We really have to
-                        * return this more properly and also
-                        * propagate errors that now are calling die()
-                        */
-                       err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
-                                                                 process_synthesized_event);
-                       if (err <= 0) {
-                               pr_err("Couldn't record tracing data.\n");
-                               goto out_child;
-                       }
-                       rec->bytes_written += err;
-               }
-       }
-
-       if (rec->opts.full_auxtrace) {
-               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
-                                       session, process_synthesized_event);
-               if (err)
-                       goto out_delete_session;
-       }
-
-       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
-                                                machine);
-       if (err < 0)
-               pr_err("Couldn't record kernel reference relocation symbol\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/kallsyms permission or run as root.\n");
-
-       err = perf_event__synthesize_modules(tool, process_synthesized_event,
-                                            machine);
+       err = record__synthesize(rec);
        if (err < 0)
-               pr_err("Couldn't record kernel module information.\n"
-                      "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-                      "Check /proc/modules permission or run as root.\n");
-
-       if (perf_guest) {
-               machines__process_guests(&session->machines,
-                                        perf_event__synthesize_guest_os, tool);
-       }
-
-       err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
-                                           process_synthesized_event, opts->sample_address,
-                                           opts->proc_map_timeout);
-       if (err != 0)
                goto out_child;
 
        if (rec->realtime_prio) {
@@ -758,18 +811,8 @@ out_child:
        /* this will be recalculated during process_buildids() */
        rec->samples = 0;
 
-       if (!err && !file->is_pipe) {
-               rec->session->header.data_size += rec->bytes_written;
-               file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
-
-               if (!rec->no_buildid) {
-                       process_buildids(rec);
-
-                       if (rec->buildid_all)
-                               dsos__hit_all(rec->session);
-               }
-               perf_session__write_header(rec->session, rec->evlist, fd, true);
-       }
+       if (!err)
+               record__finish_output(rec);
 
        if (!err && !quiet) {
                char samples[128];
@@ -1097,10 +1140,12 @@ struct option __record_options[] = {
        OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
        OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
                    "don't sample"),
-       OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
-                   "do not update the buildid cache"),
-       OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
-                   "do not collect buildids in perf.data"),
+       OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
+                       &record.no_buildid_cache_set,
+                       "do not update the buildid cache"),
+       OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
+                       &record.no_buildid_set,
+                       "do not collect buildids in perf.data"),
        OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
                     "monitor event in cgroup name only",
                     parse_cgroups),
@@ -1136,6 +1181,12 @@ struct option __record_options[] = {
                        "per thread proc mmap processing timeout in ms"),
        OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
                    "Record context switch events"),
+       OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
+                        "Configure all used events to run in kernel space.",
+                        PARSE_OPT_EXCLUSIVE),
+       OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
+                        "Configure all used events to run in user space.",
+                        PARSE_OPT_EXCLUSIVE),
        OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
                   "clang binary to use for compiling BPF scriptlets"),
        OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
index 2bf537f..7eea49f 100644 (file)
@@ -75,7 +75,10 @@ static int report__config(const char *var, const char *value, void *cb)
                return 0;
        }
        if (!strcmp(var, "report.percent-limit")) {
-               rep->min_percent = strtof(value, NULL);
+               double pcnt = strtof(value, NULL);
+
+               rep->min_percent = pcnt;
+               callchain_param.min_percent = pcnt;
                return 0;
        }
        if (!strcmp(var, "report.children")) {
@@ -87,7 +90,7 @@ static int report__config(const char *var, const char *value, void *cb)
                return 0;
        }
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 static int hist_iter__report_callback(struct hist_entry_iter *iter,
@@ -466,10 +469,11 @@ static int report__browse_hists(struct report *rep)
        return ret;
 }
 
-static void report__collapse_hists(struct report *rep)
+static int report__collapse_hists(struct report *rep)
 {
        struct ui_progress prog;
        struct perf_evsel *pos;
+       int ret = 0;
 
        ui_progress__init(&prog, rep->nr_entries, "Merging related events...");
 
@@ -481,7 +485,9 @@ static void report__collapse_hists(struct report *rep)
 
                hists->socket_filter = rep->socket_filter;
 
-               hists__collapse_resort(hists, &prog);
+               ret = hists__collapse_resort(hists, &prog);
+               if (ret < 0)
+                       break;
 
                /* Non-group events are considered as leader */
                if (symbol_conf.event_group &&
@@ -494,6 +500,7 @@ static void report__collapse_hists(struct report *rep)
        }
 
        ui_progress__finish();
+       return ret;
 }
 
 static void report__output_resort(struct report *rep)
@@ -504,7 +511,7 @@ static void report__output_resort(struct report *rep)
        ui_progress__init(&prog, rep->nr_entries, "Sorting events for output...");
 
        evlist__for_each(rep->session->evlist, pos)
-               hists__output_resort(evsel__hists(pos), &prog);
+               perf_evsel__output_resort(pos, &prog);
 
        ui_progress__finish();
 }
@@ -561,7 +568,11 @@ static int __cmd_report(struct report *rep)
                }
        }
 
-       report__collapse_hists(rep);
+       ret = report__collapse_hists(rep);
+       if (ret) {
+               ui__error("failed to process hist entry\n");
+               return ret;
+       }
 
        if (session_done())
                return 0;
@@ -633,8 +644,10 @@ parse_percent_limit(const struct option *opt, const char *str,
                    int unset __maybe_unused)
 {
        struct report *rep = opt->value;
+       double pcnt = strtof(str, NULL);
 
-       rep->min_percent = strtof(str, NULL);
+       rep->min_percent = pcnt;
+       callchain_param.min_percent = pcnt;
        return 0;
 }
 
@@ -798,6 +811,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                    "only show processor socket that match with this filter"),
        OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                    "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
        OPT_END()
        };
        struct perf_data_file file = {
@@ -907,13 +922,19 @@ repeat:
                symbol_conf.cumulate_callchain = false;
        }
 
-       if (setup_sorting(session->evlist) < 0) {
-               if (sort_order)
-                       parse_options_usage(report_usage, options, "s", 1);
-               if (field_order)
-                       parse_options_usage(sort_order ? NULL : report_usage,
-                                           options, "F", 1);
-               goto error;
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(report_usage, options, "F", 1);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto error;
+               }
+
+               sort__need_collapse = true;
        }
 
        /* Force tty output for header output and per-thread stat. */
@@ -925,6 +946,15 @@ repeat:
        else
                use_browser = 0;
 
+       if (setup_sorting(session->evlist) < 0) {
+               if (sort_order)
+                       parse_options_usage(report_usage, options, "s", 1);
+               if (field_order)
+                       parse_options_usage(sort_order ? NULL : report_usage,
+                                           options, "F", 1);
+               goto error;
+       }
+
        if (report.header || report.header_only) {
                perf_session__fprintf_info(session, stdout,
                                           report.show_full_info);
index c691214..57f9a7e 100644 (file)
@@ -23,6 +23,7 @@
 #include "util/stat.h"
 #include <linux/bitmap.h>
 #include "asm/bug.h"
+#include "util/mem-events.h"
 
 static char const              *script_name;
 static char const              *generate_script_lang;
@@ -58,6 +59,9 @@ enum perf_output_field {
        PERF_OUTPUT_IREGS           = 1U << 14,
        PERF_OUTPUT_BRSTACK         = 1U << 15,
        PERF_OUTPUT_BRSTACKSYM      = 1U << 16,
+       PERF_OUTPUT_DATA_SRC        = 1U << 17,
+       PERF_OUTPUT_WEIGHT          = 1U << 18,
+       PERF_OUTPUT_BPF_OUTPUT      = 1U << 19,
 };
 
 struct output_option {
@@ -81,6 +85,9 @@ struct output_option {
        {.str = "iregs", .field = PERF_OUTPUT_IREGS},
        {.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
        {.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
+       {.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
+       {.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
+       {.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 };
 
 /* default set to maintain compatibility with current format */
@@ -101,7 +108,7 @@ static struct {
                              PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                              PERF_OUTPUT_PERIOD,
 
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
        },
 
        [PERF_TYPE_SOFTWARE] = {
@@ -111,7 +118,7 @@ static struct {
                              PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                              PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                              PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD | PERF_OUTPUT_BPF_OUTPUT,
 
                .invalid_fields = PERF_OUTPUT_TRACE,
        },
@@ -121,7 +128,7 @@ static struct {
 
                .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
                                  PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
-                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+                                 PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE
        },
 
        [PERF_TYPE_RAW] = {
@@ -131,9 +138,10 @@ static struct {
                              PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
                              PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
                              PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-                             PERF_OUTPUT_PERIOD,
+                             PERF_OUTPUT_PERIOD |  PERF_OUTPUT_ADDR |
+                             PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
 
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
        },
 
        [PERF_TYPE_BREAKPOINT] = {
@@ -145,7 +153,7 @@ static struct {
                              PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                              PERF_OUTPUT_PERIOD,
 
-               .invalid_fields = PERF_OUTPUT_TRACE,
+               .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
        },
 };
 
@@ -242,6 +250,16 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                                           PERF_OUTPUT_ADDR, allow_user_set))
                return -EINVAL;
 
+       if (PRINT_FIELD(DATA_SRC) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC",
+                                       PERF_OUTPUT_DATA_SRC))
+               return -EINVAL;
+
+       if (PRINT_FIELD(WEIGHT) &&
+               perf_evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT",
+                                       PERF_OUTPUT_WEIGHT))
+               return -EINVAL;
+
        if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
                pr_err("Display of symbols requested but neither sample IP nor "
                           "sample address\nis selected. Hence, no addresses to convert "
@@ -608,6 +626,84 @@ static void print_sample_flags(u32 flags)
        printf("  %-4s ", str);
 }
 
+struct printer_data {
+       int line_no;
+       bool hit_nul;
+       bool is_printable;
+};
+
+static void
+print_sample_bpf_output_printer(enum binary_printer_ops op,
+                               unsigned int val,
+                               void *extra)
+{
+       unsigned char ch = (unsigned char)val;
+       struct printer_data *printer_data = extra;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf("\n");
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf("%17s", !printer_data->line_no ? "BPF output:" :
+                                                       "           ");
+               break;
+       case BINARY_PRINT_ADDR:
+               printf(" %04x:", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               printf(" %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               printf("   ");
+               break;
+       case BINARY_PRINT_SEP:
+               printf("  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               if (printer_data->hit_nul && ch)
+                       printer_data->is_printable = false;
+
+               if (!isprint(ch)) {
+                       printf("%c", '.');
+
+                       if (!printer_data->is_printable)
+                               break;
+
+                       if (ch == '\0')
+                               printer_data->hit_nul = true;
+                       else
+                               printer_data->is_printable = false;
+               } else {
+                       printf("%c", ch);
+               }
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               printf(" ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               printf("\n");
+               printer_data->line_no++;
+               break;
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void print_sample_bpf_output(struct perf_sample *sample)
+{
+       unsigned int nr_bytes = sample->raw_size;
+       struct printer_data printer_data = {0, false, true};
+
+       print_binary(sample->raw_data, nr_bytes, 8,
+                    print_sample_bpf_output_printer, &printer_data);
+
+       if (printer_data.is_printable && printer_data.hit_nul)
+               printf("%17s \"%s\"\n", "BPF string:",
+                      (char *)(sample->raw_data));
+}
+
 struct perf_script {
        struct perf_tool        tool;
        struct perf_session     *session;
@@ -634,6 +730,23 @@ static int perf_evlist__max_name_len(struct perf_evlist *evlist)
        return max;
 }
 
+static size_t data_src__printf(u64 data_src)
+{
+       struct mem_info mi = { .data_src.val = data_src };
+       char decode[100];
+       char out[100];
+       static int maxlen;
+       int len;
+
+       perf_script__meminfo_scnprintf(decode, 100, &mi);
+
+       len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode);
+       if (maxlen < len)
+               maxlen = len;
+
+       return printf("%-*s", maxlen, out);
+}
+
 static void process_event(struct perf_script *script, union perf_event *event,
                          struct perf_sample *sample, struct perf_evsel *evsel,
                          struct addr_location *al)
@@ -673,6 +786,12 @@ static void process_event(struct perf_script *script, union perf_event *event,
        if (PRINT_FIELD(ADDR))
                print_sample_addr(event, sample, thread, attr);
 
+       if (PRINT_FIELD(DATA_SRC))
+               data_src__printf(sample->data_src);
+
+       if (PRINT_FIELD(WEIGHT))
+               printf("%16" PRIu64, sample->weight);
+
        if (PRINT_FIELD(IP)) {
                if (!symbol_conf.use_callchain)
                        printf(" ");
@@ -692,6 +811,9 @@ static void process_event(struct perf_script *script, union perf_event *event,
        else if (PRINT_FIELD(BRSTACKSYM))
                print_sample_brstacksym(event, sample, thread, attr);
 
+       if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
+               print_sample_bpf_output(sample);
+
        printf("\n");
 }
 
@@ -1090,23 +1212,6 @@ static struct script_spec *script_spec__find(const char *spec)
        return NULL;
 }
 
-static struct script_spec *script_spec__findnew(const char *spec,
-                                               struct scripting_ops *ops)
-{
-       struct script_spec *s = script_spec__find(spec);
-
-       if (s)
-               return s;
-
-       s = script_spec__new(spec, ops);
-       if (!s)
-               return NULL;
-
-       script_spec__add(s);
-
-       return s;
-}
-
 int script_spec_register(const char *spec, struct scripting_ops *ops)
 {
        struct script_spec *s;
@@ -1115,9 +1220,11 @@ int script_spec_register(const char *spec, struct scripting_ops *ops)
        if (s)
                return -1;
 
-       s = script_spec__findnew(spec, ops);
+       s = script_spec__new(spec, ops);
        if (!s)
                return -1;
+       else
+               script_spec__add(s);
 
        return 0;
 }
index 038e877..1f19f2f 100644 (file)
@@ -122,6 +122,7 @@ static bool                 sync_run                        = false;
 static unsigned int            initial_delay                   = 0;
 static unsigned int            unit_width                      = 4; /* strlen("unit") */
 static bool                    forever                         = false;
+static bool                    metric_only                     = false;
 static struct timespec         ref_time;
 static struct cpu_map          *aggr_map;
 static aggr_get_id_t           aggr_get_id;
@@ -735,6 +736,191 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
        }
 }
 
+struct outstate {
+       FILE *fh;
+       bool newline;
+       const char *prefix;
+       int  nfields;
+       int  id, nr;
+       struct perf_evsel *evsel;
+};
+
+#define METRIC_LEN  35
+
+static void new_line_std(void *ctx)
+{
+       struct outstate *os = ctx;
+
+       os->newline = true;
+}
+
+static void do_new_line_std(struct outstate *os)
+{
+       fputc('\n', os->fh);
+       fputs(os->prefix, os->fh);
+       aggr_printout(os->evsel, os->id, os->nr);
+       if (stat_config.aggr_mode == AGGR_NONE)
+               fprintf(os->fh, "        ");
+       fprintf(os->fh, "                                                 ");
+}
+
+static void print_metric_std(void *ctx, const char *color, const char *fmt,
+                            const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       bool newline = os->newline;
+
+       os->newline = false;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%-*s", METRIC_LEN, "");
+               return;
+       }
+
+       if (newline)
+               do_new_line_std(os);
+
+       n = fprintf(out, " # ");
+       if (color)
+               n += color_fprintf(out, color, fmt, val);
+       else
+               n += fprintf(out, fmt, val);
+       fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
+static void new_line_csv(void *ctx)
+{
+       struct outstate *os = ctx;
+       int i;
+
+       fputc('\n', os->fh);
+       if (os->prefix)
+               fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+       aggr_printout(os->evsel, os->id, os->nr);
+       for (i = 0; i < os->nfields; i++)
+               fputs(csv_sep, os->fh);
+}
+
+static void print_metric_csv(void *ctx,
+                            const char *color __maybe_unused,
+                            const char *fmt, const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+
+       if (unit == NULL || fmt == NULL) {
+               fprintf(out, "%s%s%s%s", csv_sep, csv_sep, csv_sep, csv_sep);
+               return;
+       }
+       snprintf(buf, sizeof(buf), fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       while (isspace(*unit))
+               unit++;
+       fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
+}
+
+#define METRIC_ONLY_LEN 20
+
+/* Filter out some columns that don't work well in metrics only mode */
+
+static bool valid_only_metric(const char *unit)
+{
+       if (!unit)
+               return false;
+       if (strstr(unit, "/sec") ||
+           strstr(unit, "hz") ||
+           strstr(unit, "Hz") ||
+           strstr(unit, "CPUs utilized"))
+               return false;
+       return true;
+}
+
+static const char *fixunit(char *buf, struct perf_evsel *evsel,
+                          const char *unit)
+{
+       if (!strncmp(unit, "of all", 6)) {
+               snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
+                        unit);
+               return buf;
+       }
+       return unit;
+}
+
+static void print_metric_only(void *ctx, const char *color, const char *fmt,
+                             const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       int n;
+       char buf[1024];
+       unsigned mlen = METRIC_ONLY_LEN;
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(buf, os->evsel, unit);
+       if (color)
+               n = color_fprintf(out, color, fmt, val);
+       else
+               n = fprintf(out, fmt, val);
+       if (n > METRIC_ONLY_LEN)
+               n = METRIC_ONLY_LEN;
+       if (mlen < strlen(unit))
+               mlen = strlen(unit) + 1;
+       fprintf(out, "%*s", mlen - n, "");
+}
+
+static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
+                                 const char *fmt,
+                                 const char *unit, double val)
+{
+       struct outstate *os = ctx;
+       FILE *out = os->fh;
+       char buf[64], *vals, *ends;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       snprintf(buf, sizeof buf, fmt, val);
+       vals = buf;
+       while (isspace(*vals))
+               vals++;
+       ends = vals;
+       while (isdigit(*ends) || *ends == '.')
+               ends++;
+       *ends = 0;
+       fprintf(out, "%s%s", vals, csv_sep);
+}
+
+static void new_line_metric(void *ctx __maybe_unused)
+{
+}
+
+static void print_metric_header(void *ctx, const char *color __maybe_unused,
+                               const char *fmt __maybe_unused,
+                               const char *unit, double val __maybe_unused)
+{
+       struct outstate *os = ctx;
+       char tbuf[1024];
+
+       if (!valid_only_metric(unit))
+               return;
+       unit = fixunit(tbuf, os->evsel, unit);
+       if (csv_output)
+               fprintf(os->fh, "%s%s", unit, csv_sep);
+       else
+               fprintf(os->fh, "%-*s ", METRIC_ONLY_LEN, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
        FILE *output = stat_config.output;
@@ -763,6 +949,28 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 }
 
+static int first_shadow_cpu(struct perf_evsel *evsel, int id)
+{
+       int i;
+
+       if (!aggr_get_id)
+               return 0;
+
+       if (stat_config.aggr_mode == AGGR_NONE)
+               return id;
+
+       if (stat_config.aggr_mode == AGGR_GLOBAL)
+               return 0;
+
+       for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+               int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+               if (aggr_get_id(evsel_list->cpus, cpu2) == id)
+                       return cpu2;
+       }
+       return 0;
+}
+
 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
        FILE *output = stat_config.output;
@@ -793,22 +1001,124 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 }
 
-static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+static void printout(int id, int nr, struct perf_evsel *counter, double uval,
+                    char *prefix, u64 run, u64 ena, double noise)
 {
-       int cpu = cpu_map__id_to_cpu(id);
+       struct perf_stat_output_ctx out;
+       struct outstate os = {
+               .fh = stat_config.output,
+               .prefix = prefix ? prefix : "",
+               .id = id,
+               .nr = nr,
+               .evsel = counter,
+       };
+       print_metric_t pm = print_metric_std;
+       void (*nl)(void *);
 
-       if (stat_config.aggr_mode == AGGR_GLOBAL)
-               cpu = 0;
+       if (metric_only) {
+               nl = new_line_metric;
+               if (csv_output)
+                       pm = print_metric_only_csv;
+               else
+                       pm = print_metric_only;
+       } else
+               nl = new_line_std;
+
+       if (csv_output && !metric_only) {
+               static int aggr_fields[] = {
+                       [AGGR_GLOBAL] = 0,
+                       [AGGR_THREAD] = 1,
+                       [AGGR_NONE] = 1,
+                       [AGGR_SOCKET] = 2,
+                       [AGGR_CORE] = 2,
+               };
+
+               pm = print_metric_csv;
+               nl = new_line_csv;
+               os.nfields = 3;
+               os.nfields += aggr_fields[stat_config.aggr_mode];
+               if (counter->cgrp)
+                       os.nfields++;
+       }
+       if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
+               if (metric_only) {
+                       pm(&os, NULL, "", "", 0);
+                       return;
+               }
+               aggr_printout(counter, id, nr);
+
+               fprintf(stat_config.output, "%*s%s",
+                       csv_output ? 0 : 18,
+                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+                       csv_sep);
+
+               fprintf(stat_config.output, "%-*s%s",
+                       csv_output ? 0 : unit_width,
+                       counter->unit, csv_sep);
+
+               fprintf(stat_config.output, "%*s",
+                       csv_output ? 0 : -25,
+                       perf_evsel__name(counter));
+
+               if (counter->cgrp)
+                       fprintf(stat_config.output, "%s%s",
+                               csv_sep, counter->cgrp->name);
 
-       if (nsec_counter(counter))
+               if (!csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               print_noise(counter, noise);
+               print_running(run, ena);
+               if (csv_output)
+                       pm(&os, NULL, NULL, "", 0);
+               return;
+       }
+
+       if (metric_only)
+               /* nothing */;
+       else if (nsec_counter(counter))
                nsec_printout(id, nr, counter, uval);
        else
                abs_printout(id, nr, counter, uval);
 
-       if (!csv_output && !stat_config.interval)
-               perf_stat__print_shadow_stats(stat_config.output, counter,
-                                             uval, cpu,
-                                             stat_config.aggr_mode);
+       out.print_metric = pm;
+       out.new_line = nl;
+       out.ctx = &os;
+
+       if (csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+
+       perf_stat__print_shadow_stats(counter, uval,
+                               first_shadow_cpu(counter, id),
+                               &out);
+       if (!csv_output && !metric_only) {
+               print_noise(counter, noise);
+               print_running(run, ena);
+       }
+}
+
+static void aggr_update_shadow(void)
+{
+       int cpu, s2, id, s;
+       u64 val;
+       struct perf_evsel *counter;
+
+       for (s = 0; s < aggr_map->nr; s++) {
+               id = aggr_map->map[s];
+               evlist__for_each(evsel_list, counter) {
+                       val = 0;
+                       for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+                               s2 = aggr_get_id(evsel_list->cpus, cpu);
+                               if (s2 != id)
+                                       continue;
+                               val += perf_counts(counter->counts, cpu, 0)->val;
+                       }
+                       val = val * counter->scale;
+                       perf_stat__update_shadow_stats(counter, &val,
+                                                      first_shadow_cpu(counter, id));
+               }
+       }
 }
 
 static void print_aggr(char *prefix)
@@ -818,12 +1128,23 @@ static void print_aggr(char *prefix)
        int cpu, s, s2, id, nr;
        double uval;
        u64 ena, run, val;
+       bool first;
 
        if (!(aggr_map || aggr_get_id))
                return;
 
+       aggr_update_shadow();
+
+       /*
+        * With metric_only everything is on a single line.
+        * Without each counter has its own line.
+        */
        for (s = 0; s < aggr_map->nr; s++) {
+               if (prefix && metric_only)
+                       fprintf(output, "%s", prefix);
+
                id = aggr_map->map[s];
+               first = true;
                evlist__for_each(evsel_list, counter) {
                        val = ena = run = 0;
                        nr = 0;
@@ -836,41 +1157,20 @@ static void print_aggr(char *prefix)
                                run += perf_counts(counter->counts, cpu, 0)->run;
                                nr++;
                        }
-                       if (prefix)
-                               fprintf(output, "%s", prefix);
-
-                       if (run == 0 || ena == 0) {
+                       if (first && metric_only) {
+                               first = false;
                                aggr_printout(counter, id, nr);
-
-                               fprintf(output, "%*s%s",
-                                       csv_output ? 0 : 18,
-                                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                                       csv_sep);
-
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
-
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
-
-                               if (counter->cgrp)
-                                       fprintf(output, "%s%s",
-                                               csv_sep, counter->cgrp->name);
-
-                               print_running(run, ena);
-                               fputc('\n', output);
-                               continue;
                        }
-                       uval = val * counter->scale;
-                       printout(id, nr, counter, uval);
-                       if (!csv_output)
-                               print_noise(counter, 1.0);
+                       if (prefix && !metric_only)
+                               fprintf(output, "%s", prefix);
 
-                       print_running(run, ena);
-                       fputc('\n', output);
+                       uval = val * counter->scale;
+                       printout(id, nr, counter, uval, prefix, run, ena, 1.0);
+                       if (!metric_only)
+                               fputc('\n', output);
                }
+               if (metric_only)
+                       fputc('\n', output);
        }
 }
 
@@ -895,12 +1195,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
                        fprintf(output, "%s", prefix);
 
                uval = val * counter->scale;
-               printout(thread, 0, counter, uval);
-
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-
-               print_running(run, ena);
+               printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
                fputc('\n', output);
        }
 }
@@ -914,43 +1209,19 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
        FILE *output = stat_config.output;
        struct perf_stat_evsel *ps = counter->priv;
        double avg = avg_stats(&ps->res_stats[0]);
-       int scaled = counter->counts->scaled;
        double uval;
        double avg_enabled, avg_running;
 
        avg_enabled = avg_stats(&ps->res_stats[1]);
        avg_running = avg_stats(&ps->res_stats[2]);
 
-       if (prefix)
+       if (prefix && !metric_only)
                fprintf(output, "%s", prefix);
 
-       if (scaled == -1 || !counter->supported) {
-               fprintf(output, "%*s%s",
-                       csv_output ? 0 : 18,
-                       counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                       csv_sep);
-               fprintf(output, "%-*s%s",
-                       csv_output ? 0 : unit_width,
-                       counter->unit, csv_sep);
-               fprintf(output, "%*s",
-                       csv_output ? 0 : -25,
-                       perf_evsel__name(counter));
-
-               if (counter->cgrp)
-                       fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
-
-               print_running(avg_running, avg_enabled);
-               fputc('\n', output);
-               return;
-       }
-
        uval = avg * counter->scale;
-       printout(-1, 0, counter, uval);
-
-       print_noise(counter, avg);
-
-       print_running(avg_running, avg_enabled);
-       fprintf(output, "\n");
+       printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
+       if (!metric_only)
+               fprintf(output, "\n");
 }
 
 /*
@@ -972,39 +1243,78 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
                if (prefix)
                        fprintf(output, "%s", prefix);
 
-               if (run == 0 || ena == 0) {
-                       fprintf(output, "CPU%*d%s%*s%s",
-                               csv_output ? 0 : -4,
-                               perf_evsel__cpus(counter)->map[cpu], csv_sep,
-                               csv_output ? 0 : 18,
-                               counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-                               csv_sep);
+               uval = val * counter->scale;
+               printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
 
-                               fprintf(output, "%-*s%s",
-                                       csv_output ? 0 : unit_width,
-                                       counter->unit, csv_sep);
+               fputc('\n', output);
+       }
+}
 
-                               fprintf(output, "%*s",
-                                       csv_output ? 0 : -25,
-                                       perf_evsel__name(counter));
+static void print_no_aggr_metric(char *prefix)
+{
+       int cpu;
+       int nrcpus = 0;
+       struct perf_evsel *counter;
+       u64 ena, run, val;
+       double uval;
 
-                       if (counter->cgrp)
-                               fprintf(output, "%s%s",
-                                       csv_sep, counter->cgrp->name);
+       nrcpus = evsel_list->cpus->nr;
+       for (cpu = 0; cpu < nrcpus; cpu++) {
+               bool first = true;
 
-                       print_running(run, ena);
-                       fputc('\n', output);
-                       continue;
+               if (prefix)
+                       fputs(prefix, stat_config.output);
+               evlist__for_each(evsel_list, counter) {
+                       if (first) {
+                               aggr_printout(counter, cpu, 0);
+                               first = false;
+                       }
+                       val = perf_counts(counter->counts, cpu, 0)->val;
+                       ena = perf_counts(counter->counts, cpu, 0)->ena;
+                       run = perf_counts(counter->counts, cpu, 0)->run;
+
+                       uval = val * counter->scale;
+                       printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
                }
+               fputc('\n', stat_config.output);
+       }
+}
 
-               uval = val * counter->scale;
-               printout(cpu, 0, counter, uval);
-               if (!csv_output)
-                       print_noise(counter, 1.0);
-               print_running(run, ena);
+static int aggr_header_lens[] = {
+       [AGGR_CORE] = 18,
+       [AGGR_SOCKET] = 12,
+       [AGGR_NONE] = 6,
+       [AGGR_THREAD] = 24,
+       [AGGR_GLOBAL] = 0,
+};
 
-               fputc('\n', output);
+static void print_metric_headers(char *prefix)
+{
+       struct perf_stat_output_ctx out;
+       struct perf_evsel *counter;
+       struct outstate os = {
+               .fh = stat_config.output
+       };
+
+       if (prefix)
+               fprintf(stat_config.output, "%s", prefix);
+
+       if (!csv_output)
+               fprintf(stat_config.output, "%*s",
+                       aggr_header_lens[stat_config.aggr_mode], "");
+
+       /* Print metrics headers only */
+       evlist__for_each(evsel_list, counter) {
+               os.evsel = counter;
+               out.ctx = &os;
+               out.print_metric = print_metric_header;
+               out.new_line = new_line_metric;
+               os.evsel = counter;
+               perf_stat__print_shadow_stats(counter, 0,
+                                             0,
+                                             &out);
        }
+       fputc('\n', stat_config.output);
 }
 
 static void print_interval(char *prefix, struct timespec *ts)
@@ -1014,7 +1324,7 @@ static void print_interval(char *prefix, struct timespec *ts)
 
        sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
 
-       if (num_print_interval == 0 && !csv_output) {
+       if (num_print_interval == 0 && !csv_output && !metric_only) {
                switch (stat_config.aggr_mode) {
                case AGGR_SOCKET:
                        fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
@@ -1101,6 +1411,17 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
        else
                print_header(argc, argv);
 
+       if (metric_only) {
+               static int num_print_iv;
+
+               if (num_print_iv == 0)
+                       print_metric_headers(prefix);
+               if (num_print_iv++ == 25)
+                       num_print_iv = 0;
+               if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
+                       fprintf(stat_config.output, "%s", prefix);
+       }
+
        switch (stat_config.aggr_mode) {
        case AGGR_CORE:
        case AGGR_SOCKET:
@@ -1113,10 +1434,16 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
        case AGGR_GLOBAL:
                evlist__for_each(evsel_list, counter)
                        print_counter_aggr(counter, prefix);
+               if (metric_only)
+                       fputc('\n', stat_config.output);
                break;
        case AGGR_NONE:
-               evlist__for_each(evsel_list, counter)
-                       print_counter(counter, prefix);
+               if (metric_only)
+                       print_no_aggr_metric(prefix);
+               else {
+                       evlist__for_each(evsel_list, counter)
+                               print_counter(counter, prefix);
+               }
                break;
        case AGGR_UNSET:
        default:
@@ -1237,6 +1564,8 @@ static const struct option stat_options[] = {
                     "aggregate counts per thread", AGGR_THREAD),
        OPT_UINTEGER('D', "delay", &initial_delay,
                     "ms to wait before starting measurement after program start"),
+       OPT_BOOLEAN(0, "metric-only", &metric_only,
+                       "Only print computed metrics. No raw values"),
        OPT_END()
 };
 
@@ -1435,7 +1764,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
  */
 static int add_default_attributes(void)
 {
-       struct perf_event_attr default_attrs[] = {
+       struct perf_event_attr default_attrs0[] = {
 
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES       },
@@ -1443,8 +1772,14 @@ static int add_default_attributes(void)
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS            },
 
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES             },
+};
+       struct perf_event_attr frontend_attrs[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND        },
+};
+       struct perf_event_attr backend_attrs[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+       struct perf_event_attr default_attrs1[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS           },
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS    },
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES          },
@@ -1561,7 +1896,19 @@ static int add_default_attributes(void)
        }
 
        if (!evsel_list->nr_entries) {
-               if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
+                       return -1;
+               if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               frontend_attrs) < 0)
+                               return -1;
+               }
+               if (pmu_have_event("cpu", "stalled-cycles-backend")) {
+                       if (perf_evlist__add_default_attrs(evsel_list,
+                                               backend_attrs) < 0)
+                               return -1;
+               }
+               if (perf_evlist__add_default_attrs(evsel_list, default_attrs1) < 0)
                        return -1;
        }
 
@@ -1825,9 +2172,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
        if (evsel_list == NULL)
                return -ENOMEM;
 
+       parse_events__shrink_config_terms();
        argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
                                        (const char **) stat_usage,
                                        PARSE_OPT_STOP_AT_NON_OPTION);
+       perf_stat__init_shadow_stats();
 
        if (csv_sep) {
                csv_output = true;
@@ -1858,6 +2207,16 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
                goto out;
        }
 
+       if (metric_only && stat_config.aggr_mode == AGGR_THREAD) {
+               fprintf(stderr, "--metric-only is not supported with --per-thread\n");
+               goto out;
+       }
+
+       if (metric_only && run_count > 1) {
+               fprintf(stderr, "--metric-only is not supported with -r\n");
+               goto out;
+       }
+
        if (output_fd < 0) {
                fprintf(stderr, "argument to --log-fd must be a > 0\n");
                parse_options_usage(stat_usage, stat_options, "log-fd", 0);
index bf01cbb..94af190 100644 (file)
@@ -252,7 +252,8 @@ static void perf_top__print_sym_table(struct perf_top *top)
        char bf[160];
        int printed = 0;
        const int win_width = top->winsize.ws_col - 1;
-       struct hists *hists = evsel__hists(top->sym_evsel);
+       struct perf_evsel *evsel = top->sym_evsel;
+       struct hists *hists = evsel__hists(evsel);
 
        puts(CONSOLE_CLEAR);
 
@@ -288,7 +289,7 @@ static void perf_top__print_sym_table(struct perf_top *top)
        }
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        hists__output_recalc_col_len(hists, top->print_entries - printed);
        putchar('\n');
@@ -540,6 +541,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c)
 static void perf_top__sort_new_samples(void *arg)
 {
        struct perf_top *t = arg;
+       struct perf_evsel *evsel = t->sym_evsel;
        struct hists *hists;
 
        perf_top__reset_sample_counters(t);
@@ -547,7 +549,7 @@ static void perf_top__sort_new_samples(void *arg)
        if (t->evlist->selected != NULL)
                t->sym_evsel = t->evlist->selected;
 
-       hists = evsel__hists(t->sym_evsel);
+       hists = evsel__hists(evsel);
 
        if (t->evlist->enabled) {
                if (t->zero) {
@@ -559,7 +561,7 @@ static void perf_top__sort_new_samples(void *arg)
        }
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 }
 
 static void *display_thread_tui(void *arg)
@@ -1063,7 +1065,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset)
        return parse_callchain_top_opt(arg);
 }
 
-static int perf_top_config(const char *var, const char *value, void *cb)
+static int perf_top_config(const char *var, const char *value, void *cb __maybe_unused)
 {
        if (!strcmp(var, "top.call-graph"))
                var = "call-graph.record-mode"; /* fall-through */
@@ -1072,7 +1074,7 @@ static int perf_top_config(const char *var, const char *value, void *cb)
                return 0;
        }
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 static int
@@ -1212,6 +1214,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                     parse_branch_stack),
        OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
                    "Show raw trace event output (do not use print fmt or plugins)"),
+       OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+                   "Show entries in a hierarchy"),
        OPT_END()
        };
        const char * const top_usage[] = {
@@ -1239,10 +1243,30 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                goto out_delete_evlist;
        }
 
+       if (symbol_conf.report_hierarchy) {
+               /* disable incompatible options */
+               symbol_conf.event_group = false;
+               symbol_conf.cumulate_callchain = false;
+
+               if (field_order) {
+                       pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+                       parse_options_usage(top_usage, options, "fields", 0);
+                       parse_options_usage(NULL, options, "hierarchy", 0);
+                       goto out_delete_evlist;
+               }
+       }
+
        sort__mode = SORT_MODE__TOP;
        /* display thread wants entries to be collapsed in a different tree */
        sort__need_collapse = 1;
 
+       if (top.use_stdio)
+               use_browser = 0;
+       else if (top.use_tui)
+               use_browser = 1;
+
+       setup_browser(false);
+
        if (setup_sorting(top.evlist) < 0) {
                if (sort_order)
                        parse_options_usage(top_usage, options, "s", 1);
@@ -1252,13 +1276,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                goto out_delete_evlist;
        }
 
-       if (top.use_stdio)
-               use_browser = 0;
-       else if (top.use_tui)
-               use_browser = 1;
-
-       setup_browser(false);
-
        status = target__validate(target);
        if (status) {
                target__strerror(target, status, errbuf, BUFSIZ);
index 20916dd..8dc98c5 100644 (file)
@@ -33,6 +33,7 @@
 #include "util/stat.h"
 #include "trace-event.h"
 #include "util/parse-events.h"
+#include "util/bpf-loader.h"
 
 #include <libaudit.h>
 #include <stdlib.h>
@@ -1724,8 +1725,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 
        sc->args = sc->tp_format->format.fields;
        sc->nr_args = sc->tp_format->format.nr_fields;
-       /* drop nr field - not relevant here; does not exist on older kernels */
-       if (sc->args && strcmp(sc->args->name, "nr") == 0) {
+       /*
+        * We need to check and discard the first variable '__syscall_nr'
+        * or 'nr' that mean the syscall number. It is needless here.
+        * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
+        */
+       if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
                sc->args = sc->args->next;
                --sc->nr_args;
        }
@@ -2177,6 +2182,37 @@ out_dump:
        return 0;
 }
 
+static void bpf_output__printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       FILE *output = extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_CHAR_DATA:
+               fprintf(output, "%c", isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_DATA_BEGIN:
+       case BINARY_PRINT_LINE_BEGIN:
+       case BINARY_PRINT_ADDR:
+       case BINARY_PRINT_NUM_DATA:
+       case BINARY_PRINT_NUM_PAD:
+       case BINARY_PRINT_SEP:
+       case BINARY_PRINT_CHAR_PAD:
+       case BINARY_PRINT_LINE_END:
+       case BINARY_PRINT_DATA_END:
+       default:
+               break;
+       }
+}
+
+static void bpf_output__fprintf(struct trace *trace,
+                               struct perf_sample *sample)
+{
+       print_binary(sample->raw_data, sample->raw_size, 8,
+                    bpf_output__printer, trace->output);
+}
+
 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
                                union perf_event *event __maybe_unused,
                                struct perf_sample *sample)
@@ -2189,7 +2225,9 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 
        fprintf(trace->output, "%s:", evsel->name);
 
-       if (evsel->tp_format) {
+       if (perf_evsel__is_bpf_output(evsel)) {
+               bpf_output__fprintf(trace, sample);
+       } else if (evsel->tp_format) {
                event_format__fprintf(evsel->tp_format, sample->cpu,
                                      sample->raw_data, sample->raw_size,
                                      trace->output);
@@ -2586,6 +2624,16 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
        if (err < 0)
                goto out_error_open;
 
+       err = bpf__apply_obj_config();
+       if (err) {
+               char errbuf[BUFSIZ];
+
+               bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+               pr_err("ERROR: Apply config to BPF failed: %s\n",
+                        errbuf);
+               goto out_error_open;
+       }
+
        /*
         * Better not use !target__has_task() here because we need to cover the
         * case where no threads were specified in the command line, but a
index e5959c1..eca6a91 100644 (file)
@@ -61,50 +61,45 @@ endif
 
 ifeq ($(LIBUNWIND_LIBS),)
   NO_LIBUNWIND := 1
-else
-  #
-  # For linking with debug library, run like:
-  #
-  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
-  #
-  ifdef LIBUNWIND_DIR
-    LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
-    LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
-  endif
-  LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
-
-  # Set per-feature check compilation flags
-  FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
-  FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
 endif
+#
+# For linking with debug library, run like:
+#
+#   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+#
+ifdef LIBUNWIND_DIR
+  LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
+  LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+endif
+LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
+
+# Set per-feature check compilation flags
+FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
 
 ifeq ($(NO_PERF_REGS),0)
   CFLAGS += -DHAVE_PERF_REGS_SUPPORT
 endif
 
-ifndef NO_LIBELF
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
-  ifdef LIBDW_DIR
-    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
-    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+# for linking with debug library, run like:
+# make DEBUG=1 LIBDW_DIR=/opt/libdw/
+ifdef LIBDW_DIR
+  LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+  LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
 endif
+FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
 
-ifdef LIBBABELTRACE
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
-  ifdef LIBBABELTRACE_DIR
-    LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
-    LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
+# for linking with debug library, run like:
+# make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
+ifdef LIBBABELTRACE_DIR
+  LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
+  LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
 endif
+FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
 
 FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
 # include ARCH specific config
@@ -145,28 +140,26 @@ ifdef PARSER_DEBUG
   $(call detected_var,PARSER_DEBUG_FLEX)
 endif
 
-ifndef NO_LIBPYTHON
-  # Try different combinations to accommodate systems that only have
-  # python[2][-config] in weird combinations but always preferring
-  # python2 and python2-config as per pep-0394. If we catch a
-  # python[-config] in version 3, the version check will kill it.
-  PYTHON2 := $(if $(call get-executable,python2),python2,python)
-  override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
-  PYTHON2_CONFIG := \
-    $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
-  override PYTHON_CONFIG := \
-    $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+# Try different combinations to accommodate systems that only have
+# python[2][-config] in weird combinations but always preferring
+# python2 and python2-config as per pep-0394. If we catch a
+# python[-config] in version 3, the version check will kill it.
+PYTHON2 := $(if $(call get-executable,python2),python2,python)
+override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+PYTHON2_CONFIG := \
+  $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+override PYTHON_CONFIG := \
+  $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
 
-  PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
 
-  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
 
-  FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
-  FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
-endif
+FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
 
 CFLAGS += -fno-omit-frame-pointer
 CFLAGS += -ggdb3
@@ -181,7 +174,11 @@ LDFLAGS += -Wl,-z,noexecstack
 
 EXTLIBS = -lpthread -lrt -lm -ldl
 
+ifeq ($(FEATURES_DUMP),)
 include $(srctree)/tools/build/Makefile.feature
+else
+include $(FEATURES_DUMP)
+endif
 
 ifeq ($(feature-stackprotector-all), 1)
   CFLAGS += -fstack-protector-all
@@ -331,6 +328,13 @@ ifndef NO_LIBELF
   endif # NO_LIBBPF
 endif # NO_LIBELF
 
+ifdef PERF_HAVE_JITDUMP
+  ifndef NO_DWARF
+    $(call detected,CONFIG_JITDUMP)
+    CFLAGS += -DHAVE_JITDUMP
+  endif
+endif
+
 ifeq ($(ARCH),powerpc)
   ifndef NO_DWARF
     CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
@@ -407,6 +411,17 @@ ifndef NO_LIBAUDIT
   endif
 endif
 
+ifndef NO_LIBCRYPTO
+  ifneq ($(feature-libcrypto), 1)
+    msg := $(warning No libcrypto.h found, disables jitted code injection, please install libssl-devel or libssl-dev);
+    NO_LIBCRYPTO := 1
+  else
+    CFLAGS += -DHAVE_LIBCRYPTO_SUPPORT
+    EXTLIBS += -lcrypto
+    $(call detected,CONFIG_CRYPTO)
+  endif
+endif
+
 ifdef NO_NEWT
   NO_SLANG=1
 endif
diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile
new file mode 100644 (file)
index 0000000..5ce61a1
--- /dev/null
@@ -0,0 +1,89 @@
+ARCH=$(shell uname -m)
+
+ifeq ($(ARCH), x86_64)
+JARCH=amd64
+endif
+ifeq ($(ARCH), armv7l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), armv6l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), aarch64)
+JARCH=aarch64
+endif
+ifeq ($(ARCH), ppc64)
+JARCH=powerpc
+endif
+ifeq ($(ARCH), ppc64le)
+JARCH=powerpc
+endif
+
+DESTDIR=/usr/local
+
+VERSION=1
+REVISION=0
+AGE=0
+
+LN=ln -sf
+RM=rm
+
+SLIBJVMTI=libjvmti.so.$(VERSION).$(REVISION).$(AGE)
+VLIBJVMTI=libjvmti.so.$(VERSION)
+SLDFLAGS=-shared -Wl,-soname -Wl,$(VLIBJVMTI)
+SOLIBEXT=so
+
+# The following works at least on fedora 23, you may need the next
+# line for other distros.
+ifneq (,$(wildcard /usr/sbin/update-java-alternatives))
+JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+else
+  ifneq (,$(wildcard /usr/sbin/alternatives))
+    JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+  endif
+endif
+ifndef JDIR
+$(error Could not find alternatives command, you need to set JDIR= to point to the root of your Java directory)
+else
+  ifeq (,$(wildcard $(JDIR)/include/jvmti.h))
+  $(error the openjdk development package appears to me missing, install and try again)
+  endif
+endif
+$(info Using Java from $(JDIR))
+# -lrt required in 32-bit mode for clock_gettime()
+LIBS=-lelf -lrt
+INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
+
+TARGETS=$(SLIBJVMTI)
+
+SRCS=libjvmti.c jvmti_agent.c
+OBJS=$(SRCS:.c=.o)
+SOBJS=$(OBJS:.o=.lo)
+OPT=-O2 -g -Werror -Wall
+
+CFLAGS=$(INCDIR) $(OPT)
+
+all: $(TARGETS)
+
+.c.o:
+       $(CC) $(CFLAGS) -c $*.c
+.c.lo:
+       $(CC) -fPIC -DPIC $(CFLAGS) -c $*.c -o $*.lo
+
+$(OBJS) $(SOBJS): Makefile jvmti_agent.h ../util/jitdump.h
+
+$(SLIBJVMTI):  $(SOBJS)
+       $(CC) $(CFLAGS) $(SLDFLAGS)  -o $@ $(SOBJS) $(LIBS)
+       $(LN) $@ libjvmti.$(SOLIBEXT)
+
+clean:
+       $(RM) -f *.o *.so.* *.so *.lo
+
+install:
+       -mkdir -p $(DESTDIR)/lib
+       install -m 755 $(SLIBJVMTI) $(DESTDIR)/lib/
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) $(VLIBJVMTI))
+       (cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) libjvmti.$(SOLIBEXT))
+       ldconfig
+
+.SUFFIXES: .c .S .o .lo
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
new file mode 100644 (file)
index 0000000..6461e02
--- /dev/null
@@ -0,0 +1,465 @@
+/*
+ * jvmti_agent.c: JVMTI agent interface
+ *
+ * Adapted from the Oprofile code in opagent.c:
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#include <sys/types.h>
+#include <sys/stat.h> /* for mkdir() */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/mman.h>
+#include <syscall.h> /* for gettid() */
+#include <err.h>
+
+#include "jvmti_agent.h"
+#include "../util/jitdump.h"
+
+#define JIT_LANG "java"
+
+static char jit_path[PATH_MAX];
+static void *marker_addr;
+
+/*
+ * padding buffer
+ */
+static const char pad_bytes[7];
+
+static inline pid_t gettid(void)
+{
+       return (pid_t)syscall(__NR_gettid);
+}
+
+static int get_e_machine(struct jitheader *hdr)
+{
+       ssize_t sret;
+       char id[16];
+       int fd, ret = -1;
+       int m = -1;
+       struct {
+               uint16_t e_type;
+               uint16_t e_machine;
+       } info;
+
+       fd = open("/proc/self/exe", O_RDONLY);
+       if (fd == -1)
+               return -1;
+
+       sret = read(fd, id, sizeof(id));
+       if (sret != sizeof(id))
+               goto error;
+
+       /* check ELF signature */
+       if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F')
+               goto error;
+
+       sret = read(fd, &info, sizeof(info));
+       if (sret != sizeof(info))
+               goto error;
+
+       m = info.e_machine;
+       if (m < 0)
+               m = 0; /* ELF EM_NONE */
+
+       hdr->elf_mach = m;
+       ret = 0;
+error:
+       close(fd);
+       return ret;
+}
+
+#define NSEC_PER_SEC   1000000000
+static int perf_clk_id = CLOCK_MONOTONIC;
+
+static inline uint64_t
+timespec_to_ns(const struct timespec *ts)
+{
+        return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+static inline uint64_t
+perf_get_timestamp(void)
+{
+       struct timespec ts;
+       int ret;
+
+       ret = clock_gettime(perf_clk_id, &ts);
+       if (ret)
+               return 0;
+
+       return timespec_to_ns(&ts);
+}
+
+static int
+debug_cache_init(void)
+{
+       char str[32];
+       char *base, *p;
+       struct tm tm;
+       time_t t;
+       int ret;
+
+       time(&t);
+       localtime_r(&t, &tm);
+
+       base = getenv("JITDUMPDIR");
+       if (!base)
+               base = getenv("HOME");
+       if (!base)
+               base = ".";
+
+       strftime(str, sizeof(str), JIT_LANG"-jit-%Y%m%d", &tm);
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/", base);
+
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("jvmti: cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit", base);
+       ret = mkdir(jit_path, 0755);
+       if (ret == -1) {
+               if (errno != EEXIST) {
+                       warn("cannot create jit cache dir %s", jit_path);
+                       return -1;
+               }
+       }
+
+       snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit/%s.XXXXXXXX", base, str);
+
+       p = mkdtemp(jit_path);
+       if (p != jit_path) {
+               warn("cannot create jit cache dir %s", jit_path);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+perf_open_marker_file(int fd)
+{
+       long pgsz;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return -1;
+
+       /*
+        * we mmap the jitdump to create an MMAP RECORD in perf.data file.
+        * The mmap is captured either live (perf record running when we mmap)
+        * or  in deferred mode, via /proc/PID/maps
+        * the MMAP record is used as a marker of a jitdump file for more meta
+        * data info about the jitted code. Perf report/annotate detect this
+        * special filename and process the jitdump file.
+        *
+        * mapping must be PROT_EXEC to ensure it is captured by perf record
+        * even when not using -d option
+        */
+       marker_addr = mmap(NULL, pgsz, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0);
+       return (marker_addr == MAP_FAILED) ? -1 : 0;
+}
+
+static void
+perf_close_marker_file(void)
+{
+       long pgsz;
+
+       if (!marker_addr)
+               return;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+       if (pgsz == -1)
+               return;
+
+       munmap(marker_addr, pgsz);
+}
+
+void *jvmti_open(void)
+{
+       int pad_cnt;
+       char dump_path[PATH_MAX];
+       struct jitheader header;
+       int fd;
+       FILE *fp;
+
+       /*
+        * check if clockid is supported
+        */
+       if (!perf_get_timestamp())
+               warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+
+       memset(&header, 0, sizeof(header));
+
+       debug_cache_init();
+
+       /*
+        * jitdump file name
+        */
+       snprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid());
+
+       fd = open(dump_path, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               return NULL;
+
+       /*
+        * create perf.data maker for the jitdump file
+        */
+       if (perf_open_marker_file(fd)) {
+               warnx("jvmti: failed to create marker file");
+               return NULL;
+       }
+
+       fp = fdopen(fd, "w+");
+       if (!fp) {
+               warn("jvmti: cannot create %s", dump_path);
+               close(fd);
+               goto error;
+       }
+
+       warnx("jvmti: jitdump in %s", dump_path);
+
+       if (get_e_machine(&header)) {
+               warn("get_e_machine failed\n");
+               goto error;
+       }
+
+       header.magic      = JITHEADER_MAGIC;
+       header.version    = JITHEADER_VERSION;
+       header.total_size = sizeof(header);
+       header.pid        = getpid();
+
+       /* calculate amount of padding '\0' */
+       pad_cnt = PADDING_8ALIGNED(header.total_size);
+       header.total_size += pad_cnt;
+
+       header.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&header, sizeof(header), 1, fp)) {
+               warn("jvmti: cannot write dumpfile header");
+               goto error;
+       }
+
+       /* write padding '\0' if necessary */
+       if (pad_cnt && !fwrite(pad_bytes, pad_cnt, 1, fp)) {
+               warn("jvmti: cannot write dumpfile header padding");
+               goto error;
+       }
+
+       return fp;
+error:
+       fclose(fp);
+       return NULL;
+}
+
+int
+jvmti_close(void *agent)
+{
+       struct jr_code_close rec;
+       FILE *fp = agent;
+
+       if (!fp) {
+               warnx("jvmti: incalid fd in close_agent");
+               return -1;
+       }
+
+       rec.p.id = JIT_CODE_CLOSE;
+       rec.p.total_size = sizeof(rec);
+
+       rec.p.timestamp = perf_get_timestamp();
+
+       if (!fwrite(&rec, sizeof(rec), 1, fp))
+               return -1;
+
+       fclose(fp);
+
+       fp = NULL;
+
+       perf_close_marker_file();
+
+       return 0;
+}
+
+int
+jvmti_write_code(void *agent, char const *sym,
+       uint64_t vma, void const *code, unsigned int const size)
+{
+       static int code_generation = 1;
+       struct jr_code_load rec;
+       size_t sym_len;
+       size_t padding_count;
+       FILE *fp = agent;
+       int ret = -1;
+
+       /* don't care about 0 length function, no samples */
+       if (size == 0)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_native_code");
+               return -1;
+       }
+
+       sym_len = strlen(sym) + 1;
+
+       rec.p.id           = JIT_CODE_LOAD;
+       rec.p.total_size   = sizeof(rec) + sym_len;
+       padding_count      = PADDING_8ALIGNED(rec.p.total_size);
+       rec.p. total_size += padding_count;
+       rec.p.timestamp    = perf_get_timestamp();
+
+       rec.code_size  = size;
+       rec.vma        = vma;
+       rec.code_addr  = vma;
+       rec.pid        = getpid();
+       rec.tid        = gettid();
+
+       if (code)
+               rec.p.total_size += size;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       /*
+        * get code index inside lock to avoid race condition
+        */
+       rec.code_index = code_generation++;
+
+       ret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       fwrite_unlocked(sym, sym_len, 1, fp);
+
+       if (padding_count)
+               fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+
+       if (code)
+               fwrite_unlocked(code, size, 1, fp);
+
+       funlockfile(fp);
+
+       ret = 0;
+
+       return ret;
+}
+
+int
+jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
+                      jvmti_line_info_t *li, int nr_lines)
+{
+       struct jr_code_debug_info rec;
+       size_t sret, len, size, flen;
+       size_t padding_count;
+       uint64_t addr;
+       const char *fn = file;
+       FILE *fp = agent;
+       int i;
+
+       /*
+        * no entry to write
+        */
+       if (!nr_lines)
+               return 0;
+
+       if (!fp) {
+               warnx("jvmti: invalid fd in write_debug_info");
+               return -1;
+       }
+
+       flen = strlen(file) + 1;
+
+       rec.p.id        = JIT_CODE_DEBUG_INFO;
+       size            = sizeof(rec);
+       rec.p.timestamp = perf_get_timestamp();
+       rec.code_addr   = (uint64_t)(uintptr_t)code;
+       rec.nr_entry    = nr_lines;
+
+       /*
+        * on disk source line info layout:
+        * uint64_t : addr
+        * int      : line number
+        * int      : column discriminator
+        * file[]   : source file name
+        * padding  : pad to multiple of 8 bytes
+        */
+       size += nr_lines * sizeof(struct debug_entry);
+       size += flen * nr_lines;
+       /*
+        * pad to 8 bytes
+        */
+       padding_count = PADDING_8ALIGNED(size);
+
+       rec.p.total_size = size + padding_count;
+
+       /*
+        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * may be possible, so protect file writes
+        */
+       flockfile(fp);
+
+       sret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+       if (sret != 1)
+               goto error;
+
+       for (i = 0; i < nr_lines; i++) {
+
+               addr = (uint64_t)li[i].pc;
+               len  = sizeof(addr);
+               sret = fwrite_unlocked(&addr, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].line_number);
+               sret = fwrite_unlocked(&li[i].line_number, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               len  = sizeof(li[0].discrim);
+               sret = fwrite_unlocked(&li[i].discrim, len, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+               sret = fwrite_unlocked(fn, flen, 1, fp);
+               if (sret != 1)
+                       goto error;
+       }
+       if (padding_count)
+               sret = fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+               if (sret != 1)
+                       goto error;
+
+       funlockfile(fp);
+       return 0;
+error:
+       funlockfile(fp);
+       return -1;
+}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
new file mode 100644 (file)
index 0000000..bedf5d0
--- /dev/null
@@ -0,0 +1,36 @@
+#ifndef __JVMTI_AGENT_H__
+#define __JVMTI_AGENT_H__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <jvmti.h>
+
+#define __unused __attribute__((unused))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct {
+       unsigned long   pc;
+       int             line_number;
+       int             discrim; /* discriminator -- 0 for now */
+} jvmti_line_info_t;
+
+void *jvmti_open(void);
+int   jvmti_close(void *agent);
+int   jvmti_write_code(void *agent, char const *symbol_name,
+                      uint64_t vma, void const *code,
+                      const unsigned int code_size);
+
+int   jvmti_write_debug_info(void *agent,
+                            uint64_t code,
+                            const char *file,
+                            jvmti_line_info_t *li,
+                            int nr_lines);
+
+#if defined(__cplusplus)
+}
+
+#endif
+#endif /* __JVMTI_H__ */
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
new file mode 100644 (file)
index 0000000..ac12e4b
--- /dev/null
@@ -0,0 +1,304 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <err.h>
+#include <jvmti.h>
+#include <jvmticmlr.h>
+#include <limits.h>
+
+#include "jvmti_agent.h"
+
+static int has_line_numbers;
+void *jvmti_agent;
+
+static jvmtiError
+do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
+                   jvmti_line_info_t *tab, jint *nr)
+{
+       jint i, lines = 0;
+       jint nr_lines = 0;
+       jvmtiLineNumberEntry *loc_tab = NULL;
+       jvmtiError ret;
+
+       ret = (*jvmti)->GetLineNumberTable(jvmti, m, &nr_lines, &loc_tab);
+       if (ret != JVMTI_ERROR_NONE)
+               return ret;
+
+       for (i = 0; i < nr_lines; i++) {
+               if (loc_tab[i].start_location < bci) {
+                       tab[lines].pc = (unsigned long)pc;
+                       tab[lines].line_number = loc_tab[i].line_number;
+                       tab[lines].discrim = 0; /* not yet used */
+                       lines++;
+               } else {
+                       break;
+               }
+       }
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)loc_tab);
+       *nr = lines;
+       return JVMTI_ERROR_NONE;
+}
+
+static jvmtiError
+get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **tab, int *nr_lines)
+{
+       const jvmtiCompiledMethodLoadRecordHeader *hdr;
+       jvmtiCompiledMethodLoadInlineRecord *rec;
+       jvmtiLineNumberEntry *lne = NULL;
+       PCStackInfo *c;
+       jint nr, ret;
+       int nr_total = 0;
+       int i, lines_total = 0;
+
+       if (!(tab && nr_lines))
+               return JVMTI_ERROR_NULL_POINTER;
+
+       /*
+        * Phase 1 -- get the number of lines necessary
+        */
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               /*
+                                * unfortunately, need a tab to get the number of lines!
+                                */
+                               ret = (*jvmti)->GetLineNumberTable(jvmti, c->methods[0], &nr, &lne);
+                               if (ret == JVMTI_ERROR_NONE) {
+                                       /* free what was allocated for nothing */
+                                       (*jvmti)->Deallocate(jvmti, (unsigned char *)lne);
+                                       nr_total += (int)nr;
+                               }
+                       }
+               }
+       }
+
+       if (nr_total == 0)
+               return JVMTI_ERROR_NOT_FOUND;
+
+       /*
+        * Phase 2 -- allocate big enough line table
+        */
+       *tab = malloc(nr_total * sizeof(**tab));
+       if (!*tab)
+               return JVMTI_ERROR_OUT_OF_MEMORY;
+
+       for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+               if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+                       rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+                       for (i = 0; i < rec->numpcs; i++) {
+                               c = rec->pcinfo + i;
+                               nr = 0;
+                               ret = do_get_line_numbers(jvmti, c->pc,
+                                                         c->methods[0],
+                                                         c->bcis[0],
+                                                         *tab + lines_total,
+                                                         &nr);
+                               if (ret == JVMTI_ERROR_NONE)
+                                       lines_total += nr;
+                       }
+               }
+       }
+       *nr_lines = lines_total;
+       return JVMTI_ERROR_NONE;
+}
+
+static void JNICALL
+compiled_method_load_cb(jvmtiEnv *jvmti,
+                       jmethodID method,
+                       jint code_size,
+                       void const *code_addr,
+                       jint map_length,
+                       jvmtiAddrLocationMap const *map,
+                       const void *compile_info)
+{
+       jvmti_line_info_t *line_tab = NULL;
+       jclass decl_class;
+       char *class_sign = NULL;
+       char *func_name = NULL;
+       char *func_sign = NULL;
+       char *file_name= NULL;
+       char fn[PATH_MAX];
+       uint64_t addr = (uint64_t)(uintptr_t)code_addr;
+       jvmtiError ret;
+       int nr_lines = 0; /* in line_tab[] */
+       size_t len;
+
+       ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
+                                               &decl_class);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get declaring class");
+               return;
+       }
+
+       if (has_line_numbers && map && map_length) {
+               ret = get_line_numbers(jvmti, compile_info, &line_tab, &nr_lines);
+               if (ret != JVMTI_ERROR_NONE) {
+                       warnx("jvmti: cannot get line table for method");
+                       nr_lines = 0;
+               }
+       }
+
+       ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot get source filename ret=%d", ret);
+               goto error;
+       }
+
+       ret = (*jvmti)->GetClassSignature(jvmti, decl_class,
+                                         &class_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: getclassignature failed");
+               goto error;
+       }
+
+       ret = (*jvmti)->GetMethodName(jvmti, method, &func_name,
+                                     &func_sign, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: failed getmethodname");
+               goto error;
+       }
+
+       /*
+        * Assume path name is class hierarchy, this is a common practice with Java programs
+        */
+       if (*class_sign == 'L') {
+               int j, i = 0;
+               char *p = strrchr(class_sign, '/');
+               if (p) {
+                       /* drop the 'L' prefix and copy up to the final '/' */
+                       for (i = 0; i < (p - class_sign); i++)
+                               fn[i] = class_sign[i+1];
+               }
+               /*
+                * append file name, we use loops and not string ops to avoid modifying
+                * class_sign which is used later for the symbol name
+                */
+               for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
+                       fn[i] = file_name[j];
+               fn[i] = '\0';
+       } else {
+               /* fallback case */
+               strcpy(fn, file_name);
+       }
+       /*
+        * write source line info record if we have it
+        */
+       if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
+               warnx("jvmti: write_debug_info() failed");
+
+       len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
+       {
+               char str[len];
+               snprintf(str, len, "%s%s%s", class_sign, func_name, func_sign);
+
+               if (jvmti_write_code(jvmti_agent, str, addr, code_addr, code_size))
+                       warnx("jvmti: write_code() failed");
+       }
+error:
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_name);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)func_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+       (*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+       free(line_tab);
+}
+
+static void JNICALL
+code_generated_cb(jvmtiEnv *jvmti,
+                 char const *name,
+                 void const *code_addr,
+                 jint code_size)
+{
+       uint64_t addr = (uint64_t)(unsigned long)code_addr;
+       int ret;
+
+       ret = jvmti_write_code(jvmti_agent, name, addr, code_addr, code_size);
+       if (ret)
+               warnx("jvmti: write_code() failed for code_generated");
+}
+
+JNIEXPORT jint JNICALL
+Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
+{
+       jvmtiEventCallbacks cb;
+       jvmtiCapabilities caps1;
+       jvmtiJlocationFormat format;
+       jvmtiEnv *jvmti = NULL;
+       jint ret;
+
+       jvmti_agent = jvmti_open();
+       if (!jvmti_agent) {
+               warnx("jvmti: open_agent failed");
+               return -1;
+       }
+
+       /*
+        * Request a JVMTI interface version 1 environment
+        */
+       ret = (*jvm)->GetEnv(jvm, (void *)&jvmti, JVMTI_VERSION_1);
+       if (ret != JNI_OK) {
+               warnx("jvmti: jvmti version 1 not supported");
+               return -1;
+       }
+
+       /*
+        * acquire method_load capability, we require it
+        * request line numbers (optional)
+        */
+       memset(&caps1, 0, sizeof(caps1));
+       caps1.can_generate_compiled_method_load_events = 1;
+
+       ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: acquire compiled_method capability failed");
+               return -1;
+       }
+       ret = (*jvmti)->GetJLocationFormat(jvmti, &format);
+        if (ret == JVMTI_ERROR_NONE && format == JVMTI_JLOCATION_JVMBCI) {
+                memset(&caps1, 0, sizeof(caps1));
+                caps1.can_get_line_numbers = 1;
+                caps1.can_get_source_file_name = 1;
+               ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+                if (ret == JVMTI_ERROR_NONE)
+                        has_line_numbers = 1;
+        }
+
+       memset(&cb, 0, sizeof(cb));
+
+       cb.CompiledMethodLoad   = compiled_method_load_cb;
+       cb.DynamicCodeGenerated = code_generated_cb;
+
+       ret = (*jvmti)->SetEventCallbacks(jvmti, &cb, sizeof(cb));
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: cannot set event callbacks");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_COMPILED_METHOD_LOAD, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed for method_load");
+               return -1;
+       }
+
+       ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+                       JVMTI_EVENT_DYNAMIC_CODE_GENERATED, NULL);
+       if (ret != JVMTI_ERROR_NONE) {
+               warnx("jvmti: setnotification failed on code_generated");
+               return -1;
+       }
+       return 0;
+}
+
+JNIEXPORT void JNICALL
+Agent_OnUnload(JavaVM *jvm __unused)
+{
+       int ret;
+
+       ret = jvmti_close(jvmti_agent);
+       if (ret)
+               errx(1, "Error: op_close_agent()");
+}
index a929618..aaee0a7 100644 (file)
@@ -454,11 +454,12 @@ static void handle_internal_command(int argc, const char **argv)
 
 static void execv_dashed_external(const char **argv)
 {
-       struct strbuf cmd = STRBUF_INIT;
+       char *cmd;
        const char *tmp;
        int status;
 
-       strbuf_addf(&cmd, "perf-%s", argv[0]);
+       if (asprintf(&cmd, "perf-%s", argv[0]) < 0)
+               goto do_die;
 
        /*
         * argv[0] must be the perf command, but the argv array
@@ -467,7 +468,7 @@ static void execv_dashed_external(const char **argv)
         * restore it on error.
         */
        tmp = argv[0];
-       argv[0] = cmd.buf;
+       argv[0] = cmd;
 
        /*
         * if we fail because the command is not found, it is
@@ -475,15 +476,16 @@ static void execv_dashed_external(const char **argv)
         */
        status = run_command_v_opt(argv, 0);
        if (status != -ERR_RUN_COMMAND_EXEC) {
-               if (IS_RUN_COMMAND_ERR(status))
+               if (IS_RUN_COMMAND_ERR(status)) {
+do_die:
                        die("unable to run '%s'", argv[0]);
+               }
                exit(-status);
        }
        errno = ENOENT; /* as if we called execvp */
 
        argv[0] = tmp;
-
-       strbuf_release(&cmd);
+       zfree(&cmd);
 }
 
 static int run_argv(int *argcp, const char ***argv)
@@ -546,6 +548,8 @@ int main(int argc, const char **argv)
 
        srandom(time(NULL));
 
+       perf_config(perf_default_config, NULL);
+
        /* get debugfs/tracefs mount point from /proc/mounts */
        tracing_path_mount();
 
@@ -613,6 +617,8 @@ int main(int argc, const char **argv)
         */
        pthread__block_sigwinch();
 
+       perf_debug_setup();
+
        while (1) {
                static int done_help;
                int was_alias = run_argv(&argc, &argv);
index 90129ac..5381a01 100644 (file)
@@ -58,6 +58,8 @@ struct record_opts {
        bool         full_auxtrace;
        bool         auxtrace_snapshot_mode;
        bool         record_switch_events;
+       bool         all_kernel;
+       bool         all_user;
        unsigned int freq;
        unsigned int mmap_pages;
        unsigned int auxtrace_mmap_pages;
index 15c8400..1d95009 100644 (file)
@@ -71,7 +71,10 @@ try:
 except:
        if not audit_package_warned:
                audit_package_warned = True
-               print "Install the audit-libs-python package to get syscall names"
+               print "Install the audit-libs-python package to get syscall names.\n" \
+                    "For example:\n  # apt-get install python-audit (Ubuntu)" \
+                    "\n  # yum install audit-libs-python (Fedora)" \
+                    "\n  etc.\n"
 
 def syscall_name(id):
        try:
index bf016c4..8cc30e7 100644 (file)
@@ -1,3 +1,4 @@
 llvm-src-base.c
 llvm-src-kbuild.c
 llvm-src-prologue.c
+llvm-src-relocation.c
index 614899b..1ba628e 100644 (file)
@@ -31,7 +31,7 @@ perf-y += sample-parsing.o
 perf-y += parse-no-sample-id-all.o
 perf-y += kmod-path.o
 perf-y += thread-map.o
-perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
+perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
 perf-y += bpf.o
 perf-y += topology.o
 perf-y += cpumap.o
@@ -59,6 +59,13 @@ $(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c tests/Build
        $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
        $(Q)echo ';' >> $@
 
+$(OUTPUT)tests/llvm-src-relocation.c: tests/bpf-script-test-relocation.c tests/Build
+       $(call rule_mkdir)
+       $(Q)echo '#include <tests/llvm.h>' > $@
+       $(Q)echo 'const char test_llvm__bpf_test_relocation[] =' >> $@
+       $(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
+       $(Q)echo ';' >> $@
+
 ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
 perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
 endif
index fb80c9e..e7664fe 100644 (file)
 
 static int fd1;
 static int fd2;
+static int fd3;
 static int overflows;
+static int overflows_2;
+
+volatile long the_var;
+
+
+/*
+ * Use ASM to ensure watchpoint and breakpoint can be triggered
+ * at one instruction.
+ */
+#if defined (__x86_64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "incq (%rdi)\n"
+       "ret\n");
+#elif defined (__aarch64__)
+extern void __test_function(volatile long *ptr);
+asm (
+       ".globl __test_function\n"
+       "__test_function:\n"
+       "str x30, [x0]\n"
+       "ret\n");
+
+#else
+static void __test_function(volatile long *ptr)
+{
+       *ptr = 0x1234;
+}
+#endif
 
 __attribute__ ((noinline))
 static int test_function(void)
 {
+       __test_function(&the_var);
+       the_var++;
        return time(NULL);
 }
 
+static void sig_handler_2(int signum __maybe_unused,
+                         siginfo_t *oh __maybe_unused,
+                         void *uc __maybe_unused)
+{
+       overflows_2++;
+       if (overflows_2 > 10) {
+               ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
+       }
+}
+
 static void sig_handler(int signum __maybe_unused,
                        siginfo_t *oh __maybe_unused,
                        void *uc __maybe_unused)
@@ -54,10 +99,11 @@ static void sig_handler(int signum __maybe_unused,
                 */
                ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
                ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+               ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
        }
 }
 
-static int bp_event(void *fn, int setup_signal)
+static int __event(bool is_x, void *addr, int sig)
 {
        struct perf_event_attr pe;
        int fd;
@@ -67,8 +113,8 @@ static int bp_event(void *fn, int setup_signal)
        pe.size = sizeof(struct perf_event_attr);
 
        pe.config = 0;
-       pe.bp_type = HW_BREAKPOINT_X;
-       pe.bp_addr = (unsigned long) fn;
+       pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
+       pe.bp_addr = (unsigned long) addr;
        pe.bp_len = sizeof(long);
 
        pe.sample_period = 1;
@@ -86,17 +132,25 @@ static int bp_event(void *fn, int setup_signal)
                return TEST_FAIL;
        }
 
-       if (setup_signal) {
-               fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-               fcntl(fd, F_SETSIG, SIGIO);
-               fcntl(fd, F_SETOWN, getpid());
-       }
+       fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
+       fcntl(fd, F_SETSIG, sig);
+       fcntl(fd, F_SETOWN, getpid());
 
        ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 
        return fd;
 }
 
+static int bp_event(void *addr, int sig)
+{
+       return __event(true, addr, sig);
+}
+
+static int wp_event(void *addr, int sig)
+{
+       return __event(false, addr, sig);
+}
+
 static long long bp_count(int fd)
 {
        long long count;
@@ -114,7 +168,7 @@ static long long bp_count(int fd)
 int test__bp_signal(int subtest __maybe_unused)
 {
        struct sigaction sa;
-       long long count1, count2;
+       long long count1, count2, count3;
 
        /* setup SIGIO signal handler */
        memset(&sa, 0, sizeof(struct sigaction));
@@ -126,21 +180,52 @@ int test__bp_signal(int subtest __maybe_unused)
                return TEST_FAIL;
        }
 
+       sa.sa_sigaction = (void *) sig_handler_2;
+       if (sigaction(SIGUSR1, &sa, NULL) < 0) {
+               pr_debug("failed setting up signal handler 2\n");
+               return TEST_FAIL;
+       }
+
        /*
         * We create following events:
         *
-        * fd1 - breakpoint event on test_function with SIGIO
+        * fd1 - breakpoint event on __test_function with SIGIO
         *       signal configured. We should get signal
         *       notification each time the breakpoint is hit
         *
-        * fd2 - breakpoint event on sig_handler without SIGIO
+        * fd2 - breakpoint event on sig_handler with SIGUSR1
+        *       configured. We should get SIGUSR1 each time when
+        *       breakpoint is hit
+        *
+        * fd3 - watchpoint event on __test_function with SIGIO
         *       configured.
         *
         * Following processing should happen:
-        *   - execute test_function
-        *   - fd1 event breakpoint hit -> count1 == 1
-        *   - SIGIO is delivered       -> overflows == 1
-        *   - fd2 event breakpoint hit -> count2 == 1
+        *   Exec:               Action:                       Result:
+        *   incq (%rdi)       - fd1 event breakpoint hit   -> count1 == 1
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 1
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 1  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 1
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   incq (%rdi)       - fd3 event watchpoint hit   -> count3 == 1       (wp and bp in one insn)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 2
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 2  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows = 2
+        *   sys_rt_sigreturn  - return from sig_handler
+        *   the_var++         - fd3 event watchpoint hit   -> count3 == 2       (standalone watchpoint)
+        *                     - SIGIO is delivered
+        *   sig_handler       - fd2 event breakpoint hit   -> count2 == 3
+        *                     - SIGUSR1 is delivered
+        *   sig_handler_2                                  -> overflows_2 == 3  (nested signal)
+        *   sys_rt_sigreturn  - return from sig_handler_2
+        *   overflows++                                    -> overflows == 3
+        *   sys_rt_sigreturn  - return from sig_handler
         *
         * The test case check following error conditions:
         * - we get stuck in signal handler because of debug
@@ -152,11 +237,13 @@ int test__bp_signal(int subtest __maybe_unused)
         *
         */
 
-       fd1 = bp_event(test_function, 1);
-       fd2 = bp_event(sig_handler, 0);
+       fd1 = bp_event(__test_function, SIGIO);
+       fd2 = bp_event(sig_handler, SIGUSR1);
+       fd3 = wp_event((void *)&the_var, SIGIO);
 
        ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0);
        ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
 
        /*
         * Kick off the test by trigering 'fd1'
@@ -166,15 +253,18 @@ int test__bp_signal(int subtest __maybe_unused)
 
        ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
        ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+       ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
 
        count1 = bp_count(fd1);
        count2 = bp_count(fd2);
+       count3 = bp_count(fd3);
 
        close(fd1);
        close(fd2);
+       close(fd3);
 
-       pr_debug("count1 %lld, count2 %lld, overflow %d\n",
-                count1, count2, overflows);
+       pr_debug("count1 %lld, count2 %lld, count3 %lld, overflow %d, overflows_2 %d\n",
+                count1, count2, count3, overflows, overflows_2);
 
        if (count1 != 1) {
                if (count1 == 11)
@@ -183,12 +273,18 @@ int test__bp_signal(int subtest __maybe_unused)
                        pr_debug("failed: wrong count for bp1%lld\n", count1);
        }
 
-       if (overflows != 1)
+       if (overflows != 3)
                pr_debug("failed: wrong overflow hit\n");
 
-       if (count2 != 1)
+       if (overflows_2 != 3)
+               pr_debug("failed: wrong overflow_2 hit\n");
+
+       if (count2 != 3)
                pr_debug("failed: wrong count for bp2\n");
 
-       return count1 == 1 && overflows == 1 && count2 == 1 ?
+       if (count3 != 2)
+               pr_debug("failed: wrong count for bp3\n");
+
+       return count1 == 1 && overflows == 3 && count2 == 3 && overflows_2 == 3 && count3 == 2 ?
                TEST_OK : TEST_FAIL;
 }
diff --git a/tools/perf/tests/bpf-script-test-relocation.c b/tools/perf/tests/bpf-script-test-relocation.c
new file mode 100644 (file)
index 0000000..93af774
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * bpf-script-test-relocation.c
+ * Test BPF loader checking relocation
+ */
+#ifndef LINUX_VERSION_CODE
+# error Need LINUX_VERSION_CODE
+# error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
+#endif
+#define BPF_ANY 0
+#define BPF_MAP_TYPE_ARRAY 2
+#define BPF_FUNC_map_lookup_elem 1
+#define BPF_FUNC_map_update_elem 2
+
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+       (void *) BPF_FUNC_map_lookup_elem;
+static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
+       (void *) BPF_FUNC_map_update_elem;
+
+struct bpf_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+};
+
+#define SEC(NAME) __attribute__((section(NAME), used))
+struct bpf_map_def SEC("maps") my_table = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1,
+};
+
+int this_is_a_global_val;
+
+SEC("func=sys_write")
+int bpf_func__sys_write(void *ctx)
+{
+       int key = 0;
+       int value = 0;
+
+       /*
+        * Incorrect relocation. Should not allow this program be
+        * loaded into kernel.
+        */
+       bpf_map_update_elem(&this_is_a_global_val, &key, &value, 0);
+       return 0;
+}
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = LINUX_VERSION_CODE;
index 33689a0..199501c 100644 (file)
@@ -1,7 +1,11 @@
 #include <stdio.h>
 #include <sys/epoll.h>
+#include <util/util.h>
 #include <util/bpf-loader.h>
 #include <util/evlist.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <bpf/bpf.h>
 #include "tests.h"
 #include "llvm.h"
 #include "debug.h"
@@ -71,6 +75,15 @@ static struct {
                (NR_ITERS + 1) / 4,
        },
 #endif
+       {
+               LLVM_TESTCASE_BPF_RELOCATION,
+               "Test BPF relocation checker",
+               "[bpf_relocation_test]",
+               "fix 'perf test LLVM' first",
+               "libbpf error when dealing with relocation",
+               NULL,
+               0,
+       },
 };
 
 static int do_test(struct bpf_object *obj, int (*func)(void),
@@ -99,7 +112,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
        parse_evlist.error = &parse_error;
        INIT_LIST_HEAD(&parse_evlist.list);
 
-       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj);
+       err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj, NULL);
        if (err || list_empty(&parse_evlist.list)) {
                pr_debug("Failed to add events selected by BPF\n");
                return TEST_FAIL;
@@ -190,7 +203,7 @@ static int __test__bpf(int idx)
 
        ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
                                       bpf_testcase_table[idx].prog_id,
-                                      true);
+                                      true, NULL);
        if (ret != TEST_OK || !obj_buf || !obj_buf_sz) {
                pr_debug("Unable to get BPF object, %s\n",
                         bpf_testcase_table[idx].msg_compile_fail);
@@ -202,14 +215,21 @@ static int __test__bpf(int idx)
 
        obj = prepare_bpf(obj_buf, obj_buf_sz,
                          bpf_testcase_table[idx].name);
-       if (!obj) {
+       if ((!!bpf_testcase_table[idx].target_func) != (!!obj)) {
+               if (!obj)
+                       pr_debug("Fail to load BPF object: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
+               else
+                       pr_debug("Success unexpectedly: %s\n",
+                                bpf_testcase_table[idx].msg_load_fail);
                ret = TEST_FAIL;
                goto out;
        }
 
-       ret = do_test(obj,
-                     bpf_testcase_table[idx].target_func,
-                     bpf_testcase_table[idx].expect_result);
+       if (obj)
+               ret = do_test(obj,
+                             bpf_testcase_table[idx].target_func,
+                             bpf_testcase_table[idx].expect_result);
 out:
        bpf__clear();
        return ret;
@@ -227,6 +247,36 @@ const char *test__bpf_subtest_get_desc(int i)
        return bpf_testcase_table[i].desc;
 }
 
+static int check_env(void)
+{
+       int err;
+       unsigned int kver_int;
+       char license[] = "GPL";
+
+       struct bpf_insn insns[] = {
+               BPF_MOV64_IMM(BPF_REG_0, 1),
+               BPF_EXIT_INSN(),
+       };
+
+       err = fetch_kernel_version(&kver_int, NULL, 0);
+       if (err) {
+               pr_debug("Unable to get kernel version\n");
+               return err;
+       }
+
+       err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
+                              sizeof(insns) / sizeof(insns[0]),
+                              license, kver_int, NULL, 0);
+       if (err < 0) {
+               pr_err("Missing basic BPF support, skip this test: %s\n",
+                      strerror(errno));
+               return err;
+       }
+       close(err);
+
+       return 0;
+}
+
 int test__bpf(int i)
 {
        int err;
@@ -239,6 +289,9 @@ int test__bpf(int i)
                return TEST_SKIP;
        }
 
+       if (check_env())
+               return TEST_SKIP;
+
        err = __test__bpf(i);
        return err;
 }
index 313a48c..afc9ad0 100644 (file)
@@ -439,7 +439,7 @@ static int do_test_code_reading(bool try_kcore)
                .mmap_pages          = UINT_MAX,
                .user_freq           = UINT_MAX,
                .user_interval       = ULLONG_MAX,
-               .freq                = 4000,
+               .freq                = 500,
                .target              = {
                        .uses_mmap   = true,
                },
@@ -559,7 +559,13 @@ static int do_test_code_reading(bool try_kcore)
                                evlist = NULL;
                                continue;
                        }
-                       pr_debug("perf_evlist__open failed\n");
+
+                       if (verbose) {
+                               char errbuf[512];
+                               perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+                               pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
+                       }
+
                        goto out_put;
                }
                break;
index 5e6a86e..ecf136c 100644 (file)
@@ -191,7 +191,7 @@ static int do_test(struct hists *hists, struct result *expected, size_t nr_expec
         * function since TEST_ASSERT_VAL() returns in case of failure.
         */
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(hists_to_evsel(hists), NULL);
 
        if (verbose > 2) {
                pr_info("use callchain: %d, cumulate callchain: %d\n",
index 351a424..34b945a 100644 (file)
@@ -145,7 +145,7 @@ int test__hists_filter(int subtest __maybe_unused)
                struct hists *hists = evsel__hists(evsel);
 
                hists__collapse_resort(hists, NULL);
-               hists__output_resort(hists, NULL);
+               perf_evsel__output_resort(evsel, NULL);
 
                if (verbose > 2) {
                        pr_info("Normal histogram\n");
index b231265..23cce67 100644 (file)
@@ -156,7 +156,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
                goto out;
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        if (verbose > 2) {
                pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -256,7 +256,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
                goto out;
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        if (verbose > 2) {
                pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -310,7 +310,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
                goto out;
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        if (verbose > 2) {
                pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -388,7 +388,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
                goto out;
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        if (verbose > 2) {
                pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -491,7 +491,7 @@ static int test5(struct perf_evsel *evsel, struct machine *machine)
                goto out;
 
        hists__collapse_resort(hists, NULL);
-       hists__output_resort(hists, NULL);
+       perf_evsel__output_resort(evsel, NULL);
 
        if (verbose > 2) {
                pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
index 06f45c1..cff564f 100644 (file)
@@ -6,12 +6,6 @@
 #include "tests.h"
 #include "debug.h"
 
-static int perf_config_cb(const char *var, const char *val,
-                         void *arg __maybe_unused)
-{
-       return perf_default_config(var, val, arg);
-}
-
 #ifdef HAVE_LIBBPF_SUPPORT
 static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
 {
@@ -35,6 +29,7 @@ static int test__bpf_parsing(void *obj_buf __maybe_unused,
 static struct {
        const char *source;
        const char *desc;
+       bool should_load_fail;
 } bpf_source_table[__LLVM_TESTCASE_MAX] = {
        [LLVM_TESTCASE_BASE] = {
                .source = test_llvm__bpf_base_prog,
@@ -48,14 +43,19 @@ static struct {
                .source = test_llvm__bpf_test_prologue_prog,
                .desc = "Compile source for BPF prologue generation test",
        },
+       [LLVM_TESTCASE_BPF_RELOCATION] = {
+               .source = test_llvm__bpf_test_relocation,
+               .desc = "Compile source for BPF relocation test",
+               .should_load_fail = true,
+       },
 };
 
-
 int
 test_llvm__fetch_bpf_obj(void **p_obj_buf,
                         size_t *p_obj_buf_sz,
                         enum test_llvm__testcase idx,
-                        bool force)
+                        bool force,
+                        bool *should_load_fail)
 {
        const char *source;
        const char *desc;
@@ -68,8 +68,8 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
 
        source = bpf_source_table[idx].source;
        desc = bpf_source_table[idx].desc;
-
-       perf_config(perf_config_cb, NULL);
+       if (should_load_fail)
+               *should_load_fail = bpf_source_table[idx].should_load_fail;
 
        /*
         * Skip this test if user's .perfconfig doesn't set [llvm] section
@@ -136,14 +136,15 @@ int test__llvm(int subtest)
        int ret;
        void *obj_buf = NULL;
        size_t obj_buf_sz = 0;
+       bool should_load_fail = false;
 
        if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
                return TEST_FAIL;
 
        ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-                                      subtest, false);
+                                      subtest, false, &should_load_fail);
 
-       if (ret == TEST_OK) {
+       if (ret == TEST_OK && !should_load_fail) {
                ret = test__bpf_parsing(obj_buf, obj_buf_sz);
                if (ret != TEST_OK) {
                        pr_debug("Failed to parse test case '%s'\n",
index 5150b4d..0eaa604 100644 (file)
@@ -7,14 +7,17 @@
 extern const char test_llvm__bpf_base_prog[];
 extern const char test_llvm__bpf_test_kbuild_prog[];
 extern const char test_llvm__bpf_test_prologue_prog[];
+extern const char test_llvm__bpf_test_relocation[];
 
 enum test_llvm__testcase {
        LLVM_TESTCASE_BASE,
        LLVM_TESTCASE_KBUILD,
        LLVM_TESTCASE_BPF_PROLOGUE,
+       LLVM_TESTCASE_BPF_RELOCATION,
        __LLVM_TESTCASE_MAX,
 };
 
 int test_llvm__fetch_bpf_obj(void **p_obj_buf, size_t *p_obj_buf_sz,
-                            enum test_llvm__testcase index, bool force);
+                            enum test_llvm__testcase index, bool force,
+                            bool *should_load_fail);
 #endif
index df38dec..cac15d9 100644 (file)
@@ -5,7 +5,7 @@ ifeq ($(MAKECMDGOALS),)
 # no target specified, trigger the whole suite
 all:
        @echo "Testing Makefile";      $(MAKE) -sf tests/make MK=Makefile
-       @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf
+       @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 SET_O=1
 else
 # run only specific test over 'Makefile'
 %:
@@ -13,6 +13,27 @@ else
 endif
 else
 PERF := .
+PERF_O := $(PERF)
+O_OPT :=
+FULL_O := $(shell readlink -f $(PERF_O) || echo $(PERF_O))
+
+ifneq ($(O),)
+  FULL_O := $(shell readlink -f $(O) || echo $(O))
+  PERF_O := $(FULL_O)
+  ifeq ($(SET_O),1)
+    O_OPT := 'O=$(FULL_O)'
+  endif
+  K_O_OPT := 'O=$(FULL_O)'
+endif
+
+PARALLEL_OPT=
+ifeq ($(SET_PARALLEL),1)
+  cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
+  ifeq ($(cores),0)
+    cores := 1
+  endif
+  PARALLEL_OPT="-j$(cores)"
+endif
 
 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
@@ -59,6 +80,7 @@ make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
 make_no_auxtrace    := NO_AUXTRACE=1
 make_no_libbpf     := NO_LIBBPF=1
+make_no_libcrypto   := NO_LIBCRYPTO=1
 make_tags           := tags
 make_cscope         := cscope
 make_help           := help
@@ -82,6 +104,7 @@ make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
 make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1
+make_minimal        += NO_LIBCRYPTO=1
 
 # $(run) contains all available tests
 run := make_pure
@@ -90,6 +113,9 @@ run := make_pure
 # disable features detection
 ifeq ($(MK),Makefile)
 run += make_clean_all
+MAKE_F := $(MAKE)
+else
+MAKE_F := $(MAKE) -f $(MK)
 endif
 run += make_python_perf_so
 run += make_debug
@@ -156,11 +182,11 @@ test_make_doc    := $(test_ok)
 test_make_help_O := $(test_ok)
 test_make_doc_O  := $(test_ok)
 
-test_make_python_perf_so := test -f $(PERF)/python/perf.so
+test_make_python_perf_so := test -f $(PERF_O)/python/perf.so
 
-test_make_perf_o           := test -f $(PERF)/perf.o
-test_make_util_map_o       := test -f $(PERF)/util/map.o
-test_make_util_pmu_bison_o := test -f $(PERF)/util/pmu-bison.o
+test_make_perf_o           := test -f $(PERF_O)/perf.o
+test_make_util_map_o       := test -f $(PERF_O)/util/map.o
+test_make_util_pmu_bison_o := test -f $(PERF_O)/util/pmu-bison.o
 
 define test_dest_files
   for file in $(1); do                         \
@@ -227,7 +253,7 @@ test_make_perf_o_O            := test -f $$TMP_O/perf.o
 test_make_util_map_o_O        := test -f $$TMP_O/util/map.o
 test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o
 
-test_default = test -x $(PERF)/perf
+test_default = test -x $(PERF_O)/perf
 test = $(if $(test_$1),$(test_$1),$(test_default))
 
 test_default_O = test -x $$TMP_O/perf
@@ -240,6 +266,8 @@ run := $(shell shuf -e $(run))
 run_O := $(shell shuf -e $(run_O))
 endif
 
+max_width := $(shell echo $(run_O) | sed 's/ /\n/g' | wc -L)
+
 ifdef DEBUG
 d := $(info run   $(run))
 d := $(info run_O $(run_O))
@@ -247,13 +275,13 @@ endif
 
 MAKEFLAGS := --no-print-directory
 
-clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null)
+clean := @(cd $(PERF); $(MAKE_F) -s $(O_OPT) clean >/dev/null)
 
 $(run):
        $(call clean)
        @TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) DESTDIR=$$TMP_DEST $($@)"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
        ( eval $$cmd ) >> $@ 2>&1; \
        echo "  test: $(call test,$@)" >> $@ 2>&1; \
        $(call test,$@) && \
@@ -263,8 +291,8 @@ $(run_O):
        $(call clean)
        @TMP_O=$$(mktemp -d); \
        TMP_DEST=$$(mktemp -d); \
-       cmd="cd $(PERF) && make -f $(MK) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
-       echo "- $@: $$cmd" && echo $$cmd > $@ && \
+       cmd="cd $(PERF) && $(MAKE_F) $($(patsubst %_O,%,$@)) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
        ( eval $$cmd ) >> $@ 2>&1 && \
        echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
        $(call test_O,$@) && \
@@ -276,23 +304,60 @@ tarpkg:
        ( eval $$cmd ) >> $@ 2>&1 && \
        rm -f $@
 
+KERNEL_O := ../..
+ifneq ($(O),)
+  KERNEL_O := $(O)
+endif
+
 make_kernelsrc:
-       @echo "- make -C <kernelsrc> tools/perf"
+       @echo "- make -C <kernelsrc> $(PARALLEL_OPT) $(K_O_OPT) tools/perf"
        $(call clean); \
-       (make -C ../.. tools/perf) > $@ 2>&1 && \
-       test -x perf && rm -f $@ || (cat $@ ; false)
+       (make -C ../.. $(PARALLEL_OPT) $(K_O_OPT) tools/perf) > $@ 2>&1 && \
+       test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
 
 make_kernelsrc_tools:
-       @echo "- make -C <kernelsrc>/tools perf"
+       @echo "- make -C <kernelsrc>/tools $(PARALLEL_OPT) $(K_O_OPT) perf"
        $(call clean); \
-       (make -C ../../tools perf) > $@ 2>&1 && \
-       test -x perf && rm -f $@ || (cat $@ ; false)
+       (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \
+       test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
+
+FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP
+FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC
 
 all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
        @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
 
 out: $(run_O)
        @echo OK
+       @rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
+
+ifeq ($(REUSE_FEATURES_DUMP),1)
+$(FEATURES_DUMP_FILE):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+$(FEATURES_DUMP_FILE_STATIC):
+       $(call clean)
+       @cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) LDFLAGS='-static' feature-dump"; \
+       echo "- $@: $$cmd" && echo $$cmd && \
+       ( eval $$cmd ) > /dev/null 2>&1
+
+# Add feature dump dependency for run/run_O targets
+$(foreach t,$(run) $(run_O),$(eval \
+       $(t): $(if $(findstring make_static,$(t)),\
+               $(FEATURES_DUMP_FILE_STATIC),\
+               $(FEATURES_DUMP_FILE))))
+
+# Append 'FEATURES_DUMP=' option to all test cases. For example:
+# make_no_libbpf: NO_LIBBPF=1  --> NO_LIBBPF=1 FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP
+# make_static: LDFLAGS=-static --> LDFLAGS=-static FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP_STATIC
+$(foreach t,$(run),$(if $(findstring make_static,$(t)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE_STATIC)),\
+                       $(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE))))
+endif
 
 .PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools
 endif # ifndef MK
index abe8849..7865f68 100644 (file)
@@ -1271,6 +1271,38 @@ static int test__checkevent_precise_max_modifier(struct perf_evlist *evlist)
        return 0;
 }
 
+static int test__checkevent_config_symbol(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_raw(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_num(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0);
+       return 0;
+}
+
+static int test__checkevent_config_cache(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0);
+       return 0;
+}
+
 static int count_tracepoints(void)
 {
        struct dirent *events_ent;
@@ -1579,6 +1611,26 @@ static struct evlist_test test__events[] = {
                .check = test__checkevent_precise_max_modifier,
                .id    = 47,
        },
+       {
+               .name  = "instructions/name=insn/",
+               .check = test__checkevent_config_symbol,
+               .id    = 48,
+       },
+       {
+               .name  = "r1234/name=rawpmu/",
+               .check = test__checkevent_config_raw,
+               .id    = 49,
+       },
+       {
+               .name  = "4:0x6530160/name=numpmu/",
+               .check = test__checkevent_config_num,
+               .id    = 50,
+       },
+       {
+               .name  = "L1-dcache-misses/name=cachepmu/",
+               .check = test__checkevent_config_cache,
+               .id    = 51,
+       },
 };
 
 static struct evlist_test test__events_pmu[] = {
@@ -1666,7 +1718,7 @@ static int test_term(struct terms_test *t)
        }
 
        ret = t->check(&terms);
-       parse_events__free_terms(&terms);
+       parse_events_terms__purge(&terms);
 
        return ret;
 }
index f0bfc9e..630b0b4 100644 (file)
@@ -110,7 +110,6 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
         */
        for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
                struct symbol *pair, *first_pair;
-               bool backwards = true;
 
                sym  = rb_entry(nd, struct symbol, rb_node);
 
@@ -151,27 +150,14 @@ next_pair:
                                continue;
 
                        } else {
-                               struct rb_node *nnd;
-detour:
-                               nnd = backwards ? rb_prev(&pair->rb_node) :
-                                                 rb_next(&pair->rb_node);
-                               if (nnd) {
-                                       struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
-
-                                       if (UM(next->start) == mem_start) {
-                                               pair = next;
+                               pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL, NULL);
+                               if (pair) {
+                                       if (UM(pair->start) == mem_start)
                                                goto next_pair;
-                                       }
-                               }
 
-                               if (backwards) {
-                                       backwards = false;
-                                       pair = first_pair;
-                                       goto detour;
+                                       pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+                                                mem_start, sym->name, pair->name);
                                }
-
-                               pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
-                                        mem_start, sym->name, pair->name);
                        }
                } else
                        pr_debug("%#" PRIx64 ": %s not on kallsyms\n",
index d372021..af68a9d 100644 (file)
@@ -531,8 +531,8 @@ static struct ui_browser_colorset {
                .bg       = "yellow",
        },
        {
-               .colorset = HE_COLORSET_CODE,
-               .name     = "code",
+               .colorset = HE_COLORSET_JUMP_ARROWS,
+               .name     = "jump_arrows",
                .fg       = "blue",
                .bg       = "default",
        },
index 01781de..be3b70e 100644 (file)
@@ -7,7 +7,7 @@
 #define HE_COLORSET_MEDIUM     51
 #define HE_COLORSET_NORMAL     52
 #define HE_COLORSET_SELECTED   53
-#define HE_COLORSET_CODE       54
+#define HE_COLORSET_JUMP_ARROWS        54
 #define HE_COLORSET_ADDR       55
 #define HE_COLORSET_ROOT       56
 
index d4d7cc2..4fc208e 100644 (file)
@@ -284,7 +284,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
                to = (u64)btarget->idx;
        }
 
-       ui_browser__set_color(browser, HE_COLORSET_CODE);
+       ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
        __ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
                                 from, to);
 }
@@ -755,11 +755,11 @@ static int annotate_browser__run(struct annotate_browser *browser,
                                nd = browser->curr_hot;
                        break;
                case K_UNTAB:
-                       if (nd != NULL)
+                       if (nd != NULL) {
                                nd = rb_next(nd);
                                if (nd == NULL)
                                        nd = rb_first(&browser->entries);
-                       else
+                       else
                                nd = browser->curr_hot;
                        break;
                case K_F1:
index 08c09ad..4b98165 100644 (file)
@@ -32,6 +32,7 @@ struct hist_browser {
        bool                 show_headers;
        float                min_pcnt;
        u64                  nr_non_filtered_entries;
+       u64                  nr_hierarchy_entries;
        u64                  nr_callchain_rows;
 };
 
@@ -58,11 +59,11 @@ static int hist_browser__get_folding(struct hist_browser *browser)
 
        for (nd = rb_first(&hists->entries);
             (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
+            nd = rb_hierarchy_next(nd)) {
                struct hist_entry *he =
                        rb_entry(nd, struct hist_entry, rb_node);
 
-               if (he->unfolded)
+               if (he->leaf && he->unfolded)
                        unfolded_rows += he->nr_rows;
        }
        return unfolded_rows;
@@ -72,7 +73,9 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
 {
        u32 nr_entries;
 
-       if (hist_browser__has_filter(hb))
+       if (symbol_conf.report_hierarchy)
+               nr_entries = hb->nr_hierarchy_entries;
+       else if (hist_browser__has_filter(hb))
                nr_entries = hb->nr_non_filtered_entries;
        else
                nr_entries = hb->hists->nr_entries;
@@ -247,6 +250,38 @@ static int callchain__count_rows(struct rb_root *chain)
        return n;
 }
 
+static int hierarchy_count_rows(struct hist_browser *hb, struct hist_entry *he,
+                               bool include_children)
+{
+       int count = 0;
+       struct rb_node *node;
+       struct hist_entry *child;
+
+       if (he->leaf)
+               return callchain__count_rows(&he->sorted_chain);
+
+       if (he->has_no_entry)
+               return 1;
+
+       node = rb_first(&he->hroot_out);
+       while (node) {
+               float percent;
+
+               child = rb_entry(node, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+
+               if (!child->filtered && percent >= hb->min_pcnt) {
+                       count++;
+
+                       if (include_children && child->unfolded)
+                               count += hierarchy_count_rows(hb, child, true);
+               }
+
+               node = rb_next(node);
+       }
+       return count;
+}
+
 static bool hist_entry__toggle_fold(struct hist_entry *he)
 {
        if (!he)
@@ -326,11 +361,17 @@ static void callchain__init_have_children(struct rb_root *root)
 
 static void hist_entry__init_have_children(struct hist_entry *he)
 {
-       if (!he->init_have_children) {
+       if (he->init_have_children)
+               return;
+
+       if (he->leaf) {
                he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
                callchain__init_have_children(&he->sorted_chain);
-               he->init_have_children = true;
+       } else {
+               he->has_children = !RB_EMPTY_ROOT(&he->hroot_out);
        }
+
+       he->init_have_children = true;
 }
 
 static bool hist_browser__toggle_fold(struct hist_browser *browser)
@@ -349,17 +390,49 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
                has_children = callchain_list__toggle_fold(cl);
 
        if (has_children) {
+               int child_rows = 0;
+
                hist_entry__init_have_children(he);
                browser->b.nr_entries -= he->nr_rows;
-               browser->nr_callchain_rows -= he->nr_rows;
 
-               if (he->unfolded)
-                       he->nr_rows = callchain__count_rows(&he->sorted_chain);
+               if (he->leaf)
+                       browser->nr_callchain_rows -= he->nr_rows;
                else
+                       browser->nr_hierarchy_entries -= he->nr_rows;
+
+               if (symbol_conf.report_hierarchy)
+                       child_rows = hierarchy_count_rows(browser, he, true);
+
+               if (he->unfolded) {
+                       if (he->leaf)
+                               he->nr_rows = callchain__count_rows(&he->sorted_chain);
+                       else
+                               he->nr_rows = hierarchy_count_rows(browser, he, false);
+
+                       /* account grand children */
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries += child_rows - he->nr_rows;
+
+                       if (!he->leaf && he->nr_rows == 0) {
+                               he->has_no_entry = true;
+                               he->nr_rows = 1;
+                       }
+               } else {
+                       if (symbol_conf.report_hierarchy)
+                               browser->b.nr_entries -= child_rows - he->nr_rows;
+
+                       if (he->has_no_entry)
+                               he->has_no_entry = false;
+
                        he->nr_rows = 0;
+               }
 
                browser->b.nr_entries += he->nr_rows;
-               browser->nr_callchain_rows += he->nr_rows;
+
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else
+                       browser->nr_hierarchy_entries += he->nr_rows;
 
                return true;
        }
@@ -422,13 +495,38 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
        return n;
 }
 
-static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
+static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he,
+                                bool unfold __maybe_unused)
+{
+       float percent;
+       struct rb_node *nd;
+       struct hist_entry *child;
+       int n = 0;
+
+       for (nd = rb_first(&he->hroot_out); nd; nd = rb_next(nd)) {
+               child = rb_entry(nd, struct hist_entry, rb_node);
+               percent = hist_entry__get_percent_limit(child);
+               if (!child->filtered && percent >= hb->min_pcnt)
+                       n++;
+       }
+
+       return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *he,
+                                   struct hist_browser *hb, bool unfold)
 {
        hist_entry__init_have_children(he);
        he->unfolded = unfold ? he->has_children : false;
 
        if (he->has_children) {
-               int n = callchain__set_folding(&he->sorted_chain, unfold);
+               int n;
+
+               if (he->leaf)
+                       n = callchain__set_folding(&he->sorted_chain, unfold);
+               else
+                       n = hierarchy_set_folding(hb, he, unfold);
+
                he->nr_rows = unfold ? n : 0;
        } else
                he->nr_rows = 0;
@@ -438,19 +536,38 @@ static void
 __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 {
        struct rb_node *nd;
-       struct hists *hists = browser->hists;
+       struct hist_entry *he;
+       double percent;
 
-       for (nd = rb_first(&hists->entries);
-            (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-            nd = rb_next(nd)) {
-               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-               hist_entry__set_folding(he, unfold);
-               browser->nr_callchain_rows += he->nr_rows;
+       nd = rb_first(&browser->hists->entries);
+       while (nd) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               /* set folding state even if it's currently folded */
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               hist_entry__set_folding(he, browser, unfold);
+
+               percent = hist_entry__get_percent_limit(he);
+               if (he->filtered || percent < browser->min_pcnt)
+                       continue;
+
+               if (!he->depth || unfold)
+                       browser->nr_hierarchy_entries++;
+               if (he->leaf)
+                       browser->nr_callchain_rows += he->nr_rows;
+               else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
+                       browser->nr_hierarchy_entries++;
+                       he->has_no_entry = true;
+                       he->nr_rows = 1;
+               } else
+                       he->has_no_entry = false;
        }
 }
 
 static void hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 {
+       browser->nr_hierarchy_entries = 0;
        browser->nr_callchain_rows = 0;
        __hist_browser__set_folding(browser, unfold);
 
@@ -657,9 +774,24 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
        return 1;
 }
 
+static bool check_percent_display(struct rb_node *node, u64 parent_total)
+{
+       struct callchain_node *child;
+
+       if (node == NULL)
+               return false;
+
+       if (rb_next(node))
+               return true;
+
+       child = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(child) != parent_total;
+}
+
 static int hist_browser__show_callchain_flat(struct hist_browser *browser,
                                             struct rb_root *root,
                                             unsigned short row, u64 total,
+                                            u64 parent_total,
                                             print_callchain_entry_fn print,
                                             struct callchain_print_arg *arg,
                                             check_output_full_fn is_output_full)
@@ -669,7 +801,7 @@ static int hist_browser__show_callchain_flat(struct hist_browser *browser,
        bool need_percent;
 
        node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
 
        while (node) {
                struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -763,6 +895,7 @@ static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
 static int hist_browser__show_callchain_folded(struct hist_browser *browser,
                                               struct rb_root *root,
                                               unsigned short row, u64 total,
+                                              u64 parent_total,
                                               print_callchain_entry_fn print,
                                               struct callchain_print_arg *arg,
                                               check_output_full_fn is_output_full)
@@ -772,7 +905,7 @@ static int hist_browser__show_callchain_folded(struct hist_browser *browser,
        bool need_percent;
 
        node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
 
        while (node) {
                struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -844,20 +977,24 @@ next:
        return row - first_row;
 }
 
-static int hist_browser__show_callchain(struct hist_browser *browser,
+static int hist_browser__show_callchain_graph(struct hist_browser *browser,
                                        struct rb_root *root, int level,
                                        unsigned short row, u64 total,
+                                       u64 parent_total,
                                        print_callchain_entry_fn print,
                                        struct callchain_print_arg *arg,
                                        check_output_full_fn is_output_full)
 {
        struct rb_node *node;
        int first_row = row, offset = level * LEVEL_OFFSET_STEP;
-       u64 new_total;
        bool need_percent;
+       u64 percent_total = total;
+
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               percent_total = parent_total;
 
        node = rb_first(root);
-       need_percent = node && rb_next(node);
+       need_percent = check_percent_display(node, parent_total);
 
        while (node) {
                struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -878,7 +1015,7 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                        folded_sign = callchain_list__folded(chain);
 
                        row += hist_browser__show_callchain_list(browser, child,
-                                                       chain, row, total,
+                                                       chain, row, percent_total,
                                                        was_first && need_percent,
                                                        offset + extra_offset,
                                                        print, arg);
@@ -893,13 +1030,9 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
                if (folded_sign == '-') {
                        const int new_level = level + (extra_offset ? 2 : 1);
 
-                       if (callchain_param.mode == CHAIN_GRAPH_REL)
-                               new_total = child->children_hit;
-                       else
-                               new_total = total;
-
-                       row += hist_browser__show_callchain(browser, &child->rb_root,
-                                                           new_level, row, new_total,
+                       row += hist_browser__show_callchain_graph(browser, &child->rb_root,
+                                                           new_level, row, total,
+                                                           child->children_hit,
                                                            print, arg, is_output_full);
                }
                if (is_output_full(browser, row))
@@ -910,6 +1043,45 @@ out:
        return row - first_row;
 }
 
+static int hist_browser__show_callchain(struct hist_browser *browser,
+                                       struct hist_entry *entry, int level,
+                                       unsigned short row,
+                                       print_callchain_entry_fn print,
+                                       struct callchain_print_arg *arg,
+                                       check_output_full_fn is_output_full)
+{
+       u64 total = hists__total_period(entry->hists);
+       u64 parent_total;
+       int printed;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_total = entry->stat_acc->period;
+       else
+               parent_total = entry->stat.period;
+
+       if (callchain_param.mode == CHAIN_FLAT) {
+               printed = hist_browser__show_callchain_flat(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else if (callchain_param.mode == CHAIN_FOLDED) {
+               printed = hist_browser__show_callchain_folded(browser,
+                                               &entry->sorted_chain, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       } else {
+               printed = hist_browser__show_callchain_graph(browser,
+                                               &entry->sorted_chain, level, row,
+                                               total, parent_total, print, arg,
+                                               is_output_full);
+       }
+
+       if (arg->is_current_entry)
+               browser->he_selection = entry;
+
+       return printed;
+}
+
 struct hpp_arg {
        struct ui_browser *b;
        char folded_sign;
@@ -1006,7 +1178,6 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                                    struct hist_entry *entry,
                                    unsigned short row)
 {
-       char s[256];
        int printed = 0;
        int width = browser->b.width;
        char folded_sign = ' ';
@@ -1031,16 +1202,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                        .folded_sign    = folded_sign,
                        .current_entry  = current_entry,
                };
-               struct perf_hpp hpp = {
-                       .buf            = s,
-                       .size           = sizeof(s),
-                       .ptr            = &arg,
-               };
                int column = 0;
 
                hist_browser__gotorc(browser, row, 0);
 
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(browser->hists, fmt) {
+                       char s[2048];
+                       struct perf_hpp hpp = {
+                               .buf    = s,
+                               .size   = sizeof(s),
+                               .ptr    = &arg,
+                       };
+
                        if (perf_hpp__should_skip(fmt, entry->hists) ||
                            column++ < browser->b.horiz_scroll)
                                continue;
@@ -1065,11 +1238,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                        }
 
                        if (fmt->color) {
-                               width -= fmt->color(fmt, &hpp, entry);
+                               int ret = fmt->color(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                               /*
+                                * fmt->color() already used ui_browser to
+                                * print the non alignment bits, skip it (+ret):
+                                */
+                               ui_browser__printf(&browser->b, "%s", s + ret);
                        } else {
-                               width -= fmt->entry(fmt, &hpp, entry);
+                               hist_entry__snprintf_alignment(entry, &hpp, fmt, fmt->entry(fmt, &hpp, entry));
                                ui_browser__printf(&browser->b, "%s", s);
                        }
+                       width -= hpp.buf - s;
                }
 
                /* The scroll bar isn't being used */
@@ -1084,43 +1264,246 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                --row_offset;
 
        if (folded_sign == '-' && row != browser->b.rows) {
-               u64 total = hists__total_period(entry->hists);
                struct callchain_print_arg arg = {
                        .row_offset = row_offset,
                        .is_current_entry = current_entry,
                };
 
-               if (callchain_param.mode == CHAIN_GRAPH_REL) {
-                       if (symbol_conf.cumulate_callchain)
-                               total = entry->stat_acc->period;
-                       else
-                               total = entry->stat.period;
-               }
-
-               if (callchain_param.mode == CHAIN_FLAT) {
-                       printed += hist_browser__show_callchain_flat(browser,
-                                       &entry->sorted_chain, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
-               } else if (callchain_param.mode == CHAIN_FOLDED) {
-                       printed += hist_browser__show_callchain_folded(browser,
-                                       &entry->sorted_chain, row, total,
+               printed += hist_browser__show_callchain(browser, entry, 1, row,
                                        hist_browser__show_callchain_entry, &arg,
                                        hist_browser__check_output_full);
+       }
+
+       return printed;
+}
+
+static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
+                                             struct hist_entry *entry,
+                                             unsigned short row,
+                                             int level)
+{
+       int printed = 0;
+       int width = browser->b.width;
+       char folded_sign = ' ';
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       off_t row_offset = entry->row_offset;
+       bool first = true;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       struct hpp_arg arg = {
+               .b              = &browser->b,
+               .current_entry  = current_entry,
+       };
+       int column = 0;
+       int hierarchy_indent = (entry->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       if (current_entry) {
+               browser->he_selection = entry;
+               browser->selection = &entry->ms;
+       }
+
+       hist_entry__init_have_children(entry);
+       folded_sign = hist_entry__folded(entry);
+       arg.folded_sign = folded_sign;
+
+       if (entry->leaf && row_offset) {
+               row_offset--;
+               goto show_callchain;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&entry->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (perf_hpp__should_skip(fmt, entry->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
                } else {
-                       printed += hist_browser__show_callchain(browser,
-                                       &entry->sorted_chain, 1, row, total,
-                                       hist_browser__show_callchain_entry, &arg,
-                                       hist_browser__check_output_full);
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
+               }
+
+               if (first) {
+                       ui_browser__printf(&browser->b, "%c", folded_sign);
+                       width--;
+                       first = false;
+               } else {
+                       ui_browser__printf(&browser->b, "  ");
+                       width -= 2;
+               }
+
+               if (fmt->color) {
+                       int ret = fmt->color(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       /*
+                        * fmt->color() already used ui_browser to
+                        * print the non alignment bits, skip it (+ret):
+                        */
+                       ui_browser__printf(&browser->b, "%s", s + ret);
+               } else {
+                       int ret = fmt->entry(fmt, &hpp, entry);
+                       hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+                       ui_browser__printf(&browser->b, "%s", s);
+               }
+               width -= hpp.buf - s;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", hierarchy_indent);
+       width -= hierarchy_indent;
+
+       if (column >= browser->b.horiz_scroll) {
+               char s[2048];
+               struct perf_hpp hpp = {
+                       .buf            = s,
+                       .size           = sizeof(s),
+                       .ptr            = &arg,
+               };
+
+               if (current_entry && browser->b.navkeypressed) {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_SELECTED);
+               } else {
+                       ui_browser__set_color(&browser->b,
+                                             HE_COLORSET_NORMAL);
                }
 
-               if (arg.is_current_entry)
-                       browser->he_selection = entry;
+               perf_hpp_list__for_each_format(entry->hpp_list, fmt) {
+                       ui_browser__write_nstring(&browser->b, "", 2);
+                       width -= 2;
+
+                       /*
+                        * No need to call hist_entry__snprintf_alignment()
+                        * since this fmt is always the last column in the
+                        * hierarchy mode.
+                        */
+                       if (fmt->color) {
+                               width -= fmt->color(fmt, &hpp, entry);
+                       } else {
+                               int i = 0;
+
+                               width -= fmt->entry(fmt, &hpp, entry);
+                               ui_browser__printf(&browser->b, "%s", ltrim(s));
+
+                               while (isspace(s[i++]))
+                                       width++;
+                       }
+               }
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+
+       ++row;
+       ++printed;
+
+show_callchain:
+       if (entry->leaf && folded_sign == '-' && row != browser->b.rows) {
+               struct callchain_print_arg carg = {
+                       .row_offset = row_offset,
+               };
+
+               printed += hist_browser__show_callchain(browser, entry,
+                                       level + 1, row,
+                                       hist_browser__show_callchain_entry, &carg,
+                                       hist_browser__check_output_full);
        }
 
        return printed;
 }
 
+static int hist_browser__show_no_entry(struct hist_browser *browser,
+                                      unsigned short row, int level)
+{
+       int width = browser->b.width;
+       bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+       bool first = true;
+       int column = 0;
+       int ret;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       int indent = browser->hists->nr_hpp_node - 2;
+
+       if (current_entry) {
+               browser->he_selection = NULL;
+               browser->selection = NULL;
+       }
+
+       hist_browser__gotorc(browser, row, 0);
+
+       if (current_entry && browser->b.navkeypressed)
+               ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+       else
+               ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+       ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+       width -= level * HIERARCHY_INDENT;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&browser->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (perf_hpp__should_skip(fmt, browser->hists) ||
+                   column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->width(fmt, NULL, hists_to_evsel(browser->hists));
+
+               if (first) {
+                       /* for folded sign */
+                       first = false;
+                       ret++;
+               } else {
+                       /* space between columns */
+                       ret += 2;
+               }
+
+               ui_browser__write_nstring(&browser->b, "", ret);
+               width -= ret;
+       }
+
+       ui_browser__write_nstring(&browser->b, "", indent * HIERARCHY_INDENT);
+       width -= indent * HIERARCHY_INDENT;
+
+       if (column >= browser->b.horiz_scroll) {
+               char buf[32];
+
+               ret = snprintf(buf, sizeof(buf), "no entry >= %.2f%%", browser->min_pcnt);
+               ui_browser__printf(&browser->b, "  %s", buf);
+               width -= ret + 2;
+       }
+
+       /* The scroll bar isn't being used */
+       if (!browser->b.navkeypressed)
+               width += 1;
+
+       ui_browser__write_nstring(&browser->b, "", width);
+       return 1;
+}
+
 static int advance_hpp_check(struct perf_hpp *hpp, int inc)
 {
        advance_hpp(hpp, inc);
@@ -1144,7 +1527,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
                        return ret;
        }
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
                        continue;
 
@@ -1160,11 +1543,96 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
        return ret;
 }
 
+static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *browser, char *buf, size_t size)
+{
+       struct hists *hists = browser->hists;
+       struct perf_hpp dummy_hpp = {
+               .buf    = buf,
+               .size   = size,
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       size_t ret = 0;
+       int column = 0;
+       int indent = hists->nr_hpp_node - 2;
+       bool first_node, first_col;
+
+       ret = scnprintf(buf, size, " ");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (column++ < browser->b.horiz_scroll)
+                       continue;
+
+               ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+
+               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "  ");
+               if (advance_hpp_check(&dummy_hpp, ret))
+                       break;
+       }
+
+       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "%*s",
+                       indent * HIERARCHY_INDENT, "");
+       if (advance_hpp_check(&dummy_hpp, ret))
+               return ret;
+
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node) {
+                       ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, " / ");
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       char *start;
+
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col) {
+                               ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "+");
+                               if (advance_hpp_check(&dummy_hpp, ret))
+                                       break;
+                       }
+                       first_col = false;
+
+                       ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+                       dummy_hpp.buf[ret] = '\0';
+                       rtrim(dummy_hpp.buf);
+
+                       start = ltrim(dummy_hpp.buf);
+                       ret = strlen(start);
+
+                       if (start != dummy_hpp.buf)
+                               memmove(dummy_hpp.buf, start, ret + 1);
+
+                       if (advance_hpp_check(&dummy_hpp, ret))
+                               break;
+               }
+       }
+
+       return ret;
+}
+
 static void hist_browser__show_headers(struct hist_browser *browser)
 {
        char headers[1024];
 
-       hists_browser__scnprintf_headers(browser, headers, sizeof(headers));
+       if (symbol_conf.report_hierarchy)
+               hists_browser__scnprintf_hierarchy_headers(browser, headers,
+                                                          sizeof(headers));
+       else
+               hists_browser__scnprintf_headers(browser, headers,
+                                                sizeof(headers));
        ui_browser__gotorc(&browser->b, 0, 0);
        ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
        ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
@@ -1196,18 +1664,34 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
        hb->he_selection = NULL;
        hb->selection = NULL;
 
-       for (nd = browser->top; nd; nd = rb_next(nd)) {
+       for (nd = browser->top; nd; nd = rb_hierarchy_next(nd)) {
                struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                float percent;
 
-               if (h->filtered)
+               if (h->filtered) {
+                       /* let it move to sibling */
+                       h->unfolded = false;
                        continue;
+               }
 
                percent = hist_entry__get_percent_limit(h);
                if (percent < hb->min_pcnt)
                        continue;
 
-               row += hist_browser__show_entry(hb, h, row);
+               if (symbol_conf.report_hierarchy) {
+                       row += hist_browser__show_hierarchy_entry(hb, h, row,
+                                                                 h->depth);
+                       if (row == browser->rows)
+                               break;
+
+                       if (h->has_no_entry) {
+                               hist_browser__show_no_entry(hb, row, h->depth + 1);
+                               row++;
+                       }
+               } else {
+                       row += hist_browser__show_entry(hb, h, row);
+               }
+
                if (row == browser->rows)
                        break;
        }
@@ -1225,7 +1709,14 @@ static struct rb_node *hists__filter_entries(struct rb_node *nd,
                if (!h->filtered && percent >= min_pcnt)
                        return nd;
 
-               nd = rb_next(nd);
+               /*
+                * If it's filtered, its all children also were filtered.
+                * So move to sibling node.
+                */
+               if (rb_next(nd))
+                       nd = rb_next(nd);
+               else
+                       nd = rb_hierarchy_next(nd);
        }
 
        return NULL;
@@ -1241,7 +1732,7 @@ static struct rb_node *hists__filter_prev_entries(struct rb_node *nd,
                if (!h->filtered && percent >= min_pcnt)
                        return nd;
 
-               nd = rb_prev(nd);
+               nd = rb_hierarchy_prev(nd);
        }
 
        return NULL;
@@ -1271,8 +1762,8 @@ static void ui_browser__hists_seek(struct ui_browser *browser,
                nd = browser->top;
                goto do_offset;
        case SEEK_END:
-               nd = hists__filter_prev_entries(rb_last(browser->entries),
-                                               hb->min_pcnt);
+               nd = rb_hierarchy_last(rb_last(browser->entries));
+               nd = hists__filter_prev_entries(nd, hb->min_pcnt);
                first = false;
                break;
        default:
@@ -1306,7 +1797,7 @@ do_offset:
        if (offset > 0) {
                do {
                        h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                u16 remaining = h->nr_rows - h->row_offset;
                                if (offset > remaining) {
                                        offset -= remaining;
@@ -1318,7 +1809,8 @@ do_offset:
                                        break;
                                }
                        }
-                       nd = hists__filter_entries(rb_next(nd), hb->min_pcnt);
+                       nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                                  hb->min_pcnt);
                        if (nd == NULL)
                                break;
                        --offset;
@@ -1327,7 +1819,7 @@ do_offset:
        } else if (offset < 0) {
                while (1) {
                        h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->unfolded) {
+                       if (h->unfolded && h->leaf) {
                                if (first) {
                                        if (-offset > h->row_offset) {
                                                offset += h->row_offset;
@@ -1351,7 +1843,7 @@ do_offset:
                                }
                        }
 
-                       nd = hists__filter_prev_entries(rb_prev(nd),
+                       nd = hists__filter_prev_entries(rb_hierarchy_prev(nd),
                                                        hb->min_pcnt);
                        if (nd == NULL)
                                break;
@@ -1364,7 +1856,7 @@ do_offset:
                                 * row_offset at its last entry.
                                 */
                                h = rb_entry(nd, struct hist_entry, rb_node);
-                               if (h->unfolded)
+                               if (h->unfolded && h->leaf)
                                        h->row_offset = h->nr_rows;
                                break;
                        }
@@ -1378,17 +1870,14 @@ do_offset:
 }
 
 static int hist_browser__fprintf_callchain(struct hist_browser *browser,
-                                          struct hist_entry *he, FILE *fp)
+                                          struct hist_entry *he, FILE *fp,
+                                          int level)
 {
-       u64 total = hists__total_period(he->hists);
        struct callchain_print_arg arg  = {
                .fp = fp,
        };
 
-       if (symbol_conf.cumulate_callchain)
-               total = he->stat_acc->period;
-
-       hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+       hist_browser__show_callchain(browser, he, level, 0,
                                     hist_browser__fprintf_callchain_entry, &arg,
                                     hist_browser__check_dump_full);
        return arg.printed;
@@ -1414,7 +1903,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
        if (symbol_conf.use_callchain)
                printed += fprintf(fp, "%c ", folded_sign);
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                if (perf_hpp__should_skip(fmt, he->hists))
                        continue;
 
@@ -1425,12 +1914,71 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
                        first = false;
 
                ret = fmt->entry(fmt, &hpp, he);
+               ret = hist_entry__snprintf_alignment(he, &hpp, fmt, ret);
                advance_hpp(&hpp, ret);
        }
-       printed += fprintf(fp, "%s\n", rtrim(s));
+       printed += fprintf(fp, "%s\n", s);
 
        if (folded_sign == '-')
-               printed += hist_browser__fprintf_callchain(browser, he, fp);
+               printed += hist_browser__fprintf_callchain(browser, he, fp, 1);
+
+       return printed;
+}
+
+
+static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
+                                                struct hist_entry *he,
+                                                FILE *fp, int level)
+{
+       char s[8192];
+       int printed = 0;
+       char folded_sign = ' ';
+       struct perf_hpp hpp = {
+               .buf = s,
+               .size = sizeof(s),
+       };
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       bool first = true;
+       int ret;
+       int hierarchy_indent = (he->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
+
+       printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
+
+       folded_sign = hist_entry__folded(he);
+       printed += fprintf(fp, "%c", folded_sign);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&he->hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first) {
+                       ret = scnprintf(hpp.buf, hpp.size, "  ");
+                       advance_hpp(&hpp, ret);
+               } else
+                       first = false;
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       ret = scnprintf(hpp.buf, hpp.size, "%*s", hierarchy_indent, "");
+       advance_hpp(&hpp, ret);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               ret = scnprintf(hpp.buf, hpp.size, "  ");
+               advance_hpp(&hpp, ret);
+
+               ret = fmt->entry(fmt, &hpp, he);
+               advance_hpp(&hpp, ret);
+       }
+
+       printed += fprintf(fp, "%s\n", rtrim(s));
+
+       if (he->leaf && folded_sign == '-') {
+               printed += hist_browser__fprintf_callchain(browser, he, fp,
+                                                          he->depth + 1);
+       }
 
        return printed;
 }
@@ -1444,8 +1992,16 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
        while (nd) {
                struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-               printed += hist_browser__fprintf_entry(browser, h, fp);
-               nd = hists__filter_entries(rb_next(nd), browser->min_pcnt);
+               if (symbol_conf.report_hierarchy) {
+                       printed += hist_browser__fprintf_hierarchy_entry(browser,
+                                                                        h, fp,
+                                                                        h->depth);
+               } else {
+                       printed += hist_browser__fprintf_entry(browser, h, fp);
+               }
+
+               nd = hists__filter_entries(rb_hierarchy_next(nd),
+                                          browser->min_pcnt);
        }
 
        return printed;
@@ -1580,11 +2136,18 @@ static int hists__browser_title(struct hists *hists,
        if (hists->uid_filter_str)
                printed += snprintf(bf + printed, size - printed,
                                    ", UID: %s", hists->uid_filter_str);
-       if (thread)
-               printed += scnprintf(bf + printed, size - printed,
+       if (thread) {
+               if (sort__has_thread) {
+                       printed += scnprintf(bf + printed, size - printed,
                                    ", Thread: %s(%d)",
                                     (thread->comm_set ? thread__comm_str(thread) : ""),
                                    thread->tid);
+               } else {
+                       printed += scnprintf(bf + printed, size - printed,
+                                   ", Thread: %s",
+                                    (thread->comm_set ? thread__comm_str(thread) : ""));
+               }
+       }
        if (dso)
                printed += scnprintf(bf + printed, size - printed,
                                    ", DSO: %s", dso->short_name);
@@ -1759,15 +2322,24 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
 {
        struct thread *thread = act->thread;
 
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+               return 0;
+
        if (browser->hists->thread_filter) {
                pstack__remove(browser->pstack, &browser->hists->thread_filter);
                perf_hpp__set_elide(HISTC_THREAD, false);
                thread__zput(browser->hists->thread_filter);
                ui_helpline__pop();
        } else {
-               ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
-                                  thread->comm_set ? thread__comm_str(thread) : "",
-                                  thread->tid);
+               if (sort__has_thread) {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "",
+                                          thread->tid);
+               } else {
+                       ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s thread\"",
+                                          thread->comm_set ? thread__comm_str(thread) : "");
+               }
+
                browser->hists->thread_filter = thread__get(thread);
                perf_hpp__set_elide(HISTC_THREAD, false);
                pstack__push(browser->pstack, &browser->hists->thread_filter);
@@ -1782,13 +2354,22 @@ static int
 add_thread_opt(struct hist_browser *browser, struct popup_action *act,
               char **optstr, struct thread *thread)
 {
-       if (thread == NULL)
+       int ret;
+
+       if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
                return 0;
 
-       if (asprintf(optstr, "Zoom %s %s(%d) thread",
-                    browser->hists->thread_filter ? "out of" : "into",
-                    thread->comm_set ? thread__comm_str(thread) : "",
-                    thread->tid) < 0)
+       if (sort__has_thread) {
+               ret = asprintf(optstr, "Zoom %s %s(%d) thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "",
+                              thread->tid);
+       } else {
+               ret = asprintf(optstr, "Zoom %s %s thread",
+                              browser->hists->thread_filter ? "out of" : "into",
+                              thread->comm_set ? thread__comm_str(thread) : "");
+       }
+       if (ret < 0)
                return 0;
 
        act->thread = thread;
@@ -1801,6 +2382,9 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
 {
        struct map *map = act->ms.map;
 
+       if (!sort__has_dso || map == NULL)
+               return 0;
+
        if (browser->hists->dso_filter) {
                pstack__remove(browser->pstack, &browser->hists->dso_filter);
                perf_hpp__set_elide(HISTC_DSO, false);
@@ -1825,7 +2409,7 @@ static int
 add_dso_opt(struct hist_browser *browser, struct popup_action *act,
            char **optstr, struct map *map)
 {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                return 0;
 
        if (asprintf(optstr, "Zoom %s %s DSO",
@@ -1850,7 +2434,7 @@ static int
 add_map_opt(struct hist_browser *browser __maybe_unused,
            struct popup_action *act, char **optstr, struct map *map)
 {
-       if (map == NULL)
+       if (!sort__has_dso || map == NULL)
                return 0;
 
        if (asprintf(optstr, "Browse map details") < 0)
@@ -1952,6 +2536,9 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
 static int
 do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
 {
+       if (!sort__has_socket || act->socket < 0)
+               return 0;
+
        if (browser->hists->socket_filter > -1) {
                pstack__remove(browser->pstack, &browser->hists->socket_filter);
                browser->hists->socket_filter = -1;
@@ -1971,7 +2558,7 @@ static int
 add_socket_opt(struct hist_browser *browser, struct popup_action *act,
               char **optstr, int socket_id)
 {
-       if (socket_id < 0)
+       if (!sort__has_socket || socket_id < 0)
                return 0;
 
        if (asprintf(optstr, "Zoom %s Processor Socket %d",
@@ -1989,17 +2576,60 @@ static void hist_browser__update_nr_entries(struct hist_browser *hb)
        u64 nr_entries = 0;
        struct rb_node *nd = rb_first(&hb->hists->entries);
 
-       if (hb->min_pcnt == 0) {
+       if (hb->min_pcnt == 0 && !symbol_conf.report_hierarchy) {
                hb->nr_non_filtered_entries = hb->hists->nr_non_filtered_entries;
                return;
        }
 
        while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
                nr_entries++;
-               nd = rb_next(nd);
+               nd = rb_hierarchy_next(nd);
        }
 
        hb->nr_non_filtered_entries = nr_entries;
+       hb->nr_hierarchy_entries = nr_entries;
+}
+
+static void hist_browser__update_percent_limit(struct hist_browser *hb,
+                                              double percent)
+{
+       struct hist_entry *he;
+       struct rb_node *nd = rb_first(&hb->hists->entries);
+       u64 total = hists__total_period(hb->hists);
+       u64 min_callchain_hits = total * (percent / 100);
+
+       hb->min_pcnt = callchain_param.min_percent = percent;
+
+       while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
+               he = rb_entry(nd, struct hist_entry, rb_node);
+
+               if (he->has_no_entry) {
+                       he->has_no_entry = false;
+                       he->nr_rows = 0;
+               }
+
+               if (!he->leaf || !symbol_conf.use_callchain)
+                       goto next;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+
+next:
+               nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+               /* force to re-evaluate folding state of callchains */
+               he->init_have_children = false;
+               hist_entry__set_folding(he, hb, false);
+       }
 }
 
 static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
@@ -2037,6 +2667,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
        "E             Expand all callchains\n"                         \
        "F             Toggle percentage of filtered entries\n"         \
        "H             Display column headers\n"                        \
+       "L             Change percent limit\n"                          \
        "m             Display context menu\n"                          \
        "S             Zoom into current Processor Socket\n"            \
 
@@ -2077,7 +2708,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
        memset(options, 0, sizeof(options));
        memset(actions, 0, sizeof(actions));
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(browser->hists, fmt) {
                perf_hpp__reset_width(fmt, hists);
                /*
                 * This is done just once, and activates the horizontal scrolling
@@ -2192,6 +2823,24 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                top->zero = !top->zero;
                        }
                        continue;
+               case 'L':
+                       if (ui_browser__input_window("Percent Limit",
+                                       "Please enter the value you want to hide entries under that percent.",
+                                       buf, "ENTER: OK, ESC: Cancel",
+                                       delay_secs * 2) == K_ENTER) {
+                               char *end;
+                               double new_percent = strtod(buf, &end);
+
+                               if (new_percent < 0 || new_percent > 100) {
+                                       ui_browser__warning(&browser->b, delay_secs * 2,
+                                               "Invalid percent: %.2f", new_percent);
+                                       continue;
+                               }
+
+                               hist_browser__update_percent_limit(browser, new_percent);
+                               hist_browser__reset(browser);
+                       }
+                       continue;
                case K_F1:
                case 'h':
                case '?':
@@ -2263,10 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                        continue;
                }
 
-               if (!sort__has_sym)
-                       goto add_exit_option;
-
-               if (browser->selection == NULL)
+               if (!sort__has_sym || browser->selection == NULL)
                        goto skip_annotation;
 
                if (sort__mode == SORT_MODE__BRANCH) {
@@ -2306,11 +2952,16 @@ skip_annotation:
                                             &options[nr_options],
                                             socked_id);
                /* perf script support */
+               if (!is_report_browser(hbt))
+                       goto skip_scripting;
+
                if (browser->he_selection) {
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    thread, NULL);
+                       if (sort__has_thread && thread) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            thread, NULL);
+                       }
                        /*
                         * Note that browser->selection != NULL
                         * when browser->he_selection is not NULL,
@@ -2320,16 +2971,18 @@ skip_annotation:
                         *
                         * See hist_browser__show_entry.
                         */
-                       nr_options += add_script_opt(browser,
-                                                    &actions[nr_options],
-                                                    &options[nr_options],
-                                                    NULL, browser->selection->sym);
+                       if (sort__has_sym && browser->selection->sym) {
+                               nr_options += add_script_opt(browser,
+                                                            &actions[nr_options],
+                                                            &options[nr_options],
+                                                            NULL, browser->selection->sym);
+                       }
                }
                nr_options += add_script_opt(browser, &actions[nr_options],
                                             &options[nr_options], NULL, NULL);
                nr_options += add_switch_opt(browser, &actions[nr_options],
                                             &options[nr_options]);
-add_exit_option:
+skip_scripting:
                nr_options += add_exit_opt(browser, &actions[nr_options],
                                           &options[nr_options]);
 
index 0f8dcfd..bd9bf7e 100644 (file)
@@ -306,7 +306,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
        nr_cols = 0;
 
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                col_types[nr_cols++] = G_TYPE_STRING;
 
        store = gtk_tree_store_newv(nr_cols, col_types);
@@ -317,7 +317,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
        col_idx = 0;
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                if (perf_hpp__should_skip(fmt, hists))
                        continue;
 
@@ -367,7 +367,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
                col_idx = 0;
 
-               perf_hpp__for_each_format(fmt) {
+               hists__for_each_format(hists, fmt) {
                        if (perf_hpp__should_skip(fmt, h->hists))
                                continue;
 
@@ -396,6 +396,194 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
        gtk_container_add(GTK_CONTAINER(window), view);
 }
 
+static void perf_gtk__add_hierarchy_entries(struct hists *hists,
+                                           struct rb_root *root,
+                                           GtkTreeStore *store,
+                                           GtkTreeIter *parent,
+                                           struct perf_hpp *hpp,
+                                           float min_pcnt)
+{
+       int col_idx = 0;
+       struct rb_node *node;
+       struct hist_entry *he;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       u64 total = hists__total_period(hists);
+       int size;
+
+       for (node = rb_first(root); node; node = rb_next(node)) {
+               GtkTreeIter iter;
+               float percent;
+               char *bf;
+
+               he = rb_entry(node, struct hist_entry, rb_node);
+               if (he->filtered)
+                       continue;
+
+               percent = hist_entry__get_percent_limit(he);
+               if (percent < min_pcnt)
+                       continue;
+
+               gtk_tree_store_append(store, &iter, parent);
+
+               col_idx = 0;
+
+               /* the first hpp_list_node is for overhead columns */
+               fmt_node = list_first_entry(&hists->hpp_formats,
+                                           struct perf_hpp_list_node, list);
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (fmt->color)
+                               fmt->color(fmt, hpp, he);
+                       else
+                               fmt->entry(fmt, hpp, he);
+
+                       gtk_tree_store_set(store, &iter, col_idx++, hpp->buf, -1);
+               }
+
+               bf = hpp->buf;
+               size = hpp->size;
+               perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+                       int ret;
+
+                       if (fmt->color)
+                               ret = fmt->color(fmt, hpp, he);
+                       else
+                               ret = fmt->entry(fmt, hpp, he);
+
+                       snprintf(hpp->buf + ret, hpp->size - ret, "  ");
+                       advance_hpp(hpp, ret + 2);
+               }
+
+               gtk_tree_store_set(store, &iter, col_idx, ltrim(rtrim(bf)), -1);
+
+               if (!he->leaf) {
+                       hpp->buf = bf;
+                       hpp->size = size;
+
+                       perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
+                                                       store, &iter, hpp,
+                                                       min_pcnt);
+
+                       if (!hist_entry__has_hierarchy_children(he, min_pcnt)) {
+                               char buf[32];
+                               GtkTreeIter child;
+
+                               snprintf(buf, sizeof(buf), "no entry >= %.2f%%",
+                                        min_pcnt);
+
+                               gtk_tree_store_append(store, &child, &iter);
+                               gtk_tree_store_set(store, &child, col_idx, buf, -1);
+                       }
+               }
+
+               if (symbol_conf.use_callchain && he->leaf) {
+                       if (callchain_param.mode == CHAIN_GRAPH_REL)
+                               total = symbol_conf.cumulate_callchain ?
+                                       he->stat_acc->period : he->stat.period;
+
+                       perf_gtk__add_callchain(&he->sorted_chain, store, &iter,
+                                               col_idx, total);
+               }
+       }
+
+}
+
+static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
+                                    float min_pcnt)
+{
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+       GType col_types[MAX_COLUMNS];
+       GtkCellRenderer *renderer;
+       GtkTreeStore *store;
+       GtkWidget *view;
+       int col_idx;
+       int nr_cols = 0;
+       char s[512];
+       char buf[512];
+       bool first_node, first_col;
+       struct perf_hpp hpp = {
+               .buf            = s,
+               .size           = sizeof(s),
+       };
+
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) ||
+                   perf_hpp__is_dynamic_entry(fmt))
+                       break;
+
+               col_types[nr_cols++] = G_TYPE_STRING;
+       }
+       col_types[nr_cols++] = G_TYPE_STRING;
+
+       store = gtk_tree_store_newv(nr_cols, col_types);
+       view = gtk_tree_view_new();
+       renderer = gtk_cell_renderer_text_new();
+
+       col_idx = 0;
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                           -1, fmt->name,
+                                                           renderer, "markup",
+                                                           col_idx++, NULL);
+       }
+
+       /* construct merged column header since sort keys share single column */
+       buf[0] = '\0';
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       strcat(buf, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp ,fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               strcat(buf, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, &hpp, hists_to_evsel(hists));
+                       strcat(buf, ltrim(rtrim(hpp.buf)));
+               }
+       }
+
+       gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+                                                   -1, buf,
+                                                   renderer, "markup",
+                                                   col_idx++, NULL);
+
+       for (col_idx = 0; col_idx < nr_cols; col_idx++) {
+               GtkTreeViewColumn *column;
+
+               column = gtk_tree_view_get_column(GTK_TREE_VIEW(view), col_idx);
+               gtk_tree_view_column_set_resizable(column, TRUE);
+
+               if (col_idx == 0) {
+                       gtk_tree_view_set_expander_column(GTK_TREE_VIEW(view),
+                                                         column);
+               }
+       }
+
+       gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+       g_object_unref(GTK_TREE_MODEL(store));
+
+       perf_gtk__add_hierarchy_entries(hists, &hists->entries, store,
+                                       NULL, &hpp, min_pcnt);
+
+       gtk_tree_view_set_rules_hint(GTK_TREE_VIEW(view), TRUE);
+
+       g_signal_connect(view, "row-activated",
+                        G_CALLBACK(on_row_activated), NULL);
+       gtk_container_add(GTK_CONTAINER(window), view);
+}
+
 int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                  const char *help,
                                  struct hist_browser_timer *hbt __maybe_unused,
@@ -463,7 +651,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
                                                        GTK_POLICY_AUTOMATIC,
                                                        GTK_POLICY_AUTOMATIC);
 
-               perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
+               if (symbol_conf.report_hierarchy)
+                       perf_gtk__show_hierarchy(scrolled_window, hists, min_pcnt);
+               else
+                       perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
 
                tab_label = gtk_label_new(evname);
 
index bf2a66e..3baeaa6 100644 (file)
@@ -5,6 +5,7 @@
 #include "../util/util.h"
 #include "../util/sort.h"
 #include "../util/evsel.h"
+#include "../util/evlist.h"
 
 /* hist period print (hpp) functions */
 
@@ -371,7 +372,20 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
        return 0;
 }
 
-#define HPP__COLOR_PRINT_FNS(_name, _fn)               \
+static bool perf_hpp__is_hpp_entry(struct perf_hpp_fmt *a)
+{
+       return a->header == hpp__header_fn;
+}
+
+static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       if (!perf_hpp__is_hpp_entry(a) || !perf_hpp__is_hpp_entry(b))
+               return false;
+
+       return a->idx == b->idx;
+}
+
+#define HPP__COLOR_PRINT_FNS(_name, _fn, _idx)         \
        {                                               \
                .name   = _name,                        \
                .header = hpp__header_fn,               \
@@ -381,9 +395,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                .cmp    = hpp__nop_cmp,                 \
                .collapse = hpp__nop_cmp,               \
                .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
        }
 
-#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn)           \
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn, _idx)     \
        {                                               \
                .name   = _name,                        \
                .header = hpp__header_fn,               \
@@ -393,9 +409,11 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                .cmp    = hpp__nop_cmp,                 \
                .collapse = hpp__nop_cmp,               \
                .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
        }
 
-#define HPP__PRINT_FNS(_name, _fn)                     \
+#define HPP__PRINT_FNS(_name, _fn, _idx)               \
        {                                               \
                .name   = _name,                        \
                .header = hpp__header_fn,               \
@@ -404,22 +422,25 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
                .cmp    = hpp__nop_cmp,                 \
                .collapse = hpp__nop_cmp,               \
                .sort   = hpp__sort_ ## _fn,            \
+               .idx    = PERF_HPP__ ## _idx,           \
+               .equal  = hpp__equal,                   \
        }
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-       HPP__COLOR_PRINT_FNS("Overhead", overhead),
-       HPP__COLOR_PRINT_FNS("sys", overhead_sys),
-       HPP__COLOR_PRINT_FNS("usr", overhead_us),
-       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
-       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
-       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
-       HPP__PRINT_FNS("Samples", samples),
-       HPP__PRINT_FNS("Period", period)
+       HPP__COLOR_PRINT_FNS("Overhead", overhead, OVERHEAD),
+       HPP__COLOR_PRINT_FNS("sys", overhead_sys, OVERHEAD_SYS),
+       HPP__COLOR_PRINT_FNS("usr", overhead_us, OVERHEAD_US),
+       HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys, OVERHEAD_GUEST_SYS),
+       HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US),
+       HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC),
+       HPP__PRINT_FNS("Samples", samples, SAMPLES),
+       HPP__PRINT_FNS("Period", period, PERIOD)
 };
 
-LIST_HEAD(perf_hpp__list);
-LIST_HEAD(perf_hpp__sort_list);
-
+struct perf_hpp_list perf_hpp_list = {
+       .fields = LIST_HEAD_INIT(perf_hpp_list.fields),
+       .sorts  = LIST_HEAD_INIT(perf_hpp_list.sorts),
+};
 
 #undef HPP__COLOR_PRINT_FNS
 #undef HPP__COLOR_ACC_PRINT_FNS
@@ -485,63 +506,60 @@ void perf_hpp__init(void)
                hpp_dimension__add_output(PERF_HPP__PERIOD);
 }
 
-void perf_hpp__column_register(struct perf_hpp_fmt *format)
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format)
 {
-       list_add_tail(&format->list, &perf_hpp__list);
+       list_add_tail(&format->list, &list->fields);
 }
 
-void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format)
 {
-       list_del(&format->list);
+       list_add_tail(&format->sort_list, &list->sorts);
 }
 
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
-{
-       list_add_tail(&format->sort_list, &perf_hpp__sort_list);
-}
-
-void perf_hpp__column_enable(unsigned col)
-{
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_register(&perf_hpp__format[col]);
-}
-
-void perf_hpp__column_disable(unsigned col)
+void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
 {
-       BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       perf_hpp__column_unregister(&perf_hpp__format[col]);
+       list_del(&format->list);
 }
 
 void perf_hpp__cancel_cumulate(void)
 {
+       struct perf_hpp_fmt *fmt, *acc, *ovh, *tmp;
+
        if (is_strict_order(field_order))
                return;
 
-       perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
-       perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
+       ovh = &perf_hpp__format[PERF_HPP__OVERHEAD];
+       acc = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC];
+
+       perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+               if (acc->equal(acc, fmt)) {
+                       perf_hpp__column_unregister(fmt);
+                       continue;
+               }
+
+               if (ovh->equal(ovh, fmt))
+                       fmt->name = "Overhead";
+       }
 }
 
-void perf_hpp__setup_output_field(void)
+static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       return a->equal && a->equal(a, b);
+}
+
+void perf_hpp__setup_output_field(struct perf_hpp_list *list)
 {
        struct perf_hpp_fmt *fmt;
 
        /* append sort keys to output field */
-       perf_hpp__for_each_sort_list(fmt) {
-               if (!list_empty(&fmt->list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_sort_list(list, fmt) {
+               struct perf_hpp_fmt *pos;
 
-                       perf_hpp__for_each_format(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_format(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                }
 
                perf_hpp__column_register(fmt);
@@ -550,27 +568,17 @@ next:
        }
 }
 
-void perf_hpp__append_sort_keys(void)
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list)
 {
        struct perf_hpp_fmt *fmt;
 
        /* append output fields to sort keys */
-       perf_hpp__for_each_format(fmt) {
-               if (!list_empty(&fmt->sort_list))
-                       continue;
-
-               /*
-                * sort entry fields are dynamically created,
-                * so they can share a same sort key even though
-                * the list is empty.
-                */
-               if (perf_hpp__is_sort_entry(fmt)) {
-                       struct perf_hpp_fmt *pos;
+       perf_hpp_list__for_each_format(list, fmt) {
+               struct perf_hpp_fmt *pos;
 
-                       perf_hpp__for_each_sort_list(pos) {
-                               if (perf_hpp__same_sort_entry(pos, fmt))
-                                       goto next;
-                       }
+               perf_hpp_list__for_each_sort_list(list, pos) {
+                       if (fmt_equal(fmt, pos))
+                               goto next;
                }
 
                perf_hpp__register_sort_field(fmt);
@@ -579,20 +587,29 @@ next:
        }
 }
 
-void perf_hpp__reset_output_field(void)
+
+static void fmt_free(struct perf_hpp_fmt *fmt)
+{
+       if (fmt->free)
+               fmt->free(fmt);
+}
+
+void perf_hpp__reset_output_field(struct perf_hpp_list *list)
 {
        struct perf_hpp_fmt *fmt, *tmp;
 
        /* reset output fields */
-       perf_hpp__for_each_format_safe(fmt, tmp) {
+       perf_hpp_list__for_each_format_safe(list, fmt, tmp) {
                list_del_init(&fmt->list);
                list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
        }
 
        /* reset sort keys */
-       perf_hpp__for_each_sort_list_safe(fmt, tmp) {
+       perf_hpp_list__for_each_sort_list_safe(list, fmt, tmp) {
                list_del_init(&fmt->list);
                list_del_init(&fmt->sort_list);
+               fmt_free(fmt);
        }
 }
 
@@ -606,7 +623,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
        bool first = true;
        struct perf_hpp dummy_hpp;
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                if (perf_hpp__should_skip(fmt, hists))
                        continue;
 
@@ -624,22 +641,39 @@ unsigned int hists__sort_list_width(struct hists *hists)
        return ret;
 }
 
-void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+unsigned int hists__overhead_width(struct hists *hists)
 {
-       int idx;
-
-       if (perf_hpp__is_sort_entry(fmt))
-               return perf_hpp__reset_sort_width(fmt, hists);
+       struct perf_hpp_fmt *fmt;
+       int ret = 0;
+       bool first = true;
+       struct perf_hpp dummy_hpp;
 
-       for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-               if (fmt == &perf_hpp__format[idx])
+       hists__for_each_format(hists, fmt) {
+               if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
                        break;
+
+               if (first)
+                       first = false;
+               else
+                       ret += 2;
+
+               ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
        }
 
-       if (idx == PERF_HPP__MAX_INDEX)
+       return ret;
+}
+
+void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+       if (perf_hpp__is_sort_entry(fmt))
+               return perf_hpp__reset_sort_width(fmt, hists);
+
+       if (perf_hpp__is_dynamic_entry(fmt))
                return;
 
-       switch (idx) {
+       BUG_ON(fmt->idx >= PERF_HPP__MAX_INDEX);
+
+       switch (fmt->idx) {
        case PERF_HPP__OVERHEAD:
        case PERF_HPP__OVERHEAD_SYS:
        case PERF_HPP__OVERHEAD_US:
@@ -667,7 +701,7 @@ void perf_hpp__set_user_width(const char *width_list_str)
        struct perf_hpp_fmt *fmt;
        const char *ptr = width_list_str;
 
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                char *p;
 
                int len = strtol(ptr, &p, 10);
@@ -679,3 +713,71 @@ void perf_hpp__set_user_width(const char *width_list_str)
                        break;
        }
 }
+
+static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_list_node *node = NULL;
+       struct perf_hpp_fmt *fmt_copy;
+       bool found = false;
+       bool skip = perf_hpp__should_skip(fmt, hists);
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               if (node->level == fmt->level) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               node = malloc(sizeof(*node));
+               if (node == NULL)
+                       return -1;
+
+               node->skip = skip;
+               node->level = fmt->level;
+               perf_hpp_list__init(&node->hpp);
+
+               hists->nr_hpp_node++;
+               list_add_tail(&node->list, &hists->hpp_formats);
+       }
+
+       fmt_copy = perf_hpp_fmt__dup(fmt);
+       if (fmt_copy == NULL)
+               return -1;
+
+       if (!skip)
+               node->skip = false;
+
+       list_add_tail(&fmt_copy->list, &node->hpp.fields);
+       list_add_tail(&fmt_copy->sort_list, &node->hpp.sorts);
+
+       return 0;
+}
+
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct perf_hpp_fmt *fmt;
+       struct hists *hists;
+       int ret;
+
+       if (!symbol_conf.report_hierarchy)
+               return 0;
+
+       evlist__for_each(evlist, evsel) {
+               hists = evsel__hists(evsel);
+
+               perf_hpp_list__for_each_sort_list(list, fmt) {
+                       if (perf_hpp__is_dynamic_entry(fmt) &&
+                           !perf_hpp__defined_dynamic_entry(fmt, hists))
+                               continue;
+
+                       ret = add_hierarchy_fmt(hists, fmt);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
index 387110d..7aff5ac 100644 (file)
@@ -165,8 +165,28 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
        return ret;
 }
 
+/*
+ * If have one single callchain root, don't bother printing
+ * its percentage (100 % in fractal mode and the same percentage
+ * than the hist in graph mode). This also avoid one level of column.
+ *
+ * However when percent-limit applied, it's possible that single callchain
+ * node have different (non-100% in fractal mode) percentage.
+ */
+static bool need_percent_display(struct rb_node *node, u64 parent_samples)
+{
+       struct callchain_node *cnode;
+
+       if (rb_next(node))
+               return true;
+
+       cnode = rb_entry(node, struct callchain_node, rb_node);
+       return callchain_cumul_hits(cnode) != parent_samples;
+}
+
 static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-                                      u64 total_samples, int left_margin)
+                                      u64 total_samples, u64 parent_samples,
+                                      int left_margin)
 {
        struct callchain_node *cnode;
        struct callchain_list *chain;
@@ -177,13 +197,8 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
        int ret = 0;
        char bf[1024];
 
-       /*
-        * If have one single callchain root, don't bother printing
-        * its percentage (100 % in fractal mode and the same percentage
-        * than the hist in graph mode). This also avoid one level of column.
-        */
        node = rb_first(root);
-       if (node && !rb_next(node)) {
+       if (node && !need_percent_display(node, parent_samples)) {
                cnode = rb_entry(node, struct callchain_node, rb_node);
                list_for_each_entry(chain, &cnode->val, list) {
                        /*
@@ -213,9 +228,15 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
                root = &cnode->rb_root;
        }
 
+       if (callchain_param.mode == CHAIN_GRAPH_REL)
+               total_samples = parent_samples;
+
        ret += __callchain__fprintf_graph(fp, root, total_samples,
                                          1, 1, left_margin);
-       ret += fprintf(fp, "\n");
+       if (ret) {
+               /* do not add a blank line if it printed nothing */
+               ret += fprintf(fp, "\n");
+       }
 
        return ret;
 }
@@ -323,16 +344,19 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
                                            u64 total_samples, int left_margin,
                                            FILE *fp)
 {
+       u64 parent_samples = he->stat.period;
+
+       if (symbol_conf.cumulate_callchain)
+               parent_samples = he->stat_acc->period;
+
        switch (callchain_param.mode) {
        case CHAIN_GRAPH_REL:
-               return callchain__fprintf_graph(fp, &he->sorted_chain,
-                                               symbol_conf.cumulate_callchain ?
-                                               he->stat_acc->period : he->stat.period,
-                                               left_margin);
+               return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
+                                               parent_samples, left_margin);
                break;
        case CHAIN_GRAPH_ABS:
                return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
-                                               left_margin);
+                                               parent_samples, left_margin);
                break;
        case CHAIN_FLAT:
                return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
@@ -349,45 +373,66 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
        return 0;
 }
 
-static size_t hist_entry__callchain_fprintf(struct hist_entry *he,
-                                           struct hists *hists,
-                                           FILE *fp)
+static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 {
-       int left_margin = 0;
-       u64 total_period = hists->stats.total_period;
+       const char *sep = symbol_conf.field_sep;
+       struct perf_hpp_fmt *fmt;
+       char *start = hpp->buf;
+       int ret;
+       bool first = true;
 
-       if (field_order == NULL && (sort_order == NULL ||
-                                   !prefixcmp(sort_order, "comm"))) {
-               struct perf_hpp_fmt *fmt;
+       if (symbol_conf.exclude_other && !he->parent)
+               return 0;
 
-               perf_hpp__for_each_format(fmt) {
-                       if (!perf_hpp__is_sort_entry(fmt))
-                               continue;
+       hists__for_each_format(he->hists, fmt) {
+               if (perf_hpp__should_skip(fmt, he->hists))
+                       continue;
 
-                       /* must be 'comm' sort entry */
-                       left_margin = fmt->width(fmt, NULL, hists_to_evsel(hists));
-                       left_margin -= thread__comm_len(he->thread);
-                       break;
-               }
+               /*
+                * If there's no field_sep, we still need
+                * to display initial '  '.
+                */
+               if (!sep || !first) {
+                       ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
+                       advance_hpp(hpp, ret);
+               } else
+                       first = false;
+
+               if (perf_hpp__use_color() && fmt->color)
+                       ret = fmt->color(fmt, hpp, he);
+               else
+                       ret = fmt->entry(fmt, hpp, he);
+
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
+               advance_hpp(hpp, ret);
        }
-       return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
+
+       return hpp->buf - start;
 }
 
-static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
+static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
+                                        struct perf_hpp *hpp,
+                                        struct hists *hists,
+                                        FILE *fp)
 {
        const char *sep = symbol_conf.field_sep;
        struct perf_hpp_fmt *fmt;
-       char *start = hpp->buf;
-       int ret;
+       struct perf_hpp_list_node *fmt_node;
+       char *buf = hpp->buf;
+       size_t size = hpp->size;
+       int ret, printed = 0;
        bool first = true;
 
        if (symbol_conf.exclude_other && !he->parent)
                return 0;
 
-       perf_hpp__for_each_format(fmt) {
-               if (perf_hpp__should_skip(fmt, he->hists))
-                       continue;
+       ret = scnprintf(hpp->buf, hpp->size, "%*s", he->depth * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
 
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
                /*
                 * If there's no field_sep, we still need
                 * to display initial '  '.
@@ -403,10 +448,47 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
                else
                        ret = fmt->entry(fmt, hpp, he);
 
+               ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
                advance_hpp(hpp, ret);
        }
 
-       return hpp->buf - start;
+       if (!sep)
+               ret = scnprintf(hpp->buf, hpp->size, "%*s",
+                               (hists->nr_hpp_node - 2) * HIERARCHY_INDENT, "");
+       advance_hpp(hpp, ret);
+
+       printed += fprintf(fp, "%s", buf);
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               hpp->buf  = buf;
+               hpp->size = size;
+
+               /*
+                * No need to call hist_entry__snprintf_alignment() since this
+                * fmt is always the last column in the hierarchy mode.
+                */
+               if (perf_hpp__use_color() && fmt->color)
+                       fmt->color(fmt, hpp, he);
+               else
+                       fmt->entry(fmt, hpp, he);
+
+               /*
+                * dynamic entries are right-aligned but we want left-aligned
+                * in the hierarchy mode
+                */
+               printed += fprintf(fp, "%s%s", sep ?: "  ", ltrim(buf));
+       }
+       printed += putc('\n', fp);
+
+       if (symbol_conf.use_callchain && he->leaf) {
+               u64 total = hists__total_period(hists);
+
+               printed += hist_entry_callchain__fprintf(he, total, 0, fp);
+               goto out;
+       }
+
+out:
+       return printed;
 }
 
 static int hist_entry__fprintf(struct hist_entry *he, size_t size,
@@ -418,24 +500,134 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
                .buf            = bf,
                .size           = size,
        };
+       u64 total_period = hists->stats.total_period;
 
        if (size == 0 || size > bfsz)
                size = hpp.size = bfsz;
 
+       if (symbol_conf.report_hierarchy)
+               return hist_entry__hierarchy_fprintf(he, &hpp, hists, fp);
+
        hist_entry__snprintf(he, &hpp);
 
        ret = fprintf(fp, "%s\n", bf);
 
        if (symbol_conf.use_callchain)
-               ret += hist_entry__callchain_fprintf(he, hists, fp);
+               ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
 
        return ret;
 }
 
+static int print_hierarchy_indent(const char *sep, int indent,
+                                 const char *line, FILE *fp)
+{
+       if (sep != NULL || indent < 2)
+               return 0;
+
+       return fprintf(fp, "%-.*s", (indent - 2) * HIERARCHY_INDENT, line);
+}
+
+static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
+                                 const char *sep, FILE *fp)
+{
+       bool first_node, first_col;
+       int indent;
+       int depth;
+       unsigned width = 0;
+       unsigned header_width = 0;
+       struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
+
+       indent = hists->nr_hpp_node;
+
+       /* preserve max indent depth for column headers */
+       print_hierarchy_indent(sep, indent, spaces, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               fmt->header(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%s%s", hpp->buf, sep ?: "  ");
+       }
+
+       /* combine sort headers with ' / ' */
+       first_node = true;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               if (!first_node)
+                       header_width += fprintf(fp, " / ");
+               first_node = false;
+
+               first_col = true;
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               header_width += fprintf(fp, "+");
+                       first_col = false;
+
+                       fmt->header(fmt, hpp, hists_to_evsel(hists));
+                       rtrim(hpp->buf);
+
+                       header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+               }
+       }
+
+       fprintf(fp, "\n# ");
+
+       /* preserve max indent depth for initial dots */
+       print_hierarchy_indent(sep, indent, dots, fp);
+
+       /* the first hpp_list_node is for overhead columns */
+       fmt_node = list_first_entry(&hists->hpp_formats,
+                                   struct perf_hpp_list_node, list);
+
+       first_col = true;
+       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+               if (!first_col)
+                       fprintf(fp, "%s", sep ?: "..");
+               first_col = false;
+
+               width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+               fprintf(fp, "%.*s", width, dots);
+       }
+
+       depth = 0;
+       list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+               first_col = true;
+               width = depth * HIERARCHY_INDENT;
+
+               perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+                       if (perf_hpp__should_skip(fmt, hists))
+                               continue;
+
+                       if (!first_col)
+                               width++;  /* for '+' sign between column header */
+                       first_col = false;
+
+                       width += fmt->width(fmt, hpp, hists_to_evsel(hists));
+               }
+
+               if (width > header_width)
+                       header_width = width;
+
+               depth++;
+       }
+
+       fprintf(fp, "%s%-.*s", sep ?: "  ", header_width, dots);
+
+       fprintf(fp, "\n#\n");
+
+       return 2;
+}
+
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
                      int max_cols, float min_pcnt, FILE *fp)
 {
        struct perf_hpp_fmt *fmt;
+       struct perf_hpp_list_node *fmt_node;
        struct rb_node *nd;
        size_t ret = 0;
        unsigned int width;
@@ -449,10 +641,11 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
        bool first = true;
        size_t linesz;
        char *line = NULL;
+       unsigned indent;
 
        init_rem_hits();
 
-       perf_hpp__for_each_format(fmt)
+       hists__for_each_format(hists, fmt)
                perf_hpp__reset_width(fmt, hists);
 
        if (symbol_conf.col_width_list_str)
@@ -463,7 +656,16 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
        fprintf(fp, "# ");
 
-       perf_hpp__for_each_format(fmt) {
+       if (symbol_conf.report_hierarchy) {
+               list_for_each_entry(fmt_node, &hists->hpp_formats, list) {
+                       perf_hpp_list__for_each_format(&fmt_node->hpp, fmt)
+                               perf_hpp__reset_width(fmt, hists);
+               }
+               nr_rows += print_hierarchy_header(hists, &dummy_hpp, sep, fp);
+               goto print_entries;
+       }
+
+       hists__for_each_format(hists, fmt) {
                if (perf_hpp__should_skip(fmt, hists))
                        continue;
 
@@ -487,7 +689,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
        fprintf(fp, "# ");
 
-       perf_hpp__for_each_format(fmt) {
+       hists__for_each_format(hists, fmt) {
                unsigned int i;
 
                if (perf_hpp__should_skip(fmt, hists))
@@ -520,7 +722,9 @@ print_entries:
                goto out;
        }
 
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       indent = hists__overhead_width(hists) + 4;
+
+       for (nd = rb_first(&hists->entries); nd; nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD)) {
                struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
                float percent;
 
@@ -536,6 +740,20 @@ print_entries:
                if (max_rows && ++nr_rows >= max_rows)
                        break;
 
+               /*
+                * If all children are filtered out or percent-limited,
+                * display "no entry >= x.xx%" message.
+                */
+               if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
+                       int depth = hists->nr_hpp_node + h->depth + 1;
+
+                       print_hierarchy_indent(sep, depth, spaces, fp);
+                       fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
+
+                       if (max_rows && ++nr_rows >= max_rows)
+                               break;
+               }
+
                if (h->ms.map == NULL && verbose > 1) {
                        __map_groups__fprintf_maps(h->thread->mg,
                                                   MAP__FUNCTION, fp);
index 5eec53a..eea25e2 100644 (file)
@@ -82,6 +82,7 @@ libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
+libperf-y += mem-events.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -105,8 +106,17 @@ libperf-y += scripting-engines/
 
 libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
+libperf-y += demangle-java.o
+
+ifdef CONFIG_JITDUMP
+libperf-$(CONFIG_LIBELF) += jitdump.o
+libperf-$(CONFIG_LIBELF) += genelf.o
+libperf-$(CONFIG_LIBELF) += genelf_debug.o
+endif
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+# avoid compiler warnings in 32-bit mode
+CFLAGS_genelf_debug.o  += -Wno-packed
 
 $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
        $(call rule_mkdir)
index 360fda0..ec164fe 100644 (file)
@@ -478,10 +478,11 @@ void auxtrace_heap__pop(struct auxtrace_heap *heap)
                         heap_array[last].ordinal);
 }
 
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist)
 {
        if (itr)
-               return itr->info_priv_size(itr);
+               return itr->info_priv_size(itr, evlist);
        return 0;
 }
 
@@ -852,7 +853,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
        int err;
 
        pr_debug2("Synthesizing auxtrace information\n");
-       priv_size = auxtrace_record__info_priv_size(itr);
+       priv_size = auxtrace_record__info_priv_size(itr, session->evlist);
        ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
        if (!ev)
                return -ENOMEM;
index b86f90d..e5a8e2d 100644 (file)
@@ -293,7 +293,8 @@ struct auxtrace_record {
        int (*recording_options)(struct auxtrace_record *itr,
                                 struct perf_evlist *evlist,
                                 struct record_opts *opts);
-       size_t (*info_priv_size)(struct auxtrace_record *itr);
+       size_t (*info_priv_size)(struct auxtrace_record *itr,
+                                struct perf_evlist *evlist);
        int (*info_fill)(struct auxtrace_record *itr,
                         struct perf_session *session,
                         struct auxtrace_info_event *auxtrace_info,
@@ -429,7 +430,8 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
 int auxtrace_record__options(struct auxtrace_record *itr,
                             struct perf_evlist *evlist,
                             struct record_opts *opts);
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist);
 int auxtrace_record__info_fill(struct auxtrace_record *itr,
                               struct perf_session *session,
                               struct auxtrace_info_event *auxtrace_info,
index 540a7ef..0967ce6 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/libbpf.h>
+#include <bpf/bpf.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include "perf.h"
@@ -16,6 +17,7 @@
 #include "llvm-utils.h"
 #include "probe-event.h"
 #include "probe-finder.h" // for MAX_PROBES
+#include "parse-events.h"
 #include "llvm-utils.h"
 
 #define DEFINE_PRINT_FN(name, level) \
@@ -108,8 +110,8 @@ void bpf__clear(void)
 }
 
 static void
-bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
-                    void *_priv)
+clear_prog_priv(struct bpf_program *prog __maybe_unused,
+               void *_priv)
 {
        struct bpf_prog_priv *priv = _priv;
 
@@ -337,7 +339,7 @@ config_bpf_program(struct bpf_program *prog)
        }
        pr_debug("bpf: config '%s' is ok\n", config_str);
 
-       err = bpf_program__set_private(prog, priv, bpf_prog_priv__clear);
+       err = bpf_program__set_private(prog, priv, clear_prog_priv);
        if (err) {
                pr_debug("Failed to set priv for program '%s'\n", config_str);
                goto errout;
@@ -739,6 +741,682 @@ int bpf__foreach_tev(struct bpf_object *obj,
        return 0;
 }
 
+enum bpf_map_op_type {
+       BPF_MAP_OP_SET_VALUE,
+       BPF_MAP_OP_SET_EVSEL,
+};
+
+enum bpf_map_key_type {
+       BPF_MAP_KEY_ALL,
+       BPF_MAP_KEY_RANGES,
+};
+
+struct bpf_map_op {
+       struct list_head list;
+       enum bpf_map_op_type op_type;
+       enum bpf_map_key_type key_type;
+       union {
+               struct parse_events_array array;
+       } k;
+       union {
+               u64 value;
+               struct perf_evsel *evsel;
+       } v;
+};
+
+struct bpf_map_priv {
+       struct list_head ops_list;
+};
+
+static void
+bpf_map_op__delete(struct bpf_map_op *op)
+{
+       if (!list_empty(&op->list))
+               list_del(&op->list);
+       if (op->key_type == BPF_MAP_KEY_RANGES)
+               parse_events__clear_array(&op->k.array);
+       free(op);
+}
+
+static void
+bpf_map_priv__purge(struct bpf_map_priv *priv)
+{
+       struct bpf_map_op *pos, *n;
+
+       list_for_each_entry_safe(pos, n, &priv->ops_list, list) {
+               list_del_init(&pos->list);
+               bpf_map_op__delete(pos);
+       }
+}
+
+static void
+bpf_map_priv__clear(struct bpf_map *map __maybe_unused,
+                   void *_priv)
+{
+       struct bpf_map_priv *priv = _priv;
+
+       bpf_map_priv__purge(priv);
+       free(priv);
+}
+
+static int
+bpf_map_op_setkey(struct bpf_map_op *op, struct parse_events_term *term)
+{
+       op->key_type = BPF_MAP_KEY_ALL;
+       if (!term)
+               return 0;
+
+       if (term->array.nr_ranges) {
+               size_t memsz = term->array.nr_ranges *
+                               sizeof(op->k.array.ranges[0]);
+
+               op->k.array.ranges = memdup(term->array.ranges, memsz);
+               if (!op->k.array.ranges) {
+                       pr_debug("No enough memory to alloc indices for map\n");
+                       return -ENOMEM;
+               }
+               op->key_type = BPF_MAP_KEY_RANGES;
+               op->k.array.nr_ranges = term->array.nr_ranges;
+       }
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map_op__new(struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = zalloc(sizeof(*op));
+       if (!op) {
+               pr_debug("Failed to alloc bpf_map_op\n");
+               return ERR_PTR(-ENOMEM);
+       }
+       INIT_LIST_HEAD(&op->list);
+
+       err = bpf_map_op_setkey(op, term);
+       if (err) {
+               free(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
+{
+       struct bpf_map_priv *priv;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("Failed to get private from map %s\n", map_name);
+               return err;
+       }
+
+       if (!priv) {
+               priv = zalloc(sizeof(*priv));
+               if (!priv) {
+                       pr_debug("No enough memory to alloc map private\n");
+                       return -ENOMEM;
+               }
+               INIT_LIST_HEAD(&priv->ops_list);
+
+               if (bpf_map__set_private(map, priv, bpf_map_priv__clear)) {
+                       free(priv);
+                       return -BPF_LOADER_ERRNO__INTERNAL;
+               }
+       }
+
+       list_add_tail(&op->list, &priv->ops_list);
+       return 0;
+}
+
+static struct bpf_map_op *
+bpf_map__add_newop(struct bpf_map *map, struct parse_events_term *term)
+{
+       struct bpf_map_op *op;
+       int err;
+
+       op = bpf_map_op__new(term);
+       if (IS_ERR(op))
+               return op;
+
+       err = bpf_map__add_op(map, op);
+       if (err) {
+               bpf_map_op__delete(op);
+               return ERR_PTR(err);
+       }
+       return op;
+}
+
+static int
+__bpf_map__config_value(struct bpf_map *map,
+                       struct parse_events_term *term)
+{
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (def.type != BPF_MAP_TYPE_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+       if (def.key_size < sizeof(unsigned int)) {
+               pr_debug("Map %s has incorrect key size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
+       }
+       switch (def.value_size) {
+       case 1:
+       case 2:
+       case 4:
+       case 8:
+               break;
+       default:
+               pr_debug("Map %s has incorrect value size\n", map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_VALUE;
+       op->v.value = term->val.num;
+       return 0;
+}
+
+static int
+bpf_map__config_value(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist __maybe_unused)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_NUM) {
+               pr_debug("ERROR: wrong value type for 'value'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_value(map, term);
+}
+
+static int
+__bpf_map__config_event(struct bpf_map *map,
+                       struct parse_events_term *term,
+                       struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+       struct bpf_map_def def;
+       struct bpf_map_op *op;
+       const char *map_name;
+       int err;
+
+       map_name = bpf_map__get_name(map);
+       evsel = perf_evlist__find_evsel_by_str(evlist, term->val.str);
+       if (!evsel) {
+               pr_debug("Event (for '%s') '%s' doesn't exist\n",
+                        map_name, term->val.str);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("Unable to get map definition from '%s'\n",
+                        map_name);
+               return err;
+       }
+
+       /*
+        * No need to check key_size and value_size:
+        * kernel has already checked them.
+        */
+       if (def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+               pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+       }
+
+       op = bpf_map__add_newop(map, term);
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       op->op_type = BPF_MAP_OP_SET_EVSEL;
+       op->v.evsel = evsel;
+       return 0;
+}
+
+static int
+bpf_map__config_event(struct bpf_map *map,
+                     struct parse_events_term *term,
+                     struct perf_evlist *evlist)
+{
+       if (!term->err_val) {
+               pr_debug("Config value not set\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+       }
+
+       if (term->type_val != PARSE_EVENTS__TERM_TYPE_STR) {
+               pr_debug("ERROR: wrong value type for 'event'\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+       }
+
+       return __bpf_map__config_event(map, term, evlist);
+}
+
+struct bpf_obj_config__map_func {
+       const char *config_opt;
+       int (*config_func)(struct bpf_map *, struct parse_events_term *,
+                          struct perf_evlist *);
+};
+
+struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
+       {"value", bpf_map__config_value},
+       {"event", bpf_map__config_event},
+};
+
+static int
+config_map_indices_range_check(struct parse_events_term *term,
+                              struct bpf_map *map,
+                              const char *map_name)
+{
+       struct parse_events_array *array = &term->array;
+       struct bpf_map_def def;
+       unsigned int i;
+       int err;
+
+       if (!array->nr_ranges)
+               return 0;
+       if (!array->ranges) {
+               pr_debug("ERROR: map %s: array->nr_ranges is %d but range array is NULL\n",
+                        map_name, (int)array->nr_ranges);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: Unable to get map definition from '%s'\n",
+                        map_name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       for (i = 0; i < array->nr_ranges; i++) {
+               unsigned int start = array->ranges[i].start;
+               size_t length = array->ranges[i].length;
+               unsigned int idx = start + length - 1;
+
+               if (idx >= def.max_entries) {
+                       pr_debug("ERROR: index %d too large\n", idx);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
+               }
+       }
+       return 0;
+}
+
+static int
+bpf__obj_config_map(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *key_scan_pos)
+{
+       /* key is "map:<mapname>.<config opt>" */
+       char *map_name = strdup(term->config + sizeof("map:") - 1);
+       struct bpf_map *map;
+       int err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+       char *map_opt;
+       size_t i;
+
+       if (!map_name)
+               return -ENOMEM;
+
+       map_opt = strchr(map_name, '.');
+       if (!map_opt) {
+               pr_debug("ERROR: Invalid map config: %s\n", map_name);
+               goto out;
+       }
+
+       *map_opt++ = '\0';
+       if (*map_opt == '\0') {
+               pr_debug("ERROR: Invalid map option: %s\n", term->config);
+               goto out;
+       }
+
+       map = bpf_object__get_map_by_name(obj, map_name);
+       if (!map) {
+               pr_debug("ERROR: Map %s doesn't exist\n", map_name);
+               err = -BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST;
+               goto out;
+       }
+
+       *key_scan_pos += strlen(map_opt);
+       err = config_map_indices_range_check(term, map, map_name);
+       if (err)
+               goto out;
+       *key_scan_pos -= strlen(map_opt);
+
+       for (i = 0; i < ARRAY_SIZE(bpf_obj_config__map_funcs); i++) {
+               struct bpf_obj_config__map_func *func =
+                               &bpf_obj_config__map_funcs[i];
+
+               if (strcmp(map_opt, func->config_opt) == 0) {
+                       err = func->config_func(map, term, evlist);
+                       goto out;
+               }
+       }
+
+       pr_debug("ERROR: Invalid map config option '%s'\n", map_opt);
+       err = -BPF_LOADER_ERRNO__OBJCONF_MAP_OPT;
+out:
+       free(map_name);
+       if (!err)
+               key_scan_pos += strlen(map_opt);
+       return err;
+}
+
+int bpf__config_obj(struct bpf_object *obj,
+                   struct parse_events_term *term,
+                   struct perf_evlist *evlist,
+                   int *error_pos)
+{
+       int key_scan_pos = 0;
+       int err;
+
+       if (!obj || !term || !term->config)
+               return -EINVAL;
+
+       if (!prefixcmp(term->config, "map:")) {
+               key_scan_pos = sizeof("map:") - 1;
+               err = bpf__obj_config_map(obj, term, evlist, &key_scan_pos);
+               goto out;
+       }
+       err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+out:
+       if (error_pos)
+               *error_pos = key_scan_pos;
+       return err;
+
+}
+
+typedef int (*map_config_func_t)(const char *name, int map_fd,
+                                struct bpf_map_def *pdef,
+                                struct bpf_map_op *op,
+                                void *pkey, void *arg);
+
+static int
+foreach_key_array_all(map_config_func_t func,
+                     void *arg, const char *name,
+                     int map_fd, struct bpf_map_def *pdef,
+                     struct bpf_map_op *op)
+{
+       unsigned int i;
+       int err;
+
+       for (i = 0; i < pdef->max_entries; i++) {
+               err = func(name, map_fd, pdef, op, &i, arg);
+               if (err) {
+                       pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                name, i);
+                       return err;
+               }
+       }
+       return 0;
+}
+
+static int
+foreach_key_array_ranges(map_config_func_t func, void *arg,
+                        const char *name, int map_fd,
+                        struct bpf_map_def *pdef,
+                        struct bpf_map_op *op)
+{
+       unsigned int i, j;
+       int err;
+
+       for (i = 0; i < op->k.array.nr_ranges; i++) {
+               unsigned int start = op->k.array.ranges[i].start;
+               size_t length = op->k.array.ranges[i].length;
+
+               for (j = 0; j < length; j++) {
+                       unsigned int idx = start + j;
+
+                       err = func(name, map_fd, pdef, op, &idx, arg);
+                       if (err) {
+                               pr_debug("ERROR: failed to insert value to %s[%u]\n",
+                                        name, idx);
+                               return err;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int
+bpf_map_config_foreach_key(struct bpf_map *map,
+                          map_config_func_t func,
+                          void *arg)
+{
+       int err, map_fd;
+       const char *name;
+       struct bpf_map_op *op;
+       struct bpf_map_def def;
+       struct bpf_map_priv *priv;
+
+       name = bpf_map__get_name(map);
+
+       err = bpf_map__get_private(map, (void **)&priv);
+       if (err) {
+               pr_debug("ERROR: failed to get private from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       if (!priv || list_empty(&priv->ops_list)) {
+               pr_debug("INFO: nothing to config for map %s\n", name);
+               return 0;
+       }
+
+       err = bpf_map__get_def(map, &def);
+       if (err) {
+               pr_debug("ERROR: failed to get definition from map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       map_fd = bpf_map__get_fd(map);
+       if (map_fd < 0) {
+               pr_debug("ERROR: failed to get fd from map %s\n", name);
+               return map_fd;
+       }
+
+       list_for_each_entry(op, &priv->ops_list, list) {
+               switch (def.type) {
+               case BPF_MAP_TYPE_ARRAY:
+               case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+                       switch (op->key_type) {
+                       case BPF_MAP_KEY_ALL:
+                               err = foreach_key_array_all(func, arg, name,
+                                                           map_fd, &def, op);
+                               break;
+                       case BPF_MAP_KEY_RANGES:
+                               err = foreach_key_array_ranges(func, arg, name,
+                                                              map_fd, &def,
+                                                              op);
+                               break;
+                       default:
+                               pr_debug("ERROR: keytype for map '%s' invalid\n",
+                                        name);
+                               return -BPF_LOADER_ERRNO__INTERNAL;
+                       }
+                       if (err)
+                               return err;
+                       break;
+               default:
+                       pr_debug("ERROR: type of '%s' incorrect\n", name);
+                       return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+               }
+       }
+
+       return 0;
+}
+
+static int
+apply_config_value_for_key(int map_fd, void *pkey,
+                          size_t val_size, u64 val)
+{
+       int err = 0;
+
+       switch (val_size) {
+       case 1: {
+               u8 _val = (u8)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 2: {
+               u16 _val = (u16)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 4: {
+               u32 _val = (u32)(val);
+               err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+               break;
+       }
+       case 8: {
+               err = bpf_map_update_elem(map_fd, pkey, &val, BPF_ANY);
+               break;
+       }
+       default:
+               pr_debug("ERROR: invalid value size\n");
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+       }
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
+                          struct perf_evsel *evsel)
+{
+       struct xyarray *xy = evsel->fd;
+       struct perf_event_attr *attr;
+       unsigned int key, events;
+       bool check_pass = false;
+       int *evt_fd;
+       int err;
+
+       if (!xy) {
+               pr_debug("ERROR: evsel not ready for map %s\n", name);
+               return -BPF_LOADER_ERRNO__INTERNAL;
+       }
+
+       if (xy->row_size / xy->entry_size != 1) {
+               pr_debug("ERROR: Dimension of target event is incorrect for map %s\n",
+                        name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM;
+       }
+
+       attr = &evsel->attr;
+       if (attr->inherit) {
+               pr_debug("ERROR: Can't put inherit event into map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH;
+       }
+
+       if (perf_evsel__is_bpf_output(evsel))
+               check_pass = true;
+       if (attr->type == PERF_TYPE_RAW)
+               check_pass = true;
+       if (attr->type == PERF_TYPE_HARDWARE)
+               check_pass = true;
+       if (!check_pass) {
+               pr_debug("ERROR: Event type is wrong for map %s\n", name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE;
+       }
+
+       events = xy->entries / (xy->row_size / xy->entry_size);
+       key = *((unsigned int *)pkey);
+       if (key >= events) {
+               pr_debug("ERROR: there is no event %d for map %s\n",
+                        key, name);
+               return -BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE;
+       }
+       evt_fd = xyarray__entry(xy, key, 0);
+       err = bpf_map_update_elem(map_fd, pkey, evt_fd, BPF_ANY);
+       if (err && errno)
+               err = -errno;
+       return err;
+}
+
+static int
+apply_obj_config_map_for_key(const char *name, int map_fd,
+                            struct bpf_map_def *pdef __maybe_unused,
+                            struct bpf_map_op *op,
+                            void *pkey, void *arg __maybe_unused)
+{
+       int err;
+
+       switch (op->op_type) {
+       case BPF_MAP_OP_SET_VALUE:
+               err = apply_config_value_for_key(map_fd, pkey,
+                                                pdef->value_size,
+                                                op->v.value);
+               break;
+       case BPF_MAP_OP_SET_EVSEL:
+               err = apply_config_evsel_for_key(name, map_fd, pkey,
+                                                op->v.evsel);
+               break;
+       default:
+               pr_debug("ERROR: unknown value type for '%s'\n", name);
+               err = -BPF_LOADER_ERRNO__INTERNAL;
+       }
+       return err;
+}
+
+static int
+apply_obj_config_map(struct bpf_map *map)
+{
+       return bpf_map_config_foreach_key(map,
+                                         apply_obj_config_map_for_key,
+                                         NULL);
+}
+
+static int
+apply_obj_config_object(struct bpf_object *obj)
+{
+       struct bpf_map *map;
+       int err;
+
+       bpf_map__for_each(map, obj) {
+               err = apply_obj_config_map(map);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+int bpf__apply_obj_config(void)
+{
+       struct bpf_object *obj, *tmp;
+       int err;
+
+       bpf_object__for_each_safe(obj, tmp) {
+               err = apply_obj_config_object(obj);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
 #define ERRNO_OFFSET(e)                ((e) - __BPF_LOADER_ERRNO__START)
 #define ERRCODE_OFFSET(c)      ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
 #define NR_ERRNO       (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -753,6 +1431,20 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
        [ERRCODE_OFFSET(PROLOGUE)]      = "Failed to generate prologue",
        [ERRCODE_OFFSET(PROLOGUE2BIG)]  = "Prologue too big for program",
        [ERRCODE_OFFSET(PROLOGUEOOB)]   = "Offset out of bound for prologue",
+       [ERRCODE_OFFSET(OBJCONF_OPT)]   = "Invalid object config option",
+       [ERRCODE_OFFSET(OBJCONF_CONF)]  = "Config value not set (missing '=')",
+       [ERRCODE_OFFSET(OBJCONF_MAP_OPT)]       = "Invalid object map config option",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOTEXIST)]  = "Target map doesn't exist",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUE)]     = "Incorrect value type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_TYPE)]      = "Incorrect map type",
+       [ERRCODE_OFFSET(OBJCONF_MAP_KEYSIZE)]   = "Incorrect map key size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_VALUESIZE)] = "Incorrect map value size",
+       [ERRCODE_OFFSET(OBJCONF_MAP_NOEVT)]     = "Event not found for map setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_MAPSIZE)]   = "Invalid map size for event setting",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTDIM)]    = "Event dimension too large",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTINH)]    = "Doesn't support inherit event",
+       [ERRCODE_OFFSET(OBJCONF_MAP_EVTTYPE)]   = "Wrong event type for map",
+       [ERRCODE_OFFSET(OBJCONF_MAP_IDX2BIG)]   = "Index too large",
 };
 
 static int
@@ -872,3 +1564,29 @@ int bpf__strerror_load(struct bpf_object *obj,
        bpf__strerror_end(buf, size);
        return 0;
 }
+
+int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                            struct parse_events_term *term __maybe_unused,
+                            struct perf_evlist *evlist __maybe_unused,
+                            int *error_pos __maybe_unused, int err,
+                            char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,
+                           "Can't use this config term with this map type");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
+
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
+{
+       bpf__strerror_head(err, buf, size);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,
+                           "Cannot set event to BPF map in multi-thread tracing");
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,
+                           "%s (Hint: use -i to turn off inherit)", emsg);
+       bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,
+                           "Can only put raw, hardware and BPF output event into a BPF map");
+       bpf__strerror_end(buf, size);
+       return 0;
+}
index 6fdc045..be43119 100644 (file)
@@ -10,6 +10,7 @@
 #include <string.h>
 #include <bpf/libbpf.h>
 #include "probe-event.h"
+#include "evlist.h"
 #include "debug.h"
 
 enum bpf_loader_errno {
@@ -24,10 +25,25 @@ enum bpf_loader_errno {
        BPF_LOADER_ERRNO__PROLOGUE,     /* Failed to generate prologue */
        BPF_LOADER_ERRNO__PROLOGUE2BIG, /* Prologue too big for program */
        BPF_LOADER_ERRNO__PROLOGUEOOB,  /* Offset out of bound for prologue */
+       BPF_LOADER_ERRNO__OBJCONF_OPT,  /* Invalid object config option */
+       BPF_LOADER_ERRNO__OBJCONF_CONF, /* Config value not set (lost '=')) */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_OPT,      /* Invalid object map config option */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST, /* Target map not exist */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE,    /* Incorrect value type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,     /* Incorrect map type */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE,  /* Incorrect map key size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE,/* Incorrect map value size */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT,    /* Event not found for map setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE,  /* Invalid map size for event setting */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,   /* Event dimension too large */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,   /* Doesn't support inherit event */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,  /* Wrong event type for map */
+       BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG,  /* Index too large */
        __BPF_LOADER_ERRNO__END,
 };
 
 struct bpf_object;
+struct parse_events_term;
 #define PERF_BPF_PROBE_GROUP "perf_bpf_probe"
 
 typedef int (*bpf_prog_iter_callback_t)(struct probe_trace_event *tev,
@@ -53,6 +69,16 @@ int bpf__strerror_load(struct bpf_object *obj, int err,
                       char *buf, size_t size);
 int bpf__foreach_tev(struct bpf_object *obj,
                     bpf_prog_iter_callback_t func, void *arg);
+
+int bpf__config_obj(struct bpf_object *obj, struct parse_events_term *term,
+                   struct perf_evlist *evlist, int *error_pos);
+int bpf__strerror_config_obj(struct bpf_object *obj,
+                            struct parse_events_term *term,
+                            struct perf_evlist *evlist,
+                            int *error_pos, int err, char *buf,
+                            size_t size);
+int bpf__apply_obj_config(void);
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
 #else
 static inline struct bpf_object *
 bpf__prepare_load(const char *filename __maybe_unused,
@@ -83,6 +109,21 @@ bpf__foreach_tev(struct bpf_object *obj __maybe_unused,
        return 0;
 }
 
+static inline int
+bpf__config_obj(struct bpf_object *obj __maybe_unused,
+               struct parse_events_term *term __maybe_unused,
+               struct perf_evlist *evlist __maybe_unused,
+               int *error_pos __maybe_unused)
+{
+       return 0;
+}
+
+static inline int
+bpf__apply_obj_config(void)
+{
+       return 0;
+}
+
 static inline int
 __bpf_strerror(char *buf, size_t size)
 {
@@ -118,5 +159,23 @@ static inline int bpf__strerror_load(struct bpf_object *obj __maybe_unused,
 {
        return __bpf_strerror(buf, size);
 }
+
+static inline int
+bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+                        struct parse_events_term *term __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused,
+                        int *error_pos __maybe_unused,
+                        int err __maybe_unused,
+                        char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
+
+static inline int
+bpf__strerror_apply_obj_config(int err __maybe_unused,
+                              char *buf, size_t size)
+{
+       return __bpf_strerror(buf, size);
+}
 #endif
 #endif
index 6a7e273..f1479ee 100644 (file)
@@ -166,6 +166,50 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
        return build_id__filename(build_id_hex, bf, size);
 }
 
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size)
+{
+       char *id_name, *ch;
+       struct stat sb;
+
+       id_name = dso__build_id_filename(dso, bf, size);
+       if (!id_name)
+               goto err;
+       if (access(id_name, F_OK))
+               goto err;
+       if (lstat(id_name, &sb) == -1)
+               goto err;
+       if ((size_t)sb.st_size > size - 1)
+               goto err;
+       if (readlink(id_name, bf, size - 1) < 0)
+               goto err;
+
+       bf[sb.st_size] = '\0';
+
+       /*
+        * link should be:
+        * ../../lib/modules/4.4.0-rc4/kernel/net/ipv4/netfilter/nf_nat_ipv4.ko/a09fe3eb3147dafa4e3b31dbd6257e4d696bdc92
+        */
+       ch = strrchr(bf, '/');
+       if (!ch)
+               goto err;
+       if (ch - 3 < bf)
+               goto err;
+
+       return strncmp(".ko", ch - 3, 3) == 0;
+err:
+       /*
+        * If dso__build_id_filename work, get id_name again,
+        * because id_name points to bf and is broken.
+        */
+       if (id_name)
+               id_name = dso__build_id_filename(dso, bf, size);
+       pr_err("Invalid build id: %s\n", id_name ? :
+                                        dso->long_name ? :
+                                        dso->short_name ? :
+                                        "[unknown]");
+       return false;
+}
+
 #define dsos__for_each_with_build_id(pos, head)        \
        list_for_each_entry(pos, head, node)    \
                if (!pos->has_build_id)         \
@@ -211,6 +255,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
        dsos__for_each_with_build_id(pos, &machine->dsos.head) {
                const char *name;
                size_t name_len;
+               bool in_kernel = false;
 
                if (!pos->hit)
                        continue;
@@ -227,8 +272,11 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                        name_len = pos->long_name_len + 1;
                }
 
+               in_kernel = pos->kernel ||
+                               is_kernel_module(name,
+                                       PERF_RECORD_MISC_CPUMODE_UNKNOWN);
                err = write_buildid(name, name_len, pos->build_id, machine->pid,
-                                   pos->kernel ? kmisc : umisc, fd);
+                                   in_kernel ? kmisc : umisc, fd);
                if (err)
                        break;
        }
index 27a14a8..64af3e2 100644 (file)
@@ -16,6 +16,7 @@ int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
 int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
 
 char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size);
 
 int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
                           struct perf_sample *sample, struct perf_evsel *evsel,
index 07b5d63..3ca453f 100644 (file)
@@ -23,6 +23,8 @@
 #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
 #define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
 
+extern const char *config_exclusive_filename;
+
 typedef int (*config_fn_t)(const char *, const char *, void *);
 extern int perf_default_config(const char *, const char *, void *);
 extern int perf_config(config_fn_t fn, void *);
@@ -31,6 +33,7 @@ extern u64 perf_config_u64(const char *, const char *);
 extern int perf_config_bool(const char *, const char *);
 extern int config_error_nonbool(const char *);
 extern const char *perf_config_dirname(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
 
 char *alias_lookup(const char *alias);
 int split_cmdline(char *cmdline, const char ***argv);
index 53c43eb..24b4bd0 100644 (file)
@@ -416,7 +416,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 /*
  * Fill the node with callchain values
  */
-static void
+static int
 fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 {
        struct callchain_cursor_node *cursor_node;
@@ -433,7 +433,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                call = zalloc(sizeof(*call));
                if (!call) {
                        perror("not enough memory for the code path tree");
-                       return;
+                       return -1;
                }
                call->ip = cursor_node->ip;
                call->ms.sym = cursor_node->sym;
@@ -443,6 +443,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
                callchain_cursor_advance(cursor);
                cursor_node = callchain_cursor_current(cursor);
        }
+       return 0;
 }
 
 static struct callchain_node *
@@ -453,7 +454,19 @@ add_child(struct callchain_node *parent,
        struct callchain_node *new;
 
        new = create_child(parent, false);
-       fill_node(new, cursor);
+       if (new == NULL)
+               return NULL;
+
+       if (fill_node(new, cursor) < 0) {
+               struct callchain_list *call, *tmp;
+
+               list_for_each_entry_safe(call, tmp, &new->val, list) {
+                       list_del(&call->list);
+                       free(call);
+               }
+               free(new);
+               return NULL;
+       }
 
        new->children_hit = 0;
        new->hit = period;
@@ -462,16 +475,32 @@ add_child(struct callchain_node *parent,
        return new;
 }
 
-static s64 match_chain(struct callchain_cursor_node *node,
-                     struct callchain_list *cnode)
+enum match_result {
+       MATCH_ERROR  = -1,
+       MATCH_EQ,
+       MATCH_LT,
+       MATCH_GT,
+};
+
+static enum match_result match_chain(struct callchain_cursor_node *node,
+                                    struct callchain_list *cnode)
 {
        struct symbol *sym = node->sym;
+       u64 left, right;
 
        if (cnode->ms.sym && sym &&
-           callchain_param.key == CCKEY_FUNCTION)
-               return cnode->ms.sym->start - sym->start;
-       else
-               return cnode->ip - node->ip;
+           callchain_param.key == CCKEY_FUNCTION) {
+               left = cnode->ms.sym->start;
+               right = sym->start;
+       } else {
+               left = cnode->ip;
+               right = node->ip;
+       }
+
+       if (left == right)
+               return MATCH_EQ;
+
+       return left > right ? MATCH_GT : MATCH_LT;
 }
 
 /*
@@ -479,7 +508,7 @@ static s64 match_chain(struct callchain_cursor_node *node,
  * give a part of its callchain to the created child.
  * Then create another child to host the given callchain of new branch
  */
-static void
+static int
 split_add_child(struct callchain_node *parent,
                struct callchain_cursor *cursor,
                struct callchain_list *to_split,
@@ -491,6 +520,8 @@ split_add_child(struct callchain_node *parent,
 
        /* split */
        new = create_child(parent, true);
+       if (new == NULL)
+               return -1;
 
        /* split the callchain and move a part to the new child */
        old_tail = parent->val.prev;
@@ -524,6 +555,8 @@ split_add_child(struct callchain_node *parent,
 
                node = callchain_cursor_current(cursor);
                new = add_child(parent, cursor, period);
+               if (new == NULL)
+                       return -1;
 
                /*
                 * This is second child since we moved parent's children
@@ -534,7 +567,7 @@ split_add_child(struct callchain_node *parent,
                cnode = list_first_entry(&first->val, struct callchain_list,
                                         list);
 
-               if (match_chain(node, cnode) < 0)
+               if (match_chain(node, cnode) == MATCH_LT)
                        pp = &p->rb_left;
                else
                        pp = &p->rb_right;
@@ -545,14 +578,15 @@ split_add_child(struct callchain_node *parent,
                parent->hit = period;
                parent->count = 1;
        }
+       return 0;
 }
 
-static int
+static enum match_result
 append_chain(struct callchain_node *root,
             struct callchain_cursor *cursor,
             u64 period);
 
-static void
+static int
 append_chain_children(struct callchain_node *root,
                      struct callchain_cursor *cursor,
                      u64 period)
@@ -564,36 +598,42 @@ append_chain_children(struct callchain_node *root,
 
        node = callchain_cursor_current(cursor);
        if (!node)
-               return;
+               return -1;
 
        /* lookup in childrens */
        while (*p) {
-               s64 ret;
+               enum match_result ret;
 
                parent = *p;
                rnode = rb_entry(parent, struct callchain_node, rb_node_in);
 
                /* If at least first entry matches, rely to children */
                ret = append_chain(rnode, cursor, period);
-               if (ret == 0)
+               if (ret == MATCH_EQ)
                        goto inc_children_hit;
+               if (ret == MATCH_ERROR)
+                       return -1;
 
-               if (ret < 0)
+               if (ret == MATCH_LT)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;
        }
        /* nothing in children, add to the current node */
        rnode = add_child(root, cursor, period);
+       if (rnode == NULL)
+               return -1;
+
        rb_link_node(&rnode->rb_node_in, parent, p);
        rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
 
 inc_children_hit:
        root->children_hit += period;
        root->children_count++;
+       return 0;
 }
 
-static int
+static enum match_result
 append_chain(struct callchain_node *root,
             struct callchain_cursor *cursor,
             u64 period)
@@ -602,7 +642,7 @@ append_chain(struct callchain_node *root,
        u64 start = cursor->pos;
        bool found = false;
        u64 matches;
-       int cmp = 0;
+       enum match_result cmp = MATCH_ERROR;
 
        /*
         * Lookup in the current node
@@ -618,7 +658,7 @@ append_chain(struct callchain_node *root,
                        break;
 
                cmp = match_chain(node, cnode);
-               if (cmp)
+               if (cmp != MATCH_EQ)
                        break;
 
                found = true;
@@ -628,7 +668,7 @@ append_chain(struct callchain_node *root,
 
        /* matches not, relay no the parent */
        if (!found) {
-               WARN_ONCE(!cmp, "Chain comparison error\n");
+               WARN_ONCE(cmp == MATCH_ERROR, "Chain comparison error\n");
                return cmp;
        }
 
@@ -636,21 +676,25 @@ append_chain(struct callchain_node *root,
 
        /* we match only a part of the node. Split it and add the new chain */
        if (matches < root->val_nr) {
-               split_add_child(root, cursor, cnode, start, matches, period);
-               return 0;
+               if (split_add_child(root, cursor, cnode, start, matches,
+                                   period) < 0)
+                       return MATCH_ERROR;
+
+               return MATCH_EQ;
        }
 
        /* we match 100% of the path, increment the hit */
        if (matches == root->val_nr && cursor->pos == cursor->nr) {
                root->hit += period;
                root->count++;
-               return 0;
+               return MATCH_EQ;
        }
 
        /* We match the node and still have a part remaining */
-       append_chain_children(root, cursor, period);
+       if (append_chain_children(root, cursor, period) < 0)
+               return MATCH_ERROR;
 
-       return 0;
+       return MATCH_EQ;
 }
 
 int callchain_append(struct callchain_root *root,
@@ -662,7 +706,8 @@ int callchain_append(struct callchain_root *root,
 
        callchain_cursor_commit(cursor);
 
-       append_chain_children(&root->node, cursor, period);
+       if (append_chain_children(&root->node, cursor, period) < 0)
+               return -1;
 
        if (cursor->nr > root->max_depth)
                root->max_depth = cursor->nr;
@@ -690,7 +735,8 @@ merge_chain_branch(struct callchain_cursor *cursor,
 
        if (src->hit) {
                callchain_cursor_commit(cursor);
-               append_chain_children(dst, cursor, src->hit);
+               if (append_chain_children(dst, cursor, src->hit) < 0)
+                       return -1;
        }
 
        n = rb_first(&src->rb_root_in);
index e5fb88b..43e84aa 100644 (file)
@@ -32,14 +32,15 @@ int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
        return 0;
 }
 
-int perf_color_default_config(const char *var, const char *value, void *cb)
+int perf_color_default_config(const char *var, const char *value,
+                             void *cb __maybe_unused)
 {
        if (!strcmp(var, "color.ui")) {
                perf_use_color_default = perf_config_colorbool(var, value, -1);
                return 0;
        }
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 static int __color_vsnprintf(char *bf, size_t size, const char *color,
index d3e12e3..4e72763 100644 (file)
@@ -26,7 +26,7 @@ static const char *config_file_name;
 static int config_linenr;
 static int config_file_eof;
 
-static const char *config_exclusive_filename;
+const char *config_exclusive_filename;
 
 static int get_next_char(void)
 {
@@ -434,7 +434,7 @@ static int perf_config_from_file(config_fn_t fn, const char *filename, void *dat
        return ret;
 }
 
-static const char *perf_etc_perfconfig(void)
+const char *perf_etc_perfconfig(void)
 {
        static const char *system_wide;
        if (!system_wide)
index fa93509..9bcf2be 100644 (file)
@@ -8,6 +8,10 @@
 #include <linux/bitmap.h>
 #include "asm/bug.h"
 
+static int max_cpu_num;
+static int max_node_num;
+static int *cpunode_map;
+
 static struct cpu_map *cpu_map__default_new(void)
 {
        struct cpu_map *cpus;
@@ -486,6 +490,32 @@ out:
                pr_err("Failed to read max nodes, using default of %d\n", max_node_num);
 }
 
+int cpu__max_node(void)
+{
+       if (unlikely(!max_node_num))
+               set_max_node_num();
+
+       return max_node_num;
+}
+
+int cpu__max_cpu(void)
+{
+       if (unlikely(!max_cpu_num))
+               set_max_cpu_num();
+
+       return max_cpu_num;
+}
+
+int cpu__get_node(int cpu)
+{
+       if (unlikely(cpunode_map == NULL)) {
+               pr_debug("cpu_map not initialized\n");
+               return -1;
+       }
+
+       return cpunode_map[cpu];
+}
+
 static int init_cpunode_map(void)
 {
        int i;
index 71c41b9..81a2562 100644 (file)
@@ -57,37 +57,11 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
        return map ? map->map[0] == -1 : true;
 }
 
-int max_cpu_num;
-int max_node_num;
-int *cpunode_map;
-
 int cpu__setup_cpunode_map(void);
 
-static inline int cpu__max_node(void)
-{
-       if (unlikely(!max_node_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_node_num;
-}
-
-static inline int cpu__max_cpu(void)
-{
-       if (unlikely(!max_cpu_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_cpu_num;
-}
-
-static inline int cpu__get_node(int cpu)
-{
-       if (unlikely(cpunode_map == NULL)) {
-               pr_debug("cpu_map not initialized\n");
-               return -1;
-       }
-
-       return cpunode_map[cpu];
-}
+int cpu__max_node(void);
+int cpu__max_cpu(void);
+int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
                       int (*f)(struct cpu_map *map, int cpu, void *data),
index aada3ac..d4a5a21 100644 (file)
@@ -31,9 +31,18 @@ unsigned char sane_ctype[256] = {
 };
 
 const char *graph_line =
+       "_____________________________________________________________________"
        "_____________________________________________________________________"
        "_____________________________________________________________________";
 const char *graph_dotted_line =
        "---------------------------------------------------------------------"
        "---------------------------------------------------------------------"
        "---------------------------------------------------------------------";
+const char *spaces =
+       "                                                                     "
+       "                                                                     "
+       "                                                                     ";
+const char *dots =
+       "....................................................................."
+       "....................................................................."
+       ".....................................................................";
index 34cd1e4..811af89 100644 (file)
@@ -352,6 +352,84 @@ static int add_tracepoint_values(struct ctf_writer *cw,
        return ret;
 }
 
+static int
+add_bpf_output_values(struct bt_ctf_event_class *event_class,
+                     struct bt_ctf_event *event,
+                     struct perf_sample *sample)
+{
+       struct bt_ctf_field_type *len_type, *seq_type;
+       struct bt_ctf_field *len_field, *seq_field;
+       unsigned int raw_size = sample->raw_size;
+       unsigned int nr_elements = raw_size / sizeof(u32);
+       unsigned int i;
+       int ret;
+
+       if (nr_elements * sizeof(u32) != raw_size)
+               pr_warning("Incorrect raw_size (%u) in bpf output event, skip %lu bytes\n",
+                          raw_size, nr_elements * sizeof(u32) - raw_size);
+
+       len_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_len");
+       len_field = bt_ctf_field_create(len_type);
+       if (!len_field) {
+               pr_err("failed to create 'raw_len' for bpf output event\n");
+               ret = -1;
+               goto put_len_type;
+       }
+
+       ret = bt_ctf_field_unsigned_integer_set_value(len_field, nr_elements);
+       if (ret) {
+               pr_err("failed to set field value for raw_len\n");
+               goto put_len_field;
+       }
+       ret = bt_ctf_event_set_payload(event, "raw_len", len_field);
+       if (ret) {
+               pr_err("failed to set payload to raw_len\n");
+               goto put_len_field;
+       }
+
+       seq_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_data");
+       seq_field = bt_ctf_field_create(seq_type);
+       if (!seq_field) {
+               pr_err("failed to create 'raw_data' for bpf output event\n");
+               ret = -1;
+               goto put_seq_type;
+       }
+
+       ret = bt_ctf_field_sequence_set_length(seq_field, len_field);
+       if (ret) {
+               pr_err("failed to set length of 'raw_data'\n");
+               goto put_seq_field;
+       }
+
+       for (i = 0; i < nr_elements; i++) {
+               struct bt_ctf_field *elem_field =
+                       bt_ctf_field_sequence_get_field(seq_field, i);
+
+               ret = bt_ctf_field_unsigned_integer_set_value(elem_field,
+                               ((u32 *)(sample->raw_data))[i]);
+
+               bt_ctf_field_put(elem_field);
+               if (ret) {
+                       pr_err("failed to set raw_data[%d]\n", i);
+                       goto put_seq_field;
+               }
+       }
+
+       ret = bt_ctf_event_set_payload(event, "raw_data", seq_field);
+       if (ret)
+               pr_err("failed to set payload for raw_data\n");
+
+put_seq_field:
+       bt_ctf_field_put(seq_field);
+put_seq_type:
+       bt_ctf_field_type_put(seq_type);
+put_len_field:
+       bt_ctf_field_put(len_field);
+put_len_type:
+       bt_ctf_field_type_put(len_type);
+       return ret;
+}
+
 static int add_generic_values(struct ctf_writer *cw,
                              struct bt_ctf_event *event,
                              struct perf_evsel *evsel,
@@ -597,6 +675,12 @@ static int process_sample_event(struct perf_tool *tool,
                        return -1;
        }
 
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_values(event_class, event, sample);
+               if (ret)
+                       return -1;
+       }
+
        cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
        if (cs) {
                if (is_flush_needed(cs))
@@ -744,6 +828,25 @@ static int add_tracepoint_types(struct ctf_writer *cw,
        return ret;
 }
 
+static int add_bpf_output_types(struct ctf_writer *cw,
+                               struct bt_ctf_event_class *class)
+{
+       struct bt_ctf_field_type *len_type = cw->data.u32;
+       struct bt_ctf_field_type *seq_base_type = cw->data.u32_hex;
+       struct bt_ctf_field_type *seq_type;
+       int ret;
+
+       ret = bt_ctf_event_class_add_field(class, len_type, "raw_len");
+       if (ret)
+               return ret;
+
+       seq_type = bt_ctf_field_type_sequence_create(seq_base_type, "raw_len");
+       if (!seq_type)
+               return -1;
+
+       return bt_ctf_event_class_add_field(class, seq_type, "raw_data");
+}
+
 static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
                             struct bt_ctf_event_class *event_class)
 {
@@ -755,7 +858,8 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
         *                              ctf event header
         *   PERF_SAMPLE_READ         - TODO
         *   PERF_SAMPLE_CALLCHAIN    - TODO
-        *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+        *   PERF_SAMPLE_RAW          - tracepoint fields and BPF output
+        *                              are handled separately
         *   PERF_SAMPLE_BRANCH_STACK - TODO
         *   PERF_SAMPLE_REGS_USER    - TODO
         *   PERF_SAMPLE_STACK_USER   - TODO
@@ -824,6 +928,12 @@ static int add_event(struct ctf_writer *cw, struct perf_evsel *evsel)
                        goto err;
        }
 
+       if (perf_evsel__is_bpf_output(evsel)) {
+               ret = add_bpf_output_types(cw, event_class);
+               if (ret)
+                       goto err;
+       }
+
        ret = bt_ctf_stream_class_add_event_class(cw->stream_class, event_class);
        if (ret) {
                pr("Failed to add event class into stream.\n");
@@ -858,6 +968,23 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
        return 0;
 }
 
+static void cleanup_events(struct perf_session *session)
+{
+       struct perf_evlist *evlist = session->evlist;
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               struct evsel_priv *priv;
+
+               priv = evsel->priv;
+               bt_ctf_event_class_put(priv->event_class);
+               zfree(&evsel->priv);
+       }
+
+       perf_evlist__delete(evlist);
+       session->evlist = NULL;
+}
+
 static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
 {
        struct ctf_stream **stream;
@@ -953,6 +1080,12 @@ static struct bt_ctf_field_type *create_int_type(int size, bool sign, bool hex)
            bt_ctf_field_type_integer_set_base(type, BT_CTF_INTEGER_BASE_HEXADECIMAL))
                goto err;
 
+#if __BYTE_ORDER == __BIG_ENDIAN
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_BIG_ENDIAN);
+#else
+       bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_LITTLE_ENDIAN);
+#endif
+
        pr2("Created type: INTEGER %d-bit %ssigned %s\n",
            size, sign ? "un" : "", hex ? "hex" : "");
        return type;
@@ -1100,7 +1233,7 @@ static int convert__config(const char *var, const char *value, void *cb)
                return 0;
        }
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 int bt_convert__perf2ctf(const char *input, const char *path, bool force)
@@ -1171,6 +1304,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
                (double) c.events_size / 1024.0 / 1024.0,
                c.events_count);
 
+       cleanup_events(session);
        perf_session__delete(session);
        ctf_writer__cleanup(cw);
 
index 86d9c73..8c4212a 100644 (file)
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <api/debug.h>
 
 #include "cache.h"
 #include "color.h"
@@ -22,7 +23,7 @@ int debug_ordered_events;
 static int redirect_to_stderr;
 int debug_data_convert;
 
-static int _eprintf(int level, int var, const char *fmt, va_list args)
+int veprintf(int level, int var, const char *fmt, va_list args)
 {
        int ret = 0;
 
@@ -36,24 +37,19 @@ static int _eprintf(int level, int var, const char *fmt, va_list args)
        return ret;
 }
 
-int veprintf(int level, int var, const char *fmt, va_list args)
-{
-       return _eprintf(level, var, fmt, args);
-}
-
 int eprintf(int level, int var, const char *fmt, ...)
 {
        va_list args;
        int ret;
 
        va_start(args, fmt);
-       ret = _eprintf(level, var, fmt, args);
+       ret = veprintf(level, var, fmt, args);
        va_end(args);
 
        return ret;
 }
 
-static int __eprintf_time(u64 t, const char *fmt, va_list args)
+static int veprintf_time(u64 t, const char *fmt, va_list args)
 {
        int ret = 0;
        u64 secs, usecs, nsecs = t;
@@ -75,7 +71,7 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
 
        if (var >= level) {
                va_start(args, fmt);
-               ret = __eprintf_time(t, fmt, args);
+               ret = veprintf_time(t, fmt, args);
                va_end(args);
        }
 
@@ -91,7 +87,7 @@ void pr_stat(const char *fmt, ...)
        va_list args;
 
        va_start(args, fmt);
-       _eprintf(1, verbose, fmt, args);
+       veprintf(1, verbose, fmt, args);
        va_end(args);
        eprintf(1, verbose, "\n");
 }
@@ -110,40 +106,61 @@ int dump_printf(const char *fmt, ...)
        return ret;
 }
 
+static void trace_event_printer(enum binary_printer_ops op,
+                               unsigned int val, void *extra)
+{
+       const char *color = PERF_COLOR_BLUE;
+       union perf_event *event = (union perf_event *)extra;
+       unsigned char ch = (unsigned char)val;
+
+       switch (op) {
+       case BINARY_PRINT_DATA_BEGIN:
+               printf(".");
+               color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+                               event->header.size);
+               break;
+       case BINARY_PRINT_LINE_BEGIN:
+               printf(".");
+               break;
+       case BINARY_PRINT_ADDR:
+               color_fprintf(stdout, color, "  %04x: ", val);
+               break;
+       case BINARY_PRINT_NUM_DATA:
+               color_fprintf(stdout, color, " %02x", val);
+               break;
+       case BINARY_PRINT_NUM_PAD:
+               color_fprintf(stdout, color, "   ");
+               break;
+       case BINARY_PRINT_SEP:
+               color_fprintf(stdout, color, "  ");
+               break;
+       case BINARY_PRINT_CHAR_DATA:
+               color_fprintf(stdout, color, "%c",
+                             isprint(ch) ? ch : '.');
+               break;
+       case BINARY_PRINT_CHAR_PAD:
+               color_fprintf(stdout, color, " ");
+               break;
+       case BINARY_PRINT_LINE_END:
+               color_fprintf(stdout, color, "\n");
+               break;
+       case BINARY_PRINT_DATA_END:
+               printf("\n");
+               break;
+       default:
+               break;
+       }
+}
+
 void trace_event(union perf_event *event)
 {
        unsigned char *raw_event = (void *)event;
-       const char *color = PERF_COLOR_BLUE;
-       int i, j;
 
        if (!dump_trace)
                return;
 
-       printf(".");
-       color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
-                     event->header.size);
-
-       for (i = 0; i < event->header.size; i++) {
-               if ((i & 15) == 0) {
-                       printf(".");
-                       color_fprintf(stdout, color, "  %04x: ", i);
-               }
-
-               color_fprintf(stdout, color, " %02x", raw_event[i]);
-
-               if (((i & 15) == 15) || i == event->header.size-1) {
-                       color_fprintf(stdout, color, "  ");
-                       for (j = 0; j < 15-(i & 15); j++)
-                               color_fprintf(stdout, color, "   ");
-                       for (j = i & ~15; j <= i; j++) {
-                               color_fprintf(stdout, color, "%c",
-                                             isprint(raw_event[j]) ?
-                                             raw_event[j] : '.');
-                       }
-                       color_fprintf(stdout, color, "\n");
-               }
-       }
-       printf(".\n");
+       print_binary(raw_event, event->header.size, 16,
+                    trace_event_printer, event);
 }
 
 static struct debug_variable {
@@ -192,3 +209,23 @@ int perf_debug_option(const char *str)
        free(s);
        return 0;
 }
+
+#define DEBUG_WRAPPER(__n, __l)                                \
+static int pr_ ## __n ## _wrapper(const char *fmt, ...)        \
+{                                                      \
+       va_list args;                                   \
+       int ret;                                        \
+                                                       \
+       va_start(args, fmt);                            \
+       ret = veprintf(__l, verbose, fmt, args);        \
+       va_end(args);                                   \
+       return ret;                                     \
+}
+
+DEBUG_WRAPPER(warning, 0);
+DEBUG_WRAPPER(debug, 1);
+
+void perf_debug_setup(void)
+{
+       libapi_set_print(pr_warning_wrapper, pr_warning_wrapper, pr_debug_wrapper);
+}
index 8b9a088..14bafda 100644 (file)
@@ -53,5 +53,6 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__(
 int veprintf(int level, int var, const char *fmt, va_list args);
 
 int perf_debug_option(const char *str);
+void perf_debug_setup(void);
 
 #endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
new file mode 100644 (file)
index 0000000..3e6062a
--- /dev/null
@@ -0,0 +1,199 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+#include "symbol.h"
+
+#include "demangle-java.h"
+
+enum {
+       MODE_PREFIX = 0,
+       MODE_CLASS  = 1,
+       MODE_FUNC   = 2,
+       MODE_TYPE   = 3,
+       MODE_CTYPE  = 3, /* class arg */
+};
+
+#define BASE_ENT(c, n) [c - 'A']=n
+static const char *base_types['Z' - 'A' + 1] = {
+       BASE_ENT('B', "byte" ),
+       BASE_ENT('C', "char" ),
+       BASE_ENT('D', "double" ),
+       BASE_ENT('F', "float" ),
+       BASE_ENT('I', "int" ),
+       BASE_ENT('J', "long" ),
+       BASE_ENT('S', "short" ),
+       BASE_ENT('Z', "bool" ),
+};
+
+/*
+ * demangle Java symbol between str and end positions and stores
+ * up to maxlen characters into buf. The parser starts in mode.
+ *
+ * Use MODE_PREFIX to process entire prototype till end position
+ * Use MODE_TYPE to process return type if str starts on return type char
+ *
+ *  Return:
+ *     success: buf
+ *     error  : NULL
+ */
+static char *
+__demangle_java_sym(const char *str, const char *end, char *buf, int maxlen, int mode)
+{
+       int rlen = 0;
+       int array = 0;
+       int narg = 0;
+       const char *q;
+
+       if (!end)
+               end = str + strlen(str);
+
+       for (q = str; q != end; q++) {
+
+               if (rlen == (maxlen - 1))
+                       break;
+
+               switch (*q) {
+               case 'L':
+                       if (mode == MODE_PREFIX || mode == MODE_CTYPE) {
+                               if (mode == MODE_CTYPE) {
+                                       if (narg)
+                                               rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                                       narg++;
+                               }
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "class ");
+                               if (mode == MODE_PREFIX)
+                                       mode = MODE_CLASS;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'B':
+               case 'C':
+               case 'D':
+               case 'F':
+               case 'I':
+               case 'J':
+               case 'S':
+               case 'Z':
+                       if (mode == MODE_TYPE) {
+                               if (narg)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "%s", base_types[*q - 'A']);
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                               narg++;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case 'V':
+                       if (mode == MODE_TYPE) {
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, "void");
+                               while (array--)
+                                       rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+                               array = 0;
+                       } else
+                               buf[rlen++] = *q;
+                       break;
+               case '[':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       array++;
+                       break;
+               case '(':
+                       if (mode != MODE_FUNC)
+                               goto error;
+                       buf[rlen++] = *q;
+                       mode = MODE_TYPE;
+                       break;
+               case ')':
+                       if (mode != MODE_TYPE)
+                               goto error;
+                       buf[rlen++] = *q;
+                       narg = 0;
+                       break;
+               case ';':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       /* safe because at least one other char to process */
+                       if (isalpha(*(q + 1)))
+                               rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       if (mode == MODE_CLASS)
+                               mode = MODE_FUNC;
+                       else if (mode == MODE_CTYPE)
+                               mode = MODE_TYPE;
+                       break;
+               case '/':
+                       if (mode != MODE_CLASS && mode != MODE_CTYPE)
+                               goto error;
+                       rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+                       break;
+               default :
+                       buf[rlen++] = *q;
+               }
+       }
+       buf[rlen] = '\0';
+       return buf;
+error:
+       return NULL;
+}
+
+/*
+ * Demangle Java function signature (openJDK, not GCJ)
+ * input:
+ *     str: string to parse. String is not modified
+ *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ * return:
+ *     if input can be demangled, then a newly allocated string is returned.
+ *     if input cannot be demangled, then NULL is returned
+ *
+ * Note: caller is responsible for freeing demangled string
+ */
+char *
+java_demangle_sym(const char *str, int flags)
+{
+       char *buf, *ptr;
+       char *p;
+       size_t len, l1 = 0;
+
+       if (!str)
+               return NULL;
+
+       /* find start of retunr type */
+       p = strrchr(str, ')');
+       if (!p)
+               return NULL;
+
+       /*
+        * expansion factor estimated to 3x
+        */
+       len = strlen(str) * 3 + 1;
+       buf = malloc(len);
+       if (!buf)
+               return NULL;
+
+       buf[0] = '\0';
+       if (!(flags & JAVA_DEMANGLE_NORET)) {
+               /*
+                * get return type first
+                */
+               ptr = __demangle_java_sym(p + 1, NULL, buf, len, MODE_TYPE);
+               if (!ptr)
+                       goto error;
+
+               /* add space between return type and function prototype */
+               l1 = strlen(buf);
+               buf[l1++] = ' ';
+       }
+
+       /* process function up to return type */
+       ptr = __demangle_java_sym(str, p + 1, buf + l1, len - l1, MODE_PREFIX);
+       if (!ptr)
+               goto error;
+
+       return buf;
+error:
+       free(buf);
+       return NULL;
+}
diff --git a/tools/perf/util/demangle-java.h b/tools/perf/util/demangle-java.h
new file mode 100644 (file)
index 0000000..a981c1f
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef __PERF_DEMANGLE_JAVA
+#define __PERF_DEMANGLE_JAVA 1
+/*
+ * demangle function flags
+ */
+#define JAVA_DEMANGLE_NORET    0x1 /* do not process return type */
+
+char * java_demangle_sym(const char *str, int flags);
+
+#endif /* __PERF_DEMANGLE_JAVA */
index e8e9a9d..8e63954 100644 (file)
@@ -52,6 +52,11 @@ int dso__read_binary_type_filename(const struct dso *dso,
                        debuglink--;
                if (*debuglink == '/')
                        debuglink++;
+
+               ret = -1;
+               if (!is_regular_file(filename))
+                       break;
+
                ret = filename__read_debuglink(filename, debuglink,
                                               size - (debuglink - filename));
                }
index 7dd5939..49a11d9 100644 (file)
@@ -6,6 +6,8 @@ struct perf_env perf_env;
 
 void perf_env__exit(struct perf_env *env)
 {
+       int i;
+
        zfree(&env->hostname);
        zfree(&env->os_release);
        zfree(&env->version);
@@ -19,6 +21,10 @@ void perf_env__exit(struct perf_env *env)
        zfree(&env->numa_nodes);
        zfree(&env->pmu_mappings);
        zfree(&env->cpu);
+
+       for (i = 0; i < env->caches_cnt; i++)
+               cpu_cache_level__free(&env->caches[i]);
+       zfree(&env->caches);
 }
 
 int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[])
@@ -75,3 +81,10 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
        env->nr_cpus_avail = nr_cpus;
        return 0;
 }
+
+void cpu_cache_level__free(struct cpu_cache_level *cache)
+{
+       free(cache->type);
+       free(cache->map);
+       free(cache->size);
+}
index 0132b95..56cffb6 100644 (file)
@@ -1,11 +1,23 @@
 #ifndef __PERF_ENV_H
 #define __PERF_ENV_H
 
+#include <linux/types.h>
+
 struct cpu_topology_map {
        int     socket_id;
        int     core_id;
 };
 
+struct cpu_cache_level {
+       u32     level;
+       u32     line_size;
+       u32     sets;
+       u32     ways;
+       char    *type;
+       char    *size;
+       char    *map;
+};
+
 struct perf_env {
        char                    *hostname;
        char                    *os_release;
@@ -31,6 +43,8 @@ struct perf_env {
        char                    *numa_nodes;
        char                    *pmu_mappings;
        struct cpu_topology_map *cpu;
+       struct cpu_cache_level  *caches;
+       int                      caches_cnt;
 };
 
 extern struct perf_env perf_env;
@@ -41,4 +55,5 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]);
 
 int perf_env__read_cpu_topology_map(struct perf_env *env);
 
+void cpu_cache_level__free(struct cpu_cache_level *cache);
 #endif /* __PERF_ENV_H */
index 85155e9..7bad5c3 100644 (file)
@@ -282,7 +282,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                strcpy(execname, "");
 
                /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n",
+               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %[^\n]\n",
                       &event->mmap2.start, &event->mmap2.len, prot,
                       &event->mmap2.pgoff, &event->mmap2.maj,
                       &event->mmap2.min,
index d81f13d..86a0383 100644 (file)
@@ -1181,12 +1181,12 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
         */
        if (cpus != evlist->cpus) {
                cpu_map__put(evlist->cpus);
-               evlist->cpus = cpus;
+               evlist->cpus = cpu_map__get(cpus);
        }
 
        if (threads != evlist->threads) {
                thread_map__put(evlist->threads);
-               evlist->threads = threads;
+               evlist->threads = thread_map__get(threads);
        }
 
        perf_evlist__propagate_maps(evlist);
@@ -1223,6 +1223,9 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
        int err = 0;
 
        evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+                       continue;
+
                err = perf_evsel__set_filter(evsel, filter);
                if (err)
                        break;
@@ -1624,7 +1627,7 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
        return printed + fprintf(fp, "\n");
 }
 
-int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
+int perf_evlist__strerror_open(struct perf_evlist *evlist,
                               int err, char *buf, size_t size)
 {
        int printed, value;
@@ -1652,7 +1655,25 @@ int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
                                    "Hint:\tTry: 'sudo sh -c \"echo -1 > /proc/sys/kernel/perf_event_paranoid\"'\n"
                                    "Hint:\tThe current value is %d.", value);
                break;
+       case EINVAL: {
+               struct perf_evsel *first = perf_evlist__first(evlist);
+               int max_freq;
+
+               if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0)
+                       goto out_default;
+
+               if (first->attr.sample_freq < (u64)max_freq)
+                       goto out_default;
+
+               printed = scnprintf(buf, size,
+                                   "Error:\t%s.\n"
+                                   "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
+                                   "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
+                                   emsg, max_freq, first->attr.sample_freq);
+               break;
+       }
        default:
+out_default:
                scnprintf(buf, size, "%s", emsg);
                break;
        }
@@ -1723,3 +1744,19 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
 
        tracking_evsel->tracking = true;
 }
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist,
+                              const char *str)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               if (!evsel->name)
+                       continue;
+               if (strcmp(str, evsel->name) == 0)
+                       return evsel;
+       }
+
+       return NULL;
+}
index 7c4d9a2..a0d1522 100644 (file)
@@ -294,4 +294,7 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
                                     struct perf_evsel *tracking_evsel);
 
 void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr);
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist, const char *str);
 #endif /* __PERF_EVLIST_H */
index cdbaf9b..0902fe4 100644 (file)
@@ -225,6 +225,11 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
        if (evsel != NULL)
                perf_evsel__init(evsel, attr, idx);
 
+       if (perf_evsel__is_bpf_output(evsel)) {
+               evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+               evsel->attr.sample_period = 1;
+       }
+
        return evsel;
 }
 
@@ -898,6 +903,16 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
        if (evsel->precise_max)
                perf_event_attr__set_max_precise_ip(attr);
 
+       if (opts->all_user) {
+               attr->exclude_kernel = 1;
+               attr->exclude_user   = 0;
+       }
+
+       if (opts->all_kernel) {
+               attr->exclude_kernel = 0;
+               attr->exclude_user   = 1;
+       }
+
        /*
         * Apply event specific term settings,
         * it overloads any global configuration.
@@ -2362,12 +2377,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
        case EPERM:
        case EACCES:
                return scnprintf(msg, size,
-                "You may not have permission to collect %sstats.\n"
-                "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-                " -1 - Not paranoid at all\n"
-                "  0 - Disallow raw tracepoint access for unpriv\n"
-                "  1 - Disallow cpu events for unpriv\n"
-                "  2 - Disallow kernel profiling for unpriv",
+                "You may not have permission to collect %sstats.\n\n"
+                "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+                "which controls use of the performance events system by\n"
+                "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+                "The default value is 1:\n\n"
+                "  -1: Allow use of (almost) all events by all users\n"
+                ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+                ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+                ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
                                 target->system_wide ? "system-wide " : "");
        case ENOENT:
                return scnprintf(msg, size, "The %s event is not supported.",
index 8e75434..501ea6e 100644 (file)
@@ -93,10 +93,8 @@ struct perf_evsel {
        const char              *unit;
        struct event_format     *tp_format;
        off_t                   id_offset;
-       union {
-               void            *priv;
-               u64             db_id;
-       };
+       void                    *priv;
+       u64                     db_id;
        struct cgroup_sel       *cgrp;
        void                    *handler;
        struct cpu_map          *cpus;
@@ -364,6 +362,14 @@ static inline bool perf_evsel__is_function_event(struct perf_evsel *evsel)
 #undef FUNCTION_EVENT
 }
 
+static inline bool perf_evsel__is_bpf_output(struct perf_evsel *evsel)
+{
+       struct perf_event_attr *attr = &evsel->attr;
+
+       return (attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
+               (attr->type == PERF_TYPE_SOFTWARE);
+}
+
 struct perf_attr_details {
        bool freq;
        bool verbose;
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
new file mode 100644 (file)
index 0000000..c1ef805
--- /dev/null
@@ -0,0 +1,449 @@
+/*
+ * genelf.c
+ * Copyright (C) 2014, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@gmail.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define JVMTI
+
+#define BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef HAVE_LIBCRYPTO
+
+#define BUILD_ID_MD5
+#undef BUILD_ID_SHA    /* does not seem to work well when linked with Java */
+#undef BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef BUILD_ID_SHA
+#include <openssl/sha.h>
+#endif
+
+#ifdef BUILD_ID_MD5
+#include <openssl/md5.h>
+#endif
+#endif
+
+
+typedef struct {
+  unsigned int namesz;  /* Size of entry's owner string */
+  unsigned int descsz;  /* Size of the note descriptor */
+  unsigned int type;    /* Interpretation of the descriptor */
+  char         name[0]; /* Start of the name+desc data */
+} Elf_Note;
+
+struct options {
+       char *output;
+       int fd;
+};
+
+static char shd_string_table[] = {
+       0,
+       '.', 't', 'e', 'x', 't', 0,                     /*  1 */
+       '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', 0, /*  7 */
+       '.', 's', 'y', 'm', 't', 'a', 'b', 0,           /* 17 */
+       '.', 's', 't', 'r', 't', 'a', 'b', 0,           /* 25 */
+       '.', 'n', 'o', 't', 'e', '.', 'g', 'n', 'u', '.', 'b', 'u', 'i', 'l', 'd', '-', 'i', 'd', 0, /* 33 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'l', 'i', 'n', 'e', 0, /* 52 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'i', 'n', 'f', 'o', 0, /* 64 */
+       '.', 'd', 'e', 'b', 'u', 'g', '_', 'a', 'b', 'b', 'r', 'e', 'v', 0, /* 76 */
+};
+
+static struct buildid_note {
+       Elf_Note desc;          /* descsz: size of build-id, must be multiple of 4 */
+       char     name[4];       /* GNU\0 */
+       char     build_id[20];
+} bnote;
+
+static Elf_Sym symtab[]={
+       /* symbol 0 MUST be the undefined symbol */
+       { .st_name  = 0, /* index in sym_string table */
+         .st_info  = ELF_ST_TYPE(STT_NOTYPE),
+         .st_shndx = 0, /* for now */
+         .st_value = 0x0,
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0,
+       },
+       { .st_name  = 1, /* index in sym_string table */
+         .st_info  = ELF_ST_BIND(STB_LOCAL) | ELF_ST_TYPE(STT_FUNC),
+         .st_shndx = 1,
+         .st_value = 0, /* for now */
+         .st_other = ELF_ST_VIS(STV_DEFAULT),
+         .st_size  = 0, /* for now */
+       }
+};
+
+#ifdef BUILD_ID_URANDOM
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code __maybe_unused,
+            size_t csize __maybe_unused)
+{
+       int fd;
+       size_t sz = sizeof(note->build_id);
+       ssize_t sret;
+
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd == -1)
+               err(1, "cannot access /dev/urandom for builid");
+
+       sret = read(fd, note->build_id, sz);
+
+       close(fd);
+
+       if (sret != (ssize_t)sz)
+               memset(note->build_id, 0, sz);
+}
+#endif
+
+#ifdef BUILD_ID_SHA
+static void
+gen_build_id(struct buildid_note *note,
+            unsigned long load_addr __maybe_unused,
+            const void *code,
+            size_t csize)
+{
+       if (sizeof(note->build_id) < SHA_DIGEST_LENGTH)
+               errx(1, "build_id too small for SHA1");
+
+       SHA1(code, csize, (unsigned char *)note->build_id);
+}
+#endif
+
+#ifdef BUILD_ID_MD5
+static void
+gen_build_id(struct buildid_note *note, unsigned long load_addr, const void *code, size_t csize)
+{
+       MD5_CTX context;
+
+       if (sizeof(note->build_id) < 16)
+               errx(1, "build_id too small for MD5");
+
+       MD5_Init(&context);
+       MD5_Update(&context, &load_addr, sizeof(load_addr));
+       MD5_Update(&context, code, csize);
+       MD5_Final((unsigned char *)note->build_id, &context);
+}
+#endif
+
+/*
+ * fd: file descriptor open for writing for the output file
+ * load_addr: code load address (could be zero, just used for buildid)
+ * sym: function name (for native code - used as the symbol)
+ * code: the native code
+ * csize: the code size in bytes
+ */
+int
+jit_write_elf(int fd, uint64_t load_addr, const char *sym,
+             const void *code, int csize,
+             void *debug, int nr_debug_entries)
+{
+       Elf *e;
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Ehdr *ehdr;
+       Elf_Shdr *shdr;
+       char *strsym = NULL;
+       int symlen;
+       int retval = -1;
+
+       if (elf_version(EV_CURRENT) == EV_NONE) {
+               warnx("ELF initialization failed");
+               return -1;
+       }
+
+       e = elf_begin(fd, ELF_C_WRITE, NULL);
+       if (!e) {
+               warnx("elf_begin failed");
+               goto error;
+       }
+
+       /*
+        * setup ELF header
+        */
+       ehdr = elf_newehdr(e);
+       if (!ehdr) {
+               warnx("cannot get ehdr");
+               goto error;
+       }
+
+       ehdr->e_ident[EI_DATA] = GEN_ELF_ENDIAN;
+       ehdr->e_ident[EI_CLASS] = GEN_ELF_CLASS;
+       ehdr->e_machine = GEN_ELF_ARCH;
+       ehdr->e_type = ET_DYN;
+       ehdr->e_entry = GEN_ELF_TEXT_OFFSET;
+       ehdr->e_version = EV_CURRENT;
+       ehdr->e_shstrndx= 2; /* shdr index for section name */
+
+       /*
+        * setup text section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 16;
+       d->d_off = 0LL;
+       d->d_buf = (void *)code;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = csize;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 1;
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = GEN_ELF_TEXT_OFFSET;
+       shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup section headers string table
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = shd_string_table;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(shd_string_table);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 7; /* offset of '.shstrtab' in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup symtab section
+        */
+       symtab[1].st_size  = csize;
+       symtab[1].st_value = GEN_ELF_TEXT_OFFSET;
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 8;
+       d->d_off = 0LL;
+       d->d_buf = symtab;
+       d->d_type = ELF_T_SYM;
+       d->d_size = sizeof(symtab);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 17; /* offset of '.symtab' in shd_string_table */
+       shdr->sh_type = SHT_SYMTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = sizeof(Elf_Sym);
+       shdr->sh_link = 4; /* index of .strtab section */
+
+       /*
+        * setup symbols string table
+        * 2 = 1 for 0 in 1st entry, 1 for the 0 at end of symbol for 2nd entry
+        */
+       symlen = 2 + strlen(sym);
+       strsym = calloc(1, symlen);
+       if (!strsym) {
+               warnx("cannot allocate strsym");
+               goto error;
+       }
+       strcpy(strsym + 1, sym);
+
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = strsym;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = symlen;
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 25; /* offset in shd_string_table */
+       shdr->sh_type = SHT_STRTAB;
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup build-id section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               goto error;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               goto error;
+       }
+
+       /*
+        * build-id generation
+        */
+       gen_build_id(&bnote, load_addr, code, csize);
+       bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
+       bnote.desc.descsz = sizeof(bnote.build_id);
+       bnote.desc.type   = NT_GNU_BUILD_ID;
+       strcpy(bnote.name, "GNU");
+
+       d->d_align = 4;
+       d->d_off = 0LL;
+       d->d_buf = &bnote;
+       d->d_type = ELF_T_BYTE;
+       d->d_size = sizeof(bnote);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               goto error;
+       }
+
+       shdr->sh_name = 33; /* offset in shd_string_table */
+       shdr->sh_type = SHT_NOTE;
+       shdr->sh_addr = 0x0;
+       shdr->sh_flags = SHF_ALLOC;
+       shdr->sh_size = sizeof(bnote);
+       shdr->sh_entsize = 0;
+
+       if (debug && nr_debug_entries) {
+               retval = jit_add_debug_info(e, load_addr, debug, nr_debug_entries);
+               if (retval)
+                       goto error;
+       } else {
+               if (elf_update(e, ELF_C_WRITE) < 0) {
+                       warnx("elf_update 4 failed");
+                       goto error;
+               }
+       }
+
+       retval = 0;
+error:
+       (void)elf_end(e);
+
+       free(strsym);
+
+
+       return retval;
+}
+
+#ifndef JVMTI
+
+static unsigned char x86_code[] = {
+    0xBB, 0x2A, 0x00, 0x00, 0x00, /* movl $42, %ebx */
+    0xB8, 0x01, 0x00, 0x00, 0x00, /* movl $1, %eax */
+    0xCD, 0x80            /* int $0x80 */
+};
+
+static struct options options;
+
+int main(int argc, char **argv)
+{
+       int c, fd, ret;
+
+       while ((c = getopt(argc, argv, "o:h")) != -1) {
+               switch (c) {
+               case 'o':
+                       options.output = optarg;
+                       break;
+               case 'h':
+                       printf("Usage: genelf -o output_file [-h]\n");
+                       return 0;
+               default:
+                       errx(1, "unknown option");
+               }
+       }
+
+       fd = open(options.output, O_CREAT|O_TRUNC|O_RDWR, 0666);
+       if (fd == -1)
+               err(1, "cannot create file %s", options.output);
+
+       ret = jit_write_elf(fd, "main", x86_code, sizeof(x86_code));
+       close(fd);
+
+       if (ret != 0)
+               unlink(options.output);
+
+       return ret;
+}
+#endif
diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h
new file mode 100644 (file)
index 0000000..45bf9c6
--- /dev/null
@@ -0,0 +1,67 @@
+#ifndef __GENELF_H__
+#define __GENELF_H__
+
+/* genelf.c */
+extern int jit_write_elf(int fd, uint64_t code_addr, const char *sym,
+                        const void *code, int csize,
+                        void *debug, int nr_debug_entries);
+/* genelf_debug.c */
+extern int jit_add_debug_info(Elf *e, uint64_t code_addr,
+                             void *debug, int nr_debug_entries);
+
+#if   defined(__arm__)
+#define GEN_ELF_ARCH   EM_ARM
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__aarch64__)
+#define GEN_ELF_ARCH   EM_AARCH64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__x86_64__)
+#define GEN_ELF_ARCH   EM_X86_64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__i386__)
+#define GEN_ELF_ARCH   EM_386
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS32
+#elif defined(__ppcle__)
+#define GEN_ELF_ARCH   EM_PPC
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpc__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2MSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#elif defined(__powerpcle__)
+#define GEN_ELF_ARCH   EM_PPC64
+#define GEN_ELF_ENDIAN ELFDATA2LSB
+#define GEN_ELF_CLASS  ELFCLASS64
+#else
+#error "unsupported architecture"
+#endif
+
+#if GEN_ELF_CLASS == ELFCLASS64
+#define elf_newehdr    elf64_newehdr
+#define elf_getshdr    elf64_getshdr
+#define Elf_Ehdr       Elf64_Ehdr
+#define Elf_Shdr       Elf64_Shdr
+#define Elf_Sym                Elf64_Sym
+#define ELF_ST_TYPE(a) ELF64_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF64_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF64_ST_VISIBILITY(a)
+#else
+#define elf_newehdr    elf32_newehdr
+#define elf_getshdr    elf32_getshdr
+#define Elf_Ehdr       Elf32_Ehdr
+#define Elf_Shdr       Elf32_Shdr
+#define Elf_Sym                Elf32_Sym
+#define ELF_ST_TYPE(a) ELF32_ST_TYPE(a)
+#define ELF_ST_BIND(a) ELF32_ST_BIND(a)
+#define ELF_ST_VIS(a)  ELF32_ST_VISIBILITY(a)
+#endif
+
+/* The .text section is directly after the ELF header */
+#define GEN_ELF_TEXT_OFFSET sizeof(Elf_Ehdr)
+
+#endif
diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c
new file mode 100644 (file)
index 0000000..5980f7d
--- /dev/null
@@ -0,0 +1,610 @@
+/*
+ * genelf_debug.c
+ * Copyright (C) 2015, Google, Inc
+ *
+ * Contributed by:
+ *     Stephane Eranian <eranian@google.com>
+ *
+ * Released under the GPL v2.
+ *
+ * based on GPLv2 source code from Oprofile
+ * @remark Copyright 2007 OProfile authors
+ * @author Philippe Elie
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define BUFFER_EXT_DFL_SIZE    (4 * 1024)
+
+typedef uint32_t uword;
+typedef uint16_t uhalf;
+typedef int32_t  sword;
+typedef int16_t  shalf;
+typedef uint8_t  ubyte;
+typedef int8_t   sbyte;
+
+struct buffer_ext {
+       size_t cur_pos;
+       size_t max_sz;
+       void *data;
+};
+
+static void
+buffer_ext_dump(struct buffer_ext *be, const char *msg)
+{
+       size_t i;
+       warnx("DUMP for %s", msg);
+       for (i = 0 ; i < be->cur_pos; i++)
+               warnx("%4zu 0x%02x", i, (((char *)be->data)[i]) & 0xff);
+}
+
+static inline int
+buffer_ext_add(struct buffer_ext *be, void *addr, size_t sz)
+{
+       void *tmp;
+       size_t be_sz = be->max_sz;
+
+retry:
+       if ((be->cur_pos + sz) < be_sz) {
+               memcpy(be->data + be->cur_pos, addr, sz);
+               be->cur_pos += sz;
+               return 0;
+       }
+
+       if (!be_sz)
+               be_sz = BUFFER_EXT_DFL_SIZE;
+       else
+               be_sz <<= 1;
+
+       tmp = realloc(be->data, be_sz);
+       if (!tmp)
+               return -1;
+
+       be->data   = tmp;
+       be->max_sz = be_sz;
+
+       goto retry;
+}
+
+static void
+buffer_ext_init(struct buffer_ext *be)
+{
+       be->data = NULL;
+       be->cur_pos = 0;
+       be->max_sz = 0;
+}
+
+static inline size_t
+buffer_ext_size(struct buffer_ext *be)
+{
+       return be->cur_pos;
+}
+
+static inline void *
+buffer_ext_addr(struct buffer_ext *be)
+{
+       return be->data;
+}
+
+struct debug_line_header {
+       // Not counting this field
+       uword total_length;
+       // version number (2 currently)
+       uhalf version;
+       // relative offset from next field to
+       // program statement
+       uword prolog_length;
+       ubyte minimum_instruction_length;
+       ubyte default_is_stmt;
+       // line_base - see DWARF 2 specs
+       sbyte line_base;
+       // line_range - see DWARF 2 specs
+       ubyte line_range;
+       // number of opcode + 1
+       ubyte opcode_base;
+       /* follow the array of opcode args nr: ubytes [nr_opcode_base] */
+       /* follow the search directories index, zero terminated string
+        * terminated by an empty string.
+        */
+       /* follow an array of { filename, LEB128, LEB128, LEB128 }, first is
+        * the directory index entry, 0 means current directory, then mtime
+        * and filesize, last entry is followed by en empty string.
+        */
+       /* follow the first program statement */
+} __attribute__((packed));
+
+/* DWARF 2 spec talk only about one possible compilation unit header while
+ * binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not
+ * related to the used arch, an ELF 32 can hold more than 4 Go of debug
+ * information. For now we handle only DWARF 2 32 bits comp unit. It'll only
+ * become a problem if we generate more than 4GB of debug information.
+ */
+struct compilation_unit_header {
+       uword total_length;
+       uhalf version;
+       uword debug_abbrev_offset;
+       ubyte pointer_size;
+} __attribute__((packed));
+
+#define DW_LNS_num_opcode (DW_LNS_set_isa + 1)
+
+/* field filled at run time are marked with -1 */
+static struct debug_line_header const default_debug_line_header = {
+       .total_length = -1,
+       .version = 2,
+       .prolog_length = -1,
+       .minimum_instruction_length = 1,        /* could be better when min instruction size != 1 */
+       .default_is_stmt = 1,   /* we don't take care about basic block */
+       .line_base = -5,        /* sensible value for line base ... */
+       .line_range = -14,     /* ... and line range are guessed statically */
+       .opcode_base = DW_LNS_num_opcode
+};
+
+static ubyte standard_opcode_length[] =
+{
+       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1
+};
+#if 0
+{
+       [DW_LNS_advance_pc]   = 1,
+       [DW_LNS_advance_line] = 1,
+       [DW_LNS_set_file] =  1,
+       [DW_LNS_set_column] = 1,
+       [DW_LNS_fixed_advance_pc] = 1,
+       [DW_LNS_set_isa] = 1,
+};
+#endif
+
+/* field filled at run time are marked with -1 */
+static struct compilation_unit_header default_comp_unit_header = {
+       .total_length = -1,
+       .version = 2,
+       .debug_abbrev_offset = 0,     /* we reuse the same abbrev entries for all comp unit */
+       .pointer_size = sizeof(void *)
+};
+
+static void emit_uword(struct buffer_ext *be, uword data)
+{
+       buffer_ext_add(be, &data, sizeof(uword));
+}
+
+static void emit_string(struct buffer_ext *be, const char *s)
+{
+       buffer_ext_add(be, (void *)s, strlen(s) + 1);
+}
+
+static void emit_unsigned_LEB128(struct buffer_ext *be,
+                                unsigned long data)
+{
+       do {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (data)
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       } while (data);
+}
+
+static void emit_signed_LEB128(struct buffer_ext *be, long data)
+{
+       int more = 1;
+       int negative = data < 0;
+       int size = sizeof(long) * CHAR_BIT;
+       while (more) {
+               ubyte cur = data & 0x7F;
+               data >>= 7;
+               if (negative)
+                       data |= - (1 << (size - 7));
+               if ((data == 0 && !(cur & 0x40)) ||
+                   (data == -1l && (cur & 0x40)))
+                       more = 0;
+               else
+                       cur |= 0x80;
+               buffer_ext_add(be, &cur, 1);
+       }
+}
+
+static void emit_extended_opcode(struct buffer_ext *be, ubyte opcode,
+                                void *data, size_t data_len)
+{
+       buffer_ext_add(be, (char *)"", 1);
+
+       emit_unsigned_LEB128(be, data_len + 1);
+
+       buffer_ext_add(be, &opcode, 1);
+       buffer_ext_add(be, data, data_len);
+}
+
+static void emit_opcode(struct buffer_ext *be, ubyte opcode)
+{
+       buffer_ext_add(be, &opcode, 1);
+}
+
+static void emit_opcode_signed(struct buffer_ext  *be,
+                              ubyte opcode, long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_signed_LEB128(be, data);
+}
+
+static void emit_opcode_unsigned(struct buffer_ext *be, ubyte opcode,
+                                unsigned long data)
+{
+       buffer_ext_add(be, &opcode, 1);
+       emit_unsigned_LEB128(be, data);
+}
+
+static void emit_advance_pc(struct buffer_ext *be, unsigned long delta_pc)
+{
+       emit_opcode_unsigned(be, DW_LNS_advance_pc, delta_pc);
+}
+
+static void emit_advance_lineno(struct buffer_ext  *be, long delta_lineno)
+{
+       emit_opcode_signed(be, DW_LNS_advance_line, delta_lineno);
+}
+
+static void emit_lne_end_of_sequence(struct buffer_ext *be)
+{
+       emit_extended_opcode(be, DW_LNE_end_sequence, NULL, 0);
+}
+
+static void emit_set_file(struct buffer_ext *be, unsigned long idx)
+{
+       emit_opcode_unsigned(be, DW_LNS_set_file, idx);
+}
+
+static void emit_lne_define_filename(struct buffer_ext *be,
+                                    const char *filename)
+{
+       buffer_ext_add(be, (void *)"", 1);
+
+       /* LNE field, strlen(filename) + zero termination, 3 bytes for: the dir entry, timestamp, filesize */
+       emit_unsigned_LEB128(be, strlen(filename) + 5);
+       emit_opcode(be, DW_LNE_define_file);
+       emit_string(be, filename);
+       /* directory index 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* last modification date on file 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+       /* filesize 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void emit_lne_set_address(struct buffer_ext *be,
+                                void *address)
+{
+       emit_extended_opcode(be, DW_LNE_set_address, &address, sizeof(unsigned long));
+}
+
+static ubyte get_special_opcode(struct debug_entry *ent,
+                               unsigned int last_line,
+                               unsigned long last_vma)
+{
+       unsigned int temp;
+       unsigned long delta_addr;
+
+       /*
+        * delta from line_base
+        */
+       temp = (ent->lineno - last_line) - default_debug_line_header.line_base;
+
+       if (temp >= default_debug_line_header.line_range)
+               return 0;
+
+       /*
+        * delta of addresses
+        */
+       delta_addr = (ent->addr - last_vma) / default_debug_line_header.minimum_instruction_length;
+
+       /* This is not sufficient to ensure opcode will be in [0-256] but
+        * sufficient to ensure when summing with the delta lineno we will
+        * not overflow the unsigned long opcode */
+
+       if (delta_addr <= 256 / default_debug_line_header.line_range) {
+               unsigned long opcode = temp +
+                       (delta_addr * default_debug_line_header.line_range) +
+                       default_debug_line_header.opcode_base;
+
+               return opcode <= 255 ? opcode : 0;
+       }
+       return 0;
+}
+
+static void emit_lineno_info(struct buffer_ext *be,
+                            struct debug_entry *ent, size_t nr_entry,
+                            unsigned long code_addr)
+{
+       size_t i;
+
+       /*
+        * Machine state at start of a statement program
+        * address = 0
+        * file    = 1
+        * line    = 1
+        * column  = 0
+        * is_stmt = default_is_stmt as given in the debug_line_header
+        * basic block = 0
+        * end sequence = 0
+        */
+
+       /* start state of the state machine we take care of */
+       unsigned long last_vma = code_addr;
+       char const  *cur_filename = NULL;
+       unsigned long cur_file_idx = 0;
+       int last_line = 1;
+
+       emit_lne_set_address(be, (void *)code_addr);
+
+       for (i = 0; i < nr_entry; i++, ent = debug_entry_next(ent)) {
+               int need_copy = 0;
+               ubyte special_opcode;
+
+               /*
+                * check if filename changed, if so add it
+                */
+               if (!cur_filename || strcmp(cur_filename, ent->name)) {
+                       emit_lne_define_filename(be, ent->name);
+                       cur_filename = ent->name;
+                       emit_set_file(be, ++cur_file_idx);
+                       need_copy = 1;
+               }
+
+               special_opcode = get_special_opcode(ent, last_line, last_vma);
+               if (special_opcode != 0) {
+                       last_line = ent->lineno;
+                       last_vma  = ent->addr;
+                       emit_opcode(be, special_opcode);
+               } else {
+                       /*
+                        * lines differ, emit line delta
+                        */
+                       if (last_line != ent->lineno) {
+                               emit_advance_lineno(be, ent->lineno - last_line);
+                               last_line = ent->lineno;
+                               need_copy = 1;
+                       }
+                       /*
+                        * addresses differ, emit address delta
+                        */
+                       if (last_vma != ent->addr) {
+                               emit_advance_pc(be, ent->addr - last_vma);
+                               last_vma = ent->addr;
+                               need_copy = 1;
+                       }
+                       /*
+                        * add new row to matrix
+                        */
+                       if (need_copy)
+                               emit_opcode(be, DW_LNS_copy);
+               }
+       }
+}
+
+static void add_debug_line(struct buffer_ext *be,
+       struct debug_entry *ent, size_t nr_entry,
+       unsigned long code_addr)
+{
+       struct debug_line_header * dbg_header;
+       size_t old_size;
+
+       old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, (void *)&default_debug_line_header,
+                sizeof(default_debug_line_header));
+
+       buffer_ext_add(be, &standard_opcode_length,  sizeof(standard_opcode_length));
+
+       // empty directory entry
+       buffer_ext_add(be, (void *)"", 1);
+
+       // empty filename directory
+       buffer_ext_add(be, (void *)"", 1);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->prolog_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, minimum_instruction_length);
+
+       emit_lineno_info(be, ent, nr_entry, code_addr);
+
+       emit_lne_end_of_sequence(be);
+
+       dbg_header = buffer_ext_addr(be) + old_size;
+       dbg_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct debug_line_header, version);
+}
+
+static void
+add_debug_abbrev(struct buffer_ext *be)
+{
+        emit_unsigned_LEB128(be, 1);
+        emit_unsigned_LEB128(be, DW_TAG_compile_unit);
+        emit_unsigned_LEB128(be, DW_CHILDREN_yes);
+        emit_unsigned_LEB128(be, DW_AT_stmt_list);
+        emit_unsigned_LEB128(be, DW_FORM_data4);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void
+add_compilation_unit(struct buffer_ext *be,
+                    size_t offset_debug_line)
+{
+       struct compilation_unit_header *comp_unit_header;
+       size_t old_size = buffer_ext_size(be);
+
+       buffer_ext_add(be, &default_comp_unit_header,
+                      sizeof(default_comp_unit_header));
+
+       emit_unsigned_LEB128(be, 1);
+       emit_uword(be, offset_debug_line);
+
+       comp_unit_header = buffer_ext_addr(be) + old_size;
+       comp_unit_header->total_length = (buffer_ext_size(be) - old_size) -
+               offsetof(struct compilation_unit_header, version);
+}
+
+static int
+jit_process_debug_info(uint64_t code_addr,
+                      void *debug, int nr_debug_entries,
+                      struct buffer_ext *dl,
+                      struct buffer_ext *da,
+                      struct buffer_ext *di)
+{
+       struct debug_entry *ent = debug;
+       int i;
+
+       for (i = 0; i < nr_debug_entries; i++) {
+               ent->addr = ent->addr - code_addr;
+               ent = debug_entry_next(ent);
+       }
+       add_compilation_unit(di, buffer_ext_size(dl));
+       add_debug_line(dl, debug, nr_debug_entries, 0);
+       add_debug_abbrev(da);
+       if (0) buffer_ext_dump(da, "abbrev");
+
+       return 0;
+}
+
+int
+jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_entries)
+{
+       Elf_Data *d;
+       Elf_Scn *scn;
+       Elf_Shdr *shdr;
+       struct buffer_ext dl, di, da;
+       int ret;
+
+       buffer_ext_init(&dl);
+       buffer_ext_init(&di);
+       buffer_ext_init(&da);
+
+       ret = jit_process_debug_info(code_addr, debug, nr_debug_entries, &dl, &da, &di);
+       if (ret)
+               return -1;
+       /*
+        * setup .debug_line section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&dl);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&dl);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 52; /* .debug_line */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_info section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&di);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&di);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 64; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * setup .debug_abbrev section
+        */
+       scn = elf_newscn(e);
+       if (!scn) {
+               warnx("cannot create section");
+               return -1;
+       }
+
+       d = elf_newdata(scn);
+       if (!d) {
+               warnx("cannot get new data");
+               return -1;
+       }
+
+       d->d_align = 1;
+       d->d_off = 0LL;
+       d->d_buf = buffer_ext_addr(&da);
+       d->d_type = ELF_T_BYTE;
+       d->d_size = buffer_ext_size(&da);
+       d->d_version = EV_CURRENT;
+
+       shdr = elf_getshdr(scn);
+       if (!shdr) {
+               warnx("cannot get section header");
+               return -1;
+       }
+
+       shdr->sh_name = 76; /* .debug_info */
+       shdr->sh_type = SHT_PROGBITS;
+       shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+       shdr->sh_flags = 0;
+       shdr->sh_entsize = 0;
+
+       /*
+        * now we update the ELF image with all the sections
+        */
+       if (elf_update(e, ELF_C_WRITE) < 0) {
+               warnx("elf_update debug failed");
+               return -1;
+       }
+       return 0;
+}
index f50b723..73e38e4 100644 (file)
@@ -23,6 +23,8 @@
 #include "strbuf.h"
 #include "build-id.h"
 #include "data.h"
+#include <api/fs/fs.h>
+#include "asm/bug.h"
 
 /*
  * magic2 = "PERFILE2"
@@ -868,6 +870,199 @@ static int write_auxtrace(int fd, struct perf_header *h,
        return err;
 }
 
+static int cpu_cache_level__sort(const void *a, const void *b)
+{
+       struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a;
+       struct cpu_cache_level *cache_b = (struct cpu_cache_level *)b;
+
+       return cache_a->level - cache_b->level;
+}
+
+static bool cpu_cache_level__cmp(struct cpu_cache_level *a, struct cpu_cache_level *b)
+{
+       if (a->level != b->level)
+               return false;
+
+       if (a->line_size != b->line_size)
+               return false;
+
+       if (a->sets != b->sets)
+               return false;
+
+       if (a->ways != b->ways)
+               return false;
+
+       if (strcmp(a->type, b->type))
+               return false;
+
+       if (strcmp(a->size, b->size))
+               return false;
+
+       if (strcmp(a->map, b->map))
+               return false;
+
+       return true;
+}
+
+static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 level)
+{
+       char path[PATH_MAX], file[PATH_MAX];
+       struct stat st;
+       size_t len;
+
+       scnprintf(path, PATH_MAX, "devices/system/cpu/cpu%d/cache/index%d/", cpu, level);
+       scnprintf(file, PATH_MAX, "%s/%s", sysfs__mountpoint(), path);
+
+       if (stat(file, &st))
+               return 1;
+
+       scnprintf(file, PATH_MAX, "%s/level", path);
+       if (sysfs__read_int(file, (int *) &cache->level))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/coherency_line_size", path);
+       if (sysfs__read_int(file, (int *) &cache->line_size))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/number_of_sets", path);
+       if (sysfs__read_int(file, (int *) &cache->sets))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/ways_of_associativity", path);
+       if (sysfs__read_int(file, (int *) &cache->ways))
+               return -1;
+
+       scnprintf(file, PATH_MAX, "%s/type", path);
+       if (sysfs__read_str(file, &cache->type, &len))
+               return -1;
+
+       cache->type[len] = 0;
+       cache->type = rtrim(cache->type);
+
+       scnprintf(file, PATH_MAX, "%s/size", path);
+       if (sysfs__read_str(file, &cache->size, &len)) {
+               free(cache->type);
+               return -1;
+       }
+
+       cache->size[len] = 0;
+       cache->size = rtrim(cache->size);
+
+       scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
+       if (sysfs__read_str(file, &cache->map, &len)) {
+               free(cache->map);
+               free(cache->type);
+               return -1;
+       }
+
+       cache->map[len] = 0;
+       cache->map = rtrim(cache->map);
+       return 0;
+}
+
+static void cpu_cache_level__fprintf(FILE *out, struct cpu_cache_level *c)
+{
+       fprintf(out, "L%d %-15s %8s [%s]\n", c->level, c->type, c->size, c->map);
+}
+
+static int build_caches(struct cpu_cache_level caches[], u32 size, u32 *cntp)
+{
+       u32 i, cnt = 0;
+       long ncpus;
+       u32 nr, cpu;
+       u16 level;
+
+       ncpus = sysconf(_SC_NPROCESSORS_CONF);
+       if (ncpus < 0)
+               return -1;
+
+       nr = (u32)(ncpus & UINT_MAX);
+
+       for (cpu = 0; cpu < nr; cpu++) {
+               for (level = 0; level < 10; level++) {
+                       struct cpu_cache_level c;
+                       int err;
+
+                       err = cpu_cache_level__read(&c, cpu, level);
+                       if (err < 0)
+                               return err;
+
+                       if (err == 1)
+                               break;
+
+                       for (i = 0; i < cnt; i++) {
+                               if (cpu_cache_level__cmp(&c, &caches[i]))
+                                       break;
+                       }
+
+                       if (i == cnt)
+                               caches[cnt++] = c;
+                       else
+                               cpu_cache_level__free(&c);
+
+                       if (WARN_ONCE(cnt == size, "way too many cpu caches.."))
+                               goto out;
+               }
+       }
+ out:
+       *cntp = cnt;
+       return 0;
+}
+
+#define MAX_CACHES 2000
+
+static int write_cache(int fd, struct perf_header *h __maybe_unused,
+                         struct perf_evlist *evlist __maybe_unused)
+{
+       struct cpu_cache_level caches[MAX_CACHES];
+       u32 cnt = 0, i, version = 1;
+       int ret;
+
+       ret = build_caches(caches, MAX_CACHES, &cnt);
+       if (ret)
+               goto out;
+
+       qsort(&caches, cnt, sizeof(struct cpu_cache_level), cpu_cache_level__sort);
+
+       ret = do_write(fd, &version, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       ret = do_write(fd, &cnt, sizeof(u32));
+       if (ret < 0)
+               goto out;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level *c = &caches[i];
+
+               #define _W(v)                                   \
+                       ret = do_write(fd, &c->v, sizeof(u32)); \
+                       if (ret < 0)                            \
+                               goto out;
+
+               _W(level)
+               _W(line_size)
+               _W(sets)
+               _W(ways)
+               #undef _W
+
+               #define _W(v)                                           \
+                       ret = do_write_string(fd, (const char *) c->v); \
+                       if (ret < 0)                                    \
+                               goto out;
+
+               _W(type)
+               _W(size)
+               _W(map)
+               #undef _W
+       }
+
+out:
+       for (i = 0; i < cnt; i++)
+               cpu_cache_level__free(&caches[i]);
+       return ret;
+}
+
 static int write_stat(int fd __maybe_unused,
                      struct perf_header *h __maybe_unused,
                      struct perf_evlist *evlist __maybe_unused)
@@ -1172,6 +1367,18 @@ static void print_stat(struct perf_header *ph __maybe_unused,
        fprintf(fp, "# contains stat data\n");
 }
 
+static void print_cache(struct perf_header *ph __maybe_unused,
+                       int fd __maybe_unused, FILE *fp __maybe_unused)
+{
+       int i;
+
+       fprintf(fp, "# CPU cache info:\n");
+       for (i = 0; i < ph->env.caches_cnt; i++) {
+               fprintf(fp, "#  ");
+               cpu_cache_level__fprintf(fp, &ph->env.caches[i]);
+       }
+}
+
 static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
                               FILE *fp)
 {
@@ -1920,6 +2127,68 @@ static int process_auxtrace(struct perf_file_section *section,
        return err;
 }
 
+static int process_cache(struct perf_file_section *section __maybe_unused,
+                        struct perf_header *ph __maybe_unused, int fd __maybe_unused,
+                        void *data __maybe_unused)
+{
+       struct cpu_cache_level *caches;
+       u32 cnt, i, version;
+
+       if (readn(fd, &version, sizeof(version)) != sizeof(version))
+               return -1;
+
+       if (ph->needs_swap)
+               version = bswap_32(version);
+
+       if (version != 1)
+               return -1;
+
+       if (readn(fd, &cnt, sizeof(cnt)) != sizeof(cnt))
+               return -1;
+
+       if (ph->needs_swap)
+               cnt = bswap_32(cnt);
+
+       caches = zalloc(sizeof(*caches) * cnt);
+       if (!caches)
+               return -1;
+
+       for (i = 0; i < cnt; i++) {
+               struct cpu_cache_level c;
+
+               #define _R(v)                                           \
+                       if (readn(fd, &c.v, sizeof(u32)) != sizeof(u32))\
+                               goto out_free_caches;                   \
+                       if (ph->needs_swap)                             \
+                               c.v = bswap_32(c.v);                    \
+
+               _R(level)
+               _R(line_size)
+               _R(sets)
+               _R(ways)
+               #undef _R
+
+               #define _R(v)                           \
+                       c.v = do_read_string(fd, ph);   \
+                       if (!c.v)                       \
+                               goto out_free_caches;
+
+               _R(type)
+               _R(size)
+               _R(map)
+               #undef _R
+
+               caches[i] = c;
+       }
+
+       ph->env.caches = caches;
+       ph->env.caches_cnt = cnt;
+       return 0;
+out_free_caches:
+       free(caches);
+       return -1;
+}
+
 struct feature_ops {
        int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
        void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1962,6 +2231,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
        FEAT_OPP(HEADER_GROUP_DESC,     group_desc),
        FEAT_OPP(HEADER_AUXTRACE,       auxtrace),
        FEAT_OPA(HEADER_STAT,           stat),
+       FEAT_OPF(HEADER_CACHE,          cache),
 };
 
 struct header_print_data {
index cff9892..3d87ca8 100644 (file)
@@ -32,6 +32,7 @@ enum {
        HEADER_GROUP_DESC,
        HEADER_AUXTRACE,
        HEADER_STAT,
+       HEADER_CACHE,
        HEADER_LAST_FEATURE,
        HEADER_FEAT_BITS        = 256,
 };
index dc1e41c..43a98a4 100644 (file)
@@ -6,7 +6,8 @@
 static int autocorrect;
 static struct cmdnames aliases;
 
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+static int perf_unknown_cmd_config(const char *var, const char *value,
+                                  void *cb __maybe_unused)
 {
        if (!strcmp(var, "help.autocorrect"))
                autocorrect = perf_config_int(var,value);
@@ -14,7 +15,7 @@ static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
        if (!prefixcmp(var, "alias."))
                add_cmdname(&aliases, var + 6, strlen(var + 6));
 
-       return perf_default_config(var, value, cb);
+       return 0;
 }
 
 static int levenshtein_compare(const void *p1, const void *p2)
index c226303..290b3cb 100644 (file)
@@ -131,6 +131,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
                        symlen = unresolved_col_width + 4 + 2;
                        hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL,
                                           symlen);
+                       hists__new_col_len(hists, HISTC_MEM_DCACHELINE,
+                                          symlen);
                }
 
                if (h->mem_info->iaddr.sym) {
@@ -177,6 +179,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
        if (h->transaction)
                hists__new_col_len(hists, HISTC_TRANSACTION,
                                   hist_entry__transaction_len());
+
+       if (h->trace_output)
+               hists__new_col_len(hists, HISTC_TRACE, strlen(h->trace_output));
 }
 
 void hists__output_recalc_col_len(struct hists *hists, int max_rows)
@@ -243,6 +248,8 @@ static void he_stat__decay(struct he_stat *he_stat)
        /* XXX need decay for weight too? */
 }
 
+static void hists__delete_entry(struct hists *hists, struct hist_entry *he);
+
 static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
 {
        u64 prev_period = he->stat.period;
@@ -258,21 +265,45 @@ static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
 
        diff = prev_period - he->stat.period;
 
-       hists->stats.total_period -= diff;
-       if (!he->filtered)
-               hists->stats.total_non_filtered_period -= diff;
+       if (!he->depth) {
+               hists->stats.total_period -= diff;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period -= diff;
+       }
+
+       if (!he->leaf) {
+               struct hist_entry *child;
+               struct rb_node *node = rb_first(&he->hroot_out);
+               while (node) {
+                       child = rb_entry(node, struct hist_entry, rb_node);
+                       node = rb_next(node);
+
+                       if (hists__decay_entry(hists, child))
+                               hists__delete_entry(hists, child);
+               }
+       }
 
        return he->stat.period == 0;
 }
 
 static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
 {
-       rb_erase(&he->rb_node, &hists->entries);
+       struct rb_root *root_in;
+       struct rb_root *root_out;
 
-       if (sort__need_collapse)
-               rb_erase(&he->rb_node_in, &hists->entries_collapsed);
-       else
-               rb_erase(&he->rb_node_in, hists->entries_in);
+       if (he->parent_he) {
+               root_in  = &he->parent_he->hroot_in;
+               root_out = &he->parent_he->hroot_out;
+       } else {
+               if (sort__need_collapse)
+                       root_in = &hists->entries_collapsed;
+               else
+                       root_in = hists->entries_in;
+               root_out = &hists->entries;
+       }
+
+       rb_erase(&he->rb_node_in, root_in);
+       rb_erase(&he->rb_node, root_out);
 
        --hists->nr_entries;
        if (!he->filtered)
@@ -391,6 +422,9 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                }
                INIT_LIST_HEAD(&he->pairs.node);
                thread__get(he->thread);
+
+               if (!symbol_conf.report_hierarchy)
+                       he->leaf = true;
        }
 
        return he;
@@ -403,6 +437,16 @@ static u8 symbol__parent_filter(const struct symbol *parent)
        return 0;
 }
 
+static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
+{
+       if (!symbol_conf.use_callchain)
+               return;
+
+       he->hists->callchain_period += period;
+       if (!he->filtered)
+               he->hists->callchain_non_filtered_period += period;
+}
+
 static struct hist_entry *hists__findnew_entry(struct hists *hists,
                                               struct hist_entry *entry,
                                               struct addr_location *al,
@@ -430,8 +474,10 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
                cmp = hist_entry__cmp(he, entry);
 
                if (!cmp) {
-                       if (sample_self)
+                       if (sample_self) {
                                he_stat__add_period(&he->stat, period, weight);
+                               hist_entry__add_callchain_period(he, period);
+                       }
                        if (symbol_conf.cumulate_callchain)
                                he_stat__add_period(he->stat_acc, period, weight);
 
@@ -464,6 +510,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
        if (!he)
                return NULL;
 
+       if (sample_self)
+               hist_entry__add_callchain_period(he, period);
        hists->nr_entries++;
 
        rb_link_node(&he->rb_node_in, parent, p);
@@ -949,10 +997,15 @@ out:
 int64_t
 hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 {
+       struct hists *hists = left->hists;
        struct perf_hpp_fmt *fmt;
        int64_t cmp = 0;
 
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                cmp = fmt->cmp(fmt, left, right);
                if (cmp)
                        break;
@@ -964,10 +1017,15 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 int64_t
 hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 {
+       struct hists *hists = left->hists;
        struct perf_hpp_fmt *fmt;
        int64_t cmp = 0;
 
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   !perf_hpp__defined_dynamic_entry(fmt, hists))
+                       continue;
+
                cmp = fmt->collapse(fmt, left, right);
                if (cmp)
                        break;
@@ -1003,18 +1061,251 @@ void hist_entry__delete(struct hist_entry *he)
        free(he);
 }
 
+/*
+ * If this is not the last column, then we need to pad it according to the
+ * pre-calculated max lenght for this column, otherwise don't bother adding
+ * spaces because that would break viewing this with, for instance, 'less',
+ * that would show tons of trailing spaces when a long C++ demangled method
+ * names is sampled.
+*/
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed)
+{
+       if (!list_is_last(&fmt->list, &he->hists->hpp_list->fields)) {
+               const int width = fmt->width(fmt, hpp, hists_to_evsel(he->hists));
+               if (printed < width) {
+                       advance_hpp(hpp, printed);
+                       printed = scnprintf(hpp->buf, hpp->size, "%-*s", width - printed, " ");
+               }
+       }
+
+       return printed;
+}
+
 /*
  * collapse the histogram
  */
 
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-                                 struct rb_root *root, struct hist_entry *he)
+static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
+static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *he,
+                                      enum hist_filter type);
+
+typedef bool (*fmt_chk_fn)(struct perf_hpp_fmt *fmt);
+
+static bool check_thread_entry(struct perf_hpp_fmt *fmt)
+{
+       return perf_hpp__is_thread_entry(fmt) || perf_hpp__is_comm_entry(fmt);
+}
+
+static void hist_entry__check_and_remove_filter(struct hist_entry *he,
+                                               enum hist_filter type,
+                                               fmt_chk_fn check)
+{
+       struct perf_hpp_fmt *fmt;
+       bool type_match = false;
+       struct hist_entry *parent = he->parent_he;
+
+       switch (type) {
+       case HIST_FILTER__THREAD:
+               if (symbol_conf.comm_list == NULL &&
+                   symbol_conf.pid_list == NULL &&
+                   symbol_conf.tid_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__DSO:
+               if (symbol_conf.dso_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__SYMBOL:
+               if (symbol_conf.sym_list == NULL)
+                       return;
+               break;
+       case HIST_FILTER__PARENT:
+       case HIST_FILTER__GUEST:
+       case HIST_FILTER__HOST:
+       case HIST_FILTER__SOCKET:
+       default:
+               return;
+       }
+
+       /* if it's filtered by own fmt, it has to have filter bits */
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (check(fmt)) {
+                       type_match = true;
+                       break;
+               }
+       }
+
+       if (type_match) {
+               /*
+                * If the filter is for current level entry, propagate
+                * filter marker to parents.  The marker bit was
+                * already set by default so it only needs to clear
+                * non-filtered entries.
+                */
+               if (!(he->filtered & (1 << type))) {
+                       while (parent) {
+                               parent->filtered &= ~(1 << type);
+                               parent = parent->parent_he;
+                       }
+               }
+       } else {
+               /*
+                * If current entry doesn't have matching formats, set
+                * filter marker for upper level entries.  it will be
+                * cleared if its lower level entries is not filtered.
+                *
+                * For lower-level entries, it inherits parent's
+                * filter bit so that lower level entries of a
+                * non-filtered entry won't set the filter marker.
+                */
+               if (parent == NULL)
+                       he->filtered |= (1 << type);
+               else
+                       he->filtered |= (parent->filtered & (1 << type));
+       }
+}
+
+static void hist_entry__apply_hierarchy_filters(struct hist_entry *he)
+{
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__THREAD,
+                                           check_thread_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__DSO,
+                                           perf_hpp__is_dso_entry);
+
+       hist_entry__check_and_remove_filter(he, HIST_FILTER__SYMBOL,
+                                           perf_hpp__is_sym_entry);
+
+       hists__apply_filters(he->hists, he);
+}
+
+static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
+                                                struct rb_root *root,
+                                                struct hist_entry *he,
+                                                struct hist_entry *parent_he,
+                                                struct perf_hpp_list *hpp_list)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter, *new;
+       struct perf_hpp_fmt *fmt;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node_in);
+
+               cmp = 0;
+               perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+                       cmp = fmt->collapse(fmt, iter, he);
+                       if (cmp)
+                               break;
+               }
+
+               if (!cmp) {
+                       he_stat__add_stat(&iter->stat, &he->stat);
+                       return iter;
+               }
+
+               if (cmp < 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       new = hist_entry__new(he, true);
+       if (new == NULL)
+               return NULL;
+
+       hists->nr_entries++;
+
+       /* save related format list for output */
+       new->hpp_list = hpp_list;
+       new->parent_he = parent_he;
+
+       hist_entry__apply_hierarchy_filters(new);
+
+       /* some fields are now passed to 'new' */
+       perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+               if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+                       he->trace_output = NULL;
+               else
+                       new->trace_output = NULL;
+
+               if (perf_hpp__is_srcline_entry(fmt))
+                       he->srcline = NULL;
+               else
+                       new->srcline = NULL;
+
+               if (perf_hpp__is_srcfile_entry(fmt))
+                       he->srcfile = NULL;
+               else
+                       new->srcfile = NULL;
+       }
+
+       rb_link_node(&new->rb_node_in, parent, p);
+       rb_insert_color(&new->rb_node_in, root);
+       return new;
+}
+
+static int hists__hierarchy_insert_entry(struct hists *hists,
+                                        struct rb_root *root,
+                                        struct hist_entry *he)
+{
+       struct perf_hpp_list_node *node;
+       struct hist_entry *new_he = NULL;
+       struct hist_entry *parent = NULL;
+       int depth = 0;
+       int ret = 0;
+
+       list_for_each_entry(node, &hists->hpp_formats, list) {
+               /* skip period (overhead) and elided columns */
+               if (node->level == 0 || node->skip)
+                       continue;
+
+               /* insert copy of 'he' for each fmt into the hierarchy */
+               new_he = hierarchy_insert_entry(hists, root, he, parent, &node->hpp);
+               if (new_he == NULL) {
+                       ret = -1;
+                       break;
+               }
+
+               root = &new_he->hroot_in;
+               new_he->depth = depth++;
+               parent = new_he;
+       }
+
+       if (new_he) {
+               new_he->leaf = true;
+
+               if (symbol_conf.use_callchain) {
+                       callchain_cursor_reset(&callchain_cursor);
+                       if (callchain_merge(&callchain_cursor,
+                                           new_he->callchain,
+                                           he->callchain) < 0)
+                               ret = -1;
+               }
+       }
+
+       /* 'he' is no longer used */
+       hist_entry__delete(he);
+
+       /* return 0 (or -1) since it already applied filters */
+       return ret;
+}
+
+int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
+                                struct hist_entry *he)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct hist_entry *iter;
        int64_t cmp;
 
+       if (symbol_conf.report_hierarchy)
+               return hists__hierarchy_insert_entry(hists, root, he);
+
        while (*p != NULL) {
                parent = *p;
                iter = rb_entry(parent, struct hist_entry, rb_node_in);
@@ -1022,18 +1313,21 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
                cmp = hist_entry__collapse(iter, he);
 
                if (!cmp) {
+                       int ret = 0;
+
                        he_stat__add_stat(&iter->stat, &he->stat);
                        if (symbol_conf.cumulate_callchain)
                                he_stat__add_stat(iter->stat_acc, he->stat_acc);
 
                        if (symbol_conf.use_callchain) {
                                callchain_cursor_reset(&callchain_cursor);
-                               callchain_merge(&callchain_cursor,
-                                               iter->callchain,
-                                               he->callchain);
+                               if (callchain_merge(&callchain_cursor,
+                                                   iter->callchain,
+                                                   he->callchain) < 0)
+                                       ret = -1;
                        }
                        hist_entry__delete(he);
-                       return false;
+                       return ret;
                }
 
                if (cmp < 0)
@@ -1045,7 +1339,7 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
 
        rb_link_node(&he->rb_node_in, parent, p);
        rb_insert_color(&he->rb_node_in, root);
-       return true;
+       return 1;
 }
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
@@ -1071,14 +1365,15 @@ static void hists__apply_filters(struct hists *hists, struct hist_entry *he)
        hists__filter_entry_by_socket(hists, he);
 }
 
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 {
        struct rb_root *root;
        struct rb_node *next;
        struct hist_entry *n;
+       int ret;
 
        if (!sort__need_collapse)
-               return;
+               return 0;
 
        hists->nr_entries = 0;
 
@@ -1093,7 +1388,11 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                next = rb_next(&n->rb_node_in);
 
                rb_erase(&n->rb_node_in, root);
-               if (hists__collapse_insert_entry(hists, &hists->entries_collapsed, n)) {
+               ret = hists__collapse_insert_entry(hists, &hists->entries_collapsed, n);
+               if (ret < 0)
+                       return -1;
+
+               if (ret) {
                        /*
                         * If it wasn't combined with one of the entries already
                         * collapsed, we need to apply the filters that may have
@@ -1104,14 +1403,16 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
                if (prog)
                        ui_progress__update(prog, 1);
        }
+       return 0;
 }
 
 static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
 {
+       struct hists *hists = a->hists;
        struct perf_hpp_fmt *fmt;
        int64_t cmp = 0;
 
-       perf_hpp__for_each_sort_list(fmt) {
+       hists__for_each_sort_list(hists, fmt) {
                if (perf_hpp__should_skip(fmt, a->hists))
                        continue;
 
@@ -1152,6 +1453,113 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h)
        hists->stats.total_period += h->stat.period;
 }
 
+static void hierarchy_recalc_total_periods(struct hists *hists)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       node = rb_first(&hists->entries);
+
+       hists->stats.total_period = 0;
+       hists->stats.total_non_filtered_period = 0;
+
+       /*
+        * recalculate total period using top-level entries only
+        * since lower level entries only see non-filtered entries
+        * but upper level entries have sum of both entries.
+        */
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node);
+               node = rb_next(node);
+
+               hists->stats.total_period += he->stat.period;
+               if (!he->filtered)
+                       hists->stats.total_non_filtered_period += he->stat.period;
+       }
+}
+
+static void hierarchy_insert_output_entry(struct rb_root *root,
+                                         struct hist_entry *he)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       /* update column width of dynamic entry */
+       perf_hpp_list__for_each_sort_list(he->hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt))
+                       fmt->sort(fmt, he, NULL);
+       }
+}
+
+static void hists__hierarchy_output_resort(struct hists *hists,
+                                          struct ui_progress *prog,
+                                          struct rb_root *root_in,
+                                          struct rb_root *root_out,
+                                          u64 min_callchain_hits,
+                                          bool use_callchain)
+{
+       struct rb_node *node;
+       struct hist_entry *he;
+
+       *root_out = RB_ROOT;
+       node = rb_first(root_in);
+
+       while (node) {
+               he = rb_entry(node, struct hist_entry, rb_node_in);
+               node = rb_next(node);
+
+               hierarchy_insert_output_entry(root_out, he);
+
+               if (prog)
+                       ui_progress__update(prog, 1);
+
+               if (!he->leaf) {
+                       hists__hierarchy_output_resort(hists, prog,
+                                                      &he->hroot_in,
+                                                      &he->hroot_out,
+                                                      min_callchain_hits,
+                                                      use_callchain);
+                       hists->nr_entries++;
+                       if (!he->filtered) {
+                               hists->nr_non_filtered_entries++;
+                               hists__calc_col_len(hists, he);
+                       }
+
+                       continue;
+               }
+
+               if (!use_callchain)
+                       continue;
+
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
+
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
+
+               callchain_param.sort(&he->sorted_chain, he->callchain,
+                                    min_callchain_hits, &callchain_param);
+       }
+}
+
 static void __hists__insert_output_entry(struct rb_root *entries,
                                         struct hist_entry *he,
                                         u64 min_callchain_hits,
@@ -1160,10 +1568,20 @@ static void __hists__insert_output_entry(struct rb_root *entries,
        struct rb_node **p = &entries->rb_node;
        struct rb_node *parent = NULL;
        struct hist_entry *iter;
+       struct perf_hpp_fmt *fmt;
+
+       if (use_callchain) {
+               if (callchain_param.mode == CHAIN_GRAPH_REL) {
+                       u64 total = he->stat.period;
 
-       if (use_callchain)
+                       if (symbol_conf.cumulate_callchain)
+                               total = he->stat_acc->period;
+
+                       min_callchain_hits = total * (callchain_param.min_percent / 100);
+               }
                callchain_param.sort(&he->sorted_chain, he->callchain,
                                      min_callchain_hits, &callchain_param);
+       }
 
        while (*p != NULL) {
                parent = *p;
@@ -1177,23 +1595,41 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 
        rb_link_node(&he->rb_node, parent, p);
        rb_insert_color(&he->rb_node, entries);
+
+       perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+               if (perf_hpp__is_dynamic_entry(fmt) &&
+                   perf_hpp__defined_dynamic_entry(fmt, he->hists))
+                       fmt->sort(fmt, he, NULL);  /* update column width */
+       }
 }
 
-void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+static void output_resort(struct hists *hists, struct ui_progress *prog,
+                         bool use_callchain)
 {
        struct rb_root *root;
        struct rb_node *next;
        struct hist_entry *n;
+       u64 callchain_total;
        u64 min_callchain_hits;
-       struct perf_evsel *evsel = hists_to_evsel(hists);
-       bool use_callchain;
 
-       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
-               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
-       else
-               use_callchain = symbol_conf.use_callchain;
+       callchain_total = hists->callchain_period;
+       if (symbol_conf.filter_relative)
+               callchain_total = hists->callchain_non_filtered_period;
 
-       min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100);
+       min_callchain_hits = callchain_total * (callchain_param.min_percent / 100);
+
+       hists__reset_stats(hists);
+       hists__reset_col_len(hists);
+
+       if (symbol_conf.report_hierarchy) {
+               hists__hierarchy_output_resort(hists, prog,
+                                              &hists->entries_collapsed,
+                                              &hists->entries,
+                                              min_callchain_hits,
+                                              use_callchain);
+               hierarchy_recalc_total_periods(hists);
+               return;
+       }
 
        if (sort__need_collapse)
                root = &hists->entries_collapsed;
@@ -1203,9 +1639,6 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
        next = rb_first(root);
        hists->entries = RB_ROOT;
 
-       hists__reset_stats(hists);
-       hists__reset_col_len(hists);
-
        while (next) {
                n = rb_entry(next, struct hist_entry, rb_node_in);
                next = rb_next(&n->rb_node_in);
@@ -1221,15 +1654,136 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
        }
 }
 
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog)
+{
+       bool use_callchain;
+
+       if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
+               use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
+       else
+               use_callchain = symbol_conf.use_callchain;
+
+       output_resort(evsel__hists(evsel), prog, use_callchain);
+}
+
+void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+{
+       output_resort(hists, prog, symbol_conf.use_callchain);
+}
+
+static bool can_goto_child(struct hist_entry *he, enum hierarchy_move_dir hmd)
+{
+       if (he->leaf || hmd == HMD_FORCE_SIBLING)
+               return false;
+
+       if (he->unfolded || hmd == HMD_FORCE_CHILD)
+               return true;
+
+       return false;
+}
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       while (can_goto_child(he, HMD_NORMAL)) {
+               node = rb_last(&he->hroot_out);
+               he = rb_entry(node, struct hist_entry, rb_node);
+       }
+       return node;
+}
+
+struct rb_node *__rb_hierarchy_next(struct rb_node *node, enum hierarchy_move_dir hmd)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       if (can_goto_child(he, hmd))
+               node = rb_first(&he->hroot_out);
+       else
+               node = rb_next(node);
+
+       while (node == NULL) {
+               he = he->parent_he;
+               if (he == NULL)
+                       break;
+
+               node = rb_next(&he->rb_node);
+       }
+       return node;
+}
+
+struct rb_node *rb_hierarchy_prev(struct rb_node *node)
+{
+       struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+       node = rb_prev(node);
+       if (node)
+               return rb_hierarchy_last(node);
+
+       he = he->parent_he;
+       if (he == NULL)
+               return NULL;
+
+       return &he->rb_node;
+}
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit)
+{
+       struct rb_node *node;
+       struct hist_entry *child;
+       float percent;
+
+       if (he->leaf)
+               return false;
+
+       node = rb_first(&he->hroot_out);
+       child = rb_entry(node, struct hist_entry, rb_node);
+
+       while (node && child->filtered) {
+               node = rb_next(node);
+               child = rb_entry(node, struct hist_entry, rb_node);
+       }
+
+       if (node)
+               percent = hist_entry__get_percent_limit(child);
+       else
+               percent = 0;
+
+       return node && percent >= limit;
+}
+
 static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
                                       enum hist_filter filter)
 {
        h->filtered &= ~(1 << filter);
+
+       if (symbol_conf.report_hierarchy) {
+               struct hist_entry *parent = h->parent_he;
+
+               while (parent) {
+                       he_stat__add_stat(&parent->stat, &h->stat);
+
+                       parent->filtered &= ~(1 << filter);
+
+                       if (parent->filtered)
+                               goto next;
+
+                       /* force fold unfiltered entry for simplicity */
+                       parent->unfolded = false;
+                       parent->has_no_entry = false;
+                       parent->row_offset = 0;
+                       parent->nr_rows = 0;
+next:
+                       parent = parent->parent_he;
+               }
+       }
+
        if (h->filtered)
                return;
 
        /* force fold unfiltered entry for simplicity */
        h->unfolded = false;
+       h->has_no_entry = false;
        h->row_offset = 0;
        h->nr_rows = 0;
 
@@ -1252,28 +1806,6 @@ static bool hists__filter_entry_by_dso(struct hists *hists,
        return false;
 }
 
-void hists__filter_by_dso(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (symbol_conf.exclude_other && !h->parent)
-                       continue;
-
-               if (hists__filter_entry_by_dso(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__DSO);
-       }
-}
-
 static bool hists__filter_entry_by_thread(struct hists *hists,
                                          struct hist_entry *he)
 {
@@ -1286,25 +1818,6 @@ static bool hists__filter_entry_by_thread(struct hists *hists,
        return false;
 }
 
-void hists__filter_by_thread(struct hists *hists)
-{
-       struct rb_node *nd;
-
-       hists->stats.nr_non_filtered_samples = 0;
-
-       hists__reset_filter_stats(hists);
-       hists__reset_col_len(hists);
-
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-               if (hists__filter_entry_by_thread(hists, h))
-                       continue;
-
-               hists__remove_entry_filter(hists, h, HIST_FILTER__THREAD);
-       }
-}
-
 static bool hists__filter_entry_by_symbol(struct hists *hists,
                                          struct hist_entry *he)
 {
@@ -1318,7 +1831,21 @@ static bool hists__filter_entry_by_symbol(struct hists *hists,
        return false;
 }
 
-void hists__filter_by_symbol(struct hists *hists)
+static bool hists__filter_entry_by_socket(struct hists *hists,
+                                         struct hist_entry *he)
+{
+       if ((hists->socket_filter > -1) &&
+           (he->socket != hists->socket_filter)) {
+               he->filtered |= (1 << HIST_FILTER__SOCKET);
+               return true;
+       }
+
+       return false;
+}
+
+typedef bool (*filter_fn_t)(struct hists *hists, struct hist_entry *he);
+
+static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t filter)
 {
        struct rb_node *nd;
 
@@ -1330,42 +1857,155 @@ void hists__filter_by_symbol(struct hists *hists)
        for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
                struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-               if (hists__filter_entry_by_symbol(hists, h))
+               if (filter(hists, h))
                        continue;
 
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SYMBOL);
+               hists__remove_entry_filter(hists, h, type);
        }
 }
 
-static bool hists__filter_entry_by_socket(struct hists *hists,
-                                         struct hist_entry *he)
+static void resort_filtered_entry(struct rb_root *root, struct hist_entry *he)
 {
-       if ((hists->socket_filter > -1) &&
-           (he->socket != hists->socket_filter)) {
-               he->filtered |= (1 << HIST_FILTER__SOCKET);
-               return true;
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       struct rb_root new_root = RB_ROOT;
+       struct rb_node *nd;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (hist_entry__sort(he, iter) > 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
        }
 
-       return false;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, root);
+
+       if (he->leaf || he->filtered)
+               return;
+
+       nd = rb_first(&he->hroot_out);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &he->hroot_out);
+
+               resort_filtered_entry(&new_root, h);
+       }
+
+       he->hroot_out = new_root;
 }
 
-void hists__filter_by_socket(struct hists *hists)
+static void hists__filter_hierarchy(struct hists *hists, int type, const void *arg)
 {
        struct rb_node *nd;
+       struct rb_root new_root = RB_ROOT;
 
        hists->stats.nr_non_filtered_samples = 0;
 
        hists__reset_filter_stats(hists);
        hists__reset_col_len(hists);
 
-       for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+       nd = rb_first(&hists->entries);
+       while (nd) {
                struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+               int ret;
 
-               if (hists__filter_entry_by_socket(hists, h))
-                       continue;
+               ret = hist_entry__filter(h, type, arg);
 
-               hists__remove_entry_filter(hists, h, HIST_FILTER__SOCKET);
+               /*
+                * case 1. non-matching type
+                * zero out the period, set filter marker and move to child
+                */
+               if (ret < 0) {
+                       memset(&h->stat, 0, sizeof(h->stat));
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_CHILD);
+               }
+               /*
+                * case 2. matched type (filter out)
+                * set filter marker and move to next
+                */
+               else if (ret == 1) {
+                       h->filtered |= (1 << type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+               /*
+                * case 3. ok (not filtered)
+                * add period to hists and parents, erase the filter marker
+                * and move to next sibling
+                */
+               else {
+                       hists__remove_entry_filter(hists, h, type);
+
+                       nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+               }
+       }
+
+       hierarchy_recalc_total_periods(hists);
+
+       /*
+        * resort output after applying a new filter since filter in a lower
+        * hierarchy can change periods in a upper hierarchy.
+        */
+       nd = rb_first(&hists->entries);
+       while (nd) {
+               struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+               nd = rb_next(nd);
+               rb_erase(&h->rb_node, &hists->entries);
+
+               resort_filtered_entry(&new_root, h);
        }
+
+       hists->entries = new_root;
+}
+
+void hists__filter_by_thread(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__THREAD,
+                                       hists->thread_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__THREAD,
+                                     hists__filter_entry_by_thread);
+}
+
+void hists__filter_by_dso(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__DSO,
+                                       hists->dso_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__DSO,
+                                     hists__filter_entry_by_dso);
+}
+
+void hists__filter_by_symbol(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SYMBOL,
+                                       hists->symbol_filter_str);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
+                                     hists__filter_entry_by_symbol);
+}
+
+void hists__filter_by_socket(struct hists *hists)
+{
+       if (symbol_conf.report_hierarchy)
+               hists__filter_hierarchy(hists, HIST_FILTER__SOCKET,
+                                       &hists->socket_filter);
+       else
+               hists__filter_by_type(hists, HIST_FILTER__SOCKET,
+                                     hists__filter_entry_by_socket);
 }
 
 void events_stats__inc(struct events_stats *stats, u32 type)
@@ -1583,7 +2223,7 @@ int perf_hist_config(const char *var, const char *value)
        return 0;
 }
 
-int __hists__init(struct hists *hists)
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list)
 {
        memset(hists, 0, sizeof(*hists));
        hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
@@ -1592,6 +2232,8 @@ int __hists__init(struct hists *hists)
        hists->entries = RB_ROOT;
        pthread_mutex_init(&hists->lock, NULL);
        hists->socket_filter = -1;
+       hists->hpp_list = hpp_list;
+       INIT_LIST_HEAD(&hists->hpp_formats);
        return 0;
 }
 
@@ -1620,15 +2262,26 @@ static void hists__delete_all_entries(struct hists *hists)
 static void hists_evsel__exit(struct perf_evsel *evsel)
 {
        struct hists *hists = evsel__hists(evsel);
+       struct perf_hpp_fmt *fmt, *pos;
+       struct perf_hpp_list_node *node, *tmp;
 
        hists__delete_all_entries(hists);
+
+       list_for_each_entry_safe(node, tmp, &hists->hpp_formats, list) {
+               perf_hpp_list__for_each_format_safe(&node->hpp, fmt, pos) {
+                       list_del(&fmt->list);
+                       free(fmt);
+               }
+               list_del(&node->list);
+               free(node);
+       }
 }
 
 static int hists_evsel__init(struct perf_evsel *evsel)
 {
        struct hists *hists = evsel__hists(evsel);
 
-       __hists__init(hists);
+       __hists__init(hists, &perf_hpp_list);
        return 0;
 }
 
@@ -1647,3 +2300,9 @@ int hists__init(void)
 
        return err;
 }
+
+void perf_hpp_list__init(struct perf_hpp_list *list)
+{
+       INIT_LIST_HEAD(&list->fields);
+       INIT_LIST_HEAD(&list->sorts);
+}
index d4ec482..ead18c8 100644 (file)
@@ -66,6 +66,8 @@ struct hists {
        struct rb_root          entries_collapsed;
        u64                     nr_entries;
        u64                     nr_non_filtered_entries;
+       u64                     callchain_period;
+       u64                     callchain_non_filtered_period;
        struct thread           *thread_filter;
        const struct dso        *dso_filter;
        const char              *uid_filter_str;
@@ -75,6 +77,9 @@ struct hists {
        u64                     event_stream;
        u16                     col_len[HISTC_NR_COLS];
        int                     socket_filter;
+       struct perf_hpp_list    *hpp_list;
+       struct list_head        hpp_formats;
+       int                     nr_hpp_node;
 };
 
 struct hist_entry_iter;
@@ -121,15 +126,21 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
                         int max_stack_depth, void *arg);
 
+struct perf_hpp;
+struct perf_hpp_fmt;
+
 int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
 int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
 int hist_entry__transaction_len(void);
 int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
                              struct hists *hists);
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+                                  struct perf_hpp_fmt *fmt, int printed);
 void hist_entry__delete(struct hist_entry *he);
 
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
 void hists__output_resort(struct hists *hists, struct ui_progress *prog);
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
 
 void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
 void hists__delete_entries(struct hists *hists);
@@ -185,10 +196,10 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel)
 }
 
 int hists__init(void);
-int __hists__init(struct hists *hists);
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+int hists__collapse_insert_entry(struct hists *hists,
                                  struct rb_root *root, struct hist_entry *he);
 
 struct perf_hpp {
@@ -214,28 +225,64 @@ struct perf_hpp_fmt {
                            struct hist_entry *a, struct hist_entry *b);
        int64_t (*sort)(struct perf_hpp_fmt *fmt,
                        struct hist_entry *a, struct hist_entry *b);
+       bool (*equal)(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
+       void (*free)(struct perf_hpp_fmt *fmt);
 
        struct list_head list;
        struct list_head sort_list;
        bool elide;
        int len;
        int user_len;
+       int idx;
+       int level;
+};
+
+struct perf_hpp_list {
+       struct list_head fields;
+       struct list_head sorts;
 };
 
-extern struct list_head perf_hpp__list;
-extern struct list_head perf_hpp__sort_list;
+extern struct perf_hpp_list perf_hpp_list;
+
+struct perf_hpp_list_node {
+       struct list_head        list;
+       struct perf_hpp_list    hpp;
+       int                     level;
+       bool                    skip;
+};
+
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+                                   struct perf_hpp_fmt *format);
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+                                       struct perf_hpp_fmt *format);
+
+static inline void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__column_register(&perf_hpp_list, format);
+}
+
+static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
+{
+       perf_hpp_list__register_sort_field(&perf_hpp_list, format);
+}
+
+#define perf_hpp_list__for_each_format(_list, format) \
+       list_for_each_entry(format, &(_list)->fields, list)
 
-#define perf_hpp__for_each_format(format) \
-       list_for_each_entry(format, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_format_safe(_list, format, tmp)        \
+       list_for_each_entry_safe(format, tmp, &(_list)->fields, list)
 
-#define perf_hpp__for_each_format_safe(format, tmp)    \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__list, list)
+#define perf_hpp_list__for_each_sort_list(_list, format) \
+       list_for_each_entry(format, &(_list)->sorts, sort_list)
 
-#define perf_hpp__for_each_sort_list(format) \
-       list_for_each_entry(format, &perf_hpp__sort_list, sort_list)
+#define perf_hpp_list__for_each_sort_list_safe(_list, format, tmp)     \
+       list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list)
 
-#define perf_hpp__for_each_sort_list_safe(format, tmp) \
-       list_for_each_entry_safe(format, tmp, &perf_hpp__sort_list, sort_list)
+#define hists__for_each_format(hists, format) \
+       perf_hpp_list__for_each_format((hists)->hpp_list, fmt)
+
+#define hists__for_each_sort_list(hists, format) \
+       perf_hpp_list__for_each_sort_list((hists)->hpp_list, fmt)
 
 extern struct perf_hpp_fmt perf_hpp__format[];
 
@@ -254,21 +301,29 @@ enum {
 };
 
 void perf_hpp__init(void);
-void perf_hpp__column_register(struct perf_hpp_fmt *format);
 void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
-void perf_hpp__column_enable(unsigned col);
-void perf_hpp__column_disable(unsigned col);
 void perf_hpp__cancel_cumulate(void);
+void perf_hpp__setup_output_field(struct perf_hpp_list *list);
+void perf_hpp__reset_output_field(struct perf_hpp_list *list);
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+                                 struct perf_evlist *evlist);
 
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format);
-void perf_hpp__setup_output_field(void);
-void perf_hpp__reset_output_field(void);
-void perf_hpp__append_sort_keys(void);
 
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
 bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
+
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg);
 
 static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
                                         struct hists *hists)
@@ -372,6 +427,7 @@ static inline int script_browse(const char *script_opt __maybe_unused)
 #endif
 
 unsigned int hists__sort_list_width(struct hists *hists);
+unsigned int hists__overhead_width(struct hists *hists);
 
 void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
                          struct perf_sample *sample, bool nonany_branch_mode);
@@ -381,4 +437,26 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
                            const char *arg, int unset __maybe_unused);
 int perf_hist_config(const char *var, const char *value);
 
+void perf_hpp_list__init(struct perf_hpp_list *list);
+
+enum hierarchy_move_dir {
+       HMD_NORMAL,
+       HMD_FORCE_SIBLING,
+       HMD_FORCE_CHILD,
+};
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node);
+struct rb_node *__rb_hierarchy_next(struct rb_node *node,
+                                   enum hierarchy_move_dir hmd);
+struct rb_node *rb_hierarchy_prev(struct rb_node *node);
+
+static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
+{
+       return __rb_hierarchy_next(node, HMD_NORMAL);
+}
+
+#define HIERARCHY_INDENT  3
+
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit);
+
 #endif /* __PERF_HIST_H */
index 81a2eb7..05d8158 100644 (file)
@@ -2068,6 +2068,15 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
                err = -ENOMEM;
                goto err_free_queues;
        }
+
+       /*
+        * Since this thread will not be kept in any rbtree not in a
+        * list, initialize its list node so that at thread__put() the
+        * current thread lifetime assuption is kept and we don't segfault
+        * at list_del_init().
+        */
+       INIT_LIST_HEAD(&pt->unknown_thread->node);
+
        err = thread__set_comm(pt->unknown_thread, "unknown", 0);
        if (err)
                goto err_delete_thread;
diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h
new file mode 100644 (file)
index 0000000..a1e99da
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef __JIT_H__
+#define __JIT_H__
+
+#include <data.h>
+
+extern int jit_process(struct perf_session *session,
+                      struct perf_data_file *output,
+                      struct machine *machine,
+                      char *filename,
+                      pid_t pid,
+                      u64 *nbytes);
+
+extern int jit_inject_record(const char *filename);
+
+#endif /* __JIT_H__ */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
new file mode 100644 (file)
index 0000000..cd272cc
--- /dev/null
@@ -0,0 +1,697 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <byteswap.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "util.h"
+#include "event.h"
+#include "debug.h"
+#include "evlist.h"
+#include "symbol.h"
+#include "strlist.h"
+#include <elf.h>
+
+#include "session.h"
+#include "jit.h"
+#include "jitdump.h"
+#include "genelf.h"
+#include "../builtin.h"
+
+struct jit_buf_desc {
+       struct perf_data_file *output;
+       struct perf_session *session;
+       struct machine *machine;
+       union jr_entry   *entry;
+       void             *buf;
+       uint64_t         sample_type;
+       size_t           bufsize;
+       FILE             *in;
+       bool             needs_bswap; /* handles cross-endianess */
+       void             *debug_data;
+       size_t           nr_debug_entries;
+       uint32_t         code_load_count;
+       u64              bytes_written;
+       struct rb_root   code_root;
+       char             dir[PATH_MAX];
+};
+
+struct debug_line_info {
+       unsigned long vma;
+       unsigned int lineno;
+       /* The filename format is unspecified, absolute path, relative etc. */
+       char const filename[0];
+};
+
+struct jit_tool {
+       struct perf_tool tool;
+       struct perf_data_file   output;
+       struct perf_data_file   input;
+       u64 bytes_written;
+};
+
+#define hmax(a, b) ((a) > (b) ? (a) : (b))
+#define get_jit_tool(t) (container_of(tool, struct jit_tool, tool))
+
+static int
+jit_emit_elf(char *filename,
+            const char *sym,
+            uint64_t code_addr,
+            const void *code,
+            int csize,
+            void *debug,
+            int nr_debug_entries)
+{
+       int ret, fd;
+
+       if (verbose > 0)
+               fprintf(stderr, "write ELF image %s\n", filename);
+
+       fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+       if (fd == -1) {
+               pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(errno));
+               return -1;
+       }
+
+        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize, debug, nr_debug_entries);
+
+        close(fd);
+
+        if (ret)
+                unlink(filename);
+
+       return ret;
+}
+
+static void
+jit_close(struct jit_buf_desc *jd)
+{
+       if (!(jd && jd->in))
+               return;
+       funlockfile(jd->in);
+       fclose(jd->in);
+       jd->in = NULL;
+}
+
+static int
+jit_validate_events(struct perf_session *session)
+{
+       struct perf_evsel *evsel;
+
+       /*
+        * check that all events use CLOCK_MONOTONIC
+        */
+       evlist__for_each(session->evlist, evsel) {
+               if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
+                       return -1;
+       }
+       return 0;
+}
+
+static int
+jit_open(struct jit_buf_desc *jd, const char *name)
+{
+       struct jitheader header;
+       struct jr_prefix *prefix;
+       ssize_t bs, bsz = 0;
+       void *n, *buf = NULL;
+       int ret, retval = -1;
+
+       jd->in = fopen(name, "r");
+       if (!jd->in)
+               return -1;
+
+       bsz = hmax(sizeof(header), sizeof(*prefix));
+
+       buf = malloc(bsz);
+       if (!buf)
+               goto error;
+
+       /*
+        * protect from writer modifying the file while we are reading it
+        */
+       flockfile(jd->in);
+
+       ret = fread(buf, sizeof(header), 1, jd->in);
+       if (ret != 1)
+               goto error;
+
+       memcpy(&header, buf, sizeof(header));
+
+       if (header.magic != JITHEADER_MAGIC) {
+               if (header.magic != JITHEADER_MAGIC_SW)
+                       goto error;
+               jd->needs_bswap = true;
+       }
+
+       if (jd->needs_bswap) {
+               header.version    = bswap_32(header.version);
+               header.total_size = bswap_32(header.total_size);
+               header.pid        = bswap_32(header.pid);
+               header.elf_mach   = bswap_32(header.elf_mach);
+               header.timestamp  = bswap_64(header.timestamp);
+               header.flags      = bswap_64(header.flags);
+       }
+
+       if (verbose > 2)
+               pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+                       header.version,
+                       header.total_size,
+                       (unsigned long long)header.timestamp,
+                       header.pid,
+                       header.elf_mach);
+
+       if (header.flags & JITDUMP_FLAGS_RESERVED) {
+               pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
+                      (unsigned long long)header.flags & JITDUMP_FLAGS_RESERVED);
+               goto error;
+       }
+
+       /*
+        * validate event is using the correct clockid
+        */
+       if (jit_validate_events(jd->session)) {
+               pr_err("error, jitted code must be sampled with perf record -k 1\n");
+               goto error;
+       }
+
+       bs = header.total_size - sizeof(header);
+
+       if (bs > bsz) {
+               n = realloc(buf, bs);
+               if (!n)
+                       goto error;
+               bsz = bs;
+               buf = n;
+               /* read extra we do not know about */
+               ret = fread(buf, bs - bsz, 1, jd->in);
+               if (ret != 1)
+                       goto error;
+       }
+       /*
+        * keep dirname for generating files and mmap records
+        */
+       strcpy(jd->dir, name);
+       dirname(jd->dir);
+
+       return 0;
+error:
+       funlockfile(jd->in);
+       fclose(jd->in);
+       return retval;
+}
+
+static union jr_entry *
+jit_get_next_entry(struct jit_buf_desc *jd)
+{
+       struct jr_prefix *prefix;
+       union jr_entry *jr;
+       void *addr;
+       size_t bs, size;
+       int id, ret;
+
+       if (!(jd && jd->in))
+               return NULL;
+
+       if (jd->buf == NULL) {
+               size_t sz = getpagesize();
+               if (sz < sizeof(*prefix))
+                       sz = sizeof(*prefix);
+
+               jd->buf = malloc(sz);
+               if (jd->buf == NULL)
+                       return NULL;
+
+               jd->bufsize = sz;
+       }
+
+       prefix = jd->buf;
+
+       /*
+        * file is still locked at this point
+        */
+       ret = fread(prefix, sizeof(*prefix), 1, jd->in);
+       if (ret  != 1)
+               return NULL;
+
+       if (jd->needs_bswap) {
+               prefix->id         = bswap_32(prefix->id);
+               prefix->total_size = bswap_32(prefix->total_size);
+               prefix->timestamp  = bswap_64(prefix->timestamp);
+       }
+       id   = prefix->id;
+       size = prefix->total_size;
+
+       bs = (size_t)size;
+       if (bs < sizeof(*prefix))
+               return NULL;
+
+       if (id >= JIT_CODE_MAX) {
+               pr_warning("next_entry: unknown prefix %d, skipping\n", id);
+               return NULL;
+       }
+       if (bs > jd->bufsize) {
+               void *n;
+               n = realloc(jd->buf, bs);
+               if (!n)
+                       return NULL;
+               jd->buf = n;
+               jd->bufsize = bs;
+       }
+
+       addr = ((void *)jd->buf) + sizeof(*prefix);
+
+       ret = fread(addr, bs - sizeof(*prefix), 1, jd->in);
+       if (ret != 1)
+               return NULL;
+
+       jr = (union jr_entry *)jd->buf;
+
+       switch(id) {
+       case JIT_CODE_DEBUG_INFO:
+               if (jd->needs_bswap) {
+                       uint64_t n;
+                       jr->info.code_addr = bswap_64(jr->info.code_addr);
+                       jr->info.nr_entry  = bswap_64(jr->info.nr_entry);
+                       for (n = 0 ; n < jr->info.nr_entry; n++) {
+                               jr->info.entries[n].addr    = bswap_64(jr->info.entries[n].addr);
+                               jr->info.entries[n].lineno  = bswap_32(jr->info.entries[n].lineno);
+                               jr->info.entries[n].discrim = bswap_32(jr->info.entries[n].discrim);
+                       }
+               }
+               break;
+       case JIT_CODE_CLOSE:
+               break;
+       case JIT_CODE_LOAD:
+               if (jd->needs_bswap) {
+                       jr->load.pid       = bswap_32(jr->load.pid);
+                       jr->load.tid       = bswap_32(jr->load.tid);
+                       jr->load.vma       = bswap_64(jr->load.vma);
+                       jr->load.code_addr = bswap_64(jr->load.code_addr);
+                       jr->load.code_size = bswap_64(jr->load.code_size);
+                       jr->load.code_index= bswap_64(jr->load.code_index);
+               }
+               jd->code_load_count++;
+               break;
+       case JIT_CODE_MOVE:
+               if (jd->needs_bswap) {
+                       jr->move.pid           = bswap_32(jr->move.pid);
+                       jr->move.tid           = bswap_32(jr->move.tid);
+                       jr->move.vma           = bswap_64(jr->move.vma);
+                       jr->move.old_code_addr = bswap_64(jr->move.old_code_addr);
+                       jr->move.new_code_addr = bswap_64(jr->move.new_code_addr);
+                       jr->move.code_size     = bswap_64(jr->move.code_size);
+                       jr->move.code_index    = bswap_64(jr->move.code_index);
+               }
+               break;
+       case JIT_CODE_MAX:
+       default:
+               return NULL;
+       }
+       return jr;
+}
+
+static int
+jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
+{
+       ssize_t size;
+
+       size = perf_data_file__write(jd->output, event, event->header.size);
+       if (size < 0)
+               return -1;
+
+       jd->bytes_written += size;
+       return 0;
+}
+
+static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       uint64_t code, addr;
+       uintptr_t uaddr;
+       char *filename;
+       struct stat st;
+       size_t size;
+       u16 idr_size;
+       const char *sym;
+       uint32_t count;
+       int ret, csize;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid   = jr->load.pid;
+       tid   = jr->load.tid;
+       csize = jr->load.code_size;
+       addr  = jr->load.code_addr;
+       sym   = (void *)((unsigned long)jr + sizeof(jr->load));
+       code  = (unsigned long)jr + jr->load.p.total_size - csize;
+       count = jr->load.code_index;
+       idr_size = jd->machine->id_hdr_size;
+
+       event = calloc(1, sizeof(*event) + idr_size);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so",
+                       jd->dir,
+                       pid,
+                       count);
+
+       size++; /* for \0 */
+
+       size = PERF_ALIGN(size, sizeof(u64));
+       uaddr = (uintptr_t)code;
+       ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize, jd->debug_data, jd->nr_debug_entries);
+
+       if (jd->debug_data && jd->nr_debug_entries) {
+               free(jd->debug_data);
+               jd->debug_data = NULL;
+               jd->nr_debug_entries = 0;
+       }
+
+       if (ret) {
+               free(event);
+               return -1;
+       }
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = addr;
+       event->mmap2.len   = csize;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       /*
+        * mark dso as use to generate buildid in the header
+        */
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       struct perf_sample sample;
+       union perf_event *event;
+       struct perf_tool *tool = jd->session->tool;
+       char *filename;
+       size_t size;
+       struct stat st;
+       u16 idr_size;
+       int ret;
+       pid_t pid, tid;
+       struct {
+               u32 pid, tid;
+               u64 time;
+       } *id;
+
+       pid = jr->move.pid;
+       tid =  jr->move.tid;
+       idr_size = jd->machine->id_hdr_size;
+
+       /*
+        * +16 to account for sample_id_all (hack)
+        */
+       event = calloc(1, sizeof(*event) + 16);
+       if (!event)
+               return -1;
+
+       filename = event->mmap2.filename;
+       size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64,
+                jd->dir,
+                pid,
+                jr->move.code_index);
+
+       size++; /* for \0 */
+
+       if (stat(filename, &st))
+               memset(&st, 0, sizeof(stat));
+
+       size = PERF_ALIGN(size, sizeof(u64));
+
+       event->mmap2.header.type = PERF_RECORD_MMAP2;
+       event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+       event->mmap2.header.size = (sizeof(event->mmap2) -
+                       (sizeof(event->mmap2.filename) - size) + idr_size);
+       event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+       event->mmap2.start = jr->move.new_code_addr;
+       event->mmap2.len   = jr->move.code_size;
+       event->mmap2.pid   = pid;
+       event->mmap2.tid   = tid;
+       event->mmap2.ino   = st.st_ino;
+       event->mmap2.maj   = major(st.st_dev);
+       event->mmap2.min   = minor(st.st_dev);
+       event->mmap2.prot  = st.st_mode;
+       event->mmap2.flags = MAP_SHARED;
+       event->mmap2.ino_generation = 1;
+
+       id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+       if (jd->sample_type & PERF_SAMPLE_TID) {
+               id->pid  = pid;
+               id->tid  = tid;
+       }
+       if (jd->sample_type & PERF_SAMPLE_TIME)
+               id->time = jr->load.p.timestamp;
+
+       /*
+        * create pseudo sample to induce dso hit increment
+        * use first address as sample address
+        */
+       memset(&sample, 0, sizeof(sample));
+       sample.pid  = pid;
+       sample.tid  = tid;
+       sample.time = id->time;
+       sample.ip   = jr->move.new_code_addr;
+
+       ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+       if (ret)
+               return ret;
+
+       ret = jit_inject_event(jd, event);
+       if (!ret)
+               build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+       return ret;
+}
+
+static int jit_repipe_debug_info(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+       void *data;
+       size_t sz;
+
+       if (!(jd && jr))
+               return -1;
+
+       sz  = jr->prefix.total_size - sizeof(jr->info);
+       data = malloc(sz);
+       if (!data)
+               return -1;
+
+       memcpy(data, &jr->info.entries, sz);
+
+       jd->debug_data       = data;
+
+       /*
+        * we must use nr_entry instead of size here because
+        * we cannot distinguish actual entry from padding otherwise
+        */
+       jd->nr_debug_entries = jr->info.nr_entry;
+
+       return 0;
+}
+
+static int
+jit_process_dump(struct jit_buf_desc *jd)
+{
+       union jr_entry *jr;
+       int ret;
+
+       while ((jr = jit_get_next_entry(jd))) {
+               switch(jr->prefix.id) {
+               case JIT_CODE_LOAD:
+                       ret = jit_repipe_code_load(jd, jr);
+                       break;
+               case JIT_CODE_MOVE:
+                       ret = jit_repipe_code_move(jd, jr);
+                       break;
+               case JIT_CODE_DEBUG_INFO:
+                       ret = jit_repipe_debug_info(jd, jr);
+                       break;
+               default:
+                       ret = 0;
+                       continue;
+               }
+       }
+       return ret;
+}
+
+static int
+jit_inject(struct jit_buf_desc *jd, char *path)
+{
+       int ret;
+
+       if (verbose > 0)
+               fprintf(stderr, "injecting: %s\n", path);
+
+       ret = jit_open(jd, path);
+       if (ret)
+               return -1;
+
+       ret = jit_process_dump(jd);
+
+       jit_close(jd);
+
+       if (verbose > 0)
+               fprintf(stderr, "injected: %s (%d)\n", path, ret);
+
+       return 0;
+}
+
+/*
+ * File must be with pattern .../jit-XXXX.dump
+ * where XXXX is the PID of the process which did the mmap()
+ * as captured in the RECORD_MMAP record
+ */
+static int
+jit_detect(char *mmap_name, pid_t pid)
+ {
+       char *p;
+       char *end = NULL;
+       pid_t pid2;
+
+       if (verbose > 2)
+               fprintf(stderr, "jit marker trying : %s\n", mmap_name);
+       /*
+        * get file name
+        */
+       p = strrchr(mmap_name, '/');
+       if (!p)
+               return -1;
+
+       /*
+        * match prefix
+        */
+       if (strncmp(p, "/jit-", 5))
+               return -1;
+
+       /*
+        * skip prefix
+        */
+       p += 5;
+
+       /*
+        * must be followed by a pid
+        */
+       if (!isdigit(*p))
+               return -1;
+
+       pid2 = (int)strtol(p, &end, 10);
+       if (!end)
+               return -1;
+
+       /*
+        * pid does not match mmap pid
+        * pid==0 in system-wide mode (synthesized)
+        */
+       if (pid && pid2 != pid)
+               return -1;
+       /*
+        * validate suffix
+        */
+       if (strcmp(end, ".dump"))
+               return -1;
+
+       if (verbose > 0)
+               fprintf(stderr, "jit marker found: %s\n", mmap_name);
+
+       return 0;
+}
+
+int
+jit_process(struct perf_session *session,
+           struct perf_data_file *output,
+           struct machine *machine,
+           char *filename,
+           pid_t pid,
+           u64 *nbytes)
+{
+       struct perf_evsel *first;
+       struct jit_buf_desc jd;
+       int ret;
+
+       /*
+        * first, detect marker mmap (i.e., the jitdump mmap)
+        */
+       if (jit_detect(filename, pid))
+               return 0;
+
+       memset(&jd, 0, sizeof(jd));
+
+       jd.session = session;
+       jd.output  = output;
+       jd.machine = machine;
+
+       /*
+        * track sample_type to compute id_all layout
+        * perf sets the same sample type to all events as of now
+        */
+       first = perf_evlist__first(session->evlist);
+       jd.sample_type = first->attr.sample_type;
+
+       *nbytes = 0;
+
+       ret = jit_inject(&jd, filename);
+       if (!ret) {
+               *nbytes = jd.bytes_written;
+               ret = 1;
+       }
+
+       return ret;
+}
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h
new file mode 100644 (file)
index 0000000..b66c1f5
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * jitdump.h: jitted code info encapsulation file format
+ *
+ * Adapted from OProfile GPLv2 support jidump.h:
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#ifndef JITDUMP_H
+#define JITDUMP_H
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdint.h>
+
+/* JiTD */
+#define JITHEADER_MAGIC                0x4A695444
+#define JITHEADER_MAGIC_SW     0x4454694A
+
+#define PADDING_8ALIGNED(x) ((((x) + 7) & 7) ^ 7)
+
+#define JITHEADER_VERSION 1
+
+enum jitdump_flags_bits {
+       JITDUMP_FLAGS_MAX_BIT,
+};
+
+#define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
+                               (~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
+
+struct jitheader {
+       uint32_t magic;         /* characters "jItD" */
+       uint32_t version;       /* header version */
+       uint32_t total_size;    /* total size of header */
+       uint32_t elf_mach;      /* elf mach target */
+       uint32_t pad1;          /* reserved */
+       uint32_t pid;           /* JIT process id */
+       uint64_t timestamp;     /* timestamp */
+       uint64_t flags;         /* flags */
+};
+
+enum jit_record_type {
+       JIT_CODE_LOAD           = 0,
+        JIT_CODE_MOVE           = 1,
+       JIT_CODE_DEBUG_INFO     = 2,
+       JIT_CODE_CLOSE          = 3,
+
+       JIT_CODE_MAX,
+};
+
+/* record prefix (mandatory in each record) */
+struct jr_prefix {
+       uint32_t id;
+       uint32_t total_size;
+       uint64_t timestamp;
+};
+
+struct jr_code_load {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct jr_code_close {
+       struct jr_prefix p;
+};
+
+struct jr_code_move {
+       struct jr_prefix p;
+
+       uint32_t pid;
+       uint32_t tid;
+       uint64_t vma;
+       uint64_t old_code_addr;
+       uint64_t new_code_addr;
+       uint64_t code_size;
+       uint64_t code_index;
+};
+
+struct debug_entry {
+       uint64_t addr;
+       int lineno;         /* source line number starting at 1 */
+       int discrim;        /* column discriminator, 0 is default */
+       const char name[0]; /* null terminated filename, \xff\0 if same as previous entry */
+};
+
+struct jr_code_debug_info {
+       struct jr_prefix p;
+
+       uint64_t code_addr;
+       uint64_t nr_entry;
+       struct debug_entry entries[0];
+};
+
+union jr_entry {
+        struct jr_code_debug_info info;
+        struct jr_code_close close;
+        struct jr_code_load load;
+        struct jr_code_move move;
+        struct jr_prefix prefix;
+};
+
+static inline struct debug_entry *
+debug_entry_next(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       size_t l = strlen(ent->name) + 1;
+       return a + l;
+}
+
+static inline char *
+debug_entry_file(struct debug_entry *ent)
+{
+       void *a = ent + 1;
+       return a;
+}
+
+#endif /* !JITDUMP_H */
index ae825d4..d01e735 100644 (file)
@@ -122,6 +122,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 
 bool kvm_exit_event(struct perf_evsel *evsel);
 bool kvm_entry_event(struct perf_evsel *evsel);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 
 #define define_exit_reasons_table(name, symbols)       \
        static struct exit_reasons_table name[] = {     \
@@ -133,8 +134,13 @@ bool kvm_entry_event(struct perf_evsel *evsel);
  */
 int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
 
-extern const char * const kvm_events_tp[];
+extern const char *kvm_events_tp[];
 extern struct kvm_reg_events_ops kvm_reg_events_ops[];
 extern const char * const kvm_skip_events[];
+extern const char *vcpu_id_str;
+extern const int decode_str_len;
+extern const char *kvm_exit_reason;
+extern const char *kvm_entry_trace;
+extern const char *kvm_exit_trace;
 
 #endif /* __PERF_KVM_STAT_H */
index 2c2b443..1a3e45b 100644 (file)
@@ -179,6 +179,16 @@ struct symbol *machine__find_kernel_symbol(struct machine *machine,
                                       mapp, filter);
 }
 
+static inline
+struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
+                                                  enum map_type type, const char *name,
+                                                  struct map **mapp,
+                                                  symbol_filter_t filter)
+{
+       return map_groups__find_symbol_by_name(&machine->kmaps, type, name,
+                                              mapp, filter);
+}
+
 static inline
 struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
                                             struct map **mapp,
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
new file mode 100644 (file)
index 0000000..75465f8
--- /dev/null
@@ -0,0 +1,255 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <api/fs/fs.h>
+#include "mem-events.h"
+#include "debug.h"
+#include "symbol.h"
+
+#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+       E("ldlat-loads",        "cpu/mem-loads,ldlat=30/P",     "mem-loads"),
+       E("ldlat-stores",       "cpu/mem-stores/P",             "mem-stores"),
+};
+#undef E
+
+#undef E
+
+char *perf_mem_events__name(int i)
+{
+       return (char *)perf_mem_events[i].name;
+}
+
+int perf_mem_events__parse(const char *str)
+{
+       char *tok, *saveptr = NULL;
+       bool found = false;
+       char *buf;
+       int j;
+
+       /* We need buffer that we know we can write to. */
+       buf = malloc(strlen(str) + 1);
+       if (!buf)
+               return -ENOMEM;
+
+       strcpy(buf, str);
+
+       tok = strtok_r((char *)buf, ",", &saveptr);
+
+       while (tok) {
+               for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+                       struct perf_mem_event *e = &perf_mem_events[j];
+
+                       if (strstr(e->tag, tok))
+                               e->record = found = true;
+               }
+
+               tok = strtok_r(NULL, ",", &saveptr);
+       }
+
+       free(buf);
+
+       if (found)
+               return 0;
+
+       pr_err("failed: event '%s' not found, use '-e list' to get list of available events\n", str);
+       return -1;
+}
+
+int perf_mem_events__init(void)
+{
+       const char *mnt = sysfs__mount();
+       bool found = false;
+       int j;
+
+       if (!mnt)
+               return -ENOENT;
+
+       for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+               char path[PATH_MAX];
+               struct perf_mem_event *e = &perf_mem_events[j];
+               struct stat st;
+
+               scnprintf(path, PATH_MAX, "%s/devices/cpu/events/%s",
+                         mnt, e->sysfs_name);
+
+               if (!stat(path, &st))
+                       e->supported = found = true;
+       }
+
+       return found ? 0 : -ENOENT;
+}
+
+static const char * const tlb_access[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "L2",
+       "Walker",
+       "Fault",
+};
+
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t l = 0, i;
+       u64 m = PERF_MEM_TLB_NA;
+       u64 hit, miss;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_dtlb;
+
+       hit = m & PERF_MEM_TLB_HIT;
+       miss = m & PERF_MEM_TLB_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, tlb_access[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const mem_lvl[] = {
+       "N/A",
+       "HIT",
+       "MISS",
+       "L1",
+       "LFB",
+       "L2",
+       "L3",
+       "Local RAM",
+       "Remote RAM (1 hop)",
+       "Remote RAM (2 hops)",
+       "Remote Cache (1 hop)",
+       "Remote Cache (2 hops)",
+       "I/O",
+       "Uncached",
+};
+
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m =  PERF_MEM_LVL_NA;
+       u64 hit, miss;
+
+       if (mem_info)
+               m  = mem_info->data_src.mem_lvl;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       hit = m & PERF_MEM_LVL_HIT;
+       miss = m & PERF_MEM_LVL_MISS;
+
+       /* already taken care of */
+       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
+
+       for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, mem_lvl[i]);
+       }
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+       if (hit)
+               l += scnprintf(out + l, sz - l, " hit");
+       if (miss)
+               l += scnprintf(out + l, sz - l, " miss");
+
+       return l;
+}
+
+static const char * const snoop_access[] = {
+       "N/A",
+       "None",
+       "Miss",
+       "Hit",
+       "HitM",
+};
+
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       size_t i, l = 0;
+       u64 m = PERF_MEM_SNOOP_NA;
+
+       sz -= 1; /* -1 for null termination */
+       out[0] = '\0';
+
+       if (mem_info)
+               m = mem_info->data_src.mem_snoop;
+
+       for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
+               if (!(m & 0x1))
+                       continue;
+               if (l) {
+                       strcat(out, " or ");
+                       l += 4;
+               }
+               l += scnprintf(out + l, sz - l, snoop_access[i]);
+       }
+
+       if (*out == '\0')
+               l += scnprintf(out, sz - l, "N/A");
+
+       return l;
+}
+
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       u64 mask = PERF_MEM_LOCK_NA;
+       int l;
+
+       if (mem_info)
+               mask = mem_info->data_src.mem_lock;
+
+       if (mask & PERF_MEM_LOCK_NA)
+               l = scnprintf(out, sz, "N/A");
+       else if (mask & PERF_MEM_LOCK_LOCKED)
+               l = scnprintf(out, sz, "Yes");
+       else
+               l = scnprintf(out, sz, "No");
+
+       return l;
+}
+
+int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+       int i = 0;
+
+       i += perf_mem__lvl_scnprintf(out, sz, mem_info);
+       i += scnprintf(out + i, sz - i, "|SNP ");
+       i += perf_mem__snp_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|TLB ");
+       i += perf_mem__tlb_scnprintf(out + i, sz - i, mem_info);
+       i += scnprintf(out + i, sz - i, "|LCK ");
+       i += perf_mem__lck_scnprintf(out + i, sz - i, mem_info);
+
+       return i;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
new file mode 100644 (file)
index 0000000..5d6d930
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef __PERF_MEM_EVENTS_H
+#define __PERF_MEM_EVENTS_H
+
+#include <stdbool.h>
+
+struct perf_mem_event {
+       bool            record;
+       bool            supported;
+       const char      *tag;
+       const char      *name;
+       const char      *sysfs_name;
+};
+
+enum {
+       PERF_MEM_EVENTS__LOAD,
+       PERF_MEM_EVENTS__STORE,
+       PERF_MEM_EVENTS__MAX,
+};
+
+extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
+
+int perf_mem_events__parse(const char *str);
+int perf_mem_events__init(void);
+
+char *perf_mem_events__name(int i);
+
+struct mem_info;
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+
+int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info);
+
+#endif /* __PERF_MEM_EVENTS_H */
index 4f7b0ef..4c19d5e 100644 (file)
@@ -279,7 +279,24 @@ const char *event_type(int type)
        return "unknown";
 }
 
+static int parse_events__is_name_term(struct parse_events_term *term)
+{
+       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
+}
+
+static char *get_config_name(struct list_head *head_terms)
+{
+       struct parse_events_term *term;
+
+       if (!head_terms)
+               return NULL;
 
+       list_for_each_entry(term, head_terms, list)
+               if (parse_events__is_name_term(term))
+                       return term->val.str;
+
+       return NULL;
+}
 
 static struct perf_evsel *
 __add_event(struct list_head *list, int *idx,
@@ -333,11 +350,25 @@ static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES]
        return -1;
 }
 
+typedef int config_term_func_t(struct perf_event_attr *attr,
+                              struct parse_events_term *term,
+                              struct parse_events_error *err);
+static int config_term_common(struct perf_event_attr *attr,
+                             struct parse_events_term *term,
+                             struct parse_events_error *err);
+static int config_attr(struct perf_event_attr *attr,
+                      struct list_head *head,
+                      struct parse_events_error *err,
+                      config_term_func_t config_term);
+
 int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2)
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *err,
+                          struct list_head *head_config)
 {
        struct perf_event_attr attr;
-       char name[MAX_NAME_LEN];
+       LIST_HEAD(config_terms);
+       char name[MAX_NAME_LEN], *config_name;
        int cache_type = -1, cache_op = -1, cache_result = -1;
        char *op_result[2] = { op_result1, op_result2 };
        int i, n;
@@ -351,6 +382,7 @@ int parse_events_add_cache(struct list_head *list, int *idx,
        if (cache_type == -1)
                return -EINVAL;
 
+       config_name = get_config_name(head_config);
        n = snprintf(name, MAX_NAME_LEN, "%s", type);
 
        for (i = 0; (i < 2) && (op_result[i]); i++) {
@@ -391,7 +423,16 @@ int parse_events_add_cache(struct list_head *list, int *idx,
        memset(&attr, 0, sizeof(attr));
        attr.config = cache_type | (cache_op << 8) | (cache_result << 16);
        attr.type = PERF_TYPE_HW_CACHE;
-       return add_event(list, idx, &attr, name, NULL);
+
+       if (head_config) {
+               if (config_attr(&attr, head_config, err,
+                               config_term_common))
+                       return -EINVAL;
+
+               if (get_config_terms(head_config, &config_terms))
+                       return -ENOMEM;
+       }
+       return add_event(list, idx, &attr, config_name ? : name, &config_terms);
 }
 
 static void tracepoint_error(struct parse_events_error *e, int err,
@@ -399,6 +440,9 @@ static void tracepoint_error(struct parse_events_error *e, int err,
 {
        char help[BUFSIZ];
 
+       if (!e)
+               return;
+
        /*
         * We get error directly from syscall errno ( > 0),
         * or from encoded pointer's error ( < 0).
@@ -537,6 +581,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
 struct __add_bpf_event_param {
        struct parse_events_evlist *data;
        struct list_head *list;
+       struct list_head *head_config;
 };
 
 static int add_bpf_event(struct probe_trace_event *tev, int fd,
@@ -553,7 +598,8 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
                 tev->group, tev->event, fd);
 
        err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, tev->group,
-                                         tev->event, evlist->error, NULL);
+                                         tev->event, evlist->error,
+                                         param->head_config);
        if (err) {
                struct perf_evsel *evsel, *tmp;
 
@@ -578,11 +624,12 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
 
 int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                              struct list_head *list,
-                             struct bpf_object *obj)
+                             struct bpf_object *obj,
+                             struct list_head *head_config)
 {
        int err;
        char errbuf[BUFSIZ];
-       struct __add_bpf_event_param param = {data, list};
+       struct __add_bpf_event_param param = {data, list, head_config};
        static bool registered_unprobe_atexit = false;
 
        if (IS_ERR(obj) || !obj) {
@@ -628,17 +675,99 @@ errout:
        return err;
 }
 
+static int
+parse_events_config_bpf(struct parse_events_evlist *data,
+                       struct bpf_object *obj,
+                       struct list_head *head_config)
+{
+       struct parse_events_term *term;
+       int error_pos;
+
+       if (!head_config || list_empty(head_config))
+               return 0;
+
+       list_for_each_entry(term, head_config, list) {
+               char errbuf[BUFSIZ];
+               int err;
+
+               if (term->type_term != PARSE_EVENTS__TERM_TYPE_USER) {
+                       snprintf(errbuf, sizeof(errbuf),
+                                "Invalid config term for BPF object");
+                       errbuf[BUFSIZ - 1] = '\0';
+
+                       data->error->idx = term->err_term;
+                       data->error->str = strdup(errbuf);
+                       return -EINVAL;
+               }
+
+               err = bpf__config_obj(obj, term, data->evlist, &error_pos);
+               if (err) {
+                       bpf__strerror_config_obj(obj, term, data->evlist,
+                                                &error_pos, err, errbuf,
+                                                sizeof(errbuf));
+                       data->error->help = strdup(
+"Hint:\tValid config terms:\n"
+"     \tmap:[<arraymap>].value<indices>=[value]\n"
+"     \tmap:[<eventmap>].event<indices>=[event]\n"
+"\n"
+"     \twhere <indices> is something like [0,3...5] or [all]\n"
+"     \t(add -v to see detail)");
+                       data->error->str = strdup(errbuf);
+                       if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
+                               data->error->idx = term->err_val;
+                       else
+                               data->error->idx = term->err_term + error_pos;
+                       return err;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Split config terms:
+ * perf record -e bpf.c/call-graph=fp,map:array.value[0]=1/ ...
+ *  'call-graph=fp' is 'evt config', should be applied to each
+ *  events in bpf.c.
+ * 'map:array.value[0]=1' is 'obj config', should be processed
+ * with parse_events_config_bpf.
+ *
+ * Move object config terms from the first list to obj_head_config.
+ */
+static void
+split_bpf_config_terms(struct list_head *evt_head_config,
+                      struct list_head *obj_head_config)
+{
+       struct parse_events_term *term, *temp;
+
+       /*
+        * Currectly, all possible user config term
+        * belong to bpf object. parse_events__is_hardcoded_term()
+        * happends to be a good flag.
+        *
+        * See parse_events_config_bpf() and
+        * config_term_tracepoint().
+        */
+       list_for_each_entry_safe(term, temp, evt_head_config, list)
+               if (!parse_events__is_hardcoded_term(term))
+                       list_move_tail(&term->list, obj_head_config);
+}
+
 int parse_events_load_bpf(struct parse_events_evlist *data,
                          struct list_head *list,
                          char *bpf_file_name,
-                         bool source)
+                         bool source,
+                         struct list_head *head_config)
 {
+       int err;
        struct bpf_object *obj;
+       LIST_HEAD(obj_head_config);
+
+       if (head_config)
+               split_bpf_config_terms(head_config, &obj_head_config);
 
        obj = bpf__prepare_load(bpf_file_name, source);
        if (IS_ERR(obj)) {
                char errbuf[BUFSIZ];
-               int err;
 
                err = PTR_ERR(obj);
 
@@ -656,7 +785,18 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
                return err;
        }
 
-       return parse_events_load_bpf_obj(data, list, obj);
+       err = parse_events_load_bpf_obj(data, list, obj, head_config);
+       if (err)
+               return err;
+       err = parse_events_config_bpf(data, obj, &obj_head_config);
+
+       /*
+        * Caller doesn't know anything about obj_head_config,
+        * so combine them together again before returnning.
+        */
+       if (head_config)
+               list_splice_tail(&obj_head_config, head_config);
+       return err;
 }
 
 static int
@@ -743,9 +883,59 @@ static int check_type_val(struct parse_events_term *term,
        return -EINVAL;
 }
 
-typedef int config_term_func_t(struct perf_event_attr *attr,
-                              struct parse_events_term *term,
-                              struct parse_events_error *err);
+/*
+ * Update according to parse-events.l
+ */
+static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
+       [PARSE_EVENTS__TERM_TYPE_USER]                  = "<sysfs term>",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG]                = "config",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG1]               = "config1",
+       [PARSE_EVENTS__TERM_TYPE_CONFIG2]               = "config2",
+       [PARSE_EVENTS__TERM_TYPE_NAME]                  = "name",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD]         = "period",
+       [PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ]           = "freq",
+       [PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE]    = "branch_type",
+       [PARSE_EVENTS__TERM_TYPE_TIME]                  = "time",
+       [PARSE_EVENTS__TERM_TYPE_CALLGRAPH]             = "call-graph",
+       [PARSE_EVENTS__TERM_TYPE_STACKSIZE]             = "stack-size",
+       [PARSE_EVENTS__TERM_TYPE_NOINHERIT]             = "no-inherit",
+       [PARSE_EVENTS__TERM_TYPE_INHERIT]               = "inherit",
+};
+
+static bool config_term_shrinked;
+
+static bool
+config_term_avail(int term_type, struct parse_events_error *err)
+{
+       if (term_type < 0 || term_type >= __PARSE_EVENTS__TERM_TYPE_NR) {
+               err->str = strdup("Invalid term_type");
+               return false;
+       }
+       if (!config_term_shrinked)
+               return true;
+
+       switch (term_type) {
+       case PARSE_EVENTS__TERM_TYPE_CONFIG:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG1:
+       case PARSE_EVENTS__TERM_TYPE_CONFIG2:
+       case PARSE_EVENTS__TERM_TYPE_NAME:
+               return true;
+       default:
+               if (!err)
+                       return false;
+
+               /* term_type is validated so indexing is safe */
+               if (asprintf(&err->str, "'%s' is not usable in 'perf stat'",
+                            config_term_names[term_type]) < 0)
+                       err->str = NULL;
+               return false;
+       }
+}
+
+void parse_events__shrink_config_terms(void)
+{
+       config_term_shrinked = true;
+}
 
 static int config_term_common(struct perf_event_attr *attr,
                              struct parse_events_term *term,
@@ -812,6 +1002,17 @@ do {                                                                         \
                return -EINVAL;
        }
 
+       /*
+        * Check term availbility after basic checking so
+        * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
+        *
+        * If check availbility at the entry of this function,
+        * user will see "'<sysfs term>' is not usable in 'perf stat'"
+        * if an invalid config term is provided for legacy events
+        * (for example, instructions/badterm/...), which is confusing.
+        */
+       if (!config_term_avail(term->type_term, err))
+               return -EINVAL;
        return 0;
 #undef CHECK_TYPE_VAL
 }
@@ -958,23 +1159,8 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
                        return -ENOMEM;
        }
 
-       return add_event(list, &data->idx, &attr, NULL, &config_terms);
-}
-
-static int parse_events__is_name_term(struct parse_events_term *term)
-{
-       return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
-}
-
-static char *pmu_event_name(struct list_head *head_terms)
-{
-       struct parse_events_term *term;
-
-       list_for_each_entry(term, head_terms, list)
-               if (parse_events__is_name_term(term))
-                       return term->val.str;
-
-       return NULL;
+       return add_event(list, &data->idx, &attr,
+                        get_config_name(head_config), &config_terms);
 }
 
 int parse_events_add_pmu(struct parse_events_evlist *data,
@@ -1021,7 +1207,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
                return -EINVAL;
 
        evsel = __add_event(list, &data->idx, &attr,
-                           pmu_event_name(head_config), pmu->cpus,
+                           get_config_name(head_config), pmu->cpus,
                            &config_terms);
        if (evsel) {
                evsel->unit = info.unit;
@@ -1383,8 +1569,7 @@ int parse_events_terms(struct list_head *terms, const char *str)
                return 0;
        }
 
-       if (data.terms)
-               parse_events__free_terms(data.terms);
+       parse_events_terms__delete(data.terms);
        return ret;
 }
 
@@ -1392,9 +1577,10 @@ int parse_events(struct perf_evlist *evlist, const char *str,
                 struct parse_events_error *err)
 {
        struct parse_events_evlist data = {
-               .list  = LIST_HEAD_INIT(data.list),
-               .idx   = evlist->nr_entries,
-               .error = err,
+               .list   = LIST_HEAD_INIT(data.list),
+               .idx    = evlist->nr_entries,
+               .error  = err,
+               .evlist = evlist,
        };
        int ret;
 
@@ -2065,12 +2251,29 @@ int parse_events_term__clone(struct parse_events_term **new,
                        term->err_term, term->err_val);
 }
 
-void parse_events__free_terms(struct list_head *terms)
+void parse_events_terms__purge(struct list_head *terms)
 {
        struct parse_events_term *term, *h;
 
-       list_for_each_entry_safe(term, h, terms, list)
+       list_for_each_entry_safe(term, h, terms, list) {
+               if (term->array.nr_ranges)
+                       free(term->array.ranges);
+               list_del_init(&term->list);
                free(term);
+       }
+}
+
+void parse_events_terms__delete(struct list_head *terms)
+{
+       if (!terms)
+               return;
+       parse_events_terms__purge(terms);
+       free(terms);
+}
+
+void parse_events__clear_array(struct parse_events_array *a)
+{
+       free(a->ranges);
 }
 
 void parse_events_evlist_error(struct parse_events_evlist *data,
@@ -2085,6 +2288,33 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
        WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
 }
 
+static void config_terms_list(char *buf, size_t buf_sz)
+{
+       int i;
+       bool first = true;
+
+       buf[0] = '\0';
+       for (i = 0; i < __PARSE_EVENTS__TERM_TYPE_NR; i++) {
+               const char *name = config_term_names[i];
+
+               if (!config_term_avail(i, NULL))
+                       continue;
+               if (!name)
+                       continue;
+               if (name[0] == '<')
+                       continue;
+
+               if (strlen(buf) + strlen(name) + 2 >= buf_sz)
+                       return;
+
+               if (!first)
+                       strcat(buf, ",");
+               else
+                       first = false;
+               strcat(buf, name);
+       }
+}
+
 /*
  * Return string contains valid config terms of an event.
  * @additional_terms: For terms such as PMU sysfs terms.
@@ -2092,17 +2322,18 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
 char *parse_events_formats_error_string(char *additional_terms)
 {
        char *str;
-       static const char *static_terms = "config,config1,config2,name,"
-                                         "period,freq,branch_type,time,"
-                                         "call-graph,stack-size\n";
+       /* "branch_type" is the longest name */
+       char static_terms[__PARSE_EVENTS__TERM_TYPE_NR *
+                         (sizeof("branch_type") - 1)];
 
+       config_terms_list(static_terms, sizeof(static_terms));
        /* valid terms */
        if (additional_terms) {
-               if (!asprintf(&str, "valid terms: %s,%s",
-                             additional_terms, static_terms))
+               if (asprintf(&str, "valid terms: %s,%s",
+                            additional_terms, static_terms) < 0)
                        goto fail;
        } else {
-               if (!asprintf(&str, "valid terms: %s", static_terms))
+               if (asprintf(&str, "valid terms: %s", static_terms) < 0)
                        goto fail;
        }
        return str;
index f1a6db1..67e4930 100644 (file)
@@ -68,11 +68,21 @@ enum {
        PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
        PARSE_EVENTS__TERM_TYPE_STACKSIZE,
        PARSE_EVENTS__TERM_TYPE_NOINHERIT,
-       PARSE_EVENTS__TERM_TYPE_INHERIT
+       PARSE_EVENTS__TERM_TYPE_INHERIT,
+       __PARSE_EVENTS__TERM_TYPE_NR,
+};
+
+struct parse_events_array {
+       size_t nr_ranges;
+       struct {
+               unsigned int start;
+               size_t length;
+       } *ranges;
 };
 
 struct parse_events_term {
        char *config;
+       struct parse_events_array array;
        union {
                char *str;
                u64  num;
@@ -98,12 +108,14 @@ struct parse_events_evlist {
        int                        idx;
        int                        nr_groups;
        struct parse_events_error *error;
+       struct perf_evlist        *evlist;
 };
 
 struct parse_events_terms {
        struct list_head *terms;
 };
 
+void parse_events__shrink_config_terms(void);
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
 int parse_events_term__num(struct parse_events_term **term,
                           int type_term, char *config, u64 num,
@@ -115,7 +127,9 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
                              char *config, unsigned idx);
 int parse_events_term__clone(struct parse_events_term **new,
                             struct parse_events_term *term);
-void parse_events__free_terms(struct list_head *terms);
+void parse_events_terms__delete(struct list_head *terms);
+void parse_events_terms__purge(struct list_head *terms);
+void parse_events__clear_array(struct parse_events_array *a);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);
 int parse_events_name(struct list_head *list, char *name);
@@ -126,18 +140,22 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 int parse_events_load_bpf(struct parse_events_evlist *data,
                          struct list_head *list,
                          char *bpf_file_name,
-                         bool source);
+                         bool source,
+                         struct list_head *head_config);
 /* Provide this function for perf test */
 struct bpf_object;
 int parse_events_load_bpf_obj(struct parse_events_evlist *data,
                              struct list_head *list,
-                             struct bpf_object *obj);
+                             struct bpf_object *obj,
+                             struct list_head *head_config);
 int parse_events_add_numeric(struct parse_events_evlist *data,
                             struct list_head *list,
                             u32 type, u64 config,
                             struct list_head *head_config);
 int parse_events_add_cache(struct list_head *list, int *idx,
-                          char *type, char *op_result1, char *op_result2);
+                          char *type, char *op_result1, char *op_result2,
+                          struct parse_events_error *error,
+                          struct list_head *head_config);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
                                void *ptr, char *type, u64 len);
 int parse_events_add_pmu(struct parse_events_evlist *data,
index 58c5831..1477fbc 100644 (file)
@@ -9,8 +9,8 @@
 %{
 #include <errno.h>
 #include "../perf.h"
-#include "parse-events-bison.h"
 #include "parse-events.h"
+#include "parse-events-bison.h"
 
 char *parse_events_get_text(yyscan_t yyscanner);
 YYSTYPE *parse_events_get_lval(yyscan_t yyscanner);
@@ -111,6 +111,7 @@ do {                                                        \
 %x mem
 %s config
 %x event
+%x array
 
 group          [^,{}/]*[{][^}]*[}][^,{}/]*
 event_pmu      [^,{}/]+[/][^/]*[/][^,{}/]*
@@ -122,7 +123,7 @@ num_dec             [0-9]+
 num_hex                0x[a-fA-F0-9]+
 num_raw_hex    [a-fA-F0-9]+
 name           [a-zA-Z_*?][a-zA-Z0-9_*?.]*
-name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.]*
+name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
 /* If you add a modifier you need to update check_modifier() */
 modifier_event [ukhpPGHSDI]+
 modifier_bp    [rwx]{1,3}
@@ -176,10 +177,17 @@ modifier_bp       [rwx]{1,3}
 
 }
 
+<array>{
+"]"                    { BEGIN(config); return ']'; }
+{num_dec}              { return value(yyscanner, 10); }
+{num_hex}              { return value(yyscanner, 16); }
+,                      { return ','; }
+"\.\.\."               { return PE_ARRAY_RANGE; }
+}
+
 <config>{
        /*
-        * Please update parse_events_formats_error_string any time
-        * new static term is added.
+        * Please update config_term_names when new static term is added.
         */
 config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
 config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
@@ -196,6 +204,8 @@ no-inherit          { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
 ,                      { return ','; }
 "/"                    { BEGIN(INITIAL); return '/'; }
 {name_minus}           { return str(yyscanner, PE_NAME); }
+\[all\]                        { return PE_ARRAY_ALL; }
+"["                    { BEGIN(array); return '['; }
 }
 
 <mem>{
@@ -238,6 +248,7 @@ cpu-migrations|migrations                   { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COU
 alignment-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); }
 emulation-faults                               { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy                                          { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
+bpf-output                                     { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
 
        /*
         * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately.
index ad37996..5be4a5f 100644 (file)
@@ -28,7 +28,7 @@ do { \
        INIT_LIST_HEAD(list);         \
 } while (0)
 
-static inc_group_count(struct list_head *list,
+static void inc_group_count(struct list_head *list,
                       struct parse_events_evlist *data)
 {
        /* Count groups only have more than 1 members */
@@ -48,6 +48,7 @@ static inc_group_count(struct list_head *list,
 %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
 %token PE_ERROR
 %token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%token PE_ARRAY_ALL PE_ARRAY_RANGE
 %type <num> PE_VALUE
 %type <num> PE_VALUE_SYM_HW
 %type <num> PE_VALUE_SYM_SW
@@ -64,6 +65,7 @@ static inc_group_count(struct list_head *list,
 %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
 %type <num> value_sym
 %type <head> event_config
+%type <head> opt_event_config
 %type <term> event_term
 %type <head> event_pmu
 %type <head> event_legacy_symbol
@@ -82,6 +84,9 @@ static inc_group_count(struct list_head *list,
 %type <head> group_def
 %type <head> group
 %type <head> groups
+%type <array> array
+%type <array> array_term
+%type <array> array_terms
 
 %union
 {
@@ -93,6 +98,7 @@ static inc_group_count(struct list_head *list,
                char *sys;
                char *event;
        } tracepoint_name;
+       struct parse_events_array array;
 }
 %%
 
@@ -211,24 +217,14 @@ event_def: event_pmu |
           event_bpf_file
 
 event_pmu:
-PE_NAME '/' event_config '/'
+PE_NAME opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
-       parse_events__free_terms($3);
-       $$ = list;
-}
-|
-PE_NAME '/' '/'
-{
-       struct parse_events_evlist *data = _data;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
+       ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+       parse_events_terms__delete($2);
        $$ = list;
 }
 |
@@ -246,7 +242,7 @@ PE_KERNEL_PMU_EVENT sep_dc
 
        ALLOC_LIST(list);
        ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
        $$ = list;
 }
 |
@@ -266,7 +262,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
        ALLOC_LIST(list);
        ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events__free_terms(head);
+       parse_events_terms__delete(head);
        $$ = list;
 }
 
@@ -285,7 +281,7 @@ value_sym '/' event_config '/'
 
        ALLOC_LIST(list);
        ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
-       parse_events__free_terms($3);
+       parse_events_terms__delete($3);
        $$ = list;
 }
 |
@@ -302,33 +298,39 @@ value_sym sep_slash_dc
 }
 
 event_legacy_cache:
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
        struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5, error, $6));
+       parse_events_terms__delete($6);
        $$ = list;
 }
 |
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
        struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL, error, $4));
+       parse_events_terms__delete($4);
        $$ = list;
 }
 |
-PE_NAME_CACHE_TYPE
+PE_NAME_CACHE_TYPE opt_event_config
 {
        struct parse_events_evlist *data = _data;
+       struct parse_events_error *error = data->error;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL));
+       ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL, error, $2));
+       parse_events_terms__delete($2);
        $$ = list;
 }
 
@@ -378,24 +380,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc
 }
 
 event_legacy_tracepoint:
-tracepoint_name
-{
-       struct parse_events_evlist *data = _data;
-       struct parse_events_error *error = data->error;
-       struct list_head *list;
-
-       ALLOC_LIST(list);
-       if (error)
-               error->idx = @1.first_column;
-
-       if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, NULL))
-               return -1;
-
-       $$ = list;
-}
-|
-tracepoint_name '/' event_config '/'
+tracepoint_name opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct parse_events_error *error = data->error;
@@ -406,7 +391,7 @@ tracepoint_name '/' event_config '/'
                error->idx = @1.first_column;
 
        if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-                                       error, $3))
+                                       error, $2))
                return -1;
 
        $$ = list;
@@ -433,49 +418,68 @@ PE_NAME ':' PE_NAME
 }
 
 event_legacy_numeric:
-PE_VALUE ':' PE_VALUE
+PE_VALUE ':' PE_VALUE opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, $4));
+       parse_events_terms__delete($4);
        $$ = list;
 }
 
 event_legacy_raw:
-PE_RAW
+PE_RAW opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, $2));
+       parse_events_terms__delete($2);
        $$ = list;
 }
 
 event_bpf_file:
-PE_BPF_OBJECT
+PE_BPF_OBJECT opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct parse_events_error *error = data->error;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, false));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, false, $2));
+       parse_events_terms__delete($2);
        $$ = list;
 }
 |
-PE_BPF_SOURCE
+PE_BPF_SOURCE opt_event_config
 {
        struct parse_events_evlist *data = _data;
        struct list_head *list;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_load_bpf(data, list, $1, true));
+       ABORT_ON(parse_events_load_bpf(data, list, $1, true, $2));
+       parse_events_terms__delete($2);
        $$ = list;
 }
 
+opt_event_config:
+'/' event_config '/'
+{
+       $$ = $2;
+}
+|
+'/' '/'
+{
+       $$ = NULL;
+}
+|
+{
+       $$ = NULL;
+}
+
 start_terms: event_config
 {
        struct parse_events_terms *data = _data;
@@ -573,6 +577,86 @@ PE_TERM
        ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
        $$ = term;
 }
+|
+PE_NAME array '=' PE_NAME
+{
+       struct parse_events_term *term;
+       int i;
+
+       ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+
+       term->array = $2;
+       $$ = term;
+}
+|
+PE_NAME array '=' PE_VALUE
+{
+       struct parse_events_term *term;
+
+       ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                       $1, $4, &@1, &@4));
+       term->array = $2;
+       $$ = term;
+}
+
+array:
+'[' array_terms ']'
+{
+       $$ = $2;
+}
+|
+PE_ARRAY_ALL
+{
+       $$.nr_ranges = 0;
+       $$.ranges = NULL;
+}
+
+array_terms:
+array_terms ',' array_term
+{
+       struct parse_events_array new_array;
+
+       new_array.nr_ranges = $1.nr_ranges + $3.nr_ranges;
+       new_array.ranges = malloc(sizeof(new_array.ranges[0]) *
+                                 new_array.nr_ranges);
+       ABORT_ON(!new_array.ranges);
+       memcpy(&new_array.ranges[0], $1.ranges,
+              $1.nr_ranges * sizeof(new_array.ranges[0]));
+       memcpy(&new_array.ranges[$1.nr_ranges], $3.ranges,
+              $3.nr_ranges * sizeof(new_array.ranges[0]));
+       free($1.ranges);
+       free($3.ranges);
+       $$ = new_array;
+}
+|
+array_term
+
+array_term:
+PE_VALUE
+{
+       struct parse_events_array array;
+
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = 1;
+       $$ = array;
+}
+|
+PE_VALUE PE_ARRAY_RANGE PE_VALUE
+{
+       struct parse_events_array array;
+
+       ABORT_ON($3 < $1);
+       array.nr_ranges = 1;
+       array.ranges = malloc(sizeof(array.ranges[0]));
+       ABORT_ON(!array.ranges);
+       array.ranges[0].start = $1;
+       array.ranges[0].length = $3 - $1 + 1;
+       $$ = array;
+}
 
 sep_dc: ':' |
 
index b597bcc..adef23b 100644 (file)
@@ -98,7 +98,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
        char scale[128];
        int fd, ret = -1;
        char path[PATH_MAX];
-       const char *lc;
+       char *lc;
 
        snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
 
@@ -123,6 +123,17 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         */
        lc = setlocale(LC_NUMERIC, NULL);
 
+       /*
+        * The lc string may be allocated in static storage,
+        * so get a dynamic copy to make it survive setlocale
+        * call below.
+        */
+       lc = strdup(lc);
+       if (!lc) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
        /*
         * force to C locale to ensure kernel
         * scale string is converted correctly.
@@ -135,6 +146,8 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
        /* restore locale */
        setlocale(LC_NUMERIC, lc);
 
+       free(lc);
+
        ret = 0;
 error:
        close(fd);
@@ -153,7 +166,7 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
        if (fd == -1)
                return -1;
 
-               sret = read(fd, alias->unit, UNIT_MAX_LEN);
+       sret = read(fd, alias->unit, UNIT_MAX_LEN);
        if (sret < 0)
                goto error;
 
@@ -284,13 +297,12 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
 {
        struct dirent *evt_ent;
        DIR *event_dir;
-       int ret = 0;
 
        event_dir = opendir(dir);
        if (!event_dir)
                return -EINVAL;
 
-       while (!ret && (evt_ent = readdir(event_dir))) {
+       while ((evt_ent = readdir(event_dir))) {
                char path[PATH_MAX];
                char *name = evt_ent->d_name;
                FILE *file;
@@ -306,17 +318,19 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
 
                snprintf(path, PATH_MAX, "%s/%s", dir, name);
 
-               ret = -EINVAL;
                file = fopen(path, "r");
-               if (!file)
-                       break;
+               if (!file) {
+                       pr_debug("Cannot open %s\n", path);
+                       continue;
+               }
 
-               ret = perf_pmu__new_alias(head, dir, name, file);
+               if (perf_pmu__new_alias(head, dir, name, file) < 0)
+                       pr_debug("Cannot set up %s\n", name);
                fclose(file);
        }
 
        closedir(event_dir);
-       return ret;
+       return 0;
 }
 
 /*
@@ -354,7 +368,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias,
        list_for_each_entry(term, &alias->terms, list) {
                ret = parse_events_term__clone(&cloned, term);
                if (ret) {
-                       parse_events__free_terms(&list);
+                       parse_events_terms__purge(&list);
                        return ret;
                }
                list_add_tail(&cloned->list, &list);
index 2be10fb..4ce5c5e 100644 (file)
@@ -686,8 +686,9 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
                pf->fb_ops = NULL;
 #if _ELFUTILS_PREREQ(0, 142)
        } else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa &&
-                  pf->cfi != NULL) {
-               if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
+                  (pf->cfi_eh != NULL || pf->cfi_dbg != NULL)) {
+               if ((dwarf_cfi_addrframe(pf->cfi_eh, pf->addr, &frame) != 0 &&
+                    (dwarf_cfi_addrframe(pf->cfi_dbg, pf->addr, &frame) != 0)) ||
                    dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
                        pr_warning("Failed to get call frame on 0x%jx\n",
                                   (uintmax_t)pf->addr);
@@ -1015,8 +1016,7 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
        return DWARF_CB_OK;
 }
 
-/* Find probe points from debuginfo */
-static int debuginfo__find_probes(struct debuginfo *dbg,
+static int debuginfo__find_probe_location(struct debuginfo *dbg,
                                  struct probe_finder *pf)
 {
        struct perf_probe_point *pp = &pf->pev->point;
@@ -1025,27 +1025,6 @@ static int debuginfo__find_probes(struct debuginfo *dbg,
        Dwarf_Die *diep;
        int ret = 0;
 
-#if _ELFUTILS_PREREQ(0, 142)
-       Elf *elf;
-       GElf_Ehdr ehdr;
-       GElf_Shdr shdr;
-
-       /* Get the call frame information from this dwarf */
-       elf = dwarf_getelf(dbg->dbg);
-       if (elf == NULL)
-               return -EINVAL;
-
-       if (gelf_getehdr(elf, &ehdr) == NULL)
-               return -EINVAL;
-
-       if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) &&
-           shdr.sh_type == SHT_PROGBITS) {
-               pf->cfi = dwarf_getcfi_elf(elf);
-       } else {
-               pf->cfi = dwarf_getcfi(dbg->dbg);
-       }
-#endif
-
        off = 0;
        pf->lcache = intlist__new(NULL);
        if (!pf->lcache)
@@ -1108,6 +1087,39 @@ found:
        return ret;
 }
 
+/* Find probe points from debuginfo */
+static int debuginfo__find_probes(struct debuginfo *dbg,
+                                 struct probe_finder *pf)
+{
+       int ret = 0;
+
+#if _ELFUTILS_PREREQ(0, 142)
+       Elf *elf;
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr;
+
+       if (pf->cfi_eh || pf->cfi_dbg)
+               return debuginfo__find_probe_location(dbg, pf);
+
+       /* Get the call frame information from this dwarf */
+       elf = dwarf_getelf(dbg->dbg);
+       if (elf == NULL)
+               return -EINVAL;
+
+       if (gelf_getehdr(elf, &ehdr) == NULL)
+               return -EINVAL;
+
+       if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) &&
+           shdr.sh_type == SHT_PROGBITS)
+               pf->cfi_eh = dwarf_getcfi_elf(elf);
+
+       pf->cfi_dbg = dwarf_getcfi(dbg->dbg);
+#endif
+
+       ret = debuginfo__find_probe_location(dbg, pf);
+       return ret;
+}
+
 struct local_vars_finder {
        struct probe_finder *pf;
        struct perf_probe_arg *args;
index bed8271..0aec770 100644 (file)
@@ -76,7 +76,10 @@ struct probe_finder {
 
        /* For variable searching */
 #if _ELFUTILS_PREREQ(0, 142)
-       Dwarf_CFI               *cfi;           /* Call Frame Information */
+       /* Call Frame Information from .eh_frame */
+       Dwarf_CFI               *cfi_eh;
+       /* Call Frame Information from .debug_frame */
+       Dwarf_CFI               *cfi_dbg;
 #endif
        Dwarf_Op                *fb_ops;        /* Frame base attribute */
        struct perf_probe_arg   *pvar;          /* Current target variable */
index 544509c..b3aabc0 100644 (file)
@@ -187,6 +187,9 @@ static void define_event_symbols(struct event_format *event,
                                 const char *ev_name,
                                 struct print_arg *args)
 {
+       if (args == NULL)
+               return;
+
        switch (args->type) {
        case PRINT_NULL:
                break;
index d72fafc..fbd0524 100644 (file)
@@ -205,6 +205,9 @@ static void define_event_symbols(struct event_format *event,
                                 const char *ev_name,
                                 struct print_arg *args)
 {
+       if (args == NULL)
+               return;
+
        switch (args->type) {
        case PRINT_NULL:
                break;
@@ -1091,8 +1094,6 @@ static int python_start_script(const char *script, int argc, const char **argv)
                goto error;
        }
 
-       free(command_line);
-
        set_table_handlers(tables);
 
        if (tables->db_export_mode) {
@@ -1101,6 +1102,8 @@ static int python_start_script(const char *script, int argc, const char **argv)
                        goto error;
        }
 
+       free(command_line);
+
        return err;
 error:
        Py_Finalize();
index d5636ba..60b3593 100644 (file)
@@ -240,14 +240,6 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
        return 0;
 }
 
-static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
 static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
                                       union perf_event *event __maybe_unused,
                                       struct ordered_events *oe __maybe_unused)
@@ -260,23 +252,6 @@ static int process_finished_round(struct perf_tool *tool,
                                  union perf_event *event,
                                  struct ordered_events *oe);
 
-static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *perf_session
-                                __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
-static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
-                               union perf_event *event __maybe_unused,
-                               struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
 static int skipn(int fd, off_t n)
 {
        char buf[4096];
@@ -303,10 +278,9 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
        return event->auxtrace.size;
 }
 
-static
-int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
-                                     union perf_event *event __maybe_unused,
-                                     struct perf_session *session __maybe_unused)
+static int process_event_op2_stub(struct perf_tool *tool __maybe_unused,
+                                 union perf_event *event __maybe_unused,
+                                 struct perf_session *session __maybe_unused)
 {
        dump_printf(": unhandled!\n");
        return 0;
@@ -410,7 +384,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
        if (tool->tracing_data == NULL)
                tool->tracing_data = process_event_synth_tracing_data_stub;
        if (tool->build_id == NULL)
-               tool->build_id = process_build_id_stub;
+               tool->build_id = process_event_op2_stub;
        if (tool->finished_round == NULL) {
                if (tool->ordered_events)
                        tool->finished_round = process_finished_round;
@@ -418,13 +392,13 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
                        tool->finished_round = process_finished_round_stub;
        }
        if (tool->id_index == NULL)
-               tool->id_index = process_id_index_stub;
+               tool->id_index = process_event_op2_stub;
        if (tool->auxtrace_info == NULL)
-               tool->auxtrace_info = process_event_auxtrace_info_stub;
+               tool->auxtrace_info = process_event_op2_stub;
        if (tool->auxtrace == NULL)
                tool->auxtrace = process_event_auxtrace_stub;
        if (tool->auxtrace_error == NULL)
-               tool->auxtrace_error = process_event_auxtrace_error_stub;
+               tool->auxtrace_error = process_event_op2_stub;
        if (tool->thread_map == NULL)
                tool->thread_map = process_event_thread_map_stub;
        if (tool->cpu_map == NULL)
@@ -1149,7 +1123,7 @@ static struct machine *machines__find_for_cpumode(struct machines *machines,
 
                machine = machines__find(machines, pid);
                if (!machine)
-                       machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID);
+                       machine = machines__findnew(machines, DEFAULT_GUEST_KERNEL_ID);
                return machine;
        }
 
index 1833103..c868098 100644 (file)
@@ -22,6 +22,7 @@ cflags = getenv('CFLAGS', '').split()
 # switch off several checks (need to be at the end of cflags list)
 cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ]
 
+src_perf  = getenv('srctree') + '/tools/perf'
 build_lib = getenv('PYTHON_EXTBUILD_LIB')
 build_tmp = getenv('PYTHON_EXTBUILD_TMP')
 libtraceevent = getenv('LIBTRACEEVENT')
@@ -30,6 +31,9 @@ libapikfs = getenv('LIBAPI')
 ext_sources = [f.strip() for f in file('util/python-ext-sources')
                                if len(f.strip()) > 0 and f[0] != '#']
 
+# use full paths with source files
+ext_sources = map(lambda x: '%s/%s' % (src_perf, x) , ext_sources)
+
 perf = Extension('perf',
                  sources = ext_sources,
                  include_dirs = ['util/include'],
index ec72234..93fa136 100644 (file)
@@ -6,6 +6,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include <traceevent/event-parse.h>
+#include "mem-events.h"
 
 regex_t                parent_regex;
 const char     default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -25,9 +26,19 @@ int          sort__has_parent = 0;
 int            sort__has_sym = 0;
 int            sort__has_dso = 0;
 int            sort__has_socket = 0;
+int            sort__has_thread = 0;
+int            sort__has_comm = 0;
 enum sort_mode sort__mode = SORT_MODE__NORMAL;
 
-
+/*
+ * Replaces all occurrences of a char used with the:
+ *
+ * -t, --field-separator
+ *
+ * option, that uses a special separator character and don't pad with spaces,
+ * replacing all occurances of this separator in symbol names (and other
+ * output) with a '.' character, that thus it's the only non valid separator.
+*/
 static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...)
 {
        int n;
@@ -80,10 +91,21 @@ static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
                               width, width, comm ?: "");
 }
 
+static int hist_entry__thread_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct thread *th = arg;
+
+       if (type != HIST_FILTER__THREAD)
+               return -1;
+
+       return th && he->thread != th;
+}
+
 struct sort_entry sort_thread = {
        .se_header      = "  Pid:Command",
        .se_cmp         = sort__thread_cmp,
        .se_snprintf    = hist_entry__thread_snprintf,
+       .se_filter      = hist_entry__thread_filter,
        .se_width_idx   = HISTC_THREAD,
 };
 
@@ -121,6 +143,7 @@ struct sort_entry sort_comm = {
        .se_collapse    = sort__comm_collapse,
        .se_sort        = sort__comm_sort,
        .se_snprintf    = hist_entry__comm_snprintf,
+       .se_filter      = hist_entry__thread_filter,
        .se_width_idx   = HISTC_COMM,
 };
 
@@ -170,10 +193,21 @@ static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
        return _hist_entry__dso_snprintf(he->ms.map, bf, size, width);
 }
 
+static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->ms.map || he->ms.map->dso != dso);
+}
+
 struct sort_entry sort_dso = {
        .se_header      = "Shared Object",
        .se_cmp         = sort__dso_cmp,
        .se_snprintf    = hist_entry__dso_snprintf,
+       .se_filter      = hist_entry__dso_filter,
        .se_width_idx   = HISTC_DSO,
 };
 
@@ -246,10 +280,8 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                        ret += repsep_snprintf(bf + ret, size - ret, "%s", sym->name);
                        ret += repsep_snprintf(bf + ret, size - ret, "+0x%llx",
                                        ip - map->unmap_ip(map, sym->start));
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
                } else {
-                       ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+                       ret += repsep_snprintf(bf + ret, size - ret, "%.*s",
                                               width - ret,
                                               sym->name);
                }
@@ -257,14 +289,9 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
                size_t len = BITS_PER_LONG / 4;
                ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
                                       len, ip);
-               ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-                                      width - ret, "");
        }
 
-       if (ret > width)
-               bf[width] = '\0';
-
-       return width;
+       return ret;
 }
 
 static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -274,46 +301,56 @@ static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
                                         he->level, bf, size, width);
 }
 
+static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && (!he->ms.sym || !strstr(he->ms.sym->name, sym));
+}
+
 struct sort_entry sort_sym = {
        .se_header      = "Symbol",
        .se_cmp         = sort__sym_cmp,
        .se_sort        = sort__sym_sort,
        .se_snprintf    = hist_entry__sym_snprintf,
+       .se_filter      = hist_entry__sym_filter,
        .se_width_idx   = HISTC_SYMBOL,
 };
 
 /* --sort srcline */
 
+static char *hist_entry__get_srcline(struct hist_entry *he)
+{
+       struct map *map = he->ms.map;
+
+       if (!map)
+               return SRCLINE_UNKNOWN;
+
+       return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
+                          he->ms.sym, true);
+}
+
 static int64_t
 sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-       if (!left->srcline) {
-               if (!left->ms.map)
-                       left->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = left->ms.map;
-                       left->srcline = get_srcline(map->dso,
-                                          map__rip_2objdump(map, left->ip),
-                                                   left->ms.sym, true);
-               }
-       }
-       if (!right->srcline) {
-               if (!right->ms.map)
-                       right->srcline = SRCLINE_UNKNOWN;
-               else {
-                       struct map *map = right->ms.map;
-                       right->srcline = get_srcline(map->dso,
-                                            map__rip_2objdump(map, right->ip),
-                                                    right->ms.sym, true);
-               }
-       }
+       if (!left->srcline)
+               left->srcline = hist_entry__get_srcline(left);
+       if (!right->srcline)
+               right->srcline = hist_entry__get_srcline(right);
+
        return strcmp(right->srcline, left->srcline);
 }
 
 static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
                                        size_t size, unsigned int width)
 {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcline);
+       if (!he->srcline)
+               he->srcline = hist_entry__get_srcline(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcline);
 }
 
 struct sort_entry sort_srcline = {
@@ -327,11 +364,14 @@ struct sort_entry sort_srcline = {
 
 static char no_srcfile[1];
 
-static char *get_srcfile(struct hist_entry *e)
+static char *hist_entry__get_srcfile(struct hist_entry *e)
 {
        char *sf, *p;
        struct map *map = e->ms.map;
 
+       if (!map)
+               return no_srcfile;
+
        sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
                         e->ms.sym, false, true);
        if (!strcmp(sf, SRCLINE_UNKNOWN))
@@ -348,25 +388,21 @@ static char *get_srcfile(struct hist_entry *e)
 static int64_t
 sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-       if (!left->srcfile) {
-               if (!left->ms.map)
-                       left->srcfile = no_srcfile;
-               else
-                       left->srcfile = get_srcfile(left);
-       }
-       if (!right->srcfile) {
-               if (!right->ms.map)
-                       right->srcfile = no_srcfile;
-               else
-                       right->srcfile = get_srcfile(right);
-       }
+       if (!left->srcfile)
+               left->srcfile = hist_entry__get_srcfile(left);
+       if (!right->srcfile)
+               right->srcfile = hist_entry__get_srcfile(right);
+
        return strcmp(right->srcfile, left->srcfile);
 }
 
 static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
                                        size_t size, unsigned int width)
 {
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile);
+       if (!he->srcfile)
+               he->srcfile = hist_entry__get_srcfile(he);
+
+       return repsep_snprintf(bf, size, "%-.*s", width, he->srcfile);
 }
 
 struct sort_entry sort_srcfile = {
@@ -439,10 +475,21 @@ static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
        return repsep_snprintf(bf, size, "%*.*d", width, width-3, he->socket);
 }
 
+static int hist_entry__socket_filter(struct hist_entry *he, int type, const void *arg)
+{
+       int sk = *(const int *)arg;
+
+       if (type != HIST_FILTER__SOCKET)
+               return -1;
+
+       return sk >= 0 && he->socket != sk;
+}
+
 struct sort_entry sort_socket = {
        .se_header      = "Socket",
        .se_cmp         = sort__socket_cmp,
        .se_snprintf    = hist_entry__socket_snprintf,
+       .se_filter      = hist_entry__socket_filter,
        .se_width_idx   = HISTC_SOCKET,
 };
 
@@ -483,9 +530,6 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
        if (right->trace_output == NULL)
                right->trace_output = get_trace_output(right);
 
-       hists__new_col_len(left->hists, HISTC_TRACE, strlen(left->trace_output));
-       hists__new_col_len(right->hists, HISTC_TRACE, strlen(right->trace_output));
-
        return strcmp(right->trace_output, left->trace_output);
 }
 
@@ -496,11 +540,11 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
 
        evsel = hists_to_evsel(he->hists);
        if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
-               return scnprintf(bf, size, "%-*.*s", width, width, "N/A");
+               return scnprintf(bf, size, "%-.*s", width, "N/A");
 
        if (he->trace_output == NULL)
                he->trace_output = get_trace_output(he);
-       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->trace_output);
+       return repsep_snprintf(bf, size, "%-.*s", width, he->trace_output);
 }
 
 struct sort_entry sort_trace = {
@@ -532,6 +576,18 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
                return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__dso_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->from.map ||
+                      he->branch_info->from.map->dso != dso);
+}
+
 static int64_t
 sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -552,6 +608,18 @@ static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
                return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__dso_to_filter(struct hist_entry *he, int type,
+                                    const void *arg)
+{
+       const struct dso *dso = arg;
+
+       if (type != HIST_FILTER__DSO)
+               return -1;
+
+       return dso && (!he->branch_info || !he->branch_info->to.map ||
+                      he->branch_info->to.map->dso != dso);
+}
+
 static int64_t
 sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -613,10 +681,35 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf,
        return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__sym_from_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->from.sym &&
+                       strstr(he->branch_info->from.sym->name, sym));
+}
+
+static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
+                                      const void *arg)
+{
+       const char *sym = arg;
+
+       if (type != HIST_FILTER__SYMBOL)
+               return -1;
+
+       return sym && !(he->branch_info && he->branch_info->to.sym &&
+                       strstr(he->branch_info->to.sym->name, sym));
+}
+
 struct sort_entry sort_dso_from = {
        .se_header      = "Source Shared Object",
        .se_cmp         = sort__dso_from_cmp,
        .se_snprintf    = hist_entry__dso_from_snprintf,
+       .se_filter      = hist_entry__dso_from_filter,
        .se_width_idx   = HISTC_DSO_FROM,
 };
 
@@ -624,6 +717,7 @@ struct sort_entry sort_dso_to = {
        .se_header      = "Target Shared Object",
        .se_cmp         = sort__dso_to_cmp,
        .se_snprintf    = hist_entry__dso_to_snprintf,
+       .se_filter      = hist_entry__dso_to_filter,
        .se_width_idx   = HISTC_DSO_TO,
 };
 
@@ -631,6 +725,7 @@ struct sort_entry sort_sym_from = {
        .se_header      = "Source Symbol",
        .se_cmp         = sort__sym_from_cmp,
        .se_snprintf    = hist_entry__sym_from_snprintf,
+       .se_filter      = hist_entry__sym_from_filter,
        .se_width_idx   = HISTC_SYMBOL_FROM,
 };
 
@@ -638,6 +733,7 @@ struct sort_entry sort_sym_to = {
        .se_header      = "Target Symbol",
        .se_cmp         = sort__sym_to_cmp,
        .se_snprintf    = hist_entry__sym_to_snprintf,
+       .se_filter      = hist_entry__sym_to_filter,
        .se_width_idx   = HISTC_SYMBOL_TO,
 };
 
@@ -797,20 +893,10 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
 static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
                                    size_t size, unsigned int width)
 {
-       const char *out;
-       u64 mask = PERF_MEM_LOCK_NA;
+       char out[10];
 
-       if (he->mem_info)
-               mask = he->mem_info->data_src.mem_lock;
-
-       if (mask & PERF_MEM_LOCK_NA)
-               out = "N/A";
-       else if (mask & PERF_MEM_LOCK_LOCKED)
-               out = "Yes";
-       else
-               out = "No";
-
-       return repsep_snprintf(bf, size, "%-*s", width, out);
+       perf_mem__lck_scnprintf(out, sizeof(out), he->mem_info);
+       return repsep_snprintf(bf, size, "%.*s", width, out);
 }
 
 static int64_t
@@ -832,54 +918,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
        return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
 }
 
-static const char * const tlb_access[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "L2",
-       "Walker",
-       "Fault",
-};
-#define NUM_TLB_ACCESS (sizeof(tlb_access)/sizeof(const char *))
-
 static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
                                    size_t size, unsigned int width)
 {
        char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t l = 0, i;
-       u64 m = PERF_MEM_TLB_NA;
-       u64 hit, miss;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_dtlb;
-
-       hit = m & PERF_MEM_TLB_HIT;
-       miss = m & PERF_MEM_TLB_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
-
-       for (i = 0; m && i < NUM_TLB_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, tlb_access[i], sz - l);
-               l += strlen(tlb_access[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
 
+       perf_mem__tlb_scnprintf(out, sizeof(out), he->mem_info);
        return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
@@ -902,61 +946,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
        return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
 }
 
-static const char * const mem_lvl[] = {
-       "N/A",
-       "HIT",
-       "MISS",
-       "L1",
-       "LFB",
-       "L2",
-       "L3",
-       "Local RAM",
-       "Remote RAM (1 hop)",
-       "Remote RAM (2 hops)",
-       "Remote Cache (1 hop)",
-       "Remote Cache (2 hops)",
-       "I/O",
-       "Uncached",
-};
-#define NUM_MEM_LVL (sizeof(mem_lvl)/sizeof(const char *))
-
 static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
                                    size_t size, unsigned int width)
 {
        char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m =  PERF_MEM_LVL_NA;
-       u64 hit, miss;
-
-       if (he->mem_info)
-               m  = he->mem_info->data_src.mem_lvl;
-
-       out[0] = '\0';
-
-       hit = m & PERF_MEM_LVL_HIT;
-       miss = m & PERF_MEM_LVL_MISS;
-
-       /* already taken care of */
-       m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
-
-       for (i = 0; m && i < NUM_MEM_LVL; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, mem_lvl[i], sz - l);
-               l += strlen(mem_lvl[i]);
-       }
-       if (*out == '\0')
-               strcpy(out, "N/A");
-       if (hit)
-               strncat(out, " hit", sz - l);
-       if (miss)
-               strncat(out, " miss", sz - l);
 
+       perf_mem__lvl_scnprintf(out, sizeof(out), he->mem_info);
        return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
@@ -979,51 +974,15 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
        return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
 }
 
-static const char * const snoop_access[] = {
-       "N/A",
-       "None",
-       "Miss",
-       "Hit",
-       "HitM",
-};
-#define NUM_SNOOP_ACCESS (sizeof(snoop_access)/sizeof(const char *))
-
 static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
                                    size_t size, unsigned int width)
 {
        char out[64];
-       size_t sz = sizeof(out) - 1; /* -1 for null termination */
-       size_t i, l = 0;
-       u64 m = PERF_MEM_SNOOP_NA;
-
-       out[0] = '\0';
-
-       if (he->mem_info)
-               m = he->mem_info->data_src.mem_snoop;
-
-       for (i = 0; m && i < NUM_SNOOP_ACCESS; i++, m >>= 1) {
-               if (!(m & 0x1))
-                       continue;
-               if (l) {
-                       strcat(out, " or ");
-                       l += 4;
-               }
-               strncat(out, snoop_access[i], sz - l);
-               l += strlen(snoop_access[i]);
-       }
-
-       if (*out == '\0')
-               strcpy(out, "N/A");
 
+       perf_mem__snp_scnprintf(out, sizeof(out), he->mem_info);
        return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-static inline  u64 cl_address(u64 address)
-{
-       /* return the cacheline of the address */
-       return (address & ~(cacheline_size - 1));
-}
-
 static int64_t
 sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -1440,20 +1399,6 @@ struct hpp_sort_entry {
        struct sort_entry *se;
 };
 
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
-{
-       struct hpp_sort_entry *hse_a;
-       struct hpp_sort_entry *hse_b;
-
-       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
-               return false;
-
-       hse_a = container_of(a, struct hpp_sort_entry, hpp);
-       hse_b = container_of(b, struct hpp_sort_entry, hpp);
-
-       return hse_a->se == hse_b->se;
-}
-
 void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 {
        struct hpp_sort_entry *hse;
@@ -1539,8 +1484,56 @@ static int64_t __sort__hpp_sort(struct perf_hpp_fmt *fmt,
        return sort_fn(a, b);
 }
 
+bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+{
+       return format->header == __sort__hpp_header;
+}
+
+#define MK_SORT_ENTRY_CHK(key)                                 \
+bool perf_hpp__is_ ## key ## _entry(struct perf_hpp_fmt *fmt)  \
+{                                                              \
+       struct hpp_sort_entry *hse;                             \
+                                                               \
+       if (!perf_hpp__is_sort_entry(fmt))                      \
+               return false;                                   \
+                                                               \
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);    \
+       return hse->se == &sort_ ## key ;                       \
+}
+
+MK_SORT_ENTRY_CHK(trace)
+MK_SORT_ENTRY_CHK(srcline)
+MK_SORT_ENTRY_CHK(srcfile)
+MK_SORT_ENTRY_CHK(thread)
+MK_SORT_ENTRY_CHK(comm)
+MK_SORT_ENTRY_CHK(dso)
+MK_SORT_ENTRY_CHK(sym)
+
+
+static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_sort_entry *hse_a;
+       struct hpp_sort_entry *hse_b;
+
+       if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
+               return false;
+
+       hse_a = container_of(a, struct hpp_sort_entry, hpp);
+       hse_b = container_of(b, struct hpp_sort_entry, hpp);
+
+       return hse_a->se == hse_b->se;
+}
+
+static void hse_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_sort_entry *hse;
+
+       hse = container_of(fmt, struct hpp_sort_entry, hpp);
+       free(hse);
+}
+
 static struct hpp_sort_entry *
-__sort_dimension__alloc_hpp(struct sort_dimension *sd)
+__sort_dimension__alloc_hpp(struct sort_dimension *sd, int level)
 {
        struct hpp_sort_entry *hse;
 
@@ -1560,40 +1553,92 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
        hse->hpp.cmp = __sort__hpp_cmp;
        hse->hpp.collapse = __sort__hpp_collapse;
        hse->hpp.sort = __sort__hpp_sort;
+       hse->hpp.equal = __sort__hpp_equal;
+       hse->hpp.free = hse_free;
 
        INIT_LIST_HEAD(&hse->hpp.list);
        INIT_LIST_HEAD(&hse->hpp.sort_list);
        hse->hpp.elide = false;
        hse->hpp.len = 0;
        hse->hpp.user_len = 0;
+       hse->hpp.level = level;
 
        return hse;
 }
 
-bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+static void hpp_free(struct perf_hpp_fmt *fmt)
 {
-       return format->header == __sort__hpp_header;
+       free(fmt);
+}
+
+static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd,
+                                                      int level)
+{
+       struct perf_hpp_fmt *fmt;
+
+       fmt = memdup(hd->fmt, sizeof(*fmt));
+       if (fmt) {
+               INIT_LIST_HEAD(&fmt->list);
+               INIT_LIST_HEAD(&fmt->sort_list);
+               fmt->free = hpp_free;
+               fmt->level = level;
+       }
+
+       return fmt;
+}
+
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
+{
+       struct perf_hpp_fmt *fmt;
+       struct hpp_sort_entry *hse;
+       int ret = -1;
+       int r;
+
+       perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+               if (!perf_hpp__is_sort_entry(fmt))
+                       continue;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               if (hse->se->se_filter == NULL)
+                       continue;
+
+               /*
+                * hist entry is filtered if any of sort key in the hpp list
+                * is applied.  But it should skip non-matched filter types.
+                */
+               r = hse->se->se_filter(he, type, arg);
+               if (r >= 0) {
+                       if (ret < 0)
+                               ret = 0;
+                       ret |= r;
+               }
+       }
+
+       return ret;
 }
 
-static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd,
+                                         struct perf_hpp_list *list,
+                                         int level)
 {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, level);
 
        if (hse == NULL)
                return -1;
 
-       perf_hpp__register_sort_field(&hse->hpp);
+       perf_hpp_list__register_sort_field(list, &hse->hpp);
        return 0;
 }
 
-static int __sort_dimension__add_hpp_output(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_output(struct sort_dimension *sd,
+                                           struct perf_hpp_list *list)
 {
-       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+       struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, 0);
 
        if (hse == NULL)
                return -1;
 
-       perf_hpp__column_register(&hse->hpp);
+       perf_hpp_list__column_register(list, &hse->hpp);
        return 0;
 }
 
@@ -1727,6 +1772,9 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
        if (hde->raw_trace)
                goto raw_field;
 
+       if (!he->trace_output)
+               he->trace_output = get_trace_output(he);
+
        field = hde->field;
        namelen = strlen(field->name);
        str = he->trace_output;
@@ -1776,6 +1824,11 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 
        hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
 
+       if (b == NULL) {
+               update_dynamic_len(hde, a);
+               return 0;
+       }
+
        field = hde->field;
        if (field->flags & FIELD_IS_DYNAMIC) {
                unsigned long long dyn;
@@ -1790,9 +1843,6 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
        } else {
                offset = field->offset;
                size = field->size;
-
-               update_dynamic_len(hde, a);
-               update_dynamic_len(hde, b);
        }
 
        return memcmp(a->raw_data + offset, b->raw_data + offset, size);
@@ -1803,8 +1853,31 @@ bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
        return fmt->cmp == __sort__hde_cmp;
 }
 
+static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+       struct hpp_dynamic_entry *hde_a;
+       struct hpp_dynamic_entry *hde_b;
+
+       if (!perf_hpp__is_dynamic_entry(a) || !perf_hpp__is_dynamic_entry(b))
+               return false;
+
+       hde_a = container_of(a, struct hpp_dynamic_entry, hpp);
+       hde_b = container_of(b, struct hpp_dynamic_entry, hpp);
+
+       return hde_a->field == hde_b->field;
+}
+
+static void hde_free(struct perf_hpp_fmt *fmt)
+{
+       struct hpp_dynamic_entry *hde;
+
+       hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+       free(hde);
+}
+
 static struct hpp_dynamic_entry *
-__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
+__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field,
+                     int level)
 {
        struct hpp_dynamic_entry *hde;
 
@@ -1827,16 +1900,47 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
        hde->hpp.cmp = __sort__hde_cmp;
        hde->hpp.collapse = __sort__hde_cmp;
        hde->hpp.sort = __sort__hde_cmp;
+       hde->hpp.equal = __sort__hde_equal;
+       hde->hpp.free = hde_free;
 
        INIT_LIST_HEAD(&hde->hpp.list);
        INIT_LIST_HEAD(&hde->hpp.sort_list);
        hde->hpp.elide = false;
        hde->hpp.len = 0;
        hde->hpp.user_len = 0;
+       hde->hpp.level = level;
 
        return hde;
 }
 
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
+{
+       struct perf_hpp_fmt *new_fmt = NULL;
+
+       if (perf_hpp__is_sort_entry(fmt)) {
+               struct hpp_sort_entry *hse, *new_hse;
+
+               hse = container_of(fmt, struct hpp_sort_entry, hpp);
+               new_hse = memdup(hse, sizeof(*hse));
+               if (new_hse)
+                       new_fmt = &new_hse->hpp;
+       } else if (perf_hpp__is_dynamic_entry(fmt)) {
+               struct hpp_dynamic_entry *hde, *new_hde;
+
+               hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+               new_hde = memdup(hde, sizeof(*hde));
+               if (new_hde)
+                       new_fmt = &new_hde->hpp;
+       } else {
+               new_fmt = memdup(fmt, sizeof(*fmt));
+       }
+
+       INIT_LIST_HEAD(&new_fmt->list);
+       INIT_LIST_HEAD(&new_fmt->sort_list);
+
+       return new_fmt;
+}
+
 static int parse_field_name(char *str, char **event, char **field, char **opt)
 {
        char *event_name, *field_name, *opt_name;
@@ -1908,11 +2012,11 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
 
 static int __dynamic_dimension__add(struct perf_evsel *evsel,
                                    struct format_field *field,
-                                   bool raw_trace)
+                                   bool raw_trace, int level)
 {
        struct hpp_dynamic_entry *hde;
 
-       hde = __alloc_dynamic_entry(evsel, field);
+       hde = __alloc_dynamic_entry(evsel, field, level);
        if (hde == NULL)
                return -ENOMEM;
 
@@ -1922,14 +2026,14 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel,
        return 0;
 }
 
-static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
+static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace, int level)
 {
        int ret;
        struct format_field *field;
 
        field = evsel->tp_format->format.fields;
        while (field) {
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                if (ret < 0)
                        return ret;
 
@@ -1938,7 +2042,8 @@ static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
        return 0;
 }
 
-static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
+static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace,
+                                 int level)
 {
        int ret;
        struct perf_evsel *evsel;
@@ -1947,7 +2052,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
                if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
                        continue;
 
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
                if (ret < 0)
                        return ret;
        }
@@ -1955,7 +2060,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
 }
 
 static int add_all_matching_fields(struct perf_evlist *evlist,
-                                  char *field_name, bool raw_trace)
+                                  char *field_name, bool raw_trace, int level)
 {
        int ret = -ESRCH;
        struct perf_evsel *evsel;
@@ -1969,14 +2074,15 @@ static int add_all_matching_fields(struct perf_evlist *evlist,
                if (field == NULL)
                        continue;
 
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
                if (ret < 0)
                        break;
        }
        return ret;
 }
 
-static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
+static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok,
+                            int level)
 {
        char *str, *event_name, *field_name, *opt_name;
        struct perf_evsel *evsel;
@@ -2006,12 +2112,12 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
        }
 
        if (!strcmp(field_name, "trace_fields")) {
-               ret = add_all_dynamic_fields(evlist, raw_trace);
+               ret = add_all_dynamic_fields(evlist, raw_trace, level);
                goto out;
        }
 
        if (event_name == NULL) {
-               ret = add_all_matching_fields(evlist, field_name, raw_trace);
+               ret = add_all_matching_fields(evlist, field_name, raw_trace, level);
                goto out;
        }
 
@@ -2029,7 +2135,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
        }
 
        if (!strcmp(field_name, "*")) {
-               ret = add_evsel_fields(evsel, raw_trace);
+               ret = add_evsel_fields(evsel, raw_trace, level);
        } else {
                field = pevent_find_any_field(evsel->tp_format, field_name);
                if (field == NULL) {
@@ -2038,7 +2144,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
                        return -ENOENT;
                }
 
-               ret = __dynamic_dimension__add(evsel, field, raw_trace);
+               ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
        }
 
 out:
@@ -2046,12 +2152,14 @@ out:
        return ret;
 }
 
-static int __sort_dimension__add(struct sort_dimension *sd)
+static int __sort_dimension__add(struct sort_dimension *sd,
+                                struct perf_hpp_list *list,
+                                int level)
 {
        if (sd->taken)
                return 0;
 
-       if (__sort_dimension__add_hpp_sort(sd) < 0)
+       if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
                return -1;
 
        if (sd->entry->se_collapse)
@@ -2062,46 +2170,63 @@ static int __sort_dimension__add(struct sort_dimension *sd)
        return 0;
 }
 
-static int __hpp_dimension__add(struct hpp_dimension *hd)
+static int __hpp_dimension__add(struct hpp_dimension *hd,
+                               struct perf_hpp_list *list,
+                               int level)
 {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
 
-               perf_hpp__register_sort_field(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, level);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__register_sort_field(list, fmt);
        return 0;
 }
 
-static int __sort_dimension__add_output(struct sort_dimension *sd)
+static int __sort_dimension__add_output(struct perf_hpp_list *list,
+                                       struct sort_dimension *sd)
 {
        if (sd->taken)
                return 0;
 
-       if (__sort_dimension__add_hpp_output(sd) < 0)
+       if (__sort_dimension__add_hpp_output(sd, list) < 0)
                return -1;
 
        sd->taken = 1;
        return 0;
 }
 
-static int __hpp_dimension__add_output(struct hpp_dimension *hd)
+static int __hpp_dimension__add_output(struct perf_hpp_list *list,
+                                      struct hpp_dimension *hd)
 {
-       if (!hd->taken) {
-               hd->taken = 1;
+       struct perf_hpp_fmt *fmt;
 
-               perf_hpp__column_register(hd->fmt);
-       }
+       if (hd->taken)
+               return 0;
+
+       fmt = __hpp_dimension__alloc_hpp(hd, 0);
+       if (!fmt)
+               return -1;
+
+       hd->taken = 1;
+       perf_hpp_list__column_register(list, fmt);
        return 0;
 }
 
 int hpp_dimension__add_output(unsigned col)
 {
        BUG_ON(col >= PERF_HPP__MAX_INDEX);
-       return __hpp_dimension__add_output(&hpp_sort_dimensions[col]);
+       return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
 }
 
-static int sort_dimension__add(const char *tok,
-                              struct perf_evlist *evlist __maybe_unused)
+static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
+                              struct perf_evlist *evlist __maybe_unused,
+                              int level)
 {
        unsigned int i;
 
@@ -2136,9 +2261,13 @@ static int sort_dimension__add(const char *tok,
                        sort__has_dso = 1;
                } else if (sd->entry == &sort_socket) {
                        sort__has_socket = 1;
+               } else if (sd->entry == &sort_thread) {
+                       sort__has_thread = 1;
+               } else if (sd->entry == &sort_comm) {
+                       sort__has_comm = 1;
                }
 
-               return __sort_dimension__add(sd);
+               return __sort_dimension__add(sd, list, level);
        }
 
        for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2147,7 +2276,7 @@ static int sort_dimension__add(const char *tok,
                if (strncasecmp(tok, hd->name, strlen(tok)))
                        continue;
 
-               return __hpp_dimension__add(hd);
+               return __hpp_dimension__add(hd, list, level);
        }
 
        for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2162,7 +2291,7 @@ static int sort_dimension__add(const char *tok,
                if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
                        sort__has_sym = 1;
 
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                return 0;
        }
 
@@ -2178,16 +2307,60 @@ static int sort_dimension__add(const char *tok,
                if (sd->entry == &sort_mem_daddr_sym)
                        sort__has_sym = 1;
 
-               __sort_dimension__add(sd);
+               __sort_dimension__add(sd, list, level);
                return 0;
        }
 
-       if (!add_dynamic_entry(evlist, tok))
+       if (!add_dynamic_entry(evlist, tok, level))
                return 0;
 
        return -ESRCH;
 }
 
+static int setup_sort_list(struct perf_hpp_list *list, char *str,
+                          struct perf_evlist *evlist)
+{
+       char *tmp, *tok;
+       int ret = 0;
+       int level = 0;
+       int next_level = 1;
+       bool in_group = false;
+
+       do {
+               tok = str;
+               tmp = strpbrk(str, "{}, ");
+               if (tmp) {
+                       if (in_group)
+                               next_level = level;
+                       else
+                               next_level = level + 1;
+
+                       if (*tmp == '{')
+                               in_group = true;
+                       else if (*tmp == '}')
+                               in_group = false;
+
+                       *tmp = '\0';
+                       str = tmp + 1;
+               }
+
+               if (*tok) {
+                       ret = sort_dimension__add(list, tok, evlist, level);
+                       if (ret == -EINVAL) {
+                               error("Invalid --sort key: `%s'", tok);
+                               break;
+                       } else if (ret == -ESRCH) {
+                               error("Unknown --sort key: `%s'", tok);
+                               break;
+                       }
+               }
+
+               level = next_level;
+       } while (tmp);
+
+       return ret;
+}
+
 static const char *get_default_sort_order(struct perf_evlist *evlist)
 {
        const char *default_sort_orders[] = {
@@ -2282,7 +2455,7 @@ static char *setup_overhead(char *keys)
 
 static int __setup_sorting(struct perf_evlist *evlist)
 {
-       char *tmp, *tok, *str;
+       char *str;
        const char *sort_keys;
        int ret = 0;
 
@@ -2320,17 +2493,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
                }
        }
 
-       for (tok = strtok_r(str, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = sort_dimension__add(tok, evlist);
-               if (ret == -EINVAL) {
-                       error("Invalid --sort key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --sort key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_sort_list(&perf_hpp_list, str, evlist);
 
        free(str);
        return ret;
@@ -2341,7 +2504,7 @@ void perf_hpp__set_elide(int idx, bool elide)
        struct perf_hpp_fmt *fmt;
        struct hpp_sort_entry *hse;
 
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                if (!perf_hpp__is_sort_entry(fmt))
                        continue;
 
@@ -2401,7 +2564,7 @@ void sort__setup_elide(FILE *output)
        struct perf_hpp_fmt *fmt;
        struct hpp_sort_entry *hse;
 
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                if (!perf_hpp__is_sort_entry(fmt))
                        continue;
 
@@ -2413,7 +2576,7 @@ void sort__setup_elide(FILE *output)
         * It makes no sense to elide all of sort entries.
         * Just revert them to show up again.
         */
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                if (!perf_hpp__is_sort_entry(fmt))
                        continue;
 
@@ -2421,7 +2584,7 @@ void sort__setup_elide(FILE *output)
                        return;
        }
 
-       perf_hpp__for_each_format(fmt) {
+       perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
                if (!perf_hpp__is_sort_entry(fmt))
                        continue;
 
@@ -2429,7 +2592,7 @@ void sort__setup_elide(FILE *output)
        }
 }
 
-static int output_field_add(char *tok)
+static int output_field_add(struct perf_hpp_list *list, char *tok)
 {
        unsigned int i;
 
@@ -2439,7 +2602,7 @@ static int output_field_add(char *tok)
                if (strncasecmp(tok, sd->name, strlen(tok)))
                        continue;
 
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
        }
 
        for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2448,7 +2611,7 @@ static int output_field_add(char *tok)
                if (strncasecmp(tok, hd->name, strlen(tok)))
                        continue;
 
-               return __hpp_dimension__add_output(hd);
+               return __hpp_dimension__add_output(list, hd);
        }
 
        for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2457,7 +2620,7 @@ static int output_field_add(char *tok)
                if (strncasecmp(tok, sd->name, strlen(tok)))
                        continue;
 
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
        }
 
        for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
@@ -2466,12 +2629,32 @@ static int output_field_add(char *tok)
                if (strncasecmp(tok, sd->name, strlen(tok)))
                        continue;
 
-               return __sort_dimension__add_output(sd);
+               return __sort_dimension__add_output(list, sd);
        }
 
        return -ESRCH;
 }
 
+static int setup_output_list(struct perf_hpp_list *list, char *str)
+{
+       char *tmp, *tok;
+       int ret = 0;
+
+       for (tok = strtok_r(str, ", ", &tmp);
+                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
+               ret = output_field_add(list, tok);
+               if (ret == -EINVAL) {
+                       error("Invalid --fields key: `%s'", tok);
+                       break;
+               } else if (ret == -ESRCH) {
+                       error("Unknown --fields key: `%s'", tok);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
 static void reset_dimensions(void)
 {
        unsigned int i;
@@ -2496,7 +2679,7 @@ bool is_strict_order(const char *order)
 
 static int __setup_output_field(void)
 {
-       char *tmp, *tok, *str, *strp;
+       char *str, *strp;
        int ret = -EINVAL;
 
        if (field_order == NULL)
@@ -2516,17 +2699,7 @@ static int __setup_output_field(void)
                goto out;
        }
 
-       for (tok = strtok_r(strp, ", ", &tmp);
-                       tok; tok = strtok_r(NULL, ", ", &tmp)) {
-               ret = output_field_add(tok);
-               if (ret == -EINVAL) {
-                       error("Invalid --fields key: `%s'", tok);
-                       break;
-               } else if (ret == -ESRCH) {
-                       error("Unknown --fields key: `%s'", tok);
-                       break;
-               }
-       }
+       ret = setup_output_list(&perf_hpp_list, strp);
 
 out:
        free(str);
@@ -2542,7 +2715,7 @@ int setup_sorting(struct perf_evlist *evlist)
                return err;
 
        if (parent_pattern != default_parent_pattern) {
-               err = sort_dimension__add("parent", evlist);
+               err = sort_dimension__add(&perf_hpp_list, "parent", evlist, -1);
                if (err < 0)
                        return err;
        }
@@ -2560,9 +2733,13 @@ int setup_sorting(struct perf_evlist *evlist)
                return err;
 
        /* copy sort keys to output fields */
-       perf_hpp__setup_output_field();
+       perf_hpp__setup_output_field(&perf_hpp_list);
        /* and then copy output fields to sort keys */
-       perf_hpp__append_sort_keys();
+       perf_hpp__append_sort_keys(&perf_hpp_list);
+
+       /* setup hists-specific output fields */
+       if (perf_hpp__setup_hists_formats(&perf_hpp_list, evlist) < 0)
+               return -1;
 
        return 0;
 }
@@ -2578,5 +2755,5 @@ void reset_output_field(void)
        sort_order = NULL;
 
        reset_dimensions();
-       perf_hpp__reset_output_field();
+       perf_hpp__reset_output_field(&perf_hpp_list);
 }
index 687bbb1..3f4e359 100644 (file)
@@ -32,9 +32,12 @@ extern const char default_sort_order[];
 extern regex_t ignore_callees_regex;
 extern int have_ignore_callees;
 extern int sort__need_collapse;
+extern int sort__has_dso;
 extern int sort__has_parent;
 extern int sort__has_sym;
 extern int sort__has_socket;
+extern int sort__has_thread;
+extern int sort__has_comm;
 extern enum sort_mode sort__mode;
 extern struct sort_entry sort_comm;
 extern struct sort_entry sort_dso;
@@ -94,9 +97,11 @@ struct hist_entry {
        s32                     socket;
        s32                     cpu;
        u8                      cpumode;
+       u8                      depth;
 
        /* We are added by hists__add_dummy_entry. */
        bool                    dummy;
+       bool                    leaf;
 
        char                    level;
        u8                      filtered;
@@ -113,18 +118,28 @@ struct hist_entry {
                        bool    init_have_children;
                        bool    unfolded;
                        bool    has_children;
+                       bool    has_no_entry;
                };
        };
        char                    *srcline;
        char                    *srcfile;
        struct symbol           *parent;
-       struct rb_root          sorted_chain;
        struct branch_info      *branch_info;
        struct hists            *hists;
        struct mem_info         *mem_info;
        void                    *raw_data;
        u32                     raw_size;
        void                    *trace_output;
+       struct perf_hpp_list    *hpp_list;
+       struct hist_entry       *parent_he;
+       union {
+               /* this is for hierarchical entry structure */
+               struct {
+                       struct rb_root  hroot_in;
+                       struct rb_root  hroot_out;
+               };                              /* non-leaf entries */
+               struct rb_root  sorted_chain;   /* leaf entry has callchains */
+       };
        struct callchain_root   callchain[0]; /* must be last member */
 };
 
@@ -160,6 +175,17 @@ static inline float hist_entry__get_percent_limit(struct hist_entry *he)
        return period * 100.0 / total_period;
 }
 
+static inline u64 cl_address(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & ~(cacheline_size - 1));
+}
+
+static inline u64 cl_offset(u64 address)
+{
+       /* return the cacheline of the address */
+       return (address & (cacheline_size - 1));
+}
 
 enum sort_mode {
        SORT_MODE__NORMAL,
@@ -221,6 +247,7 @@ struct sort_entry {
        int64_t (*se_sort)(struct hist_entry *, struct hist_entry *);
        int     (*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
                               unsigned int width);
+       int     (*se_filter)(struct hist_entry *he, int type, const void *arg);
        u8      se_width_idx;
 };
 
index 6ac0314..b33ffb2 100644 (file)
@@ -2,6 +2,7 @@
 #include "evsel.h"
 #include "stat.h"
 #include "color.h"
+#include "pmu.h"
 
 enum {
        CTX_BIT_USER    = 1 << 0,
@@ -14,6 +15,13 @@ enum {
 
 #define NUM_CTX CTX_BIT_MAX
 
+/*
+ * AGGR_GLOBAL: Use CPU 0
+ * AGGR_SOCKET: Use first CPU of socket
+ * AGGR_CORE: Use first CPU of core
+ * AGGR_NONE: Use matching CPU
+ * AGGR_THREAD: Not supported?
+ */
 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
@@ -28,9 +36,15 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static bool have_frontend_stalled;
 
 struct stats walltime_nsecs_stats;
 
+void perf_stat__init_shadow_stats(void)
+{
+       have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+}
+
 static int evsel_context(struct perf_evsel *evsel)
 {
        int ctx = 0;
@@ -137,9 +151,10 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
        return color;
 }
 
-static void print_stalled_cycles_frontend(FILE *out, int cpu,
+static void print_stalled_cycles_frontend(int cpu,
                                          struct perf_evsel *evsel
-                                         __maybe_unused, double avg)
+                                         __maybe_unused, double avg,
+                                         struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -152,14 +167,17 @@ static void print_stalled_cycles_frontend(FILE *out, int cpu,
 
        color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
 
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " frontend cycles idle   ");
+       if (ratio)
+               out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
+                                 ratio);
+       else
+               out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
 }
 
-static void print_stalled_cycles_backend(FILE *out, int cpu,
+static void print_stalled_cycles_backend(int cpu,
                                         struct perf_evsel *evsel
-                                        __maybe_unused, double avg)
+                                        __maybe_unused, double avg,
+                                        struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -172,14 +190,13 @@ static void print_stalled_cycles_backend(FILE *out, int cpu,
 
        color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " backend  cycles idle   ");
+       out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
 }
 
-static void print_branch_misses(FILE *out, int cpu,
+static void print_branch_misses(int cpu,
                                struct perf_evsel *evsel __maybe_unused,
-                               double avg)
+                               double avg,
+                               struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -192,14 +209,13 @@ static void print_branch_misses(FILE *out, int cpu,
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all branches        ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
 }
 
-static void print_l1_dcache_misses(FILE *out, int cpu,
+static void print_l1_dcache_misses(int cpu,
                                   struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -212,14 +228,13 @@ static void print_l1_dcache_misses(FILE *out, int cpu,
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-dcache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
 }
 
-static void print_l1_icache_misses(FILE *out, int cpu,
+static void print_l1_icache_misses(int cpu,
                                   struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
+                                  double avg,
+                                  struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -231,15 +246,13 @@ static void print_l1_icache_misses(FILE *out, int cpu,
                ratio = avg / total * 100.0;
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all L1-icache hits  ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
 }
 
-static void print_dtlb_cache_misses(FILE *out, int cpu,
+static void print_dtlb_cache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -251,15 +264,13 @@ static void print_dtlb_cache_misses(FILE *out, int cpu,
                ratio = avg / total * 100.0;
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all dTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
 }
 
-static void print_itlb_cache_misses(FILE *out, int cpu,
+static void print_itlb_cache_misses(int cpu,
                                    struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
+                                   double avg,
+                                   struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -271,15 +282,13 @@ static void print_itlb_cache_misses(FILE *out, int cpu,
                ratio = avg / total * 100.0;
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all iTLB cache hits ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
 }
 
-static void print_ll_cache_misses(FILE *out, int cpu,
+static void print_ll_cache_misses(int cpu,
                                  struct perf_evsel *evsel __maybe_unused,
-                                 double avg)
+                                 double avg,
+                                 struct perf_stat_output_ctx *out)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -291,15 +300,15 @@ static void print_ll_cache_misses(FILE *out, int cpu,
                ratio = avg / total * 100.0;
 
        color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(out, " #  ");
-       color_fprintf(out, color, "%6.2f%%", ratio);
-       fprintf(out, " of all LL-cache hits   ");
+       out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
 }
 
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr)
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out)
 {
+       void *ctxp = out->ctx;
+       print_metric_t print_metric = out->print_metric;
        double total, ratio = 0.0, total2;
        int ctx = evsel_context(evsel);
 
@@ -307,119 +316,145 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                if (total) {
                        ratio = avg / total;
-                       fprintf(out, " #   %5.2f  insns per cycle        ", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "insn per cycle", ratio);
                } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
                }
                total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
                total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 
                if (total && avg) {
+                       out->new_line(ctxp);
                        ratio = total / avg;
-                       fprintf(out, "\n");
-                       if (aggr == AGGR_NONE)
-                               fprintf(out, "        ");
-                       fprintf(out, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+                       print_metric(ctxp, NULL, "%7.2f ",
+                                       "stalled cycles per insn",
+                                       ratio);
+               } else if (have_frontend_stalled) {
+                       print_metric(ctxp, NULL, NULL,
+                                    "stalled cycles per insn", 0);
                }
-
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-                       runtime_branches_stats[ctx][cpu].n != 0) {
-               print_branch_misses(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
+               if (runtime_branches_stats[ctx][cpu].n != 0)
+                       print_branch_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all branches", 0);
        } else if (
                evsel->attr.type == PERF_TYPE_HW_CACHE &&
                evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
                                        ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_dcache_stats[ctx][cpu].n != 0) {
-               print_l1_dcache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
+                       print_l1_dcache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
        } else if (
                evsel->attr.type == PERF_TYPE_HW_CACHE &&
                evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
                                        ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_icache_stats[ctx][cpu].n != 0) {
-               print_l1_icache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_l1_icache_stats[ctx][cpu].n != 0)
+                       print_l1_icache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
        } else if (
                evsel->attr.type == PERF_TYPE_HW_CACHE &&
                evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
                                        ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
-               print_dtlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
+                       print_dtlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
        } else if (
                evsel->attr.type == PERF_TYPE_HW_CACHE &&
                evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
                                        ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_itlb_cache_stats[ctx][cpu].n != 0) {
-               print_itlb_cache_misses(out, cpu, evsel, avg);
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
+                       print_itlb_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
        } else if (
                evsel->attr.type == PERF_TYPE_HW_CACHE &&
                evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
                                        ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_ll_cache_stats[ctx][cpu].n != 0) {
-               print_ll_cache_misses(out, cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-                       runtime_cacherefs_stats[ctx][cpu].n != 0) {
+                                        ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+               if (runtime_ll_cache_stats[ctx][cpu].n != 0)
+                       print_ll_cache_misses(cpu, evsel, avg, out);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
                total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
 
                if (total)
                        ratio = avg * 100 / total;
 
-               fprintf(out, " # %8.3f %% of all cache refs    ", ratio);
-
+               if (runtime_cacherefs_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.3f %%",
+                                    "of all cache refs", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
        } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-               print_stalled_cycles_frontend(out, cpu, evsel, avg);
+               print_stalled_cycles_frontend(cpu, evsel, avg, out);
        } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-               print_stalled_cycles_backend(out, cpu, evsel, avg);
+               print_stalled_cycles_backend(cpu, evsel, avg, out);
        } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
                total = avg_stats(&runtime_nsecs_stats[cpu]);
 
                if (total) {
                        ratio = avg / total;
-                       fprintf(out, " # %8.3f GHz                    ", ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
                } else {
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "Ghz", 0);
                }
        } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
                total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                if (total)
-                       fprintf(out,
-                               " #   %5.2f%% transactional cycles   ",
-                               100.0 * (avg / total));
+                       print_metric(ctxp, NULL,
+                                       "%7.2f%%", "transactional cycles",
+                                       100.0 * (avg / total));
+               else
+                       print_metric(ctxp, NULL, NULL, "transactional cycles",
+                                    0);
        } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
                total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
                total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
                if (total2 < avg)
                        total2 = avg;
                if (total)
-                       fprintf(out,
-                               " #   %5.2f%% aborted cycles         ",
+                       print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
                                100.0 * ((total2-avg) / total));
-       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               else
+                       print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
+       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
                total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
                if (avg)
                        ratio = total / avg;
 
-               fprintf(out, " # %8.0f cycles / transaction   ", ratio);
-       } else if (perf_stat_evsel__is(evsel, ELISION_START) &&
-                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
+                       print_metric(ctxp, NULL, "%8.0f",
+                                    "cycles / transaction", ratio);
+               else
+                       print_metric(ctxp, NULL, NULL, "cycles / transaction",
+                                    0);
+       } else if (perf_stat_evsel__is(evsel, ELISION_START)) {
                total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
                if (avg)
                        ratio = total / avg;
 
-               fprintf(out, " # %8.0f cycles / elision       ", ratio);
+               print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
        } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
                if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
-                       fprintf(out, " # %8.3f CPUs utilized          ", avg / ratio);
+                       print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
+                                    avg / ratio);
                else
-                       fprintf(out, "                                   ");
+                       print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
        } else if (runtime_nsecs_stats[cpu].n != 0) {
                char unit = 'M';
+               char unit_buf[10];
 
                total = avg_stats(&runtime_nsecs_stats[cpu]);
 
@@ -429,9 +464,9 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
                        ratio *= 1000;
                        unit = 'K';
                }
-
-               fprintf(out, " # %8.3f %c/sec                  ", ratio, unit);
+               snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+               print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
        } else {
-               fprintf(out, "                                   ");
+               print_metric(ctxp, NULL, NULL, NULL, 0);
        }
 }
index 2f901d1..4d9b481 100644 (file)
@@ -97,7 +97,7 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel)
        }
 }
 
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
 {
        int i;
        struct perf_stat_evsel *ps = evsel->priv;
@@ -108,7 +108,7 @@ void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
        perf_stat_evsel_id_init(evsel);
 }
 
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 {
        evsel->priv = zalloc(sizeof(struct perf_stat_evsel));
        if (evsel->priv == NULL)
@@ -117,13 +117,13 @@ int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
        return 0;
 }
 
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
 {
        zfree(&evsel->priv);
 }
 
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads)
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
+                                            int ncpus, int nthreads)
 {
        struct perf_counts *counts;
 
@@ -134,13 +134,13 @@ int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
        return counts ? 0 : -ENOMEM;
 }
 
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
 {
        perf_counts__delete(evsel->prev_raw_counts);
        evsel->prev_raw_counts = NULL;
 }
 
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
+static int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
 {
        int ncpus = perf_evsel__nr_cpus(evsel);
        int nthreads = thread_map__nr(evsel->threads);
@@ -310,7 +310,16 @@ int perf_stat_process_counter(struct perf_stat_config *config,
        int i, ret;
 
        aggr->val = aggr->ena = aggr->run = 0;
-       init_stats(ps->res_stats);
+
+       /*
+        * We calculate counter's data every interval,
+        * and the display code shows ps->res_stats
+        * avg value. We need to zero the stats for
+        * interval mode, otherwise overall avg running
+        * averages will be shown for each interval.
+        */
+       if (config->interval)
+               init_stats(ps->res_stats);
 
        if (counter->per_pkg)
                zero_per_pkg(counter);
index 086f4e1..0150e78 100644 (file)
@@ -68,21 +68,23 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel);
 
 extern struct stats walltime_nsecs_stats;
 
+typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
+                              const char *fmt, double val);
+typedef void (*new_line_t )(void *ctx);
+
+void perf_stat__init_shadow_stats(void);
 void perf_stat__reset_shadow_stats(void);
 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
                                    int cpu);
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-                                  double avg, int cpu, enum aggr_mode aggr);
-
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel);
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel);
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel);
-
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-                                     int ncpus, int nthreads);
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel);
+struct perf_stat_output_ctx {
+       void *ctx;
+       print_metric_t print_metric;
+       new_line_t new_line;
+};
 
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw);
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+                                  double avg, int cpu,
+                                  struct perf_stat_output_ctx *out);
 
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct perf_evlist *evlist);
index 25671fa..d3d2792 100644 (file)
@@ -51,30 +51,6 @@ void strbuf_grow(struct strbuf *sb, size_t extra)
        ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
 }
 
-static void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-                                  const void *data, size_t dlen)
-{
-       if (pos + len < pos)
-               die("you want to use way too much memory");
-       if (pos > sb->len)
-               die("`pos' is too far after the end of the buffer");
-       if (pos + len > sb->len)
-               die("`pos + len' is too far after the end of the buffer");
-
-       if (dlen >= len)
-               strbuf_grow(sb, dlen - len);
-       memmove(sb->buf + pos + dlen,
-                       sb->buf + pos + len,
-                       sb->len - pos - len);
-       memcpy(sb->buf + pos, data, dlen);
-       strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-       strbuf_splice(sb, pos, len, NULL, 0);
-}
-
 void strbuf_add(struct strbuf *sb, const void *data, size_t len)
 {
        strbuf_grow(sb, len);
index 529f2f0..7a32c83 100644 (file)
@@ -77,8 +77,6 @@ static inline void strbuf_addch(struct strbuf *sb, int c) {
        sb->buf[sb->len] = '\0';
 }
 
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
 extern void strbuf_add(struct strbuf *, const void *, size_t);
 static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
        strbuf_add(sb, s, strlen(s));
index 562b8eb..b1dd68f 100644 (file)
@@ -6,6 +6,7 @@
 #include <inttypes.h>
 
 #include "symbol.h"
+#include "demangle-java.h"
 #include "machine.h"
 #include "vdso.h"
 #include <symbol/kallsyms.h>
@@ -1077,6 +1078,8 @@ new_symbol:
                                demangle_flags = DMGL_PARAMS | DMGL_ANSI;
 
                        demangled = bfd_demangle(NULL, elf_name, demangle_flags);
+                       if (demangled == NULL)
+                               demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
                        if (demangled != NULL)
                                elf_name = demangled;
                }
index 3b2de6e..e7588dc 100644 (file)
@@ -1466,7 +1466,8 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
         * Read the build id if possible. This is required for
         * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
         */
-       if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0)
+       if (is_regular_file(name) &&
+           filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
                dso__set_build_id(dso, build_id);
 
        /*
@@ -1487,6 +1488,9 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                                                   root_dir, name, PATH_MAX))
                        continue;
 
+               if (!is_regular_file(name))
+                       continue;
+
                /* Name is now the name of the next image to try */
                if (symsrc__init(ss, dso, name, symtab_type) < 0)
                        continue;
@@ -1525,6 +1529,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
        if (!runtime_ss && syms_ss)
                runtime_ss = syms_ss;
 
+       if (syms_ss && syms_ss->type == DSO_BINARY_TYPE__BUILD_ID_CACHE)
+               if (dso__build_id_is_kmod(dso, name, PATH_MAX))
+                       kmod = true;
+
        if (syms_ss)
                ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, kmod);
        else
index ccd1caa..a937053 100644 (file)
@@ -110,7 +110,8 @@ struct symbol_conf {
                        has_filter,
                        show_ref_callgraph,
                        hide_unresolved,
-                       raw_trace;
+                       raw_trace,
+                       report_hierarchy;
        const char      *vmlinux_name,
                        *kallsyms_name,
                        *source_prefix,
index 802bb86..8ae051e 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/err.h>
 #include <traceevent/event-parse.h>
 #include <api/fs/tracing_path.h>
+#include <api/fs/fs.h>
 #include "trace-event.h"
 #include "machine.h"
 #include "util.h"
index 4d4210d..1b74164 100644 (file)
@@ -19,7 +19,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
        u64 quot, rem;
 
        quot = cyc >> tc->time_shift;
-       rem  = cyc & ((1 << tc->time_shift) - 1);
+       rem  = cyc & (((u64)1 << tc->time_shift) - 1);
        return tc->time_zero + quot * tc->time_mult +
               ((rem * tc->time_mult) >> tc->time_shift);
 }
index ead9509..b7766c5 100644 (file)
@@ -14,6 +14,7 @@
 #include <limits.h>
 #include <byteswap.h>
 #include <linux/kernel.h>
+#include <linux/log2.h>
 #include <unistd.h>
 #include "callchain.h"
 #include "strlist.h"
@@ -507,54 +508,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
        return ret;
 }
 
-int filename__read_str(const char *filename, char **buf, size_t *sizep)
-{
-       size_t size = 0, alloc_size = 0;
-       void *bf = NULL, *nbf;
-       int fd, n, err = 0;
-       char sbuf[STRERR_BUFSIZE];
-
-       fd = open(filename, O_RDONLY);
-       if (fd < 0)
-               return -errno;
-
-       do {
-               if (size == alloc_size) {
-                       alloc_size += BUFSIZ;
-                       nbf = realloc(bf, alloc_size);
-                       if (!nbf) {
-                               err = -ENOMEM;
-                               break;
-                       }
-
-                       bf = nbf;
-               }
-
-               n = read(fd, bf + size, alloc_size - size);
-               if (n < 0) {
-                       if (size) {
-                               pr_warning("read failed %d: %s\n", errno,
-                                        strerror_r(errno, sbuf, sizeof(sbuf)));
-                               err = 0;
-                       } else
-                               err = -errno;
-
-                       break;
-               }
-
-               size += n;
-       } while (n > 0);
-
-       if (!err) {
-               *sizep = size;
-               *buf   = bf;
-       } else
-               free(bf);
-
-       close(fd);
-       return err;
-}
-
 const char *get_filename_for_perf_kvm(void)
 {
        const char *filename;
@@ -691,3 +644,66 @@ out:
 
        return tip;
 }
+
+bool is_regular_file(const char *file)
+{
+       struct stat st;
+
+       if (stat(file, &st))
+               return false;
+
+       return S_ISREG(st.st_mode);
+}
+
+int fetch_current_timestamp(char *buf, size_t sz)
+{
+       struct timeval tv;
+       struct tm tm;
+       char dt[32];
+
+       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+               return -1;
+
+       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+               return -1;
+
+       scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+       return 0;
+}
+
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra)
+{
+       size_t i, j, mask;
+
+       if (!printer)
+               return;
+
+       bytes_per_line = roundup_pow_of_two(bytes_per_line);
+       mask = bytes_per_line - 1;
+
+       printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
+       for (i = 0; i < len; i++) {
+               if ((i & mask) == 0) {
+                       printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
+                       printer(BINARY_PRINT_ADDR, i, extra);
+               }
+
+               printer(BINARY_PRINT_NUM_DATA, data[i], extra);
+
+               if (((i & mask) == mask) || i == len - 1) {
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_NUM_PAD, -1, extra);
+
+                       printer(BINARY_PRINT_SEP, i, extra);
+                       for (j = i & ~mask; j <= i; j++)
+                               printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_CHAR_PAD, i, extra);
+                       printer(BINARY_PRINT_LINE_END, -1, extra);
+               }
+       }
+       printer(BINARY_PRINT_DATA_END, -1, extra);
+}
index fe915e6..d0d50ce 100644 (file)
@@ -82,6 +82,8 @@
 
 extern const char *graph_line;
 extern const char *graph_dotted_line;
+extern const char *spaces;
+extern const char *dots;
 extern char buildid_dir[];
 
 /* On most systems <limits.h> would have given us this, but
@@ -303,7 +305,6 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
                  bool show_sym, bool unwind_inlines);
 void free_srcline(char *srcline);
 
-int filename__read_str(const char *filename, char **buf, size_t *sizep);
 int perf_event_paranoid(void);
 
 void mem_bswap_64(void *src, int byte_size);
@@ -343,5 +344,27 @@ int fetch_kernel_version(unsigned int *puint,
 #define KVER_PARAM(x)  KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
 
 const char *perf_tip(const char *dirpath);
+bool is_regular_file(const char *file);
+int fetch_current_timestamp(char *buf, size_t sz);
+
+enum binary_printer_ops {
+       BINARY_PRINT_DATA_BEGIN,
+       BINARY_PRINT_LINE_BEGIN,
+       BINARY_PRINT_ADDR,
+       BINARY_PRINT_NUM_DATA,
+       BINARY_PRINT_NUM_PAD,
+       BINARY_PRINT_SEP,
+       BINARY_PRINT_CHAR_DATA,
+       BINARY_PRINT_CHAR_PAD,
+       BINARY_PRINT_LINE_END,
+       BINARY_PRINT_DATA_END,
+};
+
+typedef void (*print_binary_t)(enum binary_printer_ops,
+                              unsigned int val,
+                              void *extra);
 
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra);
 #endif /* GIT_COMPAT_UTIL_H */
index 0dac7e0..3fa94e2 100644 (file)
@@ -1970,7 +1970,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
 }
 
 static void
-dump_cstate_pstate_config_info(family, model)
+dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
 {
        if (!do_nhm_platform_info)
                return;
@@ -2142,7 +2142,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
 #define        RAPL_TIME_GRANULARITY   0x3F /* 6 bit time granularity */
 
-double get_tdp(model)
+double get_tdp(unsigned int model)
 {
        unsigned long long msr;
 
@@ -2256,7 +2256,7 @@ void rapl_probe(unsigned int family, unsigned int model)
        return;
 }
 
-void perf_limit_reasons_probe(family, model)
+void perf_limit_reasons_probe(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
                return;
@@ -2792,7 +2792,7 @@ void process_cpuid()
        perf_limit_reasons_probe(family, model);
 
        if (debug)
-               dump_cstate_pstate_config_info();
+               dump_cstate_pstate_config_info(family, model);
 
        if (has_skl_msrs(family, model))
                calculate_tsc_tweak();
index 7ec7df9..0c1a7e6 100644 (file)
@@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
 }
 EXPORT_SYMBOL(__wrap_devm_memremap_pages);
 
-pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
 {
        struct nfit_test_resource *nfit_res = get_nfit_res(addr);
 
index 90bd2ea..b3281dc 100644 (file)
@@ -217,13 +217,16 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
        return rc;
 }
 
+#define NFIT_TEST_ARS_RECORDS 4
+
 static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
                unsigned int buf_len)
 {
        if (buf_len < sizeof(*nd_cmd))
                return -EINVAL;
 
-       nd_cmd->max_ars_out = 256;
+       nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
+               + NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record);
        nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
 
        return 0;
@@ -246,7 +249,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
        if (buf_len < sizeof(*nd_cmd))
                return -EINVAL;
 
-       nd_cmd->out_length = 256;
+       nd_cmd->out_length = sizeof(struct nd_cmd_ars_status);
+       /* TODO: emit error records */
        nd_cmd->num_records = 0;
        nd_cmd->address = 0;
        nd_cmd->length = -1ULL;
index 77edcdc..0572784 100755 (executable)
@@ -88,7 +88,11 @@ test_delete()
                exit 1
        fi
 
-       rm $file
+       rm $file 2>/dev/null
+       if [ $? -ne 0 ]; then
+               chattr -i $file
+               rm $file
+       fi
 
        if [ -e $file ]; then
                echo "$file couldn't be deleted" >&2
@@ -111,6 +115,7 @@ test_zero_size_delete()
                exit 1
        fi
 
+       chattr -i $file
        printf "$attrs" > $file
 
        if [ -e $file ]; then
@@ -141,7 +146,11 @@ test_valid_filenames()
                        echo "$file could not be created" >&2
                        ret=1
                else
-                       rm $file
+                       rm $file 2>/dev/null
+                       if [ $? -ne 0 ]; then
+                               chattr -i $file
+                               rm $file
+                       fi
                fi
        done
 
@@ -174,7 +183,11 @@ test_invalid_filenames()
 
                if [ -e $file ]; then
                        echo "Creating $file should have failed" >&2
-                       rm $file
+                       rm $file 2>/dev/null
+                       if [ $? -ne 0 ]; then
+                               chattr -i $file
+                               rm $file
+                       fi
                        ret=1
                fi
        done
index 8c07644..4af74f7 100644 (file)
@@ -1,10 +1,68 @@
+#include <errno.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <sys/ioctl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <linux/fs.h>
+
+static int set_immutable(const char *path, int immutable)
+{
+       unsigned int flags;
+       int fd;
+       int rc;
+       int error;
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       rc = ioctl(fd, FS_IOC_GETFLAGS, &flags);
+       if (rc < 0) {
+               error = errno;
+               close(fd);
+               errno = error;
+               return rc;
+       }
+
+       if (immutable)
+               flags |= FS_IMMUTABLE_FL;
+       else
+               flags &= ~FS_IMMUTABLE_FL;
+
+       rc = ioctl(fd, FS_IOC_SETFLAGS, &flags);
+       error = errno;
+       close(fd);
+       errno = error;
+       return rc;
+}
+
+static int get_immutable(const char *path)
+{
+       unsigned int flags;
+       int fd;
+       int rc;
+       int error;
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       rc = ioctl(fd, FS_IOC_GETFLAGS, &flags);
+       if (rc < 0) {
+               error = errno;
+               close(fd);
+               errno = error;
+               return rc;
+       }
+       close(fd);
+       if (flags & FS_IMMUTABLE_FL)
+               return 1;
+       return 0;
+}
 
 int main(int argc, char **argv)
 {
@@ -27,7 +85,7 @@ int main(int argc, char **argv)
        buf[4] = 0;
 
        /* create a test variable */
-       fd = open(path, O_WRONLY | O_CREAT);
+       fd = open(path, O_WRONLY | O_CREAT, 0600);
        if (fd < 0) {
                perror("open(O_WRONLY)");
                return EXIT_FAILURE;
@@ -41,6 +99,18 @@ int main(int argc, char **argv)
 
        close(fd);
 
+       rc = get_immutable(path);
+       if (rc < 0) {
+               perror("ioctl(FS_IOC_GETFLAGS)");
+               return EXIT_FAILURE;
+       } else if (rc) {
+               rc = set_immutable(path, 0);
+               if (rc < 0) {
+                       perror("ioctl(FS_IOC_SETFLAGS)");
+                       return EXIT_FAILURE;
+               }
+       }
+
        fd = open(path, O_RDONLY);
        if (fd < 0) {
                perror("open");
index 773e276..1e1abe0 100644 (file)
@@ -39,28 +39,23 @@ instance_slam() {
 }
 
 instance_slam &
-x=`jobs -l`
-p1=`echo $x | cut -d' ' -f2`
+p1=$!
 echo $p1
 
 instance_slam &
-x=`jobs -l | tail -1`
-p2=`echo $x | cut -d' ' -f2`
+p2=$!
 echo $p2
 
 instance_slam &
-x=`jobs -l | tail -1`
-p3=`echo $x | cut -d' ' -f2`
+p3=$!
 echo $p3
 
 instance_slam &
-x=`jobs -l | tail -1`
-p4=`echo $x | cut -d' ' -f2`
+p4=$!
 echo $p4
 
 instance_slam &
-x=`jobs -l | tail -1`
-p5=`echo $x | cut -d' ' -f2`
+p5=$!
 echo $p5
 
 ls -lR >/dev/null
index 844787a..5eb49b7 100755 (executable)
@@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file
 then
        print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
 if test -s $1.diags
 then
        print_warning Assertion failure in $file $title
@@ -64,10 +64,12 @@ then
        then
                summary="$summary  lockdep: $n_badness"
        fi
-       n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
+       n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1`
        if test "$n_stalls" -ne 0
        then
                summary="$summary  Stalls: $n_stalls"
        fi
        print_warning Summary: $summary
+else
+       rm $1.diags
 fi
index e86d937..60fe3c5 100644 (file)
@@ -45,7 +45,17 @@ static inline int ksft_exit_fail(void)
 }
 #endif
 
-#define NSEC_PER_SEC 1000000000L
+#define NSEC_PER_SEC 1000000000LL
+#define USEC_PER_SEC 1000000LL
+
+#define ADJ_SETOFFSET 0x0100
+
+#include <sys/syscall.h>
+static int clock_adjtime(clockid_t id, struct timex *tx)
+{
+       return syscall(__NR_clock_adjtime, id, tx);
+}
+
 
 /* clear NTP time_status & time_state */
 int clear_time_state(void)
@@ -193,10 +203,137 @@ out:
 }
 
 
+int set_offset(long long offset, int use_nano)
+{
+       struct timex tmx = {};
+       int ret;
+
+       tmx.modes = ADJ_SETOFFSET;
+       if (use_nano) {
+               tmx.modes |= ADJ_NANO;
+
+               tmx.time.tv_sec = offset / NSEC_PER_SEC;
+               tmx.time.tv_usec = offset % NSEC_PER_SEC;
+
+               if (offset < 0 && tmx.time.tv_usec) {
+                       tmx.time.tv_sec -= 1;
+                       tmx.time.tv_usec += NSEC_PER_SEC;
+               }
+       } else {
+               tmx.time.tv_sec = offset / USEC_PER_SEC;
+               tmx.time.tv_usec = offset % USEC_PER_SEC;
+
+               if (offset < 0 && tmx.time.tv_usec) {
+                       tmx.time.tv_sec -= 1;
+                       tmx.time.tv_usec += USEC_PER_SEC;
+               }
+       }
+
+       ret = clock_adjtime(CLOCK_REALTIME, &tmx);
+       if (ret < 0) {
+               printf("(sec: %ld  usec: %ld) ", tmx.time.tv_sec, tmx.time.tv_usec);
+               printf("[FAIL]\n");
+               return -1;
+       }
+       return 0;
+}
+
+int set_bad_offset(long sec, long usec, int use_nano)
+{
+       struct timex tmx = {};
+       int ret;
+
+       tmx.modes = ADJ_SETOFFSET;
+       if (use_nano)
+               tmx.modes |= ADJ_NANO;
+
+       tmx.time.tv_sec = sec;
+       tmx.time.tv_usec = usec;
+       ret = clock_adjtime(CLOCK_REALTIME, &tmx);
+       if (ret >= 0) {
+               printf("Invalid (sec: %ld  usec: %ld) did not fail! ", tmx.time.tv_sec, tmx.time.tv_usec);
+               printf("[FAIL]\n");
+               return -1;
+       }
+       return 0;
+}
+
+int validate_set_offset(void)
+{
+       printf("Testing ADJ_SETOFFSET... ");
+
+       /* Test valid values */
+       if (set_offset(NSEC_PER_SEC - 1, 1))
+               return -1;
+
+       if (set_offset(-NSEC_PER_SEC + 1, 1))
+               return -1;
+
+       if (set_offset(-NSEC_PER_SEC - 1, 1))
+               return -1;
+
+       if (set_offset(5 * NSEC_PER_SEC, 1))
+               return -1;
+
+       if (set_offset(-5 * NSEC_PER_SEC, 1))
+               return -1;
+
+       if (set_offset(5 * NSEC_PER_SEC + NSEC_PER_SEC / 2, 1))
+               return -1;
+
+       if (set_offset(-5 * NSEC_PER_SEC - NSEC_PER_SEC / 2, 1))
+               return -1;
+
+       if (set_offset(USEC_PER_SEC - 1, 0))
+               return -1;
+
+       if (set_offset(-USEC_PER_SEC + 1, 0))
+               return -1;
+
+       if (set_offset(-USEC_PER_SEC - 1, 0))
+               return -1;
+
+       if (set_offset(5 * USEC_PER_SEC, 0))
+               return -1;
+
+       if (set_offset(-5 * USEC_PER_SEC, 0))
+               return -1;
+
+       if (set_offset(5 * USEC_PER_SEC + USEC_PER_SEC / 2, 0))
+               return -1;
+
+       if (set_offset(-5 * USEC_PER_SEC - USEC_PER_SEC / 2, 0))
+               return -1;
+
+       /* Test invalid values */
+       if (set_bad_offset(0, -1, 1))
+               return -1;
+       if (set_bad_offset(0, -1, 0))
+               return -1;
+       if (set_bad_offset(0, 2 * NSEC_PER_SEC, 1))
+               return -1;
+       if (set_bad_offset(0, 2 * USEC_PER_SEC, 0))
+               return -1;
+       if (set_bad_offset(0, NSEC_PER_SEC, 1))
+               return -1;
+       if (set_bad_offset(0, USEC_PER_SEC, 0))
+               return -1;
+       if (set_bad_offset(0, -NSEC_PER_SEC, 1))
+               return -1;
+       if (set_bad_offset(0, -USEC_PER_SEC, 0))
+               return -1;
+
+       printf("[OK]\n");
+       return 0;
+}
+
 int main(int argc, char **argv)
 {
        if (validate_freq())
                return ksft_exit_fail();
 
+       if (validate_set_offset())
+               return ksft_exit_fail();
+
        return ksft_exit_pass();
 }
index d0c473f..d5ce7d7 100644 (file)
@@ -4,15 +4,16 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
+                       check_initial_reg_state sigreturn ldt_gdt
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
                        test_FCMOV test_FCOMI test_FISTTP \
-                       ldt_gdt \
                        vdso_restorer
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
 
 CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
 
@@ -40,7 +41,7 @@ clean:
 $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
        $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
 
-$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
+$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
        $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
 
 # x86_64 users should be encouraged to install 32-bit libraries
@@ -65,3 +66,9 @@ endif
 sysret_ss_attrs_64: thunks.S
 ptrace_syscall_32: raw_syscall_helper_32.S
 test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644 (file)
index 0000000..6aaed9b
--- /dev/null
@@ -0,0 +1,109 @@
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (
+       ".pushsection .text\n\t"
+       ".type real_start, @function\n\t"
+       ".global real_start\n\t"
+       "real_start:\n\t"
+#ifdef __x86_64__
+       "mov %rax, ax\n\t"
+       "mov %rbx, bx\n\t"
+       "mov %rcx, cx\n\t"
+       "mov %rdx, dx\n\t"
+       "mov %rsi, si\n\t"
+       "mov %rdi, di\n\t"
+       "mov %rbp, bp\n\t"
+       "mov %rsp, sp\n\t"
+       "mov %r8, r8\n\t"
+       "mov %r9, r9\n\t"
+       "mov %r10, r10\n\t"
+       "mov %r11, r11\n\t"
+       "mov %r12, r12\n\t"
+       "mov %r13, r13\n\t"
+       "mov %r14, r14\n\t"
+       "mov %r15, r15\n\t"
+       "pushfq\n\t"
+       "popq flags\n\t"
+#else
+       "mov %eax, ax\n\t"
+       "mov %ebx, bx\n\t"
+       "mov %ecx, cx\n\t"
+       "mov %edx, dx\n\t"
+       "mov %esi, si\n\t"
+       "mov %edi, di\n\t"
+       "mov %ebp, bp\n\t"
+       "mov %esp, sp\n\t"
+       "pushfl\n\t"
+       "popl flags\n\t"
+#endif
+       "jmp _start\n\t"
+       ".size real_start, . - real_start\n\t"
+       ".popsection");
+
+int main()
+{
+       int nerrs = 0;
+
+       if (sp == 0) {
+               printf("[FAIL]\tTest was built incorrectly\n");
+               return 1;
+       }
+
+       if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+           || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+               ) {
+               printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+               SHOW(ax);
+               SHOW(bx);
+               SHOW(cx);
+               SHOW(dx);
+               SHOW(si);
+               SHOW(di);
+               SHOW(bp);
+               SHOW(sp);
+#ifdef __x86_64__
+               SHOW(r8);
+               SHOW(r9);
+               SHOW(r10);
+               SHOW(r11);
+               SHOW(r12);
+               SHOW(r13);
+               SHOW(r14);
+               SHOW(r15);
+#endif
+               nerrs++;
+       } else {
+               printf("[OK]\tAll GPRs except SP are 0\n");
+       }
+
+       if (flags != 0x202) {
+               printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+               nerrs++;
+       } else {
+               printf("[OK]\tFLAGS is 0x202\n");
+       }
+
+       return nerrs ? 1 : 0;
+}
index 5105b49..4214567 100644 (file)
@@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
                err(1, "sigaction");
 }
 
+static void setsigign(int sig, int flags)
+{
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_sigaction = (void *)SIG_IGN;
+       sa.sa_flags = flags;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
 static void clearhandler(int sig)
 {
        struct sigaction sa;
@@ -187,7 +198,7 @@ static void test_ptrace_syscall_restart(void)
 
        printf("[RUN]\tSYSEMU\n");
        if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
        wait_trap(chld);
 
        if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -218,7 +229,7 @@ static void test_ptrace_syscall_restart(void)
                err(1, "PTRACE_SETREGS");
 
        if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
        wait_trap(chld);
 
        if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -250,7 +261,7 @@ static void test_ptrace_syscall_restart(void)
                err(1, "PTRACE_SETREGS");
 
        if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-               err(1, "PTRACE_SYSCALL");
+               err(1, "PTRACE_SYSEMU");
        wait_trap(chld);
 
        if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void)
        }
 }
 
+static void test_restart_under_ptrace(void)
+{
+       printf("[RUN]\tkernel syscall restart under ptrace\n");
+       pid_t chld = fork();
+       if (chld < 0)
+               err(1, "fork");
+
+       if (chld == 0) {
+               if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+                       err(1, "PTRACE_TRACEME");
+
+               printf("\tChild will take a nap until signaled\n");
+               setsigign(SIGUSR1, SA_RESTART);
+               raise(SIGSTOP);
+
+               syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
+               _exit(0);
+       }
+
+       int status;
+
+       /* Wait for SIGSTOP. */
+       if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+               err(1, "waitpid");
+
+       struct user_regs_struct regs;
+
+       printf("[RUN]\tSYSCALL\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       /* We should be stopped at pause(2) entry. */
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tInitial nr and args are correct\n");
+       }
+
+       /* Interrupt it. */
+       kill(chld, SIGUSR1);
+
+       /* Advance.  We should be stopped at exit. */
+       printf("[RUN]\tSYSCALL\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
+                      (long)regs.user_ax);
+       }
+
+       /* Poke the regs back in.  This must not break anything. */
+       if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_SETREGS");
+
+       /* Catch the (ignored) SIGUSR1. */
+       if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+               err(1, "PTRACE_CONT");
+       if (waitpid(chld, &status, 0) != chld)
+               err(1, "waitpid");
+       if (!WIFSTOPPED(status)) {
+               printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
+               nerrs++;
+       } else {
+               printf("[OK]\tChild got SIGUSR1\n");
+       }
+
+       /* The next event should be pause(2) again. */
+       printf("[RUN]\tStep again\n");
+       if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+               err(1, "PTRACE_SYSCALL");
+       wait_trap(chld);
+
+       /* We should be stopped at pause(2) entry. */
+
+       if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+               err(1, "PTRACE_GETREGS");
+
+       if (regs.user_syscall_nr != SYS_pause ||
+           regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+           regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+           regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+               printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+               nerrs++;
+       } else {
+               printf("[OK]\tpause(2) restarted correctly\n");
+       }
+
+       /* Kill it. */
+       kill(chld, SIGKILL);
+       if (waitpid(chld, &status, 0) != chld)
+               err(1, "waitpid");
+}
+
 int main()
 {
        printf("[RUN]\tCheck int80 return regs\n");
@@ -290,5 +414,7 @@ int main()
 
        test_ptrace_syscall_restart();
 
+       test_restart_under_ptrace();
+
        return 0;
 }
index b5aa1ba..8a577e7 100644 (file)
 #include <sys/ptrace.h>
 #include <sys/user.h>
 
+/* Pull in AR_xyz defines. */
+typedef unsigned int u32;
+typedef unsigned short u16;
+#include "../../../../arch/x86/include/asm/desc_defs.h"
+
+/*
+ * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
+ * headers.
+ */
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ */
+#define UC_SIGCONTEXT_SS       0x2
+#define UC_STRICT_RESTORE_SS   0x4
+#endif
+
 /*
  * In principle, this test can run on Linux emulation layers (e.g.
  * Illumos "LX branded zones").  Solaris-based kernels reserve LDT
@@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs;
 /* Instructions for the SIGUSR1 handler. */
 static volatile unsigned short sig_cs, sig_ss;
 static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+#ifdef __x86_64__
+static volatile sig_atomic_t sig_corrupt_final_ss;
+#endif
 
 /* Abstractions for some 32-bit vs 64-bit differences. */
 #ifdef __x86_64__
@@ -305,9 +339,105 @@ static greg_t *csptr(ucontext_t *ctx)
 }
 #endif
 
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+       uint32_t valid = 0, ar;
+       asm ("lar %[cs], %[ar]\n\t"
+            "jnz 1f\n\t"
+            "mov $1, %[valid]\n\t"
+            "1:"
+            : [ar] "=r" (ar), [valid] "+rm" (valid)
+            : [cs] "r" (cs));
+
+       if (!valid)
+               return -1;
+
+       bool db = (ar & (1 << 22));
+       bool l = (ar & (1 << 21));
+
+       if (!(ar & (1<<11)))
+           return -1;  /* Not code. */
+
+       if (l && !db)
+               return 64;
+       else if (!l && db)
+               return 32;
+       else if (!l && !db)
+               return 16;
+       else
+               return -1;      /* Unknown bitness. */
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+bool is_valid_ss(unsigned short cs)
+{
+       uint32_t valid = 0, ar;
+       asm ("lar %[cs], %[ar]\n\t"
+            "jnz 1f\n\t"
+            "mov $1, %[valid]\n\t"
+            "1:"
+            : [ar] "=r" (ar), [valid] "+rm" (valid)
+            : [cs] "r" (cs));
+
+       if (!valid)
+               return false;
+
+       if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
+           (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
+               return false;
+
+       return (ar & AR_P);
+}
+
 /* Number of errors in the current test case. */
 static volatile sig_atomic_t nerrs;
 
+static void validate_signal_ss(int sig, ucontext_t *ctx)
+{
+#ifdef __x86_64__
+       bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
+
+       if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
+               printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
+               nerrs++;
+
+               /*
+                * This happens on Linux 4.1.  The rest will fail, too, so
+                * return now to reduce the noise.
+                */
+               return;
+       }
+
+       /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
+       if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
+               printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
+                      sig);
+               nerrs++;
+       }
+
+       if (is_valid_ss(*ssptr(ctx))) {
+               /*
+                * DOSEMU was written before 64-bit sigcontext had SS, and
+                * it tries to figure out the signal source SS by looking at
+                * the physical register.  Make sure that keeps working.
+                */
+               unsigned short hw_ss;
+               asm ("mov %%ss, %0" : "=rm" (hw_ss));
+               if (hw_ss != *ssptr(ctx)) {
+                       printf("[FAIL]\tHW SS didn't match saved SS\n");
+                       nerrs++;
+               }
+       }
+#endif
+}
+
 /*
  * SIGUSR1 handler.  Sets CS and SS as requested and points IP to the
  * int3 trampoline.  Sets SP to a large known value so that we can see
@@ -317,6 +447,8 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 {
        ucontext_t *ctx = (ucontext_t*)ctx_void;
 
+       validate_signal_ss(sig, ctx);
+
        memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
 
        *csptr(ctx) = sig_cs;
@@ -334,13 +466,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 }
 
 /*
- * Called after a successful sigreturn.  Restores our state so that
- * the original raise(SIGUSR1) returns.
+ * Called after a successful sigreturn (via int3) or from a failed
+ * sigreturn (directly by kernel).  Restores our state so that the
+ * original raise(SIGUSR1) returns.
  */
 static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 {
        ucontext_t *ctx = (ucontext_t*)ctx_void;
 
+       validate_signal_ss(sig, ctx);
+
        sig_err = ctx->uc_mcontext.gregs[REG_ERR];
        sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
 
@@ -358,41 +493,62 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
        memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
        memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
 
+#ifdef __x86_64__
+       if (sig_corrupt_final_ss) {
+               if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
+                       printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
+                       nerrs++;
+               } else {
+                       /*
+                        * DOSEMU transitions from 32-bit to 64-bit mode by
+                        * adjusting sigcontext, and it requires that this work
+                        * even if the saved SS is bogus.
+                        */
+                       printf("\tCorrupting SS on return to 64-bit mode\n");
+                       *ssptr(ctx) = 0;
+               }
+       }
+#endif
+
        sig_trapped = sig;
 }
 
-/*
- * Checks a given selector for its code bitness or returns -1 if it's not
- * a usable code segment selector.
- */
-int cs_bitness(unsigned short cs)
+#ifdef __x86_64__
+/* Tests recovery if !UC_STRICT_RESTORE_SS */
+static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
 {
-       uint32_t valid = 0, ar;
-       asm ("lar %[cs], %[ar]\n\t"
-            "jnz 1f\n\t"
-            "mov $1, %[valid]\n\t"
-            "1:"
-            : [ar] "=r" (ar), [valid] "+rm" (valid)
-            : [cs] "r" (cs));
+       ucontext_t *ctx = (ucontext_t*)ctx_void;
 
-       if (!valid)
-               return -1;
+       if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
+               printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
+               nerrs++;
+               return;  /* We can't do the rest. */
+       }
 
-       bool db = (ar & (1 << 22));
-       bool l = (ar & (1 << 21));
+       ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
+       *ssptr(ctx) = 0;
 
-       if (!(ar & (1<<11)))
-           return -1;  /* Not code. */
+       /* Return.  The kernel should recover without sending another signal. */
+}
 
-       if (l && !db)
-               return 64;
-       else if (!l && db)
-               return 32;
-       else if (!l && !db)
-               return 16;
-       else
-               return -1;      /* Unknown bitness. */
+static int test_nonstrict_ss(void)
+{
+       clearhandler(SIGUSR1);
+       clearhandler(SIGTRAP);
+       clearhandler(SIGSEGV);
+       clearhandler(SIGILL);
+       sethandler(SIGUSR2, sigusr2, 0);
+
+       nerrs = 0;
+
+       printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
+       raise(SIGUSR2);
+       if (!nerrs)
+               printf("[OK]\tIt worked\n");
+
+       return nerrs;
 }
+#endif
 
 /* Finds a usable code segment of the requested bitness. */
 int find_cs(int bitness)
@@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
                       errdesc, strsignal(sig_trapped));
                return 0;
        } else {
+               /*
+                * This also implicitly tests UC_STRICT_RESTORE_SS:
+                * We check that these signals set UC_STRICT_RESTORE_SS and,
+                * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
+                * then we won't get SIGSEGV.
+                */
                printf("[FAIL]\tDid not get SIGSEGV\n");
                return 1;
        }
@@ -632,6 +794,14 @@ int main()
                                                    GDT3(gdt_data16_idx));
        }
 
+#ifdef __x86_64__
+       /* Nasty ABI case: check SS corruption handling. */
+       sig_corrupt_final_ss = 1;
+       total_nerrs += test_valid_sigreturn(32, false, -1);
+       total_nerrs += test_valid_sigreturn(32, true, -1);
+       sig_corrupt_final_ss = 0;
+#endif
+
        /*
         * We're done testing valid sigreturn cases.  Now we test states
         * for which sigreturn itself will succeed but the subsequent
@@ -680,5 +850,9 @@ int main()
        if (gdt_npdata32_idx)
                test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
 
+#ifdef __x86_64__
+       total_nerrs += test_nonstrict_ss();
+#endif
+
        return total_nerrs ? 1 : 0;
 }
index 60c06af..43fcab3 100644 (file)
@@ -17,6 +17,9 @@
 
 #include <stdio.h>
 #include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <err.h>
 #include <sys/syscall.h>
 #include <asm/processor-flags.h>
 
@@ -26,6 +29,8 @@
 # define WIDTH "l"
 #endif
 
+static unsigned int nerrs;
+
 static unsigned long get_eflags(void)
 {
        unsigned long eflags;
@@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags)
                      : : "rm" (eflags) : "flags");
 }
 
-int main()
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+                      int flags)
 {
-       printf("[RUN]\tSet NT and issue a syscall\n");
-       set_eflags(get_eflags() | X86_EFLAGS_NT);
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_sigaction = handler;
+       sa.sa_flags = SA_SIGINFO | flags;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void do_it(unsigned long extraflags)
+{
+       unsigned long flags;
+
+       set_eflags(get_eflags() | extraflags);
        syscall(SYS_getpid);
-       if (get_eflags() & X86_EFLAGS_NT) {
-               printf("[OK]\tThe syscall worked and NT is still set\n");
-               return 0;
+       flags = get_eflags();
+       if ((flags & extraflags) == extraflags) {
+               printf("[OK]\tThe syscall worked and flags are still set\n");
        } else {
-               printf("[FAIL]\tThe syscall worked but NT was cleared\n");
-               return 1;
+               printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
+                      flags, extraflags);
+               nerrs++;
        }
 }
+
+int main(void)
+{
+       printf("[RUN]\tSet NT and issue a syscall\n");
+       do_it(X86_EFLAGS_NT);
+
+       /*
+        * Now try it again with TF set -- TF forces returns via IRET in all
+        * cases except non-ptregs-using 64-bit full fast path syscalls.
+        */
+
+       sethandler(SIGTRAP, sigtrap, 0);
+
+       printf("[RUN]\tSet NT|TF and issue a syscall\n");
+       do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
+
+       return nerrs == 0 ? 0 : 1;
+}
index 26b7926..ba34f9e 100644 (file)
@@ -1,15 +1,19 @@
 #if defined(__i386__) || defined(__x86_64__)
 #define barrier() asm volatile("" ::: "memory")
-#define mb() __sync_synchronize()
-
-#define smp_mb()       mb()
-# define dma_rmb()     barrier()
-# define dma_wmb()     barrier()
-# define smp_rmb()     barrier()
-# define smp_wmb()     barrier()
+#define virt_mb() __sync_synchronize()
+#define virt_rmb() barrier()
+#define virt_wmb() barrier()
+/* Atomic store should be enough, but gcc generates worse code in that case. */
+#define virt_store_mb(var, value)  do { \
+       typeof(var) virt_store_mb_value = (value); \
+       __atomic_exchange(&(var), &virt_store_mb_value, &virt_store_mb_value, \
+                         __ATOMIC_SEQ_CST); \
+       barrier(); \
+} while (0);
 /* Weak barriers should be used. If not - it's a bug */
-# define rmb() abort()
-# define wmb() abort()
+# define mb() abort()
+# define rmb() abort()
+# define wmb() abort()
 #else
 #error Please fill in barrier macros
 #endif
diff --git a/tools/virtio/linux/compiler.h b/tools/virtio/linux/compiler.h
new file mode 100644 (file)
index 0000000..845960e
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef LINUX_COMPILER_H
+#define LINUX_COMPILER_H
+
+#define WRITE_ONCE(var, val) \
+       (*((volatile typeof(val) *)(&(var))) = (val))
+
+#define READ_ONCE(var) (*((volatile typeof(val) *)(&(var))))
+
+#endif
index 4db7d56..0338499 100644 (file)
@@ -8,6 +8,7 @@
 #include <assert.h>
 #include <stdarg.h>
 
+#include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/printk.h>
 #include <linux/bug.h>
diff --git a/tools/virtio/ringtest/Makefile b/tools/virtio/ringtest/Makefile
new file mode 100644 (file)
index 0000000..feaa64a
--- /dev/null
@@ -0,0 +1,22 @@
+all:
+
+all: ring virtio_ring_0_9 virtio_ring_poll
+
+CFLAGS += -Wall
+CFLAGS += -pthread -O2 -ggdb
+LDFLAGS += -pthread -O2 -ggdb
+
+main.o: main.c main.h
+ring.o: ring.c main.h
+virtio_ring_0_9.o: virtio_ring_0_9.c main.h
+virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h
+ring: ring.o main.o
+virtio_ring_0_9: virtio_ring_0_9.o main.o
+virtio_ring_poll: virtio_ring_poll.o main.o
+clean:
+       -rm main.o
+       -rm ring.o ring
+       -rm virtio_ring_0_9.o virtio_ring_0_9
+       -rm virtio_ring_poll.o virtio_ring_poll
+
+.PHONY: all clean
diff --git a/tools/virtio/ringtest/README b/tools/virtio/ringtest/README
new file mode 100644 (file)
index 0000000..34e94c4
--- /dev/null
@@ -0,0 +1,2 @@
+Partial implementation of various ring layouts, useful to tune virtio design.
+Uses shared memory heavily.
diff --git a/tools/virtio/ringtest/main.c b/tools/virtio/ringtest/main.c
new file mode 100644 (file)
index 0000000..3a5ff43
--- /dev/null
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Command line processing and common functions for ring benchmarking.
+ */
+#define _GNU_SOURCE
+#include <getopt.h>
+#include <pthread.h>
+#include <assert.h>
+#include <sched.h>
+#include "main.h"
+#include <sys/eventfd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <limits.h>
+
+int runcycles = 10000000;
+int max_outstanding = INT_MAX;
+int batch = 1;
+
+bool do_sleep = false;
+bool do_relax = false;
+bool do_exit = true;
+
+unsigned ring_size = 256;
+
+static int kickfd = -1;
+static int callfd = -1;
+
+void notify(int fd)
+{
+       unsigned long long v = 1;
+       int r;
+
+       vmexit();
+       r = write(fd, &v, sizeof v);
+       assert(r == sizeof v);
+       vmentry();
+}
+
+void wait_for_notify(int fd)
+{
+       unsigned long long v = 1;
+       int r;
+
+       vmexit();
+       r = read(fd, &v, sizeof v);
+       assert(r == sizeof v);
+       vmentry();
+}
+
+void kick(void)
+{
+       notify(kickfd);
+}
+
+void wait_for_kick(void)
+{
+       wait_for_notify(kickfd);
+}
+
+void call(void)
+{
+       notify(callfd);
+}
+
+void wait_for_call(void)
+{
+       wait_for_notify(callfd);
+}
+
+void set_affinity(const char *arg)
+{
+       cpu_set_t cpuset;
+       int ret;
+       pthread_t self;
+       long int cpu;
+       char *endptr;
+
+       if (!arg)
+               return;
+
+       cpu = strtol(arg, &endptr, 0);
+       assert(!*endptr);
+
+       assert(cpu >= 0 || cpu < CPU_SETSIZE);
+
+       self = pthread_self();
+       CPU_ZERO(&cpuset);
+       CPU_SET(cpu, &cpuset);
+
+       ret = pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset);
+       assert(!ret);
+}
+
+static void run_guest(void)
+{
+       int completed_before;
+       int completed = 0;
+       int started = 0;
+       int bufs = runcycles;
+       int spurious = 0;
+       int r;
+       unsigned len;
+       void *buf;
+       int tokick = batch;
+
+       for (;;) {
+               if (do_sleep)
+                       disable_call();
+               completed_before = completed;
+               do {
+                       if (started < bufs &&
+                           started - completed < max_outstanding) {
+                               r = add_inbuf(0, NULL, "Hello, world!");
+                               if (__builtin_expect(r == 0, true)) {
+                                       ++started;
+                                       if (!--tokick) {
+                                               tokick = batch;
+                                               if (do_sleep)
+                                                       kick_available();
+                                       }
+
+                               }
+                       } else
+                               r = -1;
+
+                       /* Flush out completed bufs if any */
+                       if (get_buf(&len, &buf)) {
+                               ++completed;
+                               if (__builtin_expect(completed == bufs, false))
+                                       return;
+                               r = 0;
+                       }
+               } while (r == 0);
+               if (completed == completed_before)
+                       ++spurious;
+               assert(completed <= bufs);
+               assert(started <= bufs);
+               if (do_sleep) {
+                       if (enable_call())
+                               wait_for_call();
+               } else {
+                       poll_used();
+               }
+       }
+}
+
+static void run_host(void)
+{
+       int completed_before;
+       int completed = 0;
+       int spurious = 0;
+       int bufs = runcycles;
+       unsigned len;
+       void *buf;
+
+       for (;;) {
+               if (do_sleep) {
+                       if (enable_kick())
+                               wait_for_kick();
+               } else {
+                       poll_avail();
+               }
+               if (do_sleep)
+                       disable_kick();
+               completed_before = completed;
+               while (__builtin_expect(use_buf(&len, &buf), true)) {
+                       if (do_sleep)
+                               call_used();
+                       ++completed;
+                       if (__builtin_expect(completed == bufs, false))
+                               return;
+               }
+               if (completed == completed_before)
+                       ++spurious;
+               assert(completed <= bufs);
+               if (completed == bufs)
+                       break;
+       }
+}
+
+void *start_guest(void *arg)
+{
+       set_affinity(arg);
+       run_guest();
+       pthread_exit(NULL);
+}
+
+void *start_host(void *arg)
+{
+       set_affinity(arg);
+       run_host();
+       pthread_exit(NULL);
+}
+
+static const char optstring[] = "";
+static const struct option longopts[] = {
+       {
+               .name = "help",
+               .has_arg = no_argument,
+               .val = 'h',
+       },
+       {
+               .name = "host-affinity",
+               .has_arg = required_argument,
+               .val = 'H',
+       },
+       {
+               .name = "guest-affinity",
+               .has_arg = required_argument,
+               .val = 'G',
+       },
+       {
+               .name = "ring-size",
+               .has_arg = required_argument,
+               .val = 'R',
+       },
+       {
+               .name = "run-cycles",
+               .has_arg = required_argument,
+               .val = 'C',
+       },
+       {
+               .name = "outstanding",
+               .has_arg = required_argument,
+               .val = 'o',
+       },
+       {
+               .name = "batch",
+               .has_arg = required_argument,
+               .val = 'b',
+       },
+       {
+               .name = "sleep",
+               .has_arg = no_argument,
+               .val = 's',
+       },
+       {
+               .name = "relax",
+               .has_arg = no_argument,
+               .val = 'x',
+       },
+       {
+               .name = "exit",
+               .has_arg = no_argument,
+               .val = 'e',
+       },
+       {
+       }
+};
+
+static void help(void)
+{
+       fprintf(stderr, "Usage: <test> [--help]"
+               " [--host-affinity H]"
+               " [--guest-affinity G]"
+               " [--ring-size R (default: %d)]"
+               " [--run-cycles C (default: %d)]"
+               " [--batch b]"
+               " [--outstanding o]"
+               " [--sleep]"
+               " [--relax]"
+               " [--exit]"
+               "\n",
+               ring_size,
+               runcycles);
+}
+
+int main(int argc, char **argv)
+{
+       int ret;
+       pthread_t host, guest;
+       void *tret;
+       char *host_arg = NULL;
+       char *guest_arg = NULL;
+       char *endptr;
+       long int c;
+
+       kickfd = eventfd(0, 0);
+       assert(kickfd >= 0);
+       callfd = eventfd(0, 0);
+       assert(callfd >= 0);
+
+       for (;;) {
+               int o = getopt_long(argc, argv, optstring, longopts, NULL);
+               switch (o) {
+               case -1:
+                       goto done;
+               case '?':
+                       help();
+                       exit(2);
+               case 'H':
+                       host_arg = optarg;
+                       break;
+               case 'G':
+                       guest_arg = optarg;
+                       break;
+               case 'R':
+                       ring_size = strtol(optarg, &endptr, 0);
+                       assert(ring_size && !(ring_size & (ring_size - 1)));
+                       assert(!*endptr);
+                       break;
+               case 'C':
+                       c = strtol(optarg, &endptr, 0);
+                       assert(!*endptr);
+                       assert(c > 0 && c < INT_MAX);
+                       runcycles = c;
+                       break;
+               case 'o':
+                       c = strtol(optarg, &endptr, 0);
+                       assert(!*endptr);
+                       assert(c > 0 && c < INT_MAX);
+                       max_outstanding = c;
+                       break;
+               case 'b':
+                       c = strtol(optarg, &endptr, 0);
+                       assert(!*endptr);
+                       assert(c > 0 && c < INT_MAX);
+                       batch = c;
+                       break;
+               case 's':
+                       do_sleep = true;
+                       break;
+               case 'x':
+                       do_relax = true;
+                       break;
+               case 'e':
+                       do_exit = true;
+                       break;
+               default:
+                       help();
+                       exit(4);
+                       break;
+               }
+       }
+
+       /* does nothing here, used to make sure all smp APIs compile */
+       smp_acquire();
+       smp_release();
+       smp_mb();
+done:
+
+       if (batch > max_outstanding)
+               batch = max_outstanding;
+
+       if (optind < argc) {
+               help();
+               exit(4);
+       }
+       alloc_ring();
+
+       ret = pthread_create(&host, NULL, start_host, host_arg);
+       assert(!ret);
+       ret = pthread_create(&guest, NULL, start_guest, guest_arg);
+       assert(!ret);
+
+       ret = pthread_join(guest, &tret);
+       assert(!ret);
+       ret = pthread_join(host, &tret);
+       assert(!ret);
+       return 0;
+}
diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h
new file mode 100644 (file)
index 0000000..16917ac
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Common macros and functions for ring benchmarking.
+ */
+#ifndef MAIN_H
+#define MAIN_H
+
+#include <stdbool.h>
+
+extern bool do_exit;
+
+#if defined(__x86_64__) || defined(__i386__)
+#include "x86intrin.h"
+
+static inline void wait_cycles(unsigned long long cycles)
+{
+       unsigned long long t;
+
+       t = __rdtsc();
+       while (__rdtsc() - t < cycles) {}
+}
+
+#define VMEXIT_CYCLES 500
+#define VMENTRY_CYCLES 500
+
+#else
+static inline void wait_cycles(unsigned long long cycles)
+{
+       _Exit(5);
+}
+#define VMEXIT_CYCLES 0
+#define VMENTRY_CYCLES 0
+#endif
+
+static inline void vmexit(void)
+{
+       if (!do_exit)
+               return;
+       
+       wait_cycles(VMEXIT_CYCLES);
+}
+static inline void vmentry(void)
+{
+       if (!do_exit)
+               return;
+       
+       wait_cycles(VMENTRY_CYCLES);
+}
+
+/* implemented by ring */
+void alloc_ring(void);
+/* guest side */
+int add_inbuf(unsigned, void *, void *);
+void *get_buf(unsigned *, void **);
+void disable_call();
+bool enable_call();
+void kick_available();
+void poll_used();
+/* host side */
+void disable_kick();
+bool enable_kick();
+bool use_buf(unsigned *, void **);
+void call_used();
+void poll_avail();
+
+/* implemented by main */
+extern bool do_sleep;
+void kick(void);
+void wait_for_kick(void);
+void call(void);
+void wait_for_call(void);
+
+extern unsigned ring_size;
+
+/* Compiler barrier - similar to what Linux uses */
+#define barrier() asm volatile("" ::: "memory")
+
+/* Is there a portable way to do this? */
+#if defined(__x86_64__) || defined(__i386__)
+#define cpu_relax() asm ("rep; nop" ::: "memory")
+#else
+#define cpu_relax() assert(0)
+#endif
+
+extern bool do_relax;
+
+static inline void busy_wait(void)
+{
+       if (do_relax)
+               cpu_relax();
+       else
+               /* prevent compiler from removing busy loops */
+               barrier();
+} 
+
+/*
+ * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized
+ * with other __ATOMIC_SEQ_CST calls.
+ */
+#define smp_mb() __sync_synchronize()
+
+/*
+ * This abuses the atomic builtins for thread fences, and
+ * adds a compiler barrier.
+ */
+#define smp_release() do { \
+    barrier(); \
+    __atomic_thread_fence(__ATOMIC_RELEASE); \
+} while (0)
+
+#define smp_acquire() do { \
+    __atomic_thread_fence(__ATOMIC_ACQUIRE); \
+    barrier(); \
+} while (0)
+
+#endif
diff --git a/tools/virtio/ringtest/ring.c b/tools/virtio/ringtest/ring.c
new file mode 100644 (file)
index 0000000..c25c8d2
--- /dev/null
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Simple descriptor-based ring. virtio 0.9 compatible event index is used for
+ * signalling, unconditionally.
+ */
+#define _GNU_SOURCE
+#include "main.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Next - Where next entry will be written.
+ * Prev - "Next" value when event triggered previously.
+ * Event - Peer requested event after writing this entry.
+ */
+static inline bool need_event(unsigned short event,
+                             unsigned short next,
+                             unsigned short prev)
+{
+       return (unsigned short)(next - event - 1) < (unsigned short)(next - prev);
+}
+
+/* Design:
+ * Guest adds descriptors with unique index values and DESC_HW in flags.
+ * Host overwrites used descriptors with correct len, index, and DESC_HW clear.
+ * Flags are always set last.
+ */
+#define DESC_HW 0x1
+
+struct desc {
+       unsigned short flags;
+       unsigned short index;
+       unsigned len;
+       unsigned long long addr;
+};
+
+/* how much padding is needed to avoid false cache sharing */
+#define HOST_GUEST_PADDING 0x80
+
+/* Mostly read */
+struct event {
+       unsigned short kick_index;
+       unsigned char reserved0[HOST_GUEST_PADDING - 2];
+       unsigned short call_index;
+       unsigned char reserved1[HOST_GUEST_PADDING - 2];
+};
+
+struct data {
+       void *buf; /* descriptor is writeable, we can't get buf from there */
+       void *data;
+} *data;
+
+struct desc *ring;
+struct event *event;
+
+struct guest {
+       unsigned avail_idx;
+       unsigned last_used_idx;
+       unsigned num_free;
+       unsigned kicked_avail_idx;
+       unsigned char reserved[HOST_GUEST_PADDING - 12];
+} guest;
+
+struct host {
+       /* we do not need to track last avail index
+        * unless we have more than one in flight.
+        */
+       unsigned used_idx;
+       unsigned called_used_idx;
+       unsigned char reserved[HOST_GUEST_PADDING - 4];
+} host;
+
+/* implemented by ring */
+void alloc_ring(void)
+{
+       int ret;
+       int i;
+
+       ret = posix_memalign((void **)&ring, 0x1000, ring_size * sizeof *ring);
+       if (ret) {
+               perror("Unable to allocate ring buffer.\n");
+               exit(3);
+       }
+       event = malloc(sizeof *event);
+       if (!event) {
+               perror("Unable to allocate event buffer.\n");
+               exit(3);
+       }
+       memset(event, 0, sizeof *event);
+       guest.avail_idx = 0;
+       guest.kicked_avail_idx = -1;
+       guest.last_used_idx = 0;
+       host.used_idx = 0;
+       host.called_used_idx = -1;
+       for (i = 0; i < ring_size; ++i) {
+               struct desc desc = {
+                       .index = i,
+               };
+               ring[i] = desc;
+       }
+       guest.num_free = ring_size;
+       data = malloc(ring_size * sizeof *data);
+       if (!data) {
+               perror("Unable to allocate data buffer.\n");
+               exit(3);
+       }
+       memset(data, 0, ring_size * sizeof *data);
+}
+
+/* guest side */
+int add_inbuf(unsigned len, void *buf, void *datap)
+{
+       unsigned head, index;
+
+       if (!guest.num_free)
+               return -1;
+
+       guest.num_free--;
+       head = (ring_size - 1) & (guest.avail_idx++);
+
+       /* Start with a write. On MESI architectures this helps
+        * avoid a shared state with consumer that is polling this descriptor.
+        */
+       ring[head].addr = (unsigned long)(void*)buf;
+       ring[head].len = len;
+       /* read below might bypass write above. That is OK because it's just an
+        * optimization. If this happens, we will get the cache line in a
+        * shared state which is unfortunate, but probably not worth it to
+        * add an explicit full barrier to avoid this.
+        */
+       barrier();
+       index = ring[head].index;
+       data[index].buf = buf;
+       data[index].data = datap;
+       /* Barrier A (for pairing) */
+       smp_release();
+       ring[head].flags = DESC_HW;
+
+       return 0;
+}
+
+void *get_buf(unsigned *lenp, void **bufp)
+{
+       unsigned head = (ring_size - 1) & guest.last_used_idx;
+       unsigned index;
+       void *datap;
+
+       if (ring[head].flags & DESC_HW)
+               return NULL;
+       /* Barrier B (for pairing) */
+       smp_acquire();
+       *lenp = ring[head].len;
+       index = ring[head].index & (ring_size - 1);
+       datap = data[index].data;
+       *bufp = data[index].buf;
+       data[index].buf = NULL;
+       data[index].data = NULL;
+       guest.num_free++;
+       guest.last_used_idx++;
+       return datap;
+}
+
+void poll_used(void)
+{
+       unsigned head = (ring_size - 1) & guest.last_used_idx;
+
+       while (ring[head].flags & DESC_HW)
+               busy_wait();
+}
+
+void disable_call()
+{
+       /* Doing nothing to disable calls might cause
+        * extra interrupts, but reduces the number of cache misses.
+        */
+}
+
+bool enable_call()
+{
+       unsigned head = (ring_size - 1) & guest.last_used_idx;
+
+       event->call_index = guest.last_used_idx;
+       /* Flush call index write */
+       /* Barrier D (for pairing) */
+       smp_mb();
+       return ring[head].flags & DESC_HW;
+}
+
+void kick_available(void)
+{
+       /* Flush in previous flags write */
+       /* Barrier C (for pairing) */
+       smp_mb();
+       if (!need_event(event->kick_index,
+                       guest.avail_idx,
+                       guest.kicked_avail_idx))
+               return;
+
+       guest.kicked_avail_idx = guest.avail_idx;
+       kick();
+}
+
+/* host side */
+void disable_kick()
+{
+       /* Doing nothing to disable kicks might cause
+        * extra interrupts, but reduces the number of cache misses.
+        */
+}
+
+bool enable_kick()
+{
+       unsigned head = (ring_size - 1) & host.used_idx;
+
+       event->kick_index = host.used_idx;
+       /* Barrier C (for pairing) */
+       smp_mb();
+       return !(ring[head].flags & DESC_HW);
+}
+
+void poll_avail(void)
+{
+       unsigned head = (ring_size - 1) & host.used_idx;
+
+       while (!(ring[head].flags & DESC_HW))
+               busy_wait();
+}
+
+bool use_buf(unsigned *lenp, void **bufp)
+{
+       unsigned head = (ring_size - 1) & host.used_idx;
+
+       if (!(ring[head].flags & DESC_HW))
+               return false;
+
+       /* make sure length read below is not speculated */
+       /* Barrier A (for pairing) */
+       smp_acquire();
+
+       /* simple in-order completion: we don't need
+        * to touch index at all. This also means we
+        * can just modify the descriptor in-place.
+        */
+       ring[head].len--;
+       /* Make sure len is valid before flags.
+        * Note: alternative is to write len and flags in one access -
+        * possible on 64 bit architectures but wmb is free on Intel anyway
+        * so I have no way to test whether it's a gain.
+        */
+       /* Barrier B (for pairing) */
+       smp_release();
+       ring[head].flags = 0;
+       host.used_idx++;
+       return true;
+}
+
+void call_used(void)
+{
+       /* Flush in previous flags write */
+       /* Barrier D (for pairing) */
+       smp_mb();
+       if (!need_event(event->call_index,
+                       host.used_idx,
+                       host.called_used_idx))
+               return;
+
+       host.called_used_idx = host.used_idx;
+       call();
+}
diff --git a/tools/virtio/ringtest/run-on-all.sh b/tools/virtio/ringtest/run-on-all.sh
new file mode 100755 (executable)
index 0000000..52b0f71
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+#use last CPU for host. Why not the first?
+#many devices tend to use cpu0 by default so
+#it tends to be busier
+HOST_AFFINITY=$(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n|tail -1)
+
+#run command on all cpus
+for cpu in $(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n);
+do
+       #Don't run guest and host on same CPU
+       #It actually works ok if using signalling
+       if
+               (echo "$@" | grep -e "--sleep" > /dev/null) || \
+                       test $HOST_AFFINITY '!=' $cpu
+       then
+               echo "GUEST AFFINITY $cpu"
+               "$@" --host-affinity $HOST_AFFINITY --guest-affinity $cpu
+       fi
+done
+echo "NO GUEST AFFINITY"
+"$@" --host-affinity $HOST_AFFINITY
+echo "NO AFFINITY"
+"$@"
diff --git a/tools/virtio/ringtest/virtio_ring_0_9.c b/tools/virtio/ringtest/virtio_ring_0_9.c
new file mode 100644 (file)
index 0000000..47c9a1a
--- /dev/null
@@ -0,0 +1,316 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Partial implementation of virtio 0.9. event index is used for signalling,
+ * unconditionally. Design roughly follows linux kernel implementation in order
+ * to be able to judge its performance.
+ */
+#define _GNU_SOURCE
+#include "main.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <linux/virtio_ring.h>
+
+struct data {
+       void *data;
+} *data;
+
+struct vring ring;
+
+/* enabling the below activates experimental ring polling code
+ * (which skips index reads on consumer in favor of looking at
+ * high bits of ring id ^ 0x8000).
+ */
+/* #ifdef RING_POLL */
+
+/* how much padding is needed to avoid false cache sharing */
+#define HOST_GUEST_PADDING 0x80
+
+struct guest {
+       unsigned short avail_idx;
+       unsigned short last_used_idx;
+       unsigned short num_free;
+       unsigned short kicked_avail_idx;
+       unsigned short free_head;
+       unsigned char reserved[HOST_GUEST_PADDING - 10];
+} guest;
+
+struct host {
+       /* we do not need to track last avail index
+        * unless we have more than one in flight.
+        */
+       unsigned short used_idx;
+       unsigned short called_used_idx;
+       unsigned char reserved[HOST_GUEST_PADDING - 4];
+} host;
+
+/* implemented by ring */
+void alloc_ring(void)
+{
+       int ret;
+       int i;
+       void *p;
+
+       ret = posix_memalign(&p, 0x1000, vring_size(ring_size, 0x1000));
+       if (ret) {
+               perror("Unable to allocate ring buffer.\n");
+               exit(3);
+       }
+       memset(p, 0, vring_size(ring_size, 0x1000));
+       vring_init(&ring, ring_size, p, 0x1000);
+
+       guest.avail_idx = 0;
+       guest.kicked_avail_idx = -1;
+       guest.last_used_idx = 0;
+       /* Put everything in free lists. */
+       guest.free_head = 0;
+       for (i = 0; i < ring_size - 1; i++)
+               ring.desc[i].next = i + 1;
+       host.used_idx = 0;
+       host.called_used_idx = -1;
+       guest.num_free = ring_size;
+       data = malloc(ring_size * sizeof *data);
+       if (!data) {
+               perror("Unable to allocate data buffer.\n");
+               exit(3);
+       }
+       memset(data, 0, ring_size * sizeof *data);
+}
+
+/* guest side */
+int add_inbuf(unsigned len, void *buf, void *datap)
+{
+       unsigned head, avail;
+       struct vring_desc *desc;
+
+       if (!guest.num_free)
+               return -1;
+
+       head = guest.free_head;
+       guest.num_free--;
+
+       desc = ring.desc;
+       desc[head].flags = VRING_DESC_F_NEXT;
+       desc[head].addr = (unsigned long)(void *)buf;
+       desc[head].len = len;
+       /* We do it like this to simulate the way
+        * we'd have to flip it if we had multiple
+        * descriptors.
+        */
+       desc[head].flags &= ~VRING_DESC_F_NEXT;
+       guest.free_head = desc[head].next;
+
+       data[head].data = datap;
+
+#ifdef RING_POLL
+       /* Barrier A (for pairing) */
+       smp_release();
+       avail = guest.avail_idx++;
+       ring.avail->ring[avail & (ring_size - 1)] =
+               (head | (avail & ~(ring_size - 1))) ^ 0x8000;
+#else
+       avail = (ring_size - 1) & (guest.avail_idx++);
+       ring.avail->ring[avail] = head;
+       /* Barrier A (for pairing) */
+       smp_release();
+#endif
+       ring.avail->idx = guest.avail_idx;
+       return 0;
+}
+
+void *get_buf(unsigned *lenp, void **bufp)
+{
+       unsigned head;
+       unsigned index;
+       void *datap;
+
+#ifdef RING_POLL
+       head = (ring_size - 1) & guest.last_used_idx;
+       index = ring.used->ring[head].id;
+       if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1))
+               return NULL;
+       /* Barrier B (for pairing) */
+       smp_acquire();
+       index &= ring_size - 1;
+#else
+       if (ring.used->idx == guest.last_used_idx)
+               return NULL;
+       /* Barrier B (for pairing) */
+       smp_acquire();
+       head = (ring_size - 1) & guest.last_used_idx;
+       index = ring.used->ring[head].id;
+#endif
+       *lenp = ring.used->ring[head].len;
+       datap = data[index].data;
+       *bufp = (void*)(unsigned long)ring.desc[index].addr;
+       data[index].data = NULL;
+       ring.desc[index].next = guest.free_head;
+       guest.free_head = index;
+       guest.num_free++;
+       guest.last_used_idx++;
+       return datap;
+}
+
+void poll_used(void)
+{
+#ifdef RING_POLL
+       unsigned head = (ring_size - 1) & guest.last_used_idx;
+
+       for (;;) {
+               unsigned index = ring.used->ring[head].id;
+
+               if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1))
+                       busy_wait();
+               else
+                       break;
+       }
+#else
+       unsigned head = guest.last_used_idx;
+
+       while (ring.used->idx == head)
+               busy_wait();
+#endif
+}
+
+void disable_call()
+{
+       /* Doing nothing to disable calls might cause
+        * extra interrupts, but reduces the number of cache misses.
+        */
+}
+
+bool enable_call()
+{
+       unsigned short last_used_idx;
+
+       vring_used_event(&ring) = (last_used_idx = guest.last_used_idx);
+       /* Flush call index write */
+       /* Barrier D (for pairing) */
+       smp_mb();
+#ifdef RING_POLL
+       {
+               unsigned short head = last_used_idx & (ring_size - 1);
+               unsigned index = ring.used->ring[head].id;
+
+               return (index ^ last_used_idx ^ 0x8000) & ~(ring_size - 1);
+       }
+#else
+       return ring.used->idx == last_used_idx;
+#endif
+}
+
+void kick_available(void)
+{
+       /* Flush in previous flags write */
+       /* Barrier C (for pairing) */
+       smp_mb();
+       if (!vring_need_event(vring_avail_event(&ring),
+                             guest.avail_idx,
+                             guest.kicked_avail_idx))
+               return;
+
+       guest.kicked_avail_idx = guest.avail_idx;
+       kick();
+}
+
+/* host side */
+void disable_kick()
+{
+       /* Doing nothing to disable kicks might cause
+        * extra interrupts, but reduces the number of cache misses.
+        */
+}
+
+bool enable_kick()
+{
+       unsigned head = host.used_idx;
+
+       vring_avail_event(&ring) = head;
+       /* Barrier C (for pairing) */
+       smp_mb();
+#ifdef RING_POLL
+       {
+               unsigned index = ring.avail->ring[head & (ring_size - 1)];
+
+               return (index ^ head ^ 0x8000) & ~(ring_size - 1);
+       }
+#else
+       return head == ring.avail->idx;
+#endif
+}
+
+void poll_avail(void)
+{
+       unsigned head = host.used_idx;
+#ifdef RING_POLL
+       for (;;) {
+               unsigned index = ring.avail->ring[head & (ring_size - 1)];
+               if ((index ^ head ^ 0x8000) & ~(ring_size - 1))
+                       busy_wait();
+               else
+                       break;
+       }
+#else
+       while (ring.avail->idx == head)
+               busy_wait();
+#endif
+}
+
+bool use_buf(unsigned *lenp, void **bufp)
+{
+       unsigned used_idx = host.used_idx;
+       struct vring_desc *desc;
+       unsigned head;
+
+#ifdef RING_POLL
+       head = ring.avail->ring[used_idx & (ring_size - 1)];
+       if ((used_idx ^ head ^ 0x8000) & ~(ring_size - 1))
+               return false;
+       /* Barrier A (for pairing) */
+       smp_acquire();
+
+       used_idx &= ring_size - 1;
+       desc = &ring.desc[head & (ring_size - 1)];
+#else
+       if (used_idx == ring.avail->idx)
+               return false;
+
+       /* Barrier A (for pairing) */
+       smp_acquire();
+
+       used_idx &= ring_size - 1;
+       head = ring.avail->ring[used_idx];
+       desc = &ring.desc[head];
+#endif
+
+       *lenp = desc->len;
+       *bufp = (void *)(unsigned long)desc->addr;
+
+       /* now update used ring */
+       ring.used->ring[used_idx].id = head;
+       ring.used->ring[used_idx].len = desc->len - 1;
+       /* Barrier B (for pairing) */
+       smp_release();
+       host.used_idx++;
+       ring.used->idx = host.used_idx;
+       
+       return true;
+}
+
+void call_used(void)
+{
+       /* Flush in previous flags write */
+       /* Barrier D (for pairing) */
+       smp_mb();
+       if (!vring_need_event(vring_used_event(&ring),
+                             host.used_idx,
+                             host.called_used_idx))
+               return;
+
+       host.called_used_idx = host.used_idx;
+       call();
+}
diff --git a/tools/virtio/ringtest/virtio_ring_poll.c b/tools/virtio/ringtest/virtio_ring_poll.c
new file mode 100644 (file)
index 0000000..84fc2c5
--- /dev/null
@@ -0,0 +1,2 @@
+#define RING_POLL 1
+#include "virtio_ring_0_9.c"
index 69bca18..ea60646 100644 (file)
@@ -143,7 +143,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
  * Check if there was a change in the timer state (should we raise or lower
  * the line level to the GIC).
  */
-static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
+static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
@@ -154,10 +154,12 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
         * until we call this function from kvm_timer_flush_hwstate.
         */
        if (!vgic_initialized(vcpu->kvm))
-           return;
+               return -ENODEV;
 
        if (kvm_timer_should_fire(vcpu) != timer->irq.level)
                kvm_timer_update_irq(vcpu, !timer->irq.level);
+
+       return 0;
 }
 
 /*
@@ -218,7 +220,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
        bool phys_active;
        int ret;
 
-       kvm_timer_update_state(vcpu);
+       if (kvm_timer_update_state(vcpu))
+               return;
 
        /*
        * If we enter the guest with the virtual input level to the VGIC
index 043032c..00429b3 100644 (file)
@@ -1875,8 +1875,8 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-       int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
+       int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
+       int sz = nr_longs * sizeof(unsigned long);
        vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
        vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
        vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
index 3531599..65da997 100644 (file)
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
         * This memory barrier pairs with prepare_to_wait's set_current_state()
         */
        smp_mb();
-       if (waitqueue_active(&vcpu->wq))
-               wake_up_interruptible(&vcpu->wq);
+       if (swait_active(&vcpu->wq))
+               swake_up(&vcpu->wq);
 
        mmput(mm);
        kvm_put_kvm(vcpu->kvm);
@@ -172,7 +172,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
         * do alloc nowait since if we are going to sleep anyway we
         * may as well sleep faulting in page
         */
-       work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+       work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
        if (!work)
                return 0;
 
index a11cfd2..5af50c3 100644 (file)
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
-       vcpu->halt_poll_ns = 0;
-       init_waitqueue_head(&vcpu->wq);
+       init_swait_queue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
 
        vcpu->pre_pcpu = -1;
@@ -1952,6 +1951,9 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
        else
                val *= halt_poll_ns_grow;
 
+       if (val > halt_poll_ns)
+               val = halt_poll_ns;
+
        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
 }
@@ -1990,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
        ktime_t start, cur;
-       DEFINE_WAIT(wait);
+       DECLARE_SWAITQUEUE(wait);
        bool waited = false;
        u64 block_ns;
 
@@ -2015,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        kvm_arch_vcpu_blocking(vcpu);
 
        for (;;) {
-               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
                if (kvm_vcpu_check_block(vcpu) < 0)
                        break;
@@ -2024,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                schedule();
        }
 
-       finish_wait(&vcpu->wq, &wait);
+       finish_swait(&vcpu->wq, &wait);
        cur = ktime_get();
 
        kvm_arch_vcpu_unblocking(vcpu);
@@ -2056,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
        int me;
        int cpu = vcpu->cpu;
-       wait_queue_head_t *wqp;
+       struct swait_queue_head *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
-       if (waitqueue_active(wqp)) {
-               wake_up_interruptible(wqp);
+       if (swait_active(wqp)) {
+               swake_up(wqp);
                ++vcpu->stat.halt_wakeup;
        }
 
@@ -2161,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        if (vcpu == me)
                                continue;
-                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;