perf_counter/x86: Remove the IRQ (non-NMI) handling bits
[cascardo/linux.git] / arch / x86 / kernel / cpu / perf_counter.c
index 5bfd30a..12cc05e 100644 (file)
@@ -40,7 +40,7 @@ struct cpu_hw_counters {
 struct x86_pmu {
        const char      *name;
        int             version;
-       int             (*handle_irq)(struct pt_regs *, int);
+       int             (*handle_irq)(struct pt_regs *);
        void            (*disable_all)(void);
        void            (*enable_all)(void);
        void            (*enable)(struct hw_perf_counter *, int);
@@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK                0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK          0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK      0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK              \
        (CORE_EVNTSEL_EVENT_MASK |      \
         CORE_EVNTSEL_UNIT_MASK  |      \
+        CORE_EVNTSEL_EDGE_MASK  |      \
+        CORE_EVNTSEL_INV_MASK  |       \
         CORE_EVNTSEL_COUNTER_MASK)
 
        return event & CORE_EVNTSEL_MASK;
@@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK   0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK   0x000040000ULL
+#define K7_EVNTSEL_INV_MASK    0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK        0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK                        \
        (K7_EVNTSEL_EVENT_MASK |        \
         K7_EVNTSEL_UNIT_MASK  |        \
+        K7_EVNTSEL_EDGE_MASK  |        \
+        K7_EVNTSEL_INV_MASK   |        \
         K7_EVNTSEL_COUNTER_MASK)
 
        return event & K7_EVNTSEL_MASK;
@@ -239,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-       struct perf_counter_hw_event *hw_event = &counter->hw_event;
+       struct perf_counter_attr *attr = &counter->attr;
        struct hw_perf_counter *hwc = &counter->hw;
        int err;
 
@@ -271,39 +279,28 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        /*
         * Count user and OS events unless requested not to.
         */
-       if (!hw_event->exclude_user)
+       if (!attr->exclude_user)
                hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-       if (!hw_event->exclude_kernel)
+       if (!attr->exclude_kernel)
                hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-       /*
-        * If privileged enough, allow NMI events:
-        */
-       hwc->nmi = 0;
-       if (hw_event->nmi) {
-               if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
-                       return -EACCES;
-               hwc->nmi = 1;
-       }
+       if (!hwc->sample_period)
+               hwc->sample_period = x86_pmu.max_period;
 
-       if (!hwc->irq_period)
-               hwc->irq_period = x86_pmu.max_period;
-
-       atomic64_set(&hwc->period_left,
-                       min(x86_pmu.max_period, hwc->irq_period));
+       atomic64_set(&hwc->period_left, hwc->sample_period);
 
        /*
         * Raw event type provide the config in the event structure
         */
-       if (perf_event_raw(hw_event)) {
-               hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+       if (perf_event_raw(attr)) {
+               hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
        } else {
-               if (perf_event_id(hw_event) >= x86_pmu.max_events)
+               if (perf_event_id(attr) >= x86_pmu.max_events)
                        return -EINVAL;
                /*
                 * The generic map:
                 */
-               hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+               hwc->config |= x86_pmu.event_map(perf_event_id(attr));
        }
 
        counter->destroy = hw_perf_counter_destroy;
@@ -453,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the counter disabled in hw:
  */
-static void
+static int
 x86_perf_counter_set_period(struct perf_counter *counter,
                             struct hw_perf_counter *hwc, int idx)
 {
        s64 left = atomic64_read(&hwc->period_left);
-       s64 period = min(x86_pmu.max_period, hwc->irq_period);
-       int err;
+       s64 period = hwc->sample_period;
+       int err, ret = 0;
 
        /*
         * If we are way outside a reasoable range then just skip forward:
@@ -467,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter,
        if (unlikely(left <= -period)) {
                left = period;
                atomic64_set(&hwc->period_left, left);
+               ret = 1;
        }
 
        if (unlikely(left <= 0)) {
                left += period;
                atomic64_set(&hwc->period_left, left);
+               ret = 1;
        }
        /*
         * Quirk: certain CPUs dont like it if just 1 event is left:
@@ -479,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
        if (unlikely(left < 2))
                left = 2;
 
+       if (left > x86_pmu.max_period)
+               left = x86_pmu.max_period;
+
        per_cpu(prev_left[idx], smp_processor_id()) = left;
 
        /*
@@ -489,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
        err = checking_wrmsrl(hwc->counter_base + idx,
                             (u64)(-left) & x86_pmu.counter_mask);
+
+       return ret;
 }
 
 static inline void
@@ -545,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
        if (!x86_pmu.num_counters_fixed)
                return -1;
 
-       if (unlikely(hwc->nmi))
-               return -1;
-
        event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
        if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
@@ -603,7 +604,7 @@ try_generic:
                hwc->counter_base = x86_pmu.perfctr;
        }
 
-       perf_counters_lapic_init(hwc->nmi);
+       perf_counters_lapic_init();
 
        x86_pmu.disable(hwc, idx);
 
@@ -616,6 +617,18 @@ try_generic:
        return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       struct hw_perf_counter *hwc = &counter->hw;
+
+       if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+                               cpuc->counters[hwc->idx] != counter))
+               return;
+
+       x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -699,28 +712,50 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
 {
        struct hw_perf_counter *hwc = &counter->hw;
        int idx = hwc->idx;
+       int ret;
 
        x86_perf_counter_update(counter, hwc, idx);
-       x86_perf_counter_set_period(counter, hwc, idx);
+       ret = x86_perf_counter_set_period(counter, hwc, idx);
 
        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
                intel_pmu_enable_counter(hwc, idx);
+
+       return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+       unsigned long flags;
+       int idx;
+
+       if (!x86_pmu.num_counters)
+               return;
+
+       local_irq_save(flags);
+
+       printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+               checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+       }
+       for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+               checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+       }
+
+       local_irq_restore(flags);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
        struct cpu_hw_counters *cpuc;
        struct cpu_hw_counters;
@@ -741,6 +776,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 again:
        if (++loops > 100) {
                WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+               perf_counter_print_debug();
+               intel_pmu_reset();
+               perf_enable();
                return 1;
        }
 
@@ -753,8 +791,10 @@ again:
                if (!test_bit(bit, cpuc->active_mask))
                        continue;
 
-               intel_pmu_save_and_restart(counter);
-               if (perf_counter_overflow(counter, nmi, regs, 0))
+               if (!intel_pmu_save_and_restart(counter))
+                       continue;
+
+               if (perf_counter_overflow(counter, 1, regs, 0))
                        intel_pmu_disable_counter(&counter->hw, bit);
        }
 
@@ -767,15 +807,14 @@ again:
        if (status)
                goto again;
 
-       if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-               perf_enable();
+       perf_enable();
 
        return 1;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-       int cpu, idx, throttle = 0, handled = 0;
+       int cpu, idx, handled = 0;
        struct cpu_hw_counters *cpuc;
        struct perf_counter *counter;
        struct hw_perf_counter *hwc;
@@ -784,71 +823,30 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
        cpu = smp_processor_id();
        cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-       if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-               throttle = 1;
-               __perf_disable();
-               cpuc->enabled = 0;
-               barrier();
-       }
-
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-               int disable = 0;
-
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
 
                counter = cpuc->counters[idx];
                hwc = &counter->hw;
 
-               if (counter->hw_event.nmi != nmi)
-                       goto next;
-
                val = x86_perf_counter_update(counter, hwc, idx);
                if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-                       goto next;
+                       continue;
 
                /* counter overflow */
-               x86_perf_counter_set_period(counter, hwc, idx);
                handled = 1;
                inc_irq_stat(apic_perf_irqs);
-               disable = perf_counter_overflow(counter, nmi, regs, 0);
+               if (!x86_perf_counter_set_period(counter, hwc, idx))
+                       continue;
 
-next:
-               if (disable || throttle)
+               if (perf_counter_overflow(counter, 1, regs, 0))
                        amd_pmu_disable_counter(hwc, idx);
        }
 
        return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-       struct cpu_hw_counters *cpuc;
-
-       if (!x86_pmu_initialized())
-               return;
-
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-       if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-               /*
-                * Clear them before re-enabling irqs/NMIs again:
-                */
-               cpuc->interrupts = 0;
-               perf_enable();
-       } else {
-               cpuc->interrupts = 0;
-       }
-}
-
-void smp_perf_counter_interrupt(struct pt_regs *regs)
-{
-       irq_enter();
-       apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-       ack_APIC_irq();
-       x86_pmu.handle_irq(regs, 0);
-       irq_exit();
-}
-
 void smp_perf_pending_interrupt(struct pt_regs *regs)
 {
        irq_enter();
@@ -863,24 +861,15 @@ void set_perf_counter_pending(void)
        apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 }
 
-void perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(void)
 {
-       u32 apic_val;
-
        if (!x86_pmu_initialized())
                return;
 
        /*
-        * Enable the performance counter vector in the APIC LVT:
+        * Always use NMI for PMU
         */
-       apic_val = apic_read(APIC_LVTERR);
-
-       apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
-       if (nmi)
-               apic_write(APIC_LVTPC, APIC_DM_NMI);
-       else
-               apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-       apic_write(APIC_LVTERR, apic_val);
+       apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
 static int __kprobes
@@ -912,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
         * If the first NMI handles both, the latter will be empty and daze
         * the CPU.
         */
-       x86_pmu.handle_irq(regs, 1);
+       x86_pmu.handle_irq(regs);
 
        return NOTIFY_STOP;
 }
@@ -1054,7 +1043,7 @@ void __init init_hw_perf_counters(void)
 
        pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-       perf_counters_lapic_init(0);
+       perf_counters_lapic_init();
        register_die_notifier(&perf_counter_nmi_notifier);
 }
 
@@ -1067,6 +1056,7 @@ static const struct pmu pmu = {
        .enable         = x86_pmu_enable,
        .disable        = x86_pmu_disable,
        .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)