Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
[cascardo/linux.git] / arch / x86 / kernel / cpu / perf_counter.c
index 3d4ebbd..2732e2c 100644 (file)
@@ -6,6 +6,7 @@
  *  Copyright (C) 2009 Jaswinder Singh Rajput
  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -20,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
+#include <linux/cpu.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
 
 static u64 perf_counter_mask __read_mostly;
 
+/* The maximal number of PEBS counters: */
+#define MAX_PEBS_COUNTERS      4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE                (BTS_RECORD_SIZE * 1024)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH            (BTS_RECORD_SIZE * 64)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR                        (1 << 6)
+#define X86_DEBUGCTL_BTS               (1 << 7)
+#define X86_DEBUGCTL_BTINT             (1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS                (1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR       (1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_counter_reset[MAX_PEBS_COUNTERS];
+};
+
 struct cpu_hw_counters {
        struct perf_counter     *counters[X86_PMC_IDX_MAX];
        unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           interrupts;
        int                     enabled;
+       struct debug_store      *ds;
 };
 
 /*
@@ -58,6 +100,8 @@ struct x86_pmu {
        int             apic;
        u64             max_period;
        u64             intel_ctrl;
+       void            (*enable_bts)(u64 config);
+       void            (*disable_bts)(void);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter,
        u64 prev_raw_count, new_raw_count;
        s64 delta;
 
+       if (idx == X86_PMC_IDX_FIXED_BTS)
+               return 0;
+
        /*
         * Careful: an NMI might modify the previous counter value.
         *
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
 #endif
 }
 
+static inline bool bts_available(void)
+{
+       return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_counters, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+       int cpu;
+
+       if (!bts_available())
+               return;
+
+       get_online_cpus();
+
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+               if (!ds)
+                       continue;
+
+               per_cpu(cpu_hw_counters, cpu).ds = NULL;
+
+               kfree((void *)(unsigned long)ds->bts_buffer_base);
+               kfree(ds);
+       }
+
+       put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+       int cpu, err = 0;
+
+       if (!bts_available())
+               return 0;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds;
+               void *buffer;
+
+               err = -ENOMEM;
+               buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+               if (unlikely(!buffer))
+                       break;
+
+               ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+               if (unlikely(!ds)) {
+                       kfree(buffer);
+                       break;
+               }
+
+               ds->bts_buffer_base = (u64)(unsigned long)buffer;
+               ds->bts_index = ds->bts_buffer_base;
+               ds->bts_absolute_maximum =
+                       ds->bts_buffer_base + BTS_BUFFER_SIZE;
+               ds->bts_interrupt_threshold =
+                       ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+               per_cpu(cpu_hw_counters, cpu).ds = ds;
+               err = 0;
+       }
+
+       if (err)
+               release_bts_hardware();
+       else {
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+
+       return err;
+}
+
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
        if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
                release_pmc_hardware();
+               release_bts_hardware();
                mutex_unlock(&pmc_reserve_mutex);
        }
 }
@@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
        return 0;
 }
 
+static void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= X86_DEBUGCTL_TR;
+       debugctlmsr |= X86_DEBUGCTL_BTS;
+       debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+                 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        err = 0;
        if (!atomic_inc_not_zero(&active_counters)) {
                mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
-                       err = -EBUSY;
-               else
+               if (atomic_read(&active_counters) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               err = reserve_bts_hardware();
+               }
+               if (!err)
                        atomic_inc(&active_counters);
                mutex_unlock(&pmc_reserve_mutex);
        }
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        if (config == -1LL)
                return -EINVAL;
 
+       /*
+        * Branch tracing:
+        */
+       if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+           (hwc->sample_period == 1)) {
+               /* BTS is not supported by this architecture. */
+               if (!bts_available())
+                       return -EOPNOTSUPP;
+
+               /* BTS is currently only allowed for user-mode. */
+               if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+                       return -EOPNOTSUPP;
+       }
+
        hwc->config |= config;
 
        return 0;
@@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void)
 
 static void intel_pmu_disable_all(void)
 {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+       if (!cpuc->enabled)
+               return;
+
+       cpuc->enabled = 0;
+       barrier();
+
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+               intel_pmu_disable_bts();
 }
 
 static void amd_pmu_disable_all(void)
@@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void)
 
 static void intel_pmu_enable_all(void)
 {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+       if (cpuc->enabled)
+               return;
+
+       cpuc->enabled = 1;
+       barrier();
+
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+       if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+               struct perf_counter *counter =
+                       cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+
+               if (WARN_ON_ONCE(!counter))
+                       return;
+
+               intel_pmu_enable_bts(counter->hw.config);
+       }
 }
 
 static void amd_pmu_enable_all(void)
@@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void
 intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+               intel_pmu_disable_bts();
+               return;
+       }
+
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_disable_fixed(hwc, idx);
                return;
@@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
        s64 period = hwc->sample_period;
        int err, ret = 0;
 
+       if (idx == X86_PMC_IDX_FIXED_BTS)
+               return 0;
+
        /*
         * If we are way outside a reasoable range then just skip forward:
         */
@@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 
 static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+               if (!__get_cpu_var(cpu_hw_counters).enabled)
+                       return;
+
+               intel_pmu_enable_bts(hwc->config);
+               return;
+       }
+
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_enable_fixed(hwc, idx);
                return;
@@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
        unsigned int event;
 
+       event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+       if (unlikely((event ==
+                     x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+                    (hwc->sample_period == 1)))
+               return X86_PMC_IDX_FIXED_BTS;
+
        if (!x86_pmu.num_counters_fixed)
                return -1;
 
-       event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
        if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
                return X86_PMC_IDX_FIXED_INSTRUCTIONS;
        if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter)
        int idx;
 
        idx = fixed_mode_idx(counter, hwc);
-       if (idx >= 0) {
+       if (idx == X86_PMC_IDX_FIXED_BTS) {
+               /* BTS is already occupied. */
+               if (test_and_set_bit(idx, cpuc->used_mask))
+                       return -EAGAIN;
+
+               hwc->config_base        = 0;
+               hwc->counter_base       = 0;
+               hwc->idx                = idx;
+       } else if (idx >= 0) {
                /*
                 * Try to get the fixed counter, if that is already taken
                 * then try to get a generic counter:
@@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void)
        local_irq_restore(flags);
 }
 
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
+                                      struct perf_sample_data *data)
+{
+       struct debug_store *ds = cpuc->ds;
+       struct bts_record {
+               u64     from;
+               u64     to;
+               u64     flags;
+       };
+       struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+       unsigned long orig_ip = data->regs->ip;
+       struct bts_record *at, *top;
+
+       if (!counter)
+               return;
+
+       if (!ds)
+               return;
+
+       at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+       top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+       ds->bts_index = ds->bts_buffer_base;
+
+       for (; at < top; at++) {
+               data->regs->ip  = at->from;
+               data->addr      = at->to;
+
+               perf_counter_output(counter, 1, data);
+       }
+
+       data->regs->ip  = orig_ip;
+       data->addr      = 0;
+
+       /* There's new data available. */
+       counter->pending_kill = POLL_IN;
+}
+
 static void x86_pmu_disable(struct perf_counter *counter)
 {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
         * that we are disabling:
         */
        x86_perf_counter_update(counter, hwc, idx);
+
+       /* Drain the remaining BTS records. */
+       if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+               struct perf_sample_data data;
+               struct pt_regs regs;
+
+               data.regs = &regs;
+               intel_pmu_drain_bts_buffer(cpuc, &data);
+       }
        cpuc->counters[idx] = NULL;
        clear_bit(idx, cpuc->used_mask);
 
@@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
 
 static void intel_pmu_reset(void)
 {
+       struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
        unsigned long flags;
        int idx;
 
@@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void)
        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
                checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
        }
+       if (ds)
+               ds->bts_index = ds->bts_buffer_base;
 
        local_irq_restore(flags);
 }
@@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
        cpuc = &__get_cpu_var(cpu_hw_counters);
 
        perf_disable();
+       intel_pmu_drain_bts_buffer(cpuc, &data);
        status = intel_pmu_get_status();
        if (!status) {
                perf_enable();
@@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = {
         * the generic counter period:
         */
        .max_period             = (1ULL << 31) - 1,
+       .enable_bts             = intel_pmu_enable_bts,
+       .disable_bts            = intel_pmu_disable_bts,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 
        return entry;
 }
+
+void hw_perf_counter_setup_online(int cpu)
+{
+       init_debug_store_on_cpu(cpu);
+}