perf_counter: add comment to barrier
[cascardo/linux.git] / arch / x86 / kernel / cpu / perf_counter.c
1 /*
2  * Performance counter x86 architecture code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *  Copyright(C) 2009 Jaswinder Singh Rajput
7  *
8  *  For licencing details see kernel-base/COPYING
9  */
10
11 #include <linux/perf_counter.h>
12 #include <linux/capability.h>
13 #include <linux/notifier.h>
14 #include <linux/hardirq.h>
15 #include <linux/kprobes.h>
16 #include <linux/module.h>
17 #include <linux/kdebug.h>
18 #include <linux/sched.h>
19
20 #include <asm/apic.h>
21
22 static bool perf_counters_initialized __read_mostly;
23
24 /*
25  * Number of (generic) HW counters:
26  */
27 static int nr_counters_generic __read_mostly;
28 static u64 perf_counter_mask __read_mostly;
29 static u64 counter_value_mask __read_mostly;
30 static int counter_value_bits __read_mostly;
31
32 static int nr_counters_fixed __read_mostly;
33
34 struct cpu_hw_counters {
35         struct perf_counter     *counters[X86_PMC_IDX_MAX];
36         unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
37         unsigned long           interrupts;
38         u64                     throttle_ctrl;
39         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
40         int                     enabled;
41 };
42
43 /*
44  * struct pmc_x86_ops - performance counter x86 ops
45  */
46 struct pmc_x86_ops {
47         u64             (*save_disable_all)(void);
48         void            (*restore_all)(u64);
49         u64             (*get_status)(u64);
50         void            (*ack_status)(u64);
51         void            (*enable)(int, u64);
52         void            (*disable)(int, u64);
53         unsigned        eventsel;
54         unsigned        perfctr;
55         u64             (*event_map)(int);
56         u64             (*raw_event)(u64);
57         int             max_events;
58 };
59
60 static struct pmc_x86_ops *pmc_ops;
61
62 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
63         .enabled = 1,
64 };
65
66 /*
67  * Intel PerfMon v3. Used on Core2 and later.
68  */
69 static const u64 intel_perfmon_event_map[] =
70 {
71   [PERF_COUNT_CPU_CYCLES]               = 0x003c,
72   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
73   [PERF_COUNT_CACHE_REFERENCES]         = 0x4f2e,
74   [PERF_COUNT_CACHE_MISSES]             = 0x412e,
75   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
76   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
77   [PERF_COUNT_BUS_CYCLES]               = 0x013c,
78 };
79
80 static u64 pmc_intel_event_map(int event)
81 {
82         return intel_perfmon_event_map[event];
83 }
84
85 static u64 pmc_intel_raw_event(u64 event)
86 {
87 #define CORE_EVNTSEL_EVENT_MASK         0x000000FF
88 #define CORE_EVNTSEL_UNIT_MASK          0x0000FF00
89 #define CORE_EVNTSEL_COUNTER_MASK       0xFF000000
90
91 #define CORE_EVNTSEL_MASK               \
92         (CORE_EVNTSEL_EVENT_MASK |      \
93          CORE_EVNTSEL_UNIT_MASK  |      \
94          CORE_EVNTSEL_COUNTER_MASK)
95
96         return event & CORE_EVNTSEL_MASK;
97 }
98
99 /*
100  * AMD Performance Monitor K7 and later.
101  */
102 static const u64 amd_perfmon_event_map[] =
103 {
104   [PERF_COUNT_CPU_CYCLES]               = 0x0076,
105   [PERF_COUNT_INSTRUCTIONS]             = 0x00c0,
106   [PERF_COUNT_CACHE_REFERENCES]         = 0x0080,
107   [PERF_COUNT_CACHE_MISSES]             = 0x0081,
108   [PERF_COUNT_BRANCH_INSTRUCTIONS]      = 0x00c4,
109   [PERF_COUNT_BRANCH_MISSES]            = 0x00c5,
110 };
111
112 static u64 pmc_amd_event_map(int event)
113 {
114         return amd_perfmon_event_map[event];
115 }
116
117 static u64 pmc_amd_raw_event(u64 event)
118 {
119 #define K7_EVNTSEL_EVENT_MASK   0x7000000FF
120 #define K7_EVNTSEL_UNIT_MASK    0x00000FF00
121 #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000
122
123 #define K7_EVNTSEL_MASK                 \
124         (K7_EVNTSEL_EVENT_MASK |        \
125          K7_EVNTSEL_UNIT_MASK  |        \
126          K7_EVNTSEL_COUNTER_MASK)
127
128         return event & K7_EVNTSEL_MASK;
129 }
130
131 /*
132  * Propagate counter elapsed time into the generic counter.
133  * Can only be executed on the CPU where the counter is active.
134  * Returns the delta events processed.
135  */
136 static void
137 x86_perf_counter_update(struct perf_counter *counter,
138                         struct hw_perf_counter *hwc, int idx)
139 {
140         u64 prev_raw_count, new_raw_count, delta;
141
142         /*
143          * Careful: an NMI might modify the previous counter value.
144          *
145          * Our tactic to handle this is to first atomically read and
146          * exchange a new raw count - then add that new-prev delta
147          * count to the generic counter atomically:
148          */
149 again:
150         prev_raw_count = atomic64_read(&hwc->prev_count);
151         rdmsrl(hwc->counter_base + idx, new_raw_count);
152
153         if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
154                                         new_raw_count) != prev_raw_count)
155                 goto again;
156
157         /*
158          * Now we have the new raw value and have updated the prev
159          * timestamp already. We can now calculate the elapsed delta
160          * (counter-)time and add that to the generic counter.
161          *
162          * Careful, not all hw sign-extends above the physical width
163          * of the count, so we do that by clipping the delta to 32 bits:
164          */
165         delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
166
167         atomic64_add(delta, &counter->count);
168         atomic64_sub(delta, &hwc->period_left);
169 }
170
171 /*
172  * Setup the hardware configuration for a given hw_event_type
173  */
174 static int __hw_perf_counter_init(struct perf_counter *counter)
175 {
176         struct perf_counter_hw_event *hw_event = &counter->hw_event;
177         struct hw_perf_counter *hwc = &counter->hw;
178
179         if (unlikely(!perf_counters_initialized))
180                 return -EINVAL;
181
182         /*
183          * Generate PMC IRQs:
184          * (keep 'enabled' bit clear for now)
185          */
186         hwc->config = ARCH_PERFMON_EVENTSEL_INT;
187
188         /*
189          * Count user and OS events unless requested not to.
190          */
191         if (!hw_event->exclude_user)
192                 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
193         if (!hw_event->exclude_kernel)
194                 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
195
196         /*
197          * If privileged enough, allow NMI events:
198          */
199         hwc->nmi = 0;
200         if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
201                 hwc->nmi = 1;
202
203         hwc->irq_period         = hw_event->irq_period;
204         /*
205          * Intel PMCs cannot be accessed sanely above 32 bit width,
206          * so we install an artificial 1<<31 period regardless of
207          * the generic counter period:
208          */
209         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
210                 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
211                         hwc->irq_period = 0x7FFFFFFF;
212
213         atomic64_set(&hwc->period_left, hwc->irq_period);
214
215         /*
216          * Raw event type provide the config in the event structure
217          */
218         if (hw_event->raw) {
219                 hwc->config |= pmc_ops->raw_event(hw_event->type);
220         } else {
221                 if (hw_event->type >= pmc_ops->max_events)
222                         return -EINVAL;
223                 /*
224                  * The generic map:
225                  */
226                 hwc->config |= pmc_ops->event_map(hw_event->type);
227         }
228         counter->wakeup_pending = 0;
229
230         return 0;
231 }
232
233 static u64 pmc_intel_save_disable_all(void)
234 {
235         u64 ctrl;
236
237         rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
238         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
239
240         return ctrl;
241 }
242
243 static u64 pmc_amd_save_disable_all(void)
244 {
245         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
246         int enabled, idx;
247
248         enabled = cpuc->enabled;
249         cpuc->enabled = 0;
250         /*
251          * ensure we write the disable before we start disabling the
252          * counters proper, so that pcm_amd_enable() does the right thing.
253          */
254         barrier();
255
256         for (idx = 0; idx < nr_counters_generic; idx++) {
257                 u64 val;
258
259                 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
260                 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
261                         val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
262                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
263                 }
264         }
265
266         return enabled;
267 }
268
269 u64 hw_perf_save_disable(void)
270 {
271         if (unlikely(!perf_counters_initialized))
272                 return 0;
273
274         return pmc_ops->save_disable_all();
275 }
276 /*
277  * Exported because of ACPI idle
278  */
279 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
280
281 static void pmc_intel_restore_all(u64 ctrl)
282 {
283         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
284 }
285
286 static void pmc_amd_restore_all(u64 ctrl)
287 {
288         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
289         int idx;
290
291         cpuc->enabled = ctrl;
292         barrier();
293         if (!ctrl)
294                 return;
295
296         for (idx = 0; idx < nr_counters_generic; idx++) {
297                 if (test_bit(idx, cpuc->active_mask)) {
298                         u64 val;
299
300                         rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
301                         val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
302                         wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
303                 }
304         }
305 }
306
307 void hw_perf_restore(u64 ctrl)
308 {
309         if (unlikely(!perf_counters_initialized))
310                 return;
311
312         pmc_ops->restore_all(ctrl);
313 }
314 /*
315  * Exported because of ACPI idle
316  */
317 EXPORT_SYMBOL_GPL(hw_perf_restore);
318
319 static u64 pmc_intel_get_status(u64 mask)
320 {
321         u64 status;
322
323         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
324
325         return status;
326 }
327
328 static u64 pmc_amd_get_status(u64 mask)
329 {
330         u64 status = 0;
331         int idx;
332
333         for (idx = 0; idx < nr_counters_generic; idx++) {
334                 s64 val;
335
336                 if (!(mask & (1 << idx)))
337                         continue;
338
339                 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
340                 val <<= (64 - counter_value_bits);
341                 if (val >= 0)
342                         status |= (1 << idx);
343         }
344
345         return status;
346 }
347
348 static u64 hw_perf_get_status(u64 mask)
349 {
350         if (unlikely(!perf_counters_initialized))
351                 return 0;
352
353         return pmc_ops->get_status(mask);
354 }
355
356 static void pmc_intel_ack_status(u64 ack)
357 {
358         wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
359 }
360
361 static void pmc_amd_ack_status(u64 ack)
362 {
363 }
364
365 static void hw_perf_ack_status(u64 ack)
366 {
367         if (unlikely(!perf_counters_initialized))
368                 return;
369
370         pmc_ops->ack_status(ack);
371 }
372
373 static void pmc_intel_enable(int idx, u64 config)
374 {
375         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
376                         config | ARCH_PERFMON_EVENTSEL0_ENABLE);
377 }
378
379 static void pmc_amd_enable(int idx, u64 config)
380 {
381         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
382
383         set_bit(idx, cpuc->active_mask);
384         if (cpuc->enabled)
385                 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
386
387         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
388 }
389
390 static void hw_perf_enable(int idx, u64 config)
391 {
392         if (unlikely(!perf_counters_initialized))
393                 return;
394
395         pmc_ops->enable(idx, config);
396 }
397
398 static void pmc_intel_disable(int idx, u64 config)
399 {
400         wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
401 }
402
403 static void pmc_amd_disable(int idx, u64 config)
404 {
405         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
406
407         clear_bit(idx, cpuc->active_mask);
408         wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
409
410 }
411
412 static void hw_perf_disable(int idx, u64 config)
413 {
414         if (unlikely(!perf_counters_initialized))
415                 return;
416
417         pmc_ops->disable(idx, config);
418 }
419
420 static inline void
421 __pmc_fixed_disable(struct perf_counter *counter,
422                     struct hw_perf_counter *hwc, unsigned int __idx)
423 {
424         int idx = __idx - X86_PMC_IDX_FIXED;
425         u64 ctrl_val, mask;
426         int err;
427
428         mask = 0xfULL << (idx * 4);
429
430         rdmsrl(hwc->config_base, ctrl_val);
431         ctrl_val &= ~mask;
432         err = checking_wrmsrl(hwc->config_base, ctrl_val);
433 }
434
435 static inline void
436 __pmc_generic_disable(struct perf_counter *counter,
437                            struct hw_perf_counter *hwc, unsigned int idx)
438 {
439         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
440                 __pmc_fixed_disable(counter, hwc, idx);
441         else
442                 hw_perf_disable(idx, hwc->config);
443 }
444
445 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
446
447 /*
448  * Set the next IRQ period, based on the hwc->period_left value.
449  * To be called with the counter disabled in hw:
450  */
451 static void
452 __hw_perf_counter_set_period(struct perf_counter *counter,
453                              struct hw_perf_counter *hwc, int idx)
454 {
455         s64 left = atomic64_read(&hwc->period_left);
456         s64 period = hwc->irq_period;
457         int err;
458
459         /*
460          * If we are way outside a reasoable range then just skip forward:
461          */
462         if (unlikely(left <= -period)) {
463                 left = period;
464                 atomic64_set(&hwc->period_left, left);
465         }
466
467         if (unlikely(left <= 0)) {
468                 left += period;
469                 atomic64_set(&hwc->period_left, left);
470         }
471
472         per_cpu(prev_left[idx], smp_processor_id()) = left;
473
474         /*
475          * The hw counter starts counting from this counter offset,
476          * mark it to be able to extra future deltas:
477          */
478         atomic64_set(&hwc->prev_count, (u64)-left);
479
480         err = checking_wrmsrl(hwc->counter_base + idx,
481                              (u64)(-left) & counter_value_mask);
482 }
483
484 static inline void
485 __pmc_fixed_enable(struct perf_counter *counter,
486                    struct hw_perf_counter *hwc, unsigned int __idx)
487 {
488         int idx = __idx - X86_PMC_IDX_FIXED;
489         u64 ctrl_val, bits, mask;
490         int err;
491
492         /*
493          * Enable IRQ generation (0x8),
494          * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
495          * if requested:
496          */
497         bits = 0x8ULL;
498         if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
499                 bits |= 0x2;
500         if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
501                 bits |= 0x1;
502         bits <<= (idx * 4);
503         mask = 0xfULL << (idx * 4);
504
505         rdmsrl(hwc->config_base, ctrl_val);
506         ctrl_val &= ~mask;
507         ctrl_val |= bits;
508         err = checking_wrmsrl(hwc->config_base, ctrl_val);
509 }
510
511 static void
512 __pmc_generic_enable(struct perf_counter *counter,
513                           struct hw_perf_counter *hwc, int idx)
514 {
515         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
516                 __pmc_fixed_enable(counter, hwc, idx);
517         else
518                 hw_perf_enable(idx, hwc->config);
519 }
520
521 static int
522 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
523 {
524         unsigned int event;
525
526         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
527                 return -1;
528
529         if (unlikely(hwc->nmi))
530                 return -1;
531
532         event = hwc->config & ARCH_PERFMON_EVENT_MASK;
533
534         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
535                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
536         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
537                 return X86_PMC_IDX_FIXED_CPU_CYCLES;
538         if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
539                 return X86_PMC_IDX_FIXED_BUS_CYCLES;
540
541         return -1;
542 }
543
544 /*
545  * Find a PMC slot for the freshly enabled / scheduled in counter:
546  */
547 static int pmc_generic_enable(struct perf_counter *counter)
548 {
549         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
550         struct hw_perf_counter *hwc = &counter->hw;
551         int idx;
552
553         idx = fixed_mode_idx(counter, hwc);
554         if (idx >= 0) {
555                 /*
556                  * Try to get the fixed counter, if that is already taken
557                  * then try to get a generic counter:
558                  */
559                 if (test_and_set_bit(idx, cpuc->used))
560                         goto try_generic;
561
562                 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
563                 /*
564                  * We set it so that counter_base + idx in wrmsr/rdmsr maps to
565                  * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
566                  */
567                 hwc->counter_base =
568                         MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
569                 hwc->idx = idx;
570         } else {
571                 idx = hwc->idx;
572                 /* Try to get the previous generic counter again */
573                 if (test_and_set_bit(idx, cpuc->used)) {
574 try_generic:
575                         idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
576                         if (idx == nr_counters_generic)
577                                 return -EAGAIN;
578
579                         set_bit(idx, cpuc->used);
580                         hwc->idx = idx;
581                 }
582                 hwc->config_base  = pmc_ops->eventsel;
583                 hwc->counter_base = pmc_ops->perfctr;
584         }
585
586         perf_counters_lapic_init(hwc->nmi);
587
588         __pmc_generic_disable(counter, hwc, idx);
589
590         cpuc->counters[idx] = counter;
591         /*
592          * Make it visible before enabling the hw:
593          */
594         smp_wmb();
595
596         __hw_perf_counter_set_period(counter, hwc, idx);
597         __pmc_generic_enable(counter, hwc, idx);
598
599         return 0;
600 }
601
602 void perf_counter_print_debug(void)
603 {
604         u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
605         struct cpu_hw_counters *cpuc;
606         int cpu, idx;
607
608         if (!nr_counters_generic)
609                 return;
610
611         local_irq_disable();
612
613         cpu = smp_processor_id();
614         cpuc = &per_cpu(cpu_hw_counters, cpu);
615
616         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
617                 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
618                 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
619                 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
620                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
621
622                 pr_info("\n");
623                 pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
624                 pr_info("CPU#%d: status:     %016llx\n", cpu, status);
625                 pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
626                 pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
627         }
628         pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
629
630         for (idx = 0; idx < nr_counters_generic; idx++) {
631                 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
632                 rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
633
634                 prev_left = per_cpu(prev_left[idx], cpu);
635
636                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
637                         cpu, idx, pmc_ctrl);
638                 pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
639                         cpu, idx, pmc_count);
640                 pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
641                         cpu, idx, prev_left);
642         }
643         for (idx = 0; idx < nr_counters_fixed; idx++) {
644                 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
645
646                 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
647                         cpu, idx, pmc_count);
648         }
649         local_irq_enable();
650 }
651
652 static void pmc_generic_disable(struct perf_counter *counter)
653 {
654         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
655         struct hw_perf_counter *hwc = &counter->hw;
656         unsigned int idx = hwc->idx;
657
658         __pmc_generic_disable(counter, hwc, idx);
659
660         clear_bit(idx, cpuc->used);
661         cpuc->counters[idx] = NULL;
662         /*
663          * Make sure the cleared pointer becomes visible before we
664          * (potentially) free the counter:
665          */
666         smp_wmb();
667
668         /*
669          * Drain the remaining delta count out of a counter
670          * that we are disabling:
671          */
672         x86_perf_counter_update(counter, hwc, idx);
673 }
674
675 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
676 {
677         struct perf_data *irqdata = counter->irqdata;
678
679         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
680                 irqdata->overrun++;
681         } else {
682                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
683
684                 *p = data;
685                 irqdata->len += sizeof(u64);
686         }
687 }
688
689 /*
690  * Save and restart an expired counter. Called by NMI contexts,
691  * so it has to be careful about preempting normal counter ops:
692  */
693 static void perf_save_and_restart(struct perf_counter *counter)
694 {
695         struct hw_perf_counter *hwc = &counter->hw;
696         int idx = hwc->idx;
697
698         x86_perf_counter_update(counter, hwc, idx);
699         __hw_perf_counter_set_period(counter, hwc, idx);
700
701         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
702                 __pmc_generic_enable(counter, hwc, idx);
703 }
704
705 static void
706 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
707 {
708         struct perf_counter *counter, *group_leader = sibling->group_leader;
709
710         /*
711          * Store sibling timestamps (if any):
712          */
713         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
714
715                 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
716                 perf_store_irq_data(sibling, counter->hw_event.type);
717                 perf_store_irq_data(sibling, atomic64_read(&counter->count));
718         }
719 }
720
721 /*
722  * Maximum interrupt frequency of 100KHz per CPU
723  */
724 #define PERFMON_MAX_INTERRUPTS (100000/HZ)
725
726 /*
727  * This handler is triggered by the local APIC, so the APIC IRQ handling
728  * rules apply:
729  */
730 static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
731 {
732         int bit, cpu = smp_processor_id();
733         u64 ack, status;
734         struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
735         int ret = 0;
736
737         cpuc->throttle_ctrl = hw_perf_save_disable();
738
739         status = hw_perf_get_status(cpuc->throttle_ctrl);
740         if (!status)
741                 goto out;
742
743         ret = 1;
744 again:
745         inc_irq_stat(apic_perf_irqs);
746         ack = status;
747         for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
748                 struct perf_counter *counter = cpuc->counters[bit];
749
750                 clear_bit(bit, (unsigned long *) &status);
751                 if (!counter)
752                         continue;
753
754                 perf_save_and_restart(counter);
755
756                 switch (counter->hw_event.record_type) {
757                 case PERF_RECORD_SIMPLE:
758                         continue;
759                 case PERF_RECORD_IRQ:
760                         perf_store_irq_data(counter, instruction_pointer(regs));
761                         break;
762                 case PERF_RECORD_GROUP:
763                         perf_handle_group(counter, &status, &ack);
764                         break;
765                 }
766                 /*
767                  * From NMI context we cannot call into the scheduler to
768                  * do a task wakeup - but we mark these generic as
769                  * wakeup_pending and initate a wakeup callback:
770                  */
771                 if (nmi) {
772                         counter->wakeup_pending = 1;
773                         set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
774                 } else {
775                         wake_up(&counter->waitq);
776                 }
777         }
778
779         hw_perf_ack_status(ack);
780
781         /*
782          * Repeat if there is more work to be done:
783          */
784         status = hw_perf_get_status(cpuc->throttle_ctrl);
785         if (status)
786                 goto again;
787 out:
788         /*
789          * Restore - do not reenable when global enable is off or throttled:
790          */
791         if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
792                 hw_perf_restore(cpuc->throttle_ctrl);
793
794         return ret;
795 }
796
797 void perf_counter_unthrottle(void)
798 {
799         struct cpu_hw_counters *cpuc;
800
801         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
802                 return;
803
804         if (unlikely(!perf_counters_initialized))
805                 return;
806
807         cpuc = &__get_cpu_var(cpu_hw_counters);
808         if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
809                 if (printk_ratelimit())
810                         printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
811                 hw_perf_restore(cpuc->throttle_ctrl);
812         }
813         cpuc->interrupts = 0;
814 }
815
816 void smp_perf_counter_interrupt(struct pt_regs *regs)
817 {
818         irq_enter();
819         apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
820         ack_APIC_irq();
821         __smp_perf_counter_interrupt(regs, 0);
822         irq_exit();
823 }
824
825 /*
826  * This handler is triggered by NMI contexts:
827  */
828 void perf_counter_notify(struct pt_regs *regs)
829 {
830         struct cpu_hw_counters *cpuc;
831         unsigned long flags;
832         int bit, cpu;
833
834         local_irq_save(flags);
835         cpu = smp_processor_id();
836         cpuc = &per_cpu(cpu_hw_counters, cpu);
837
838         for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
839                 struct perf_counter *counter = cpuc->counters[bit];
840
841                 if (!counter)
842                         continue;
843
844                 if (counter->wakeup_pending) {
845                         counter->wakeup_pending = 0;
846                         wake_up(&counter->waitq);
847                 }
848         }
849
850         local_irq_restore(flags);
851 }
852
853 void perf_counters_lapic_init(int nmi)
854 {
855         u32 apic_val;
856
857         if (!perf_counters_initialized)
858                 return;
859         /*
860          * Enable the performance counter vector in the APIC LVT:
861          */
862         apic_val = apic_read(APIC_LVTERR);
863
864         apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
865         if (nmi)
866                 apic_write(APIC_LVTPC, APIC_DM_NMI);
867         else
868                 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
869         apic_write(APIC_LVTERR, apic_val);
870 }
871
872 static int __kprobes
873 perf_counter_nmi_handler(struct notifier_block *self,
874                          unsigned long cmd, void *__args)
875 {
876         struct die_args *args = __args;
877         struct pt_regs *regs;
878         int ret;
879
880         switch (cmd) {
881         case DIE_NMI:
882         case DIE_NMI_IPI:
883                 break;
884
885         default:
886                 return NOTIFY_DONE;
887         }
888
889         regs = args->regs;
890
891         apic_write(APIC_LVTPC, APIC_DM_NMI);
892         ret = __smp_perf_counter_interrupt(regs, 1);
893
894         return ret ? NOTIFY_STOP : NOTIFY_OK;
895 }
896
897 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
898         .notifier_call          = perf_counter_nmi_handler,
899         .next                   = NULL,
900         .priority               = 1
901 };
902
903 static struct pmc_x86_ops pmc_intel_ops = {
904         .save_disable_all       = pmc_intel_save_disable_all,
905         .restore_all            = pmc_intel_restore_all,
906         .get_status             = pmc_intel_get_status,
907         .ack_status             = pmc_intel_ack_status,
908         .enable                 = pmc_intel_enable,
909         .disable                = pmc_intel_disable,
910         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
911         .perfctr                = MSR_ARCH_PERFMON_PERFCTR0,
912         .event_map              = pmc_intel_event_map,
913         .raw_event              = pmc_intel_raw_event,
914         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
915 };
916
917 static struct pmc_x86_ops pmc_amd_ops = {
918         .save_disable_all       = pmc_amd_save_disable_all,
919         .restore_all            = pmc_amd_restore_all,
920         .get_status             = pmc_amd_get_status,
921         .ack_status             = pmc_amd_ack_status,
922         .enable                 = pmc_amd_enable,
923         .disable                = pmc_amd_disable,
924         .eventsel               = MSR_K7_EVNTSEL0,
925         .perfctr                = MSR_K7_PERFCTR0,
926         .event_map              = pmc_amd_event_map,
927         .raw_event              = pmc_amd_raw_event,
928         .max_events             = ARRAY_SIZE(amd_perfmon_event_map),
929 };
930
931 static struct pmc_x86_ops *pmc_intel_init(void)
932 {
933         union cpuid10_eax eax;
934         unsigned int ebx;
935         unsigned int unused;
936         union cpuid10_edx edx;
937
938         /*
939          * Check whether the Architectural PerfMon supports
940          * Branch Misses Retired Event or not.
941          */
942         cpuid(10, &eax.full, &ebx, &unused, &edx.full);
943         if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
944                 return NULL;
945
946         pr_info("Intel Performance Monitoring support detected.\n");
947         pr_info("... version:         %d\n", eax.split.version_id);
948         pr_info("... bit width:       %d\n", eax.split.bit_width);
949         pr_info("... mask length:     %d\n", eax.split.mask_length);
950
951         nr_counters_generic = eax.split.num_counters;
952         nr_counters_fixed = edx.split.num_counters_fixed;
953         counter_value_mask = (1ULL << eax.split.bit_width) - 1;
954
955         return &pmc_intel_ops;
956 }
957
958 static struct pmc_x86_ops *pmc_amd_init(void)
959 {
960         nr_counters_generic = 4;
961         nr_counters_fixed = 0;
962         counter_value_mask = 0x0000FFFFFFFFFFFFULL;
963         counter_value_bits = 48;
964
965         pr_info("AMD Performance Monitoring support detected.\n");
966
967         return &pmc_amd_ops;
968 }
969
970 void __init init_hw_perf_counters(void)
971 {
972         if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
973                 return;
974
975         switch (boot_cpu_data.x86_vendor) {
976         case X86_VENDOR_INTEL:
977                 pmc_ops = pmc_intel_init();
978                 break;
979         case X86_VENDOR_AMD:
980                 pmc_ops = pmc_amd_init();
981                 break;
982         }
983         if (!pmc_ops)
984                 return;
985
986         pr_info("... num counters:    %d\n", nr_counters_generic);
987         if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
988                 nr_counters_generic = X86_PMC_MAX_GENERIC;
989                 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
990                         nr_counters_generic, X86_PMC_MAX_GENERIC);
991         }
992         perf_counter_mask = (1 << nr_counters_generic) - 1;
993         perf_max_counters = nr_counters_generic;
994
995         pr_info("... value mask:      %016Lx\n", counter_value_mask);
996
997         if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
998                 nr_counters_fixed = X86_PMC_MAX_FIXED;
999                 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1000                         nr_counters_fixed, X86_PMC_MAX_FIXED);
1001         }
1002         pr_info("... fixed counters:  %d\n", nr_counters_fixed);
1003
1004         perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1005
1006         pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
1007         perf_counters_initialized = true;
1008
1009         perf_counters_lapic_init(0);
1010         register_die_notifier(&perf_counter_nmi_notifier);
1011 }
1012
1013 static void pmc_generic_read(struct perf_counter *counter)
1014 {
1015         x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1016 }
1017
1018 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1019         .enable         = pmc_generic_enable,
1020         .disable        = pmc_generic_disable,
1021         .read           = pmc_generic_read,
1022 };
1023
1024 const struct hw_perf_counter_ops *
1025 hw_perf_counter_init(struct perf_counter *counter)
1026 {
1027         int err;
1028
1029         err = __hw_perf_counter_init(counter);
1030         if (err)
1031                 return NULL;
1032
1033         return &x86_perf_counter_ops;
1034 }