Merge branch 'linus' into perfcounters/core
[cascardo/linux.git] / kernel / perf_counter.c
1 /*
2  * Performance counter core code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/fs.h>
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/perf_counter.h>
23 #include <linux/mm.h>
24 #include <linux/vmstat.h>
25
26 /*
27  * Each CPU has a list of per CPU counters:
28  */
29 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
30
31 int perf_max_counters __read_mostly = 1;
32 static int perf_reserved_percpu __read_mostly;
33 static int perf_overcommit __read_mostly = 1;
34
35 /*
36  * Mutex for (sysadmin-configurable) counter reservations:
37  */
38 static DEFINE_MUTEX(perf_resource_mutex);
39
40 /*
41  * Architecture provided APIs - weak aliases:
42  */
43 extern __weak const struct hw_perf_counter_ops *
44 hw_perf_counter_init(struct perf_counter *counter)
45 {
46         return NULL;
47 }
48
49 u64 __weak hw_perf_save_disable(void)           { return 0; }
50 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
51 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
52 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
53                struct perf_cpu_context *cpuctx,
54                struct perf_counter_context *ctx, int cpu)
55 {
56         return 0;
57 }
58
59 void __weak perf_counter_print_debug(void)      { }
60
61 static void
62 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
63 {
64         struct perf_counter *group_leader = counter->group_leader;
65
66         /*
67          * Depending on whether it is a standalone or sibling counter,
68          * add it straight to the context's counter list, or to the group
69          * leader's sibling list:
70          */
71         if (counter->group_leader == counter)
72                 list_add_tail(&counter->list_entry, &ctx->counter_list);
73         else
74                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
75 }
76
77 static void
78 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
79 {
80         struct perf_counter *sibling, *tmp;
81
82         list_del_init(&counter->list_entry);
83
84         /*
85          * If this was a group counter with sibling counters then
86          * upgrade the siblings to singleton counters by adding them
87          * to the context list directly:
88          */
89         list_for_each_entry_safe(sibling, tmp,
90                                  &counter->sibling_list, list_entry) {
91
92                 list_del_init(&sibling->list_entry);
93                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
94                 sibling->group_leader = sibling;
95         }
96 }
97
98 static void
99 counter_sched_out(struct perf_counter *counter,
100                   struct perf_cpu_context *cpuctx,
101                   struct perf_counter_context *ctx)
102 {
103         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
104                 return;
105
106         counter->state = PERF_COUNTER_STATE_INACTIVE;
107         counter->hw_ops->disable(counter);
108         counter->oncpu = -1;
109
110         if (!is_software_counter(counter))
111                 cpuctx->active_oncpu--;
112         ctx->nr_active--;
113         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
114                 cpuctx->exclusive = 0;
115 }
116
117 static void
118 group_sched_out(struct perf_counter *group_counter,
119                 struct perf_cpu_context *cpuctx,
120                 struct perf_counter_context *ctx)
121 {
122         struct perf_counter *counter;
123
124         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
125                 return;
126
127         counter_sched_out(group_counter, cpuctx, ctx);
128
129         /*
130          * Schedule out siblings (if any):
131          */
132         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
133                 counter_sched_out(counter, cpuctx, ctx);
134
135         if (group_counter->hw_event.exclusive)
136                 cpuctx->exclusive = 0;
137 }
138
139 /*
140  * Cross CPU call to remove a performance counter
141  *
142  * We disable the counter on the hardware level first. After that we
143  * remove it from the context list.
144  */
145 static void __perf_counter_remove_from_context(void *info)
146 {
147         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
148         struct perf_counter *counter = info;
149         struct perf_counter_context *ctx = counter->ctx;
150         unsigned long flags;
151         u64 perf_flags;
152
153         /*
154          * If this is a task context, we need to check whether it is
155          * the current task context of this cpu. If not it has been
156          * scheduled out before the smp call arrived.
157          */
158         if (ctx->task && cpuctx->task_ctx != ctx)
159                 return;
160
161         curr_rq_lock_irq_save(&flags);
162         spin_lock(&ctx->lock);
163
164         counter_sched_out(counter, cpuctx, ctx);
165
166         counter->task = NULL;
167         ctx->nr_counters--;
168
169         /*
170          * Protect the list operation against NMI by disabling the
171          * counters on a global level. NOP for non NMI based counters.
172          */
173         perf_flags = hw_perf_save_disable();
174         list_del_counter(counter, ctx);
175         hw_perf_restore(perf_flags);
176
177         if (!ctx->task) {
178                 /*
179                  * Allow more per task counters with respect to the
180                  * reservation:
181                  */
182                 cpuctx->max_pertask =
183                         min(perf_max_counters - ctx->nr_counters,
184                             perf_max_counters - perf_reserved_percpu);
185         }
186
187         spin_unlock(&ctx->lock);
188         curr_rq_unlock_irq_restore(&flags);
189 }
190
191
192 /*
193  * Remove the counter from a task's (or a CPU's) list of counters.
194  *
195  * Must be called with counter->mutex and ctx->mutex held.
196  *
197  * CPU counters are removed with a smp call. For task counters we only
198  * call when the task is on a CPU.
199  */
200 static void perf_counter_remove_from_context(struct perf_counter *counter)
201 {
202         struct perf_counter_context *ctx = counter->ctx;
203         struct task_struct *task = ctx->task;
204
205         if (!task) {
206                 /*
207                  * Per cpu counters are removed via an smp call and
208                  * the removal is always sucessful.
209                  */
210                 smp_call_function_single(counter->cpu,
211                                          __perf_counter_remove_from_context,
212                                          counter, 1);
213                 return;
214         }
215
216 retry:
217         task_oncpu_function_call(task, __perf_counter_remove_from_context,
218                                  counter);
219
220         spin_lock_irq(&ctx->lock);
221         /*
222          * If the context is active we need to retry the smp call.
223          */
224         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
225                 spin_unlock_irq(&ctx->lock);
226                 goto retry;
227         }
228
229         /*
230          * The lock prevents that this context is scheduled in so we
231          * can remove the counter safely, if the call above did not
232          * succeed.
233          */
234         if (!list_empty(&counter->list_entry)) {
235                 ctx->nr_counters--;
236                 list_del_counter(counter, ctx);
237                 counter->task = NULL;
238         }
239         spin_unlock_irq(&ctx->lock);
240 }
241
242 /*
243  * Cross CPU call to disable a performance counter
244  */
245 static void __perf_counter_disable(void *info)
246 {
247         struct perf_counter *counter = info;
248         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
249         struct perf_counter_context *ctx = counter->ctx;
250         unsigned long flags;
251
252         /*
253          * If this is a per-task counter, need to check whether this
254          * counter's task is the current task on this cpu.
255          */
256         if (ctx->task && cpuctx->task_ctx != ctx)
257                 return;
258
259         curr_rq_lock_irq_save(&flags);
260         spin_lock(&ctx->lock);
261
262         /*
263          * If the counter is on, turn it off.
264          * If it is in error state, leave it in error state.
265          */
266         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
267                 if (counter == counter->group_leader)
268                         group_sched_out(counter, cpuctx, ctx);
269                 else
270                         counter_sched_out(counter, cpuctx, ctx);
271                 counter->state = PERF_COUNTER_STATE_OFF;
272         }
273
274         spin_unlock(&ctx->lock);
275         curr_rq_unlock_irq_restore(&flags);
276 }
277
278 /*
279  * Disable a counter.
280  */
281 static void perf_counter_disable(struct perf_counter *counter)
282 {
283         struct perf_counter_context *ctx = counter->ctx;
284         struct task_struct *task = ctx->task;
285
286         if (!task) {
287                 /*
288                  * Disable the counter on the cpu that it's on
289                  */
290                 smp_call_function_single(counter->cpu, __perf_counter_disable,
291                                          counter, 1);
292                 return;
293         }
294
295  retry:
296         task_oncpu_function_call(task, __perf_counter_disable, counter);
297
298         spin_lock_irq(&ctx->lock);
299         /*
300          * If the counter is still active, we need to retry the cross-call.
301          */
302         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
303                 spin_unlock_irq(&ctx->lock);
304                 goto retry;
305         }
306
307         /*
308          * Since we have the lock this context can't be scheduled
309          * in, so we can change the state safely.
310          */
311         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
312                 counter->state = PERF_COUNTER_STATE_OFF;
313
314         spin_unlock_irq(&ctx->lock);
315 }
316
317 /*
318  * Disable a counter and all its children.
319  */
320 static void perf_counter_disable_family(struct perf_counter *counter)
321 {
322         struct perf_counter *child;
323
324         perf_counter_disable(counter);
325
326         /*
327          * Lock the mutex to protect the list of children
328          */
329         mutex_lock(&counter->mutex);
330         list_for_each_entry(child, &counter->child_list, child_list)
331                 perf_counter_disable(child);
332         mutex_unlock(&counter->mutex);
333 }
334
335 static int
336 counter_sched_in(struct perf_counter *counter,
337                  struct perf_cpu_context *cpuctx,
338                  struct perf_counter_context *ctx,
339                  int cpu)
340 {
341         if (counter->state <= PERF_COUNTER_STATE_OFF)
342                 return 0;
343
344         counter->state = PERF_COUNTER_STATE_ACTIVE;
345         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
346         /*
347          * The new state must be visible before we turn it on in the hardware:
348          */
349         smp_wmb();
350
351         if (counter->hw_ops->enable(counter)) {
352                 counter->state = PERF_COUNTER_STATE_INACTIVE;
353                 counter->oncpu = -1;
354                 return -EAGAIN;
355         }
356
357         if (!is_software_counter(counter))
358                 cpuctx->active_oncpu++;
359         ctx->nr_active++;
360
361         if (counter->hw_event.exclusive)
362                 cpuctx->exclusive = 1;
363
364         return 0;
365 }
366
367 /*
368  * Return 1 for a group consisting entirely of software counters,
369  * 0 if the group contains any hardware counters.
370  */
371 static int is_software_only_group(struct perf_counter *leader)
372 {
373         struct perf_counter *counter;
374
375         if (!is_software_counter(leader))
376                 return 0;
377         list_for_each_entry(counter, &leader->sibling_list, list_entry)
378                 if (!is_software_counter(counter))
379                         return 0;
380         return 1;
381 }
382
383 /*
384  * Work out whether we can put this counter group on the CPU now.
385  */
386 static int group_can_go_on(struct perf_counter *counter,
387                            struct perf_cpu_context *cpuctx,
388                            int can_add_hw)
389 {
390         /*
391          * Groups consisting entirely of software counters can always go on.
392          */
393         if (is_software_only_group(counter))
394                 return 1;
395         /*
396          * If an exclusive group is already on, no other hardware
397          * counters can go on.
398          */
399         if (cpuctx->exclusive)
400                 return 0;
401         /*
402          * If this group is exclusive and there are already
403          * counters on the CPU, it can't go on.
404          */
405         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
406                 return 0;
407         /*
408          * Otherwise, try to add it if all previous groups were able
409          * to go on.
410          */
411         return can_add_hw;
412 }
413
414 /*
415  * Cross CPU call to install and enable a performance counter
416  */
417 static void __perf_install_in_context(void *info)
418 {
419         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
420         struct perf_counter *counter = info;
421         struct perf_counter_context *ctx = counter->ctx;
422         struct perf_counter *leader = counter->group_leader;
423         int cpu = smp_processor_id();
424         unsigned long flags;
425         u64 perf_flags;
426         int err;
427
428         /*
429          * If this is a task context, we need to check whether it is
430          * the current task context of this cpu. If not it has been
431          * scheduled out before the smp call arrived.
432          */
433         if (ctx->task && cpuctx->task_ctx != ctx)
434                 return;
435
436         curr_rq_lock_irq_save(&flags);
437         spin_lock(&ctx->lock);
438
439         /*
440          * Protect the list operation against NMI by disabling the
441          * counters on a global level. NOP for non NMI based counters.
442          */
443         perf_flags = hw_perf_save_disable();
444
445         list_add_counter(counter, ctx);
446         ctx->nr_counters++;
447
448         /*
449          * Don't put the counter on if it is disabled or if
450          * it is in a group and the group isn't on.
451          */
452         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
453             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
454                 goto unlock;
455
456         /*
457          * An exclusive counter can't go on if there are already active
458          * hardware counters, and no hardware counter can go on if there
459          * is already an exclusive counter on.
460          */
461         if (!group_can_go_on(counter, cpuctx, 1))
462                 err = -EEXIST;
463         else
464                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
465
466         if (err) {
467                 /*
468                  * This counter couldn't go on.  If it is in a group
469                  * then we have to pull the whole group off.
470                  * If the counter group is pinned then put it in error state.
471                  */
472                 if (leader != counter)
473                         group_sched_out(leader, cpuctx, ctx);
474                 if (leader->hw_event.pinned)
475                         leader->state = PERF_COUNTER_STATE_ERROR;
476         }
477
478         if (!err && !ctx->task && cpuctx->max_pertask)
479                 cpuctx->max_pertask--;
480
481  unlock:
482         hw_perf_restore(perf_flags);
483
484         spin_unlock(&ctx->lock);
485         curr_rq_unlock_irq_restore(&flags);
486 }
487
488 /*
489  * Attach a performance counter to a context
490  *
491  * First we add the counter to the list with the hardware enable bit
492  * in counter->hw_config cleared.
493  *
494  * If the counter is attached to a task which is on a CPU we use a smp
495  * call to enable it in the task context. The task might have been
496  * scheduled away, but we check this in the smp call again.
497  *
498  * Must be called with ctx->mutex held.
499  */
500 static void
501 perf_install_in_context(struct perf_counter_context *ctx,
502                         struct perf_counter *counter,
503                         int cpu)
504 {
505         struct task_struct *task = ctx->task;
506
507         if (!task) {
508                 /*
509                  * Per cpu counters are installed via an smp call and
510                  * the install is always sucessful.
511                  */
512                 smp_call_function_single(cpu, __perf_install_in_context,
513                                          counter, 1);
514                 return;
515         }
516
517         counter->task = task;
518 retry:
519         task_oncpu_function_call(task, __perf_install_in_context,
520                                  counter);
521
522         spin_lock_irq(&ctx->lock);
523         /*
524          * we need to retry the smp call.
525          */
526         if (ctx->is_active && list_empty(&counter->list_entry)) {
527                 spin_unlock_irq(&ctx->lock);
528                 goto retry;
529         }
530
531         /*
532          * The lock prevents that this context is scheduled in so we
533          * can add the counter safely, if it the call above did not
534          * succeed.
535          */
536         if (list_empty(&counter->list_entry)) {
537                 list_add_counter(counter, ctx);
538                 ctx->nr_counters++;
539         }
540         spin_unlock_irq(&ctx->lock);
541 }
542
543 /*
544  * Cross CPU call to enable a performance counter
545  */
546 static void __perf_counter_enable(void *info)
547 {
548         struct perf_counter *counter = info;
549         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
550         struct perf_counter_context *ctx = counter->ctx;
551         struct perf_counter *leader = counter->group_leader;
552         unsigned long flags;
553         int err;
554
555         /*
556          * If this is a per-task counter, need to check whether this
557          * counter's task is the current task on this cpu.
558          */
559         if (ctx->task && cpuctx->task_ctx != ctx)
560                 return;
561
562         curr_rq_lock_irq_save(&flags);
563         spin_lock(&ctx->lock);
564
565         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
566                 goto unlock;
567         counter->state = PERF_COUNTER_STATE_INACTIVE;
568
569         /*
570          * If the counter is in a group and isn't the group leader,
571          * then don't put it on unless the group is on.
572          */
573         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
574                 goto unlock;
575
576         if (!group_can_go_on(counter, cpuctx, 1))
577                 err = -EEXIST;
578         else
579                 err = counter_sched_in(counter, cpuctx, ctx,
580                                        smp_processor_id());
581
582         if (err) {
583                 /*
584                  * If this counter can't go on and it's part of a
585                  * group, then the whole group has to come off.
586                  */
587                 if (leader != counter)
588                         group_sched_out(leader, cpuctx, ctx);
589                 if (leader->hw_event.pinned)
590                         leader->state = PERF_COUNTER_STATE_ERROR;
591         }
592
593  unlock:
594         spin_unlock(&ctx->lock);
595         curr_rq_unlock_irq_restore(&flags);
596 }
597
598 /*
599  * Enable a counter.
600  */
601 static void perf_counter_enable(struct perf_counter *counter)
602 {
603         struct perf_counter_context *ctx = counter->ctx;
604         struct task_struct *task = ctx->task;
605
606         if (!task) {
607                 /*
608                  * Enable the counter on the cpu that it's on
609                  */
610                 smp_call_function_single(counter->cpu, __perf_counter_enable,
611                                          counter, 1);
612                 return;
613         }
614
615         spin_lock_irq(&ctx->lock);
616         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
617                 goto out;
618
619         /*
620          * If the counter is in error state, clear that first.
621          * That way, if we see the counter in error state below, we
622          * know that it has gone back into error state, as distinct
623          * from the task having been scheduled away before the
624          * cross-call arrived.
625          */
626         if (counter->state == PERF_COUNTER_STATE_ERROR)
627                 counter->state = PERF_COUNTER_STATE_OFF;
628
629  retry:
630         spin_unlock_irq(&ctx->lock);
631         task_oncpu_function_call(task, __perf_counter_enable, counter);
632
633         spin_lock_irq(&ctx->lock);
634
635         /*
636          * If the context is active and the counter is still off,
637          * we need to retry the cross-call.
638          */
639         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
640                 goto retry;
641
642         /*
643          * Since we have the lock this context can't be scheduled
644          * in, so we can change the state safely.
645          */
646         if (counter->state == PERF_COUNTER_STATE_OFF)
647                 counter->state = PERF_COUNTER_STATE_INACTIVE;
648  out:
649         spin_unlock_irq(&ctx->lock);
650 }
651
652 /*
653  * Enable a counter and all its children.
654  */
655 static void perf_counter_enable_family(struct perf_counter *counter)
656 {
657         struct perf_counter *child;
658
659         perf_counter_enable(counter);
660
661         /*
662          * Lock the mutex to protect the list of children
663          */
664         mutex_lock(&counter->mutex);
665         list_for_each_entry(child, &counter->child_list, child_list)
666                 perf_counter_enable(child);
667         mutex_unlock(&counter->mutex);
668 }
669
670 void __perf_counter_sched_out(struct perf_counter_context *ctx,
671                               struct perf_cpu_context *cpuctx)
672 {
673         struct perf_counter *counter;
674         u64 flags;
675
676         spin_lock(&ctx->lock);
677         ctx->is_active = 0;
678         if (likely(!ctx->nr_counters))
679                 goto out;
680
681         flags = hw_perf_save_disable();
682         if (ctx->nr_active) {
683                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
684                         group_sched_out(counter, cpuctx, ctx);
685         }
686         hw_perf_restore(flags);
687  out:
688         spin_unlock(&ctx->lock);
689 }
690
691 /*
692  * Called from scheduler to remove the counters of the current task,
693  * with interrupts disabled.
694  *
695  * We stop each counter and update the counter value in counter->count.
696  *
697  * This does not protect us against NMI, but disable()
698  * sets the disabled bit in the control field of counter _before_
699  * accessing the counter control register. If a NMI hits, then it will
700  * not restart the counter.
701  */
702 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
703 {
704         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
705         struct perf_counter_context *ctx = &task->perf_counter_ctx;
706
707         if (likely(!cpuctx->task_ctx))
708                 return;
709
710         __perf_counter_sched_out(ctx, cpuctx);
711
712         cpuctx->task_ctx = NULL;
713 }
714
715 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
716 {
717         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
718 }
719
720 static int
721 group_sched_in(struct perf_counter *group_counter,
722                struct perf_cpu_context *cpuctx,
723                struct perf_counter_context *ctx,
724                int cpu)
725 {
726         struct perf_counter *counter, *partial_group;
727         int ret;
728
729         if (group_counter->state == PERF_COUNTER_STATE_OFF)
730                 return 0;
731
732         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
733         if (ret)
734                 return ret < 0 ? ret : 0;
735
736         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
737                 return -EAGAIN;
738
739         /*
740          * Schedule in siblings as one group (if any):
741          */
742         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
743                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
744                         partial_group = counter;
745                         goto group_error;
746                 }
747         }
748
749         return 0;
750
751 group_error:
752         /*
753          * Groups can be scheduled in as one unit only, so undo any
754          * partial group before returning:
755          */
756         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
757                 if (counter == partial_group)
758                         break;
759                 counter_sched_out(counter, cpuctx, ctx);
760         }
761         counter_sched_out(group_counter, cpuctx, ctx);
762
763         return -EAGAIN;
764 }
765
766 static void
767 __perf_counter_sched_in(struct perf_counter_context *ctx,
768                         struct perf_cpu_context *cpuctx, int cpu)
769 {
770         struct perf_counter *counter;
771         u64 flags;
772         int can_add_hw = 1;
773
774         spin_lock(&ctx->lock);
775         ctx->is_active = 1;
776         if (likely(!ctx->nr_counters))
777                 goto out;
778
779         flags = hw_perf_save_disable();
780
781         /*
782          * First go through the list and put on any pinned groups
783          * in order to give them the best chance of going on.
784          */
785         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
786                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
787                     !counter->hw_event.pinned)
788                         continue;
789                 if (counter->cpu != -1 && counter->cpu != cpu)
790                         continue;
791
792                 if (group_can_go_on(counter, cpuctx, 1))
793                         group_sched_in(counter, cpuctx, ctx, cpu);
794
795                 /*
796                  * If this pinned group hasn't been scheduled,
797                  * put it in error state.
798                  */
799                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
800                         counter->state = PERF_COUNTER_STATE_ERROR;
801         }
802
803         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
804                 /*
805                  * Ignore counters in OFF or ERROR state, and
806                  * ignore pinned counters since we did them already.
807                  */
808                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
809                     counter->hw_event.pinned)
810                         continue;
811
812                 /*
813                  * Listen to the 'cpu' scheduling filter constraint
814                  * of counters:
815                  */
816                 if (counter->cpu != -1 && counter->cpu != cpu)
817                         continue;
818
819                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
820                         if (group_sched_in(counter, cpuctx, ctx, cpu))
821                                 can_add_hw = 0;
822                 }
823         }
824         hw_perf_restore(flags);
825  out:
826         spin_unlock(&ctx->lock);
827 }
828
829 /*
830  * Called from scheduler to add the counters of the current task
831  * with interrupts disabled.
832  *
833  * We restore the counter value and then enable it.
834  *
835  * This does not protect us against NMI, but enable()
836  * sets the enabled bit in the control field of counter _before_
837  * accessing the counter control register. If a NMI hits, then it will
838  * keep the counter running.
839  */
840 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
841 {
842         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
843         struct perf_counter_context *ctx = &task->perf_counter_ctx;
844
845         __perf_counter_sched_in(ctx, cpuctx, cpu);
846         cpuctx->task_ctx = ctx;
847 }
848
849 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
850 {
851         struct perf_counter_context *ctx = &cpuctx->ctx;
852
853         __perf_counter_sched_in(ctx, cpuctx, cpu);
854 }
855
856 int perf_counter_task_disable(void)
857 {
858         struct task_struct *curr = current;
859         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
860         struct perf_counter *counter;
861         unsigned long flags;
862         u64 perf_flags;
863         int cpu;
864
865         if (likely(!ctx->nr_counters))
866                 return 0;
867
868         curr_rq_lock_irq_save(&flags);
869         cpu = smp_processor_id();
870
871         /* force the update of the task clock: */
872         __task_delta_exec(curr, 1);
873
874         perf_counter_task_sched_out(curr, cpu);
875
876         spin_lock(&ctx->lock);
877
878         /*
879          * Disable all the counters:
880          */
881         perf_flags = hw_perf_save_disable();
882
883         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
884                 if (counter->state != PERF_COUNTER_STATE_ERROR)
885                         counter->state = PERF_COUNTER_STATE_OFF;
886         }
887
888         hw_perf_restore(perf_flags);
889
890         spin_unlock(&ctx->lock);
891
892         curr_rq_unlock_irq_restore(&flags);
893
894         return 0;
895 }
896
897 int perf_counter_task_enable(void)
898 {
899         struct task_struct *curr = current;
900         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
901         struct perf_counter *counter;
902         unsigned long flags;
903         u64 perf_flags;
904         int cpu;
905
906         if (likely(!ctx->nr_counters))
907                 return 0;
908
909         curr_rq_lock_irq_save(&flags);
910         cpu = smp_processor_id();
911
912         /* force the update of the task clock: */
913         __task_delta_exec(curr, 1);
914
915         perf_counter_task_sched_out(curr, cpu);
916
917         spin_lock(&ctx->lock);
918
919         /*
920          * Disable all the counters:
921          */
922         perf_flags = hw_perf_save_disable();
923
924         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
925                 if (counter->state > PERF_COUNTER_STATE_OFF)
926                         continue;
927                 counter->state = PERF_COUNTER_STATE_INACTIVE;
928                 counter->hw_event.disabled = 0;
929         }
930         hw_perf_restore(perf_flags);
931
932         spin_unlock(&ctx->lock);
933
934         perf_counter_task_sched_in(curr, cpu);
935
936         curr_rq_unlock_irq_restore(&flags);
937
938         return 0;
939 }
940
941 /*
942  * Round-robin a context's counters:
943  */
944 static void rotate_ctx(struct perf_counter_context *ctx)
945 {
946         struct perf_counter *counter;
947         u64 perf_flags;
948
949         if (!ctx->nr_counters)
950                 return;
951
952         spin_lock(&ctx->lock);
953         /*
954          * Rotate the first entry last (works just fine for group counters too):
955          */
956         perf_flags = hw_perf_save_disable();
957         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
958                 list_del(&counter->list_entry);
959                 list_add_tail(&counter->list_entry, &ctx->counter_list);
960                 break;
961         }
962         hw_perf_restore(perf_flags);
963
964         spin_unlock(&ctx->lock);
965 }
966
967 void perf_counter_task_tick(struct task_struct *curr, int cpu)
968 {
969         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
970         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
971         const int rotate_percpu = 0;
972
973         if (rotate_percpu)
974                 perf_counter_cpu_sched_out(cpuctx);
975         perf_counter_task_sched_out(curr, cpu);
976
977         if (rotate_percpu)
978                 rotate_ctx(&cpuctx->ctx);
979         rotate_ctx(ctx);
980
981         if (rotate_percpu)
982                 perf_counter_cpu_sched_in(cpuctx, cpu);
983         perf_counter_task_sched_in(curr, cpu);
984 }
985
986 /*
987  * Cross CPU call to read the hardware counter
988  */
989 static void __read(void *info)
990 {
991         struct perf_counter *counter = info;
992         unsigned long flags;
993
994         curr_rq_lock_irq_save(&flags);
995         counter->hw_ops->read(counter);
996         curr_rq_unlock_irq_restore(&flags);
997 }
998
999 static u64 perf_counter_read(struct perf_counter *counter)
1000 {
1001         /*
1002          * If counter is enabled and currently active on a CPU, update the
1003          * value in the counter structure:
1004          */
1005         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1006                 smp_call_function_single(counter->oncpu,
1007                                          __read, counter, 1);
1008         }
1009
1010         return atomic64_read(&counter->count);
1011 }
1012
1013 /*
1014  * Cross CPU call to switch performance data pointers
1015  */
1016 static void __perf_switch_irq_data(void *info)
1017 {
1018         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1019         struct perf_counter *counter = info;
1020         struct perf_counter_context *ctx = counter->ctx;
1021         struct perf_data *oldirqdata = counter->irqdata;
1022
1023         /*
1024          * If this is a task context, we need to check whether it is
1025          * the current task context of this cpu. If not it has been
1026          * scheduled out before the smp call arrived.
1027          */
1028         if (ctx->task) {
1029                 if (cpuctx->task_ctx != ctx)
1030                         return;
1031                 spin_lock(&ctx->lock);
1032         }
1033
1034         /* Change the pointer NMI safe */
1035         atomic_long_set((atomic_long_t *)&counter->irqdata,
1036                         (unsigned long) counter->usrdata);
1037         counter->usrdata = oldirqdata;
1038
1039         if (ctx->task)
1040                 spin_unlock(&ctx->lock);
1041 }
1042
1043 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1044 {
1045         struct perf_counter_context *ctx = counter->ctx;
1046         struct perf_data *oldirqdata = counter->irqdata;
1047         struct task_struct *task = ctx->task;
1048
1049         if (!task) {
1050                 smp_call_function_single(counter->cpu,
1051                                          __perf_switch_irq_data,
1052                                          counter, 1);
1053                 return counter->usrdata;
1054         }
1055
1056 retry:
1057         spin_lock_irq(&ctx->lock);
1058         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1059                 counter->irqdata = counter->usrdata;
1060                 counter->usrdata = oldirqdata;
1061                 spin_unlock_irq(&ctx->lock);
1062                 return oldirqdata;
1063         }
1064         spin_unlock_irq(&ctx->lock);
1065         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1066         /* Might have failed, because task was scheduled out */
1067         if (counter->irqdata == oldirqdata)
1068                 goto retry;
1069
1070         return counter->usrdata;
1071 }
1072
1073 static void put_context(struct perf_counter_context *ctx)
1074 {
1075         if (ctx->task)
1076                 put_task_struct(ctx->task);
1077 }
1078
1079 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1080 {
1081         struct perf_cpu_context *cpuctx;
1082         struct perf_counter_context *ctx;
1083         struct task_struct *task;
1084
1085         /*
1086          * If cpu is not a wildcard then this is a percpu counter:
1087          */
1088         if (cpu != -1) {
1089                 /* Must be root to operate on a CPU counter: */
1090                 if (!capable(CAP_SYS_ADMIN))
1091                         return ERR_PTR(-EACCES);
1092
1093                 if (cpu < 0 || cpu > num_possible_cpus())
1094                         return ERR_PTR(-EINVAL);
1095
1096                 /*
1097                  * We could be clever and allow to attach a counter to an
1098                  * offline CPU and activate it when the CPU comes up, but
1099                  * that's for later.
1100                  */
1101                 if (!cpu_isset(cpu, cpu_online_map))
1102                         return ERR_PTR(-ENODEV);
1103
1104                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1105                 ctx = &cpuctx->ctx;
1106
1107                 return ctx;
1108         }
1109
1110         rcu_read_lock();
1111         if (!pid)
1112                 task = current;
1113         else
1114                 task = find_task_by_vpid(pid);
1115         if (task)
1116                 get_task_struct(task);
1117         rcu_read_unlock();
1118
1119         if (!task)
1120                 return ERR_PTR(-ESRCH);
1121
1122         ctx = &task->perf_counter_ctx;
1123         ctx->task = task;
1124
1125         /* Reuse ptrace permission checks for now. */
1126         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1127                 put_context(ctx);
1128                 return ERR_PTR(-EACCES);
1129         }
1130
1131         return ctx;
1132 }
1133
1134 /*
1135  * Called when the last reference to the file is gone.
1136  */
1137 static int perf_release(struct inode *inode, struct file *file)
1138 {
1139         struct perf_counter *counter = file->private_data;
1140         struct perf_counter_context *ctx = counter->ctx;
1141
1142         file->private_data = NULL;
1143
1144         mutex_lock(&ctx->mutex);
1145         mutex_lock(&counter->mutex);
1146
1147         perf_counter_remove_from_context(counter);
1148
1149         mutex_unlock(&counter->mutex);
1150         mutex_unlock(&ctx->mutex);
1151
1152         kfree(counter);
1153         put_context(ctx);
1154
1155         return 0;
1156 }
1157
1158 /*
1159  * Read the performance counter - simple non blocking version for now
1160  */
1161 static ssize_t
1162 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1163 {
1164         u64 cntval;
1165
1166         if (count != sizeof(cntval))
1167                 return -EINVAL;
1168
1169         /*
1170          * Return end-of-file for a read on a counter that is in
1171          * error state (i.e. because it was pinned but it couldn't be
1172          * scheduled on to the CPU at some point).
1173          */
1174         if (counter->state == PERF_COUNTER_STATE_ERROR)
1175                 return 0;
1176
1177         mutex_lock(&counter->mutex);
1178         cntval = perf_counter_read(counter);
1179         mutex_unlock(&counter->mutex);
1180
1181         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1182 }
1183
1184 static ssize_t
1185 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1186 {
1187         if (!usrdata->len)
1188                 return 0;
1189
1190         count = min(count, (size_t)usrdata->len);
1191         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1192                 return -EFAULT;
1193
1194         /* Adjust the counters */
1195         usrdata->len -= count;
1196         if (!usrdata->len)
1197                 usrdata->rd_idx = 0;
1198         else
1199                 usrdata->rd_idx += count;
1200
1201         return count;
1202 }
1203
1204 static ssize_t
1205 perf_read_irq_data(struct perf_counter  *counter,
1206                    char __user          *buf,
1207                    size_t               count,
1208                    int                  nonblocking)
1209 {
1210         struct perf_data *irqdata, *usrdata;
1211         DECLARE_WAITQUEUE(wait, current);
1212         ssize_t res, res2;
1213
1214         irqdata = counter->irqdata;
1215         usrdata = counter->usrdata;
1216
1217         if (usrdata->len + irqdata->len >= count)
1218                 goto read_pending;
1219
1220         if (nonblocking)
1221                 return -EAGAIN;
1222
1223         spin_lock_irq(&counter->waitq.lock);
1224         __add_wait_queue(&counter->waitq, &wait);
1225         for (;;) {
1226                 set_current_state(TASK_INTERRUPTIBLE);
1227                 if (usrdata->len + irqdata->len >= count)
1228                         break;
1229
1230                 if (signal_pending(current))
1231                         break;
1232
1233                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1234                         break;
1235
1236                 spin_unlock_irq(&counter->waitq.lock);
1237                 schedule();
1238                 spin_lock_irq(&counter->waitq.lock);
1239         }
1240         __remove_wait_queue(&counter->waitq, &wait);
1241         __set_current_state(TASK_RUNNING);
1242         spin_unlock_irq(&counter->waitq.lock);
1243
1244         if (usrdata->len + irqdata->len < count &&
1245             counter->state != PERF_COUNTER_STATE_ERROR)
1246                 return -ERESTARTSYS;
1247 read_pending:
1248         mutex_lock(&counter->mutex);
1249
1250         /* Drain pending data first: */
1251         res = perf_copy_usrdata(usrdata, buf, count);
1252         if (res < 0 || res == count)
1253                 goto out;
1254
1255         /* Switch irq buffer: */
1256         usrdata = perf_switch_irq_data(counter);
1257         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1258         if (res2 < 0) {
1259                 if (!res)
1260                         res = -EFAULT;
1261         } else {
1262                 res += res2;
1263         }
1264 out:
1265         mutex_unlock(&counter->mutex);
1266
1267         return res;
1268 }
1269
1270 static ssize_t
1271 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1272 {
1273         struct perf_counter *counter = file->private_data;
1274
1275         switch (counter->hw_event.record_type) {
1276         case PERF_RECORD_SIMPLE:
1277                 return perf_read_hw(counter, buf, count);
1278
1279         case PERF_RECORD_IRQ:
1280         case PERF_RECORD_GROUP:
1281                 return perf_read_irq_data(counter, buf, count,
1282                                           file->f_flags & O_NONBLOCK);
1283         }
1284         return -EINVAL;
1285 }
1286
1287 static unsigned int perf_poll(struct file *file, poll_table *wait)
1288 {
1289         struct perf_counter *counter = file->private_data;
1290         unsigned int events = 0;
1291         unsigned long flags;
1292
1293         poll_wait(file, &counter->waitq, wait);
1294
1295         spin_lock_irqsave(&counter->waitq.lock, flags);
1296         if (counter->usrdata->len || counter->irqdata->len)
1297                 events |= POLLIN;
1298         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1299
1300         return events;
1301 }
1302
1303 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1304 {
1305         struct perf_counter *counter = file->private_data;
1306         int err = 0;
1307
1308         switch (cmd) {
1309         case PERF_COUNTER_IOC_ENABLE:
1310                 perf_counter_enable_family(counter);
1311                 break;
1312         case PERF_COUNTER_IOC_DISABLE:
1313                 perf_counter_disable_family(counter);
1314                 break;
1315         default:
1316                 err = -ENOTTY;
1317         }
1318         return err;
1319 }
1320
1321 static const struct file_operations perf_fops = {
1322         .release                = perf_release,
1323         .read                   = perf_read,
1324         .poll                   = perf_poll,
1325         .unlocked_ioctl         = perf_ioctl,
1326         .compat_ioctl           = perf_ioctl,
1327 };
1328
1329 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1330 {
1331         int cpu = raw_smp_processor_id();
1332
1333         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1334         return 0;
1335 }
1336
1337 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1338 {
1339         int cpu = raw_smp_processor_id();
1340         s64 prev;
1341         u64 now;
1342
1343         now = cpu_clock(cpu);
1344         prev = atomic64_read(&counter->hw.prev_count);
1345         atomic64_set(&counter->hw.prev_count, now);
1346         atomic64_add(now - prev, &counter->count);
1347 }
1348
1349 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1350 {
1351         cpu_clock_perf_counter_update(counter);
1352 }
1353
1354 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1355 {
1356         cpu_clock_perf_counter_update(counter);
1357 }
1358
1359 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1360         .enable         = cpu_clock_perf_counter_enable,
1361         .disable        = cpu_clock_perf_counter_disable,
1362         .read           = cpu_clock_perf_counter_read,
1363 };
1364
1365 /*
1366  * Called from within the scheduler:
1367  */
1368 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1369 {
1370         struct task_struct *curr = counter->task;
1371         u64 delta;
1372
1373         delta = __task_delta_exec(curr, update);
1374
1375         return curr->se.sum_exec_runtime + delta;
1376 }
1377
1378 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1379 {
1380         u64 prev;
1381         s64 delta;
1382
1383         prev = atomic64_read(&counter->hw.prev_count);
1384
1385         atomic64_set(&counter->hw.prev_count, now);
1386
1387         delta = now - prev;
1388
1389         atomic64_add(delta, &counter->count);
1390 }
1391
1392 static void task_clock_perf_counter_read(struct perf_counter *counter)
1393 {
1394         u64 now = task_clock_perf_counter_val(counter, 1);
1395
1396         task_clock_perf_counter_update(counter, now);
1397 }
1398
1399 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1400 {
1401         u64 now = task_clock_perf_counter_val(counter, 0);
1402
1403         atomic64_set(&counter->hw.prev_count, now);
1404
1405         return 0;
1406 }
1407
1408 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1409 {
1410         u64 now = task_clock_perf_counter_val(counter, 0);
1411
1412         task_clock_perf_counter_update(counter, now);
1413 }
1414
1415 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1416         .enable         = task_clock_perf_counter_enable,
1417         .disable        = task_clock_perf_counter_disable,
1418         .read           = task_clock_perf_counter_read,
1419 };
1420
1421 #ifdef CONFIG_VM_EVENT_COUNTERS
1422 #define cpu_page_faults()       __get_cpu_var(vm_event_states).event[PGFAULT]
1423 #else
1424 #define cpu_page_faults()       0
1425 #endif
1426
1427 static u64 get_page_faults(struct perf_counter *counter)
1428 {
1429         struct task_struct *curr = counter->ctx->task;
1430
1431         if (curr)
1432                 return curr->maj_flt + curr->min_flt;
1433         return cpu_page_faults();
1434 }
1435
1436 static void page_faults_perf_counter_update(struct perf_counter *counter)
1437 {
1438         u64 prev, now;
1439         s64 delta;
1440
1441         prev = atomic64_read(&counter->hw.prev_count);
1442         now = get_page_faults(counter);
1443
1444         atomic64_set(&counter->hw.prev_count, now);
1445
1446         delta = now - prev;
1447
1448         atomic64_add(delta, &counter->count);
1449 }
1450
1451 static void page_faults_perf_counter_read(struct perf_counter *counter)
1452 {
1453         page_faults_perf_counter_update(counter);
1454 }
1455
1456 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1457 {
1458         atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
1459         return 0;
1460 }
1461
1462 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1463 {
1464         page_faults_perf_counter_update(counter);
1465 }
1466
1467 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1468         .enable         = page_faults_perf_counter_enable,
1469         .disable        = page_faults_perf_counter_disable,
1470         .read           = page_faults_perf_counter_read,
1471 };
1472
1473 static u64 get_context_switches(struct perf_counter *counter)
1474 {
1475         struct task_struct *curr = counter->ctx->task;
1476
1477         if (curr)
1478                 return curr->nvcsw + curr->nivcsw;
1479         return cpu_nr_switches(smp_processor_id());
1480 }
1481
1482 static void context_switches_perf_counter_update(struct perf_counter *counter)
1483 {
1484         u64 prev, now;
1485         s64 delta;
1486
1487         prev = atomic64_read(&counter->hw.prev_count);
1488         now = get_context_switches(counter);
1489
1490         atomic64_set(&counter->hw.prev_count, now);
1491
1492         delta = now - prev;
1493
1494         atomic64_add(delta, &counter->count);
1495 }
1496
1497 static void context_switches_perf_counter_read(struct perf_counter *counter)
1498 {
1499         context_switches_perf_counter_update(counter);
1500 }
1501
1502 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1503 {
1504         atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
1505         return 0;
1506 }
1507
1508 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1509 {
1510         context_switches_perf_counter_update(counter);
1511 }
1512
1513 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1514         .enable         = context_switches_perf_counter_enable,
1515         .disable        = context_switches_perf_counter_disable,
1516         .read           = context_switches_perf_counter_read,
1517 };
1518
1519 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1520 {
1521         struct task_struct *curr = counter->ctx->task;
1522
1523         if (curr)
1524                 return curr->se.nr_migrations;
1525         return cpu_nr_migrations(smp_processor_id());
1526 }
1527
1528 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1529 {
1530         u64 prev, now;
1531         s64 delta;
1532
1533         prev = atomic64_read(&counter->hw.prev_count);
1534         now = get_cpu_migrations(counter);
1535
1536         atomic64_set(&counter->hw.prev_count, now);
1537
1538         delta = now - prev;
1539
1540         atomic64_add(delta, &counter->count);
1541 }
1542
1543 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1544 {
1545         cpu_migrations_perf_counter_update(counter);
1546 }
1547
1548 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1549 {
1550         atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
1551         return 0;
1552 }
1553
1554 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1555 {
1556         cpu_migrations_perf_counter_update(counter);
1557 }
1558
1559 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1560         .enable         = cpu_migrations_perf_counter_enable,
1561         .disable        = cpu_migrations_perf_counter_disable,
1562         .read           = cpu_migrations_perf_counter_read,
1563 };
1564
1565 static const struct hw_perf_counter_ops *
1566 sw_perf_counter_init(struct perf_counter *counter)
1567 {
1568         const struct hw_perf_counter_ops *hw_ops = NULL;
1569
1570         /*
1571          * Software counters (currently) can't in general distinguish
1572          * between user, kernel and hypervisor events.
1573          * However, context switches and cpu migrations are considered
1574          * to be kernel events, and page faults are never hypervisor
1575          * events.
1576          */
1577         switch (counter->hw_event.type) {
1578         case PERF_COUNT_CPU_CLOCK:
1579                 if (!(counter->hw_event.exclude_user ||
1580                       counter->hw_event.exclude_kernel ||
1581                       counter->hw_event.exclude_hv))
1582                         hw_ops = &perf_ops_cpu_clock;
1583                 break;
1584         case PERF_COUNT_TASK_CLOCK:
1585                 if (counter->hw_event.exclude_user ||
1586                     counter->hw_event.exclude_kernel ||
1587                     counter->hw_event.exclude_hv)
1588                         break;
1589                 /*
1590                  * If the user instantiates this as a per-cpu counter,
1591                  * use the cpu_clock counter instead.
1592                  */
1593                 if (counter->ctx->task)
1594                         hw_ops = &perf_ops_task_clock;
1595                 else
1596                         hw_ops = &perf_ops_cpu_clock;
1597                 break;
1598         case PERF_COUNT_PAGE_FAULTS:
1599                 if (!(counter->hw_event.exclude_user ||
1600                       counter->hw_event.exclude_kernel))
1601                         hw_ops = &perf_ops_page_faults;
1602                 break;
1603         case PERF_COUNT_CONTEXT_SWITCHES:
1604                 if (!counter->hw_event.exclude_kernel)
1605                         hw_ops = &perf_ops_context_switches;
1606                 break;
1607         case PERF_COUNT_CPU_MIGRATIONS:
1608                 if (!counter->hw_event.exclude_kernel)
1609                         hw_ops = &perf_ops_cpu_migrations;
1610                 break;
1611         default:
1612                 break;
1613         }
1614         return hw_ops;
1615 }
1616
1617 /*
1618  * Allocate and initialize a counter structure
1619  */
1620 static struct perf_counter *
1621 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1622                    int cpu,
1623                    struct perf_counter_context *ctx,
1624                    struct perf_counter *group_leader,
1625                    gfp_t gfpflags)
1626 {
1627         const struct hw_perf_counter_ops *hw_ops;
1628         struct perf_counter *counter;
1629
1630         counter = kzalloc(sizeof(*counter), gfpflags);
1631         if (!counter)
1632                 return NULL;
1633
1634         /*
1635          * Single counters are their own group leaders, with an
1636          * empty sibling list:
1637          */
1638         if (!group_leader)
1639                 group_leader = counter;
1640
1641         mutex_init(&counter->mutex);
1642         INIT_LIST_HEAD(&counter->list_entry);
1643         INIT_LIST_HEAD(&counter->sibling_list);
1644         init_waitqueue_head(&counter->waitq);
1645
1646         INIT_LIST_HEAD(&counter->child_list);
1647
1648         counter->irqdata                = &counter->data[0];
1649         counter->usrdata                = &counter->data[1];
1650         counter->cpu                    = cpu;
1651         counter->hw_event               = *hw_event;
1652         counter->wakeup_pending         = 0;
1653         counter->group_leader           = group_leader;
1654         counter->hw_ops                 = NULL;
1655         counter->ctx                    = ctx;
1656
1657         counter->state = PERF_COUNTER_STATE_INACTIVE;
1658         if (hw_event->disabled)
1659                 counter->state = PERF_COUNTER_STATE_OFF;
1660
1661         hw_ops = NULL;
1662         if (!hw_event->raw && hw_event->type < 0)
1663                 hw_ops = sw_perf_counter_init(counter);
1664         else
1665                 hw_ops = hw_perf_counter_init(counter);
1666
1667         if (!hw_ops) {
1668                 kfree(counter);
1669                 return NULL;
1670         }
1671         counter->hw_ops = hw_ops;
1672
1673         return counter;
1674 }
1675
1676 /**
1677  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1678  *
1679  * @hw_event_uptr:      event type attributes for monitoring/sampling
1680  * @pid:                target pid
1681  * @cpu:                target cpu
1682  * @group_fd:           group leader counter fd
1683  */
1684 asmlinkage int
1685 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1686                       pid_t pid, int cpu, int group_fd)
1687 {
1688         struct perf_counter *counter, *group_leader;
1689         struct perf_counter_hw_event hw_event;
1690         struct perf_counter_context *ctx;
1691         struct file *counter_file = NULL;
1692         struct file *group_file = NULL;
1693         int fput_needed = 0;
1694         int fput_needed2 = 0;
1695         int ret;
1696
1697         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1698                 return -EFAULT;
1699
1700         /*
1701          * Get the target context (task or percpu):
1702          */
1703         ctx = find_get_context(pid, cpu);
1704         if (IS_ERR(ctx))
1705                 return PTR_ERR(ctx);
1706
1707         /*
1708          * Look up the group leader (we will attach this counter to it):
1709          */
1710         group_leader = NULL;
1711         if (group_fd != -1) {
1712                 ret = -EINVAL;
1713                 group_file = fget_light(group_fd, &fput_needed);
1714                 if (!group_file)
1715                         goto err_put_context;
1716                 if (group_file->f_op != &perf_fops)
1717                         goto err_put_context;
1718
1719                 group_leader = group_file->private_data;
1720                 /*
1721                  * Do not allow a recursive hierarchy (this new sibling
1722                  * becoming part of another group-sibling):
1723                  */
1724                 if (group_leader->group_leader != group_leader)
1725                         goto err_put_context;
1726                 /*
1727                  * Do not allow to attach to a group in a different
1728                  * task or CPU context:
1729                  */
1730                 if (group_leader->ctx != ctx)
1731                         goto err_put_context;
1732                 /*
1733                  * Only a group leader can be exclusive or pinned
1734                  */
1735                 if (hw_event.exclusive || hw_event.pinned)
1736                         goto err_put_context;
1737         }
1738
1739         ret = -EINVAL;
1740         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1741                                      GFP_KERNEL);
1742         if (!counter)
1743                 goto err_put_context;
1744
1745         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1746         if (ret < 0)
1747                 goto err_free_put_context;
1748
1749         counter_file = fget_light(ret, &fput_needed2);
1750         if (!counter_file)
1751                 goto err_free_put_context;
1752
1753         counter->filp = counter_file;
1754         mutex_lock(&ctx->mutex);
1755         perf_install_in_context(ctx, counter, cpu);
1756         mutex_unlock(&ctx->mutex);
1757
1758         fput_light(counter_file, fput_needed2);
1759
1760 out_fput:
1761         fput_light(group_file, fput_needed);
1762
1763         return ret;
1764
1765 err_free_put_context:
1766         kfree(counter);
1767
1768 err_put_context:
1769         put_context(ctx);
1770
1771         goto out_fput;
1772 }
1773
1774 /*
1775  * Initialize the perf_counter context in a task_struct:
1776  */
1777 static void
1778 __perf_counter_init_context(struct perf_counter_context *ctx,
1779                             struct task_struct *task)
1780 {
1781         memset(ctx, 0, sizeof(*ctx));
1782         spin_lock_init(&ctx->lock);
1783         mutex_init(&ctx->mutex);
1784         INIT_LIST_HEAD(&ctx->counter_list);
1785         ctx->task = task;
1786 }
1787
1788 /*
1789  * inherit a counter from parent task to child task:
1790  */
1791 static struct perf_counter *
1792 inherit_counter(struct perf_counter *parent_counter,
1793               struct task_struct *parent,
1794               struct perf_counter_context *parent_ctx,
1795               struct task_struct *child,
1796               struct perf_counter *group_leader,
1797               struct perf_counter_context *child_ctx)
1798 {
1799         struct perf_counter *child_counter;
1800
1801         /*
1802          * Instead of creating recursive hierarchies of counters,
1803          * we link inherited counters back to the original parent,
1804          * which has a filp for sure, which we use as the reference
1805          * count:
1806          */
1807         if (parent_counter->parent)
1808                 parent_counter = parent_counter->parent;
1809
1810         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1811                                            parent_counter->cpu, child_ctx,
1812                                            group_leader, GFP_KERNEL);
1813         if (!child_counter)
1814                 return NULL;
1815
1816         /*
1817          * Link it up in the child's context:
1818          */
1819         child_counter->task = child;
1820         list_add_counter(child_counter, child_ctx);
1821         child_ctx->nr_counters++;
1822
1823         child_counter->parent = parent_counter;
1824         /*
1825          * inherit into child's child as well:
1826          */
1827         child_counter->hw_event.inherit = 1;
1828
1829         /*
1830          * Get a reference to the parent filp - we will fput it
1831          * when the child counter exits. This is safe to do because
1832          * we are in the parent and we know that the filp still
1833          * exists and has a nonzero count:
1834          */
1835         atomic_long_inc(&parent_counter->filp->f_count);
1836
1837         /*
1838          * Link this into the parent counter's child list
1839          */
1840         mutex_lock(&parent_counter->mutex);
1841         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1842
1843         /*
1844          * Make the child state follow the state of the parent counter,
1845          * not its hw_event.disabled bit.  We hold the parent's mutex,
1846          * so we won't race with perf_counter_{en,dis}able_family.
1847          */
1848         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1849                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1850         else
1851                 child_counter->state = PERF_COUNTER_STATE_OFF;
1852
1853         mutex_unlock(&parent_counter->mutex);
1854
1855         return child_counter;
1856 }
1857
1858 static int inherit_group(struct perf_counter *parent_counter,
1859               struct task_struct *parent,
1860               struct perf_counter_context *parent_ctx,
1861               struct task_struct *child,
1862               struct perf_counter_context *child_ctx)
1863 {
1864         struct perf_counter *leader;
1865         struct perf_counter *sub;
1866
1867         leader = inherit_counter(parent_counter, parent, parent_ctx,
1868                                  child, NULL, child_ctx);
1869         if (!leader)
1870                 return -ENOMEM;
1871         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1872                 if (!inherit_counter(sub, parent, parent_ctx,
1873                                      child, leader, child_ctx))
1874                         return -ENOMEM;
1875         }
1876         return 0;
1877 }
1878
1879 static void sync_child_counter(struct perf_counter *child_counter,
1880                                struct perf_counter *parent_counter)
1881 {
1882         u64 parent_val, child_val;
1883
1884         parent_val = atomic64_read(&parent_counter->count);
1885         child_val = atomic64_read(&child_counter->count);
1886
1887         /*
1888          * Add back the child's count to the parent's count:
1889          */
1890         atomic64_add(child_val, &parent_counter->count);
1891
1892         /*
1893          * Remove this counter from the parent's list
1894          */
1895         mutex_lock(&parent_counter->mutex);
1896         list_del_init(&child_counter->child_list);
1897         mutex_unlock(&parent_counter->mutex);
1898
1899         /*
1900          * Release the parent counter, if this was the last
1901          * reference to it.
1902          */
1903         fput(parent_counter->filp);
1904 }
1905
1906 static void
1907 __perf_counter_exit_task(struct task_struct *child,
1908                          struct perf_counter *child_counter,
1909                          struct perf_counter_context *child_ctx)
1910 {
1911         struct perf_counter *parent_counter;
1912         struct perf_counter *sub, *tmp;
1913
1914         /*
1915          * If we do not self-reap then we have to wait for the
1916          * child task to unschedule (it will happen for sure),
1917          * so that its counter is at its final count. (This
1918          * condition triggers rarely - child tasks usually get
1919          * off their CPU before the parent has a chance to
1920          * get this far into the reaping action)
1921          */
1922         if (child != current) {
1923                 wait_task_inactive(child, 0);
1924                 list_del_init(&child_counter->list_entry);
1925         } else {
1926                 struct perf_cpu_context *cpuctx;
1927                 unsigned long flags;
1928                 u64 perf_flags;
1929
1930                 /*
1931                  * Disable and unlink this counter.
1932                  *
1933                  * Be careful about zapping the list - IRQ/NMI context
1934                  * could still be processing it:
1935                  */
1936                 curr_rq_lock_irq_save(&flags);
1937                 perf_flags = hw_perf_save_disable();
1938
1939                 cpuctx = &__get_cpu_var(perf_cpu_context);
1940
1941                 group_sched_out(child_counter, cpuctx, child_ctx);
1942
1943                 list_del_init(&child_counter->list_entry);
1944
1945                 child_ctx->nr_counters--;
1946
1947                 hw_perf_restore(perf_flags);
1948                 curr_rq_unlock_irq_restore(&flags);
1949         }
1950
1951         parent_counter = child_counter->parent;
1952         /*
1953          * It can happen that parent exits first, and has counters
1954          * that are still around due to the child reference. These
1955          * counters need to be zapped - but otherwise linger.
1956          */
1957         if (parent_counter) {
1958                 sync_child_counter(child_counter, parent_counter);
1959                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1960                                          list_entry) {
1961                         if (sub->parent) {
1962                                 sync_child_counter(sub, sub->parent);
1963                                 kfree(sub);
1964                         }
1965                 }
1966                 kfree(child_counter);
1967         }
1968 }
1969
1970 /*
1971  * When a child task exits, feed back counter values to parent counters.
1972  *
1973  * Note: we may be running in child context, but the PID is not hashed
1974  * anymore so new counters will not be added.
1975  */
1976 void perf_counter_exit_task(struct task_struct *child)
1977 {
1978         struct perf_counter *child_counter, *tmp;
1979         struct perf_counter_context *child_ctx;
1980
1981         child_ctx = &child->perf_counter_ctx;
1982
1983         if (likely(!child_ctx->nr_counters))
1984                 return;
1985
1986         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1987                                  list_entry)
1988                 __perf_counter_exit_task(child, child_counter, child_ctx);
1989 }
1990
1991 /*
1992  * Initialize the perf_counter context in task_struct
1993  */
1994 void perf_counter_init_task(struct task_struct *child)
1995 {
1996         struct perf_counter_context *child_ctx, *parent_ctx;
1997         struct perf_counter *counter;
1998         struct task_struct *parent = current;
1999
2000         child_ctx  =  &child->perf_counter_ctx;
2001         parent_ctx = &parent->perf_counter_ctx;
2002
2003         __perf_counter_init_context(child_ctx, child);
2004
2005         /*
2006          * This is executed from the parent task context, so inherit
2007          * counters that have been marked for cloning:
2008          */
2009
2010         if (likely(!parent_ctx->nr_counters))
2011                 return;
2012
2013         /*
2014          * Lock the parent list. No need to lock the child - not PID
2015          * hashed yet and not running, so nobody can access it.
2016          */
2017         mutex_lock(&parent_ctx->mutex);
2018
2019         /*
2020          * We dont have to disable NMIs - we are only looking at
2021          * the list, not manipulating it:
2022          */
2023         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2024                 if (!counter->hw_event.inherit)
2025                         continue;
2026
2027                 if (inherit_group(counter, parent,
2028                                   parent_ctx, child, child_ctx))
2029                         break;
2030         }
2031
2032         mutex_unlock(&parent_ctx->mutex);
2033 }
2034
2035 static void __cpuinit perf_counter_init_cpu(int cpu)
2036 {
2037         struct perf_cpu_context *cpuctx;
2038
2039         cpuctx = &per_cpu(perf_cpu_context, cpu);
2040         __perf_counter_init_context(&cpuctx->ctx, NULL);
2041
2042         mutex_lock(&perf_resource_mutex);
2043         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2044         mutex_unlock(&perf_resource_mutex);
2045
2046         hw_perf_counter_setup(cpu);
2047 }
2048
2049 #ifdef CONFIG_HOTPLUG_CPU
2050 static void __perf_counter_exit_cpu(void *info)
2051 {
2052         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2053         struct perf_counter_context *ctx = &cpuctx->ctx;
2054         struct perf_counter *counter, *tmp;
2055
2056         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2057                 __perf_counter_remove_from_context(counter);
2058 }
2059 static void perf_counter_exit_cpu(int cpu)
2060 {
2061         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2062         struct perf_counter_context *ctx = &cpuctx->ctx;
2063
2064         mutex_lock(&ctx->mutex);
2065         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2066         mutex_unlock(&ctx->mutex);
2067 }
2068 #else
2069 static inline void perf_counter_exit_cpu(int cpu) { }
2070 #endif
2071
2072 static int __cpuinit
2073 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2074 {
2075         unsigned int cpu = (long)hcpu;
2076
2077         switch (action) {
2078
2079         case CPU_UP_PREPARE:
2080         case CPU_UP_PREPARE_FROZEN:
2081                 perf_counter_init_cpu(cpu);
2082                 break;
2083
2084         case CPU_DOWN_PREPARE:
2085         case CPU_DOWN_PREPARE_FROZEN:
2086                 perf_counter_exit_cpu(cpu);
2087                 break;
2088
2089         default:
2090                 break;
2091         }
2092
2093         return NOTIFY_OK;
2094 }
2095
2096 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2097         .notifier_call          = perf_cpu_notify,
2098 };
2099
2100 static int __init perf_counter_init(void)
2101 {
2102         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2103                         (void *)(long)smp_processor_id());
2104         register_cpu_notifier(&perf_cpu_nb);
2105
2106         return 0;
2107 }
2108 early_initcall(perf_counter_init);
2109
2110 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2111 {
2112         return sprintf(buf, "%d\n", perf_reserved_percpu);
2113 }
2114
2115 static ssize_t
2116 perf_set_reserve_percpu(struct sysdev_class *class,
2117                         const char *buf,
2118                         size_t count)
2119 {
2120         struct perf_cpu_context *cpuctx;
2121         unsigned long val;
2122         int err, cpu, mpt;
2123
2124         err = strict_strtoul(buf, 10, &val);
2125         if (err)
2126                 return err;
2127         if (val > perf_max_counters)
2128                 return -EINVAL;
2129
2130         mutex_lock(&perf_resource_mutex);
2131         perf_reserved_percpu = val;
2132         for_each_online_cpu(cpu) {
2133                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2134                 spin_lock_irq(&cpuctx->ctx.lock);
2135                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2136                           perf_max_counters - perf_reserved_percpu);
2137                 cpuctx->max_pertask = mpt;
2138                 spin_unlock_irq(&cpuctx->ctx.lock);
2139         }
2140         mutex_unlock(&perf_resource_mutex);
2141
2142         return count;
2143 }
2144
2145 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2146 {
2147         return sprintf(buf, "%d\n", perf_overcommit);
2148 }
2149
2150 static ssize_t
2151 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2152 {
2153         unsigned long val;
2154         int err;
2155
2156         err = strict_strtoul(buf, 10, &val);
2157         if (err)
2158                 return err;
2159         if (val > 1)
2160                 return -EINVAL;
2161
2162         mutex_lock(&perf_resource_mutex);
2163         perf_overcommit = val;
2164         mutex_unlock(&perf_resource_mutex);
2165
2166         return count;
2167 }
2168
2169 static SYSDEV_CLASS_ATTR(
2170                                 reserve_percpu,
2171                                 0644,
2172                                 perf_show_reserve_percpu,
2173                                 perf_set_reserve_percpu
2174                         );
2175
2176 static SYSDEV_CLASS_ATTR(
2177                                 overcommit,
2178                                 0644,
2179                                 perf_show_overcommit,
2180                                 perf_set_overcommit
2181                         );
2182
2183 static struct attribute *perfclass_attrs[] = {
2184         &attr_reserve_percpu.attr,
2185         &attr_overcommit.attr,
2186         NULL
2187 };
2188
2189 static struct attribute_group perfclass_attr_group = {
2190         .attrs                  = perfclass_attrs,
2191         .name                   = "perf_counters",
2192 };
2193
2194 static int __init perf_counter_sysfs_init(void)
2195 {
2196         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2197                                   &perfclass_attr_group);
2198 }
2199 device_initcall(perf_counter_sysfs_init);