enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
+ EVENT_TIME = 0x4,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
/*
* Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
static bool is_orphaned_event(struct perf_event *event)
{
- return event->state == PERF_EVENT_STATE_EXIT;
+ return event->state == PERF_EVENT_STATE_DEAD;
}
static inline int pmu_filter_match(struct perf_event *event)
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
}
#define DETACH_GROUP 0x01UL
-#define DETACH_STATE 0x02UL
/*
* Cross CPU call to remove a performance event
if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_STATE)
- event->state = PERF_EVENT_STATE_EXIT;
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
event->tstamp_stopped = tstamp;
}
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
struct task_struct *task)
*/
raw_spin_lock_irq(&ctx->lock);
task = ctx->task;
+
/*
- * Worse, we cannot even rely on the ctx actually existing anymore. If
- * between find_get_context() and perf_install_in_context() the task
- * went through perf_event_exit_task() its dead and we should not be
- * adding new events.
+ * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
+ * ctx gets destroyed, we must not install an event into it.
+ *
+ * This is normally tested for after we acquire the mutex, so this is
+ * a sanity check.
*/
- if (task == TASK_TOMBSTONE) {
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
- update_context_time(ctx);
- /*
- * Update cgrp time only if current cgrp matches event->cgrp.
- * Must be done before calling add_event_to_ctx().
- */
- update_cgrp_time_from_event(event);
+
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
event->state <= PERF_EVENT_STATE_ERROR)
return;
- update_context_time(ctx);
+ if (ctx->is_active)
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+
__perf_event_mark_enabled(event);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
- if (is_cgroup_event(event)) {
- perf_cgroup_set_timestamp(current, ctx); // XXX ?
+ if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- }
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
+ }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
}
ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
if (!ctx->is_active)
cpuctx->task_ctx = NULL;
}
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- if (!ctx->nr_active)
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* update (and stop) ctx time */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
+ if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
perf_pmu_disable(ctx->pmu);
- if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ if (is_active & EVENT_PINNED) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
- if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
perf_cgroup_sched_out(task, next);
}
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- if (!cpuctx->task_ctx)
- return;
-
- if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
- return;
-
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-}
-
/*
* Called with IRQs disabled
*/
if (likely(!ctx->nr_events))
return;
- ctx->is_active |= event_type;
+ ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
cpuctx->task_ctx = ctx;
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* start ctx time */
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ }
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ if (is_active & EVENT_PINNED)
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
- if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ if (is_active & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, cpuctx);
}
cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
enabled |= event_enable_on_exec(event, ctx);
if (has_branch_stack(event))
dec = true;
- if (dec)
- static_key_slow_dec_deferred(&perf_sched_events);
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
unaccount_event_cpu(event, event->cpu);
}
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
*/
int perf_event_release_kernel(struct perf_event *event)
{
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ /*
+ * If we got here through err_file: fput(event_file); we will not have
+ * attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
- perf_event_ctx_unlock(event, ctx);
+ perf_remove_from_context(event, DETACH_GROUP);
+ raw_spin_lock_irq(&ctx->lock);
/*
- * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
- * either from the above perf_remove_from_context() or through
- * perf_event_exit_event().
+ * Mark this even as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * Therefore, anybody acquiring event->child_mutex after the below
- * loop _must_ also see this, most importantly inherit_event() which
- * will avoid placing more children on the list.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
+ perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
}
mutex_unlock(&event->child_mutex);
- /* Must be the last reference */
- put_event(event);
+no_ctx:
+ put_event(event); /* Must be the 'last' reference */
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
{
bool no_children;
- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
if (is_cgroup_event(event))
inc = true;
- if (inc)
- static_key_slow_inc(&perf_sched_events.key);
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
account_event_cpu(event, event->cpu);
}
if (move_group) {
gctx = group_leader->ctx;
mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ if (gctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
if (!perf_event_validate_size(event)) {
err = -E2BIG;
goto err_locked;
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
err_cpus:
put_online_cpus();
err_task:
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_unlock;
+ }
+
if (!exclusive_event_installable(event, ctx)) {
- mutex_unlock(&ctx->mutex);
- perf_unpin_context(ctx);
- put_ctx(ctx);
err = -EBUSY;
- goto err_free;
+ goto err_unlock;
}
perf_install_in_context(ctx, event, cpu);
return event;
+err_unlock:
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_free:
free_event(event);
err:
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
+ child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
perf_event_init_cpu(cpu);
break;
- case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.