8000 Merge tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm… · Rust-for-Linux/linux@17ef32a · GitHub
[go: up one dir, main page]

Skip to content

Commit 17ef32a

Browse files
committed
Merge tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Borislav Petkov: - Avoid a crash on a heterogeneous machine where not all cores support the same hw events features - Avoid a deadlock when throttling events - Document the perf event states more - Make sure a number of perf paths switching off or rescheduling events call perf_cgroup_event_disable() - Make sure perf does task sampling before its userspace mapping is torn down, and not after * tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86/intel: Fix crash in icl_update_topdown_event() perf: Fix the throttle error of some clock events perf: Add comment to enum perf_event_state perf/core: Fix WARN in perf_cgroup_switch() perf: Fix dangling cgroup pointer in cpuctx perf: Fix cgroup state vs ERROR perf: Fix sample vs do_exit()
2 parents aff2a7e + b0823d5 commit 17ef32a

File tree

4 files changed

+124
-53
lines changed

4 files changed

+124
-53
lines changed

arch/x86/events/intel/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event)
28262826
* If the PEBS counters snapshotting is enabled,
28272827
* the topdown event is available in PEBS records.
28282828
*/
2829-
if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
2829+
if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
28302830
static_call(intel_pmu_update_topdown_event)(event, NULL);
28312831
else
28322832
intel_pmu_drain_pebs_buffer();

include/linux/perf_event.h

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -635,8 +635,46 @@ struct perf_addr_filter_range {
635635
unsigned long size;
636636
};
637637

638-
/**
639-
* enum perf_event_state - the states of an event:
638+
/*
639+
* The normal states are:
640+
*
641+
* ACTIVE --.
642+
* ^ |
643+
* | |
644+
* sched_{in,out}() |
645+
* | |
646+
* v |
647+
* ,---> INACTIVE --+ <-.
648+
* | | |
649+
* | {dis,en}able()
650+
* sched_in() | |
651+
* | OFF <--' --+
652+
* | |
653+
* `---> ERROR ------'
654+
*
655+
* That is:
656+
*
657+
* sched_in: INACTIVE -> {ACTIVE,ERROR}
658+
* sched_out: ACTIVE -> INACTIVE
659+
* disable: {ACTIVE,INACTIVE} -> OFF
660+
* enable: {OFF,ERROR} -> INACTIVE
661+
*
662+
* Where {OFF,ERROR} are disabled states.
663+
*
664+
* Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
665+
* defunct events:
666+
*
667+
* - EXIT means task that the even was assigned to died, but child events
668+
* still live, and further children can still be created. But the event
669+
* itself will never be active again. It can only transition to
670+
* {REVOKED,DEAD};
671+
*
672+
* - REVOKED means the PMU the event was associated with is gone; all
673+
* functionality is stopped but the event is still alive. Can only
674+
* transition to DEAD;
675+
*
676+
* - DEAD event really is DYING tearing down state and freeing bits.
677+
*
640678
*/
641679
enum perf_event_state {
642680
PERF_EVENT_STATE_DEAD = -5,

kernel/events/core.c

Lines changed: 74 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
207207
__perf_ctx_unlock(&cpuctx->ctx);
208208
}
209209

210+
typedef struct {
211+
struct perf_cpu_context *cpuctx;
212+
struct perf_event_context *ctx;
213+
} class_perf_ctx_lock_t;
214+
215+
static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
216+
{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
217+
218+
static inline class_perf_ctx_lock_t
219+
class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
220+
struct perf_event_context *ctx)
221+
{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
222+
210223
#define TASK_TOMBSTONE ((void *)-1L)
211224

212225
static bool is_kernel_event(struct perf_event *event)
@@ -944,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task)
944957
if (READ_ONCE(cpuctx->cgrp) == cgrp)
945958
return;
946959

947-
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
960+
guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
961+
/*
962+
* Re-check, could've raced vs perf_remove_from_context().
963+
*/
964+
if (READ_ONCE(cpuctx->cgrp) == NULL)
965+
return;
966+
948967
perf_ctx_disable(&cpuctx->ctx, true);
949968

950969
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
962981
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
963982

964983
perf_ctx_enable(&cpuctx->ctx, true);
965-
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
966984
}
967985

968986
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
21202138
if (event->group_leader == event)
21212139
del_event_from_groups(event, ctx);
21222140

2123-
/*
2124-
* If event was in error state, then keep it
2125-
* that way, otherwise bogus counts will be
2126-
* returned on read(). The only way to get out
2127-
* of error state is by explicit re-enabling
2128-
* of the event
2129-
*/
2130-
if (event->state > PERF_EVENT_STATE_OFF) {
2131-
perf_cgroup_event_disable(event, ctx);
2132-
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2133-
}
2134-
21352141
ctx->generation++;
21362142
event->pmu_ctx->nr_events--;
21372143
}
@@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
21492155
}
21502156

21512157
static void put_event(struct perf_event *event);
2152-
static void event_sched_out(struct perf_event *event,
2153-
struct perf_event_context *ctx);
2158+
static void __event_disable(struct perf_event *event,
2159+
struct perf_event_context *ctx,
2160+
enum perf_event_state state);
21542161

21552162
static void perf_put_aux_event(struct perf_event *event)
21562163
{
@@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
21832190
* state so that we don't try to schedule it again. Note
21842191
* that perf_event_enable() will clear the ERROR status.
21852192
*/
2186-
event_sched_out(iter, ctx);
2187-
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2193+
__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
21882194
}
21892195
}
21902196

@@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
22422248
&event->pmu_ctx->flexible_active;
22432249
}
22442250

2245-
/*
2246-
* Events that have PERF_EV_CAP_SIBLING require being part of a group and
2247-
* cannot exist on their own, schedule them out and move them into the ERROR
2248-
* state. Also see _perf_event_enable(), it will not be able to recover
2249-
* this ERROR state.
2250-
*/
2251-
static inline void perf_remove_sibling_event(struct perf_event *event)
2252-
{
2253-
event_sched_out(event, event->ctx);
2254-
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2255-
}
2256-
22572251
static void perf_group_detach(struct perf_event *event)
22582252
{
22592253
struct perf_event *leader = event->group_leader;
@@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
22892283
*/
22902284
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
22912285

2286+
/*
2287+
* Events that have PERF_EV_CAP_SIBLING require being part of
2288+
* a group and cannot exist on their own, schedule them out
2289+
* and move them into the ERROR state. Also see
2290+
* _perf_event_enable(), it will not be able to recover this
2291+
* ERROR state.
2292+
*/
22922293
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2293-
perf_remove_sibling_event(sibling);
2294+
__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
22942295

22952296
sibling->group_leader = sibling;
22962297
list_del_init(&sibling->sibling_list);
@@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event,
24932494
state = PERF_EVENT_STATE_EXIT;
24942495
if (flags & DETACH_REVOKE)
24952496
state = PERF_EVENT_STATE_REVOKED;
2496-
if (flags & DETACH_DEAD) {
2497-
event->pending_disable = 1;
2497+
if (flags & DETACH_DEAD)
24982498
state = PERF_EVENT_STATE_DEAD;
2499-
}
2499+
25002500
event_sched_out(event, ctx);
2501+
2502+
if (event->state > PERF_EVENT_STATE_OFF)
2503+
perf_cgroup_event_disable(event, ctx);
2504+
25012505
perf_event_set_state(event, min(event->state, state));
25022506

25032507
if (flags & DETACH_GROUP)
@@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
25622566
event_function_call(event, __perf_remove_from_context, (void *)flags);
25632567
}
25642568

2569+
static void __event_disable(struct perf_event *event,
2570+
struct perf_event_context *ctx,
2571+
enum perf_event_state state)
2572+
{
2573+
event_sched_out(event, ctx);
2574+
perf_cgroup_event_disable(event, ctx);
2575+
perf_event_set_state(event, state);
2576+
}
2577+
25652578
/*
25662579
* Cross CPU call to disable a performance event
25672580
*/
@@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
25762589
perf_pmu_disable(event->pmu_ctx->pmu);
25772590
ctx_time_update_event(ctx, event);
25782591

2592+
/*
2593+
* When disabling a group leader, the whole group becomes ineligible
2594+
* to run, so schedule out the full group.
2595+
*/
25792596
if (event == event->group_leader)
25802597
group_sched_out(event, ctx);
2581-
else
2582-
event_sched_out(event, ctx);
25832598

2584-
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2585-
perf_cgroup_event_disable(event, ctx);
2599+
/*
2600+
* But only mark the leader OFF; the siblings will remain
2601+
* INACTIVE.
2602+
*/
2603+
__event_disable(event, ctx, PERF_EVENT_STATE_OFF);
25862604

25872605
perf_pmu_enable(event->pmu_ctx->pmu);
25882606
}
@@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
26562674

26572675
static void perf_event_throttle(struct perf_event *event)
26582676
{
2659-
event->pmu->stop(event, 0);
26602677
event->hw.interrupts = MAX_INTERRUPTS;
2678+
event->pmu->stop(event, 0);
26612679
if (event == event->group_leader)
26622680
perf_log_throttle(event, 0);
26632681
}
@@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
74397457
if (!regs)
74407458
return 0;
74417459

7460+
/* No mm, no stack, no dump. */
7461+
if (!current->mm)
7462+
return 0;
7463+
74427464
/*
74437465
* Check if we fit in with the requested stack size into the:
74447466
* - TASK_SIZE
@@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
81508172
const u32 max_stack = event->attr.sample_max_stack;
81518173
struct perf_callchain_entry *callchain;
81528174

8175+
if (!current->mm)
8176+
user = false;
8177+
81538178
if (!kernel && !user)
81548179
return &__empty_callchain;
81558180

@@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
1174911774
{
1175011775
struct hw_perf_event *hwc = &event->hw;
1175111776

11752-
if (is_sampling_event(event)) {
11777+
/*
11778+
* The throttle can be triggered in the hrtimer handler.
11779+
* The HRTIMER_NORESTART should be used to stop the timer,
11780+
* rather than hrtimer_cancel(). See perf_swevent_hrtimer()
11781+
*/
11782+
if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
1175311783
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
1175411784
local64_set(&hwc->period_left, ktime_to_ns(remaining));
1175511785

@@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
1180411834
static void cpu_clock_event_stop(struct perf_event *event, int flags)
1180511835
{
1180611836
perf_swevent_cancel_hrtimer(event);
11807-
cpu_clock_event_update(event);
11837+
if (flags & PERF_EF_UPDATE)
11838+
cpu_clock_event_update(event);
1180811839
}
1180911840

1181011841
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags)
1188211913
static void task_clock_event_stop(struct perf_event *event, int flags)
1188311914
{
1188411915
perf_swevent_cancel_hrtimer(event);
11885-
task_clock_event_update(event, event->ctx->time);
11916+
if (flags & PERF_EF_UPDATE)
11917+
task_clock_event_update(event, event->ctx->time);
1188611918
}
1188711919

1188811920
static int task_clock_event_add(struct perf_event *event, int flags)

kernel/exit.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,15 @@ void __noreturn do_exit(long code)
940940
taskstats_exit(tsk, group_dead);
941941
trace_sched_process_exit(tsk, group_dead);
942942

943+
/*
944+
* Since sampling can touch ->mm, make sure to stop everything before we
945+
* tear it down.
946+
*
947+
* Also flushes inherited counters to the parent - before the parent
948+
* gets woken up by child-exit notifications.
949+
*/
950+
perf_event_exit_task(tsk);
951+
943952
exit_mm();
944953

945954
if (group_dead)
@@ -955,14 +964,6 @@ void __noreturn do_exit(long code)
955964
exit_task_work(tsk);
956965
exit_thread(tsk);
957966

958-
/*
959-
* Flush inherited counters to the parent - before the parent
960-
* gets woken up by child-exit notifications.
961-
*
962-
* because of cgroup mode, must be called before cgroup_exit()
963-
*/
964-
perf_event_exit_task(tsk);
965-
966967
sched_autogroup_exit_task(tsk);
967968
cgroup_exit(tsk);
968969

0 commit comments

Comments
 (0)
0