Skip to content

Commit a8d757e

Browse files
Stephane EranianIngo Molnar
authored andcommitted
perf events: Fix slow and broken cgroup context switch code
The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 <not counted> cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 <not counted> cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar <mingo@elte.hu>
1 parent c6a389f commit a8d757e

3 files changed

Lines changed: 69 additions & 20 deletions

File tree

include/linux/perf_event.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -944,8 +944,10 @@ extern void perf_pmu_unregister(struct pmu *pmu);
944944

945945
extern int perf_num_counters(void);
946946
extern const char *perf_pmu_name(void);
947-
extern void __perf_event_task_sched_in(struct task_struct *task);
948-
extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
947+
extern void __perf_event_task_sched_in(struct task_struct *prev,
948+
struct task_struct *task);
949+
extern void __perf_event_task_sched_out(struct task_struct *prev,
950+
struct task_struct *next);
949951
extern int perf_event_init_task(struct task_struct *child);
950952
extern void perf_event_exit_task(struct task_struct *child);
951953
extern void perf_event_free_task(struct task_struct *task);
@@ -1059,17 +1061,20 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
10591061

10601062
extern struct jump_label_key perf_sched_events;
10611063

1062-
static inline void perf_event_task_sched_in(struct task_struct *task)
1064+
static inline void perf_event_task_sched_in(struct task_struct *prev,
1065+
struct task_struct *task)
10631066
{
10641067
if (static_branch(&perf_sched_events))
1065-
__perf_event_task_sched_in(task);
1068+
__perf_event_task_sched_in(prev, task);
10661069
}
10671070

1068-
static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
1071+
static inline void perf_event_task_sched_out(struct task_struct *prev,
1072+
struct task_struct *next)
10691073
{
10701074
perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
10711075

1072-
__perf_event_task_sched_out(task, next);
1076+
if (static_branch(&perf_sched_events))
1077+
__perf_event_task_sched_out(prev, next);
10731078
}
10741079

10751080
extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1139,10 +1144,11 @@ extern void perf_event_disable(struct perf_event *event);
11391144
extern void perf_event_task_tick(void);
11401145
#else
11411146
static inline void
1142-
perf_event_task_sched_in(struct task_struct *task) { }
1147+
perf_event_task_sched_in(struct task_struct *prev,
1148+
struct task_struct *task) { }
11431149
static inline void
1144-
perf_event_task_sched_out(struct task_struct *task,
1145-
struct task_struct *next) { }
1150+
perf_event_task_sched_out(struct task_struct *prev,
1151+
struct task_struct *next) { }
11461152
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
11471153
static inline void perf_event_exit_task(struct task_struct *child) { }
11481154
static inline void perf_event_free_task(struct task_struct *task) { }

kernel/events/core.c

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
399399
local_irq_restore(flags);
400400
}
401401

402-
static inline void perf_cgroup_sched_out(struct task_struct *task)
402+
static inline void perf_cgroup_sched_out(struct task_struct *task,
403+
struct task_struct *next)
403404
{
404-
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
405+
struct perf_cgroup *cgrp1;
406+
struct perf_cgroup *cgrp2 = NULL;
407+
408+
/*
409+
* we come here when we know perf_cgroup_events > 0
410+
*/
411+
cgrp1 = perf_cgroup_from_task(task);
412+
413+
/*
414+
* next is NULL when called from perf_event_enable_on_exec()
415+
* that will systematically cause a cgroup_switch()
416+
*/
417+
if (next)
418+
cgrp2 = perf_cgroup_from_task(next);
419+
420+
/*
421+
* only schedule out current cgroup events if we know
422+
* that we are switching to a different cgroup. Otherwise,
423+
* do no touch the cgroup events.
424+
*/
425+
if (cgrp1 != cgrp2)
426+
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
405427
}
406428

407-
static inline void perf_cgroup_sched_in(struct task_struct *task)
429+
static inline void perf_cgroup_sched_in(struct task_struct *prev,
430+
struct task_struct *task)
408431
{
409-
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
432+
struct perf_cgroup *cgrp1;
433+
struct perf_cgroup *cgrp2 = NULL;
434+
435+
/*
436+
* we come here when we know perf_cgroup_events > 0
437+
*/
438+
cgrp1 = perf_cgroup_from_task(task);
439+
440+
/* prev can never be NULL */
441+
cgrp2 = perf_cgroup_from_task(prev);
442+
443+
/*
444+
* only need to schedule in cgroup events if we are changing
445+
* cgroup during ctxsw. Cgroup events were not scheduled
446+
* out of ctxsw out if that was not the case.
447+
*/
448+
if (cgrp1 != cgrp2)
449+
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
410450
}
411451

412452
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
518558
{
519559
}
520560

521-
static inline void perf_cgroup_sched_out(struct task_struct *task)
561+
static inline void perf_cgroup_sched_out(struct task_struct *task,
562+
struct task_struct *next)
522563
{
523564
}
524565

525-
static inline void perf_cgroup_sched_in(struct task_struct *task)
566+
static inline void perf_cgroup_sched_in(struct task_struct *prev,
567+
struct task_struct *task)
526568
{
527569
}
528570

@@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
19882030
* cgroup event are system-wide mode only
19892031
*/
19902032
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1991-
perf_cgroup_sched_out(task);
2033+
perf_cgroup_sched_out(task, next);
19922034
}
19932035

19942036
static void task_ctx_sched_out(struct perf_event_context *ctx)
@@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
21532195
* accessing the event control register. If a NMI hits, then it will
21542196
* keep the event running.
21552197
*/
2156-
void __perf_event_task_sched_in(struct task_struct *task)
2198+
void __perf_event_task_sched_in(struct task_struct *prev,
2199+
struct task_struct *task)
21572200
{
21582201
struct perf_event_context *ctx;
21592202
int ctxn;
@@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
21712214
* cgroup event are system-wide mode only
21722215
*/
21732216
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2174-
perf_cgroup_sched_in(task);
2217+
perf_cgroup_sched_in(prev, task);
21752218
}
21762219

21772220
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
24272470
* ctxswin cgroup events which are already scheduled
24282471
* in.
24292472
*/
2430-
perf_cgroup_sched_out(current);
2473+
perf_cgroup_sched_out(current, NULL);
24312474

24322475
raw_spin_lock(&ctx->lock);
24332476
task_ctx_sched_out(ctx);

kernel/sched.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
30653065
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
30663066
local_irq_disable();
30673067
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3068-
perf_event_task_sched_in(current);
3068+
perf_event_task_sched_in(prev, current);
30693069
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
30703070
local_irq_enable();
30713071
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

0 commit comments

Comments
 (0)