Message ID | 20240801045907.4010984-10-mizhang@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Mediated Passthrough vPMU 3.0 for x86 | expand |
On 8/1/2024 12:58 PM, Mingwei Zhang wrote: > From: Kan Liang <kan.liang@linux.intel.com> > > Current perf doesn't explicitly schedule out all exclude_guest events > while the guest is running. There is no problem with the current > emulated vPMU. Because perf owns all the PMU counters. It can mask the > counter which is assigned to an exclude_guest event when a guest is > running (Intel way), or set the corresponding HOSTONLY bit in evsentsel > (AMD way). The counter doesn't count when a guest is running. > > However, either way doesn't work with the introduced passthrough vPMU. > A guest owns all the PMU counters when it's running. The host should not > mask any counters. The counter may be used by the guest. The evsentsel > may be overwritten. > > Perf should explicitly schedule out all exclude_guest events to release > the PMU resources when entering a guest, and resume the counting when > exiting the guest. > > It's possible that an exclude_guest event is created when a guest is > running. The new event should not be scheduled in as well. > > The ctx time is shared among different PMUs. The time cannot be stopped > when a guest is running. It is required to calculate the time for events > from other PMUs, e.g., uncore events. Add timeguest to track the guest > run time. For an exclude_guest event, the elapsed time equals > the ctx time - guest time. > Cgroup has dedicated times. Use the same method to deduct the guest time > from the cgroup time as well. > > Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Signed-off-by: Kan Liang <kan.liang@linux.intel.com> > Signed-off-by: Mingwei Zhang <mizhang@google.com> > --- > include/linux/perf_event.h | 6 ++ > kernel/events/core.c | 178 +++++++++++++++++++++++++++++++------ > 2 files changed, 155 insertions(+), 29 deletions(-) > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index e22cdb6486e6..81a5f8399cb8 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -952,6 +952,11 @@ struct perf_event_context { > */ > struct perf_time_ctx time; > > + /* > + * Context clock, runs when in the guest mode. > + */ > + struct perf_time_ctx timeguest; > + > /* > * These fields let us detect when two contexts have both > * been cloned (inherited) from a common ancestor. > @@ -1044,6 +1049,7 @@ struct bpf_perf_event_data_kern { > */ > struct perf_cgroup_info { > struct perf_time_ctx time; > + struct perf_time_ctx timeguest; > int active; > }; > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index c25e2bf27001..57648736e43e 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -376,7 +376,8 @@ enum event_type_t { > /* see ctx_resched() for details */ > EVENT_CPU = 0x8, > EVENT_CGROUP = 0x10, > - EVENT_FLAGS = EVENT_CGROUP, > + EVENT_GUEST = 0x20, > + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, > EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, > }; > > @@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly; > > static atomic_t nr_mediated_pmu_vms; > static DEFINE_MUTEX(perf_mediated_pmu_mutex); > +static DEFINE_PER_CPU(bool, perf_in_guest); > > /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */ > static inline bool is_include_guest_event(struct perf_event *event) > @@ -706,6 +708,10 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, > if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) > return true; > > + if ((event_type & EVENT_GUEST) && > + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) > + return true; > + > return false; > } > > @@ -770,12 +776,21 @@ static inline int is_cgroup_event(struct perf_event *event) > return event->cgrp != NULL; > } > > +static inline u64 __perf_event_time_ctx(struct perf_event *event, > + struct perf_time_ctx *time, > + struct perf_time_ctx *timeguest); > + > +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, > + struct perf_time_ctx *time, > + struct perf_time_ctx *timeguest, > + u64 now); > + > static inline u64 perf_cgroup_event_time(struct perf_event *event) > { > struct perf_cgroup_info *t; > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > - return t->time.time; > + return __perf_event_time_ctx(event, &t->time, &t->timeguest); > } > > static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) > @@ -784,9 +799,9 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > if (!__load_acquire(&t->active)) > - return t->time.time; > - now += READ_ONCE(t->time.offset); > - return now; > + return __perf_event_time_ctx(event, &t->time, &t->timeguest); > + > + return __perf_event_time_ctx_now(event, &t->time, &t->timeguest, now); > } > > static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv); > @@ -796,6 +811,18 @@ static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bo > update_perf_time_ctx(&info->time, now, adv); > } > > +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) > +{ > + update_perf_time_ctx(&info->timeguest, now, adv); > +} > + > +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) > +{ > + __update_cgrp_time(info, now, true); > + if (__this_cpu_read(perf_in_guest)) > + __update_cgrp_guest_time(info, now, true); > +} > + > static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) > { > struct perf_cgroup *cgrp = cpuctx->cgrp; > @@ -809,7 +836,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, > cgrp = container_of(css, struct perf_cgroup, css); > info = this_cpu_ptr(cgrp->info); > > - __update_cgrp_time(info, now, true); > + update_cgrp_time(info, now); > if (final) > __store_release(&info->active, 0); > } > @@ -832,11 +859,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) > * Do not update time when cgroup is not active > */ > if (info->active) > - __update_cgrp_time(info, perf_clock(), true); > + update_cgrp_time(info, perf_clock()); > } > > static inline void > -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) > +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) > { > struct perf_event_context *ctx = &cpuctx->ctx; > struct perf_cgroup *cgrp = cpuctx->cgrp; > @@ -856,8 +883,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) > for (css = &cgrp->css; css; css = css->parent) { > cgrp = container_of(css, struct perf_cgroup, css); > info = this_cpu_ptr(cgrp->info); > - __update_cgrp_time(info, ctx->time.stamp, false); > - __store_release(&info->active, 1); > + if (guest) { > + __update_cgrp_guest_time(info, ctx->time.stamp, false); > + } else { > + __update_cgrp_time(info, ctx->time.stamp, false); > + __store_release(&info->active, 1); > + } > } > } > > @@ -1061,7 +1092,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, > } > > static inline void > -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) > +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) > { > } > > @@ -1488,16 +1519,34 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo > */ > static void __update_context_time(struct perf_event_context *ctx, bool adv) > { > - u64 now = perf_clock(); > + lockdep_assert_held(&ctx->lock); > + > + update_perf_time_ctx(&ctx->time, perf_clock(), adv); > +} > > +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) > +{ > lockdep_assert_held(&ctx->lock); > > - update_perf_time_ctx(&ctx->time, now, adv); > + /* must be called after __update_context_time(); */ > + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); > } > > static void update_context_time(struct perf_event_context *ctx) > { > __update_context_time(ctx, true); > + if (__this_cpu_read(perf_in_guest)) > + __update_context_guest_time(ctx, true); > +} > + > +static inline u64 __perf_event_time_ctx(struct perf_event *event, > + struct perf_time_ctx *time, > + struct perf_time_ctx *timeguest) > +{ > + if (event->attr.exclude_guest) > + return time->time - timeguest->time; > + else > + return time->time; > } > > static u64 perf_event_time(struct perf_event *event) > @@ -1510,7 +1559,26 @@ static u64 perf_event_time(struct perf_event *event) > if (is_cgroup_event(event)) > return perf_cgroup_event_time(event); > > - return ctx->time.time; > + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); > +} > + > +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, > + struct perf_time_ctx *time, > + struct perf_time_ctx *timeguest, > + u64 now) > +{ > + /* > + * The exclude_guest event time should be calculated from > + * the ctx time - the guest time. > + * The ctx time is now + READ_ONCE(time->offset). > + * The guest time is now + READ_ONCE(timeguest->offset). > + * So the exclude_guest time is > + * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). > + */ > + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) Hi Kan, we see the following the warning when run perf record command after enabling "CONFIG_DEBUG_PREEMPT" config item. [ 166.779208] BUG: using __this_cpu_read() in preemptible [00000000] code: perf/9494 [ 166.779234] caller is __this_cpu_preempt_check+0x13/0x20 [ 166.779241] CPU: 56 UID: 0 PID: 9494 Comm: perf Not tainted 6.11.0-rc4-perf-next-mediated-vpmu-v3+ #80 [ 166.779245] Hardware name: Quanta Cloud Technology Inc. QuantaGrid D54Q-2U/S6Q-MB-MPS, BIOS 3A11.uh 12/02/2022 [ 166.779248] Call Trace: [ 166.779250] <TASK> [ 166.779252] dump_stack_lvl+0x76/0xa0 [ 166.779260] dump_stack+0x10/0x20 [ 166.779267] check_preemption_disabled+0xd7/0xf0 [ 166.779273] __this_cpu_preempt_check+0x13/0x20 [ 166.779279] calc_timer_values+0x193/0x200 [ 166.779287] perf_event_update_userpage+0x4b/0x170 [ 166.779294] ? ring_buffer_attach+0x14c/0x200 [ 166.779301] perf_mmap+0x533/0x5d0 [ 166.779309] mmap_region+0x243/0xaa0 [ 166.779322] do_mmap+0x35b/0x640 [ 166.779333] vm_mmap_pgoff+0xf0/0x1c0 [ 166.779345] ksys_mmap_pgoff+0x17a/0x250 [ 166.779354] __x64_sys_mmap+0x33/0x70 [ 166.779362] x64_sys_call+0x1fa4/0x25f0 [ 166.779369] do_syscall_64+0x70/0x130 The season that kernel complains this is __perf_event_time_ctx_now() calls __this_cpu_read() in preemption enabled context. To eliminate the warning, we may need to use this_cpu_read() to replace __this_cpu_read(). diff --git a/kernel/events/core.c b/kernel/events/core.c index ccd61fd06e8d..1eb628f8b3a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1581,7 +1581,7 @@ static inline u64 __perf_event_time_ctx_now(struct perf_event *event, * So the exclude_guest time is * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). */ - if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) + if (event->attr.exclude_guest && this_cpu_read(perf_in_guest)) return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); else return now + READ_ONCE(time->offset); > + return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); > + else > + return now + READ_ONCE(time->offset); > } > > static u64 perf_event_time_now(struct perf_event *event, u64 now) > @@ -1524,10 +1592,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) > return perf_cgroup_event_time_now(event, now); > > if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) > - return ctx->time.time; > + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); > > - now += READ_ONCE(ctx->time.offset); > - return now; > + return __perf_event_time_ctx_now(event, &ctx->time, &ctx->timeguest, now); > } > > static enum event_type_t get_event_type(struct perf_event *event) > @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) > * would only update time for the pinned events. > */ > if (is_active & EVENT_TIME) { > + bool stop; > + > + /* vPMU should not stop time */ > + stop = !(event_type & EVENT_GUEST) && > + ctx == &cpuctx->ctx; > + > /* update (and stop) ctx time */ > update_context_time(ctx); > - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); > + update_cgrp_time_from_cpuctx(cpuctx, stop); > /* > * CPU-release for the below ->is_active store, > * see __load_acquire() in perf_event_time_now() > @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) > cpuctx->task_ctx = NULL; > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule out all !exclude_guest events of PMU > + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. > + */ > + is_active = EVENT_ALL; > + __update_context_guest_time(ctx, false); > + perf_cgroup_set_timestamp(cpuctx, true); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + } > > list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > if (perf_skip_pmu_ctx(pmu_ctx, event_type)) > @@ -3853,10 +3937,15 @@ static inline void group_update_userpage(struct perf_event *group_event) > event_update_userpage(event); > } > > +struct merge_sched_data { > + int can_add_hw; > + enum event_type_t event_type; > +}; > + > static int merge_sched_in(struct perf_event *event, void *data) > { > struct perf_event_context *ctx = event->ctx; > - int *can_add_hw = data; > + struct merge_sched_data *msd = data; > > if (event->state <= PERF_EVENT_STATE_OFF) > return 0; > @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data) > if (!event_filter_match(event)) > return 0; > > - if (group_can_go_on(event, *can_add_hw)) { > + /* > + * Don't schedule in any exclude_guest events of PMU with > + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. > + */ > + if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest && > + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && > + !(msd->event_type & EVENT_GUEST)) > + return 0; > + > + if (group_can_go_on(event, msd->can_add_hw)) { > if (!group_sched_in(event, ctx)) > list_add_tail(&event->active_list, get_event_list(event)); > } > > if (event->state == PERF_EVENT_STATE_INACTIVE) { > - *can_add_hw = 0; > + msd->can_add_hw = 0; > if (event->attr.pinned) { > perf_cgroup_event_disable(event, ctx); > perf_event_set_state(event, PERF_EVENT_STATE_ERROR); > @@ -3889,11 +3987,15 @@ static int merge_sched_in(struct perf_event *event, void *data) > > static void pmu_groups_sched_in(struct perf_event_context *ctx, > struct perf_event_groups *groups, > - struct pmu *pmu) > + struct pmu *pmu, > + enum event_type_t event_type) > { > - int can_add_hw = 1; > + struct merge_sched_data msd = { > + .can_add_hw = 1, > + .event_type = event_type, > + }; > visit_groups_merge(ctx, groups, smp_processor_id(), pmu, > - merge_sched_in, &can_add_hw); > + merge_sched_in, &msd); > } > > static void ctx_groups_sched_in(struct perf_event_context *ctx, > @@ -3905,14 +4007,14 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx, > list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > if (perf_skip_pmu_ctx(pmu_ctx, event_type)) > continue; > - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); > + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type); > } > } > > static void __pmu_ctx_sched_in(struct perf_event_context *ctx, > struct pmu *pmu) > { > - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); > + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0); > } > > static void > @@ -3927,9 +4029,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) > return; > > if (!(is_active & EVENT_TIME)) { > + /* EVENT_TIME should be active while the guest runs */ > + WARN_ON_ONCE(event_type & EVENT_GUEST); > /* start ctx time */ > __update_context_time(ctx, false); > - perf_cgroup_set_timestamp(cpuctx); > + perf_cgroup_set_timestamp(cpuctx, false); > /* > * CPU-release for the below ->is_active store, > * see __load_acquire() in perf_event_time_now() > @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) > WARN_ON_ONCE(cpuctx->task_ctx != ctx); > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule in all !exclude_guest events of PMU > + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. > + */ > + is_active = EVENT_ALL; > + > + /* > + * Update ctx time to set the new start time for > + * the exclude_guest events. > + */ > + update_context_time(ctx); > + update_cgrp_time_from_cpuctx(cpuctx, false); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + } > > /* > * First go through the list and put on any pinned groups
On 2024-08-21 1:27 a.m., Mi, Dapeng wrote: > > On 8/1/2024 12:58 PM, Mingwei Zhang wrote: >> From: Kan Liang <kan.liang@linux.intel.com> >> >> Current perf doesn't explicitly schedule out all exclude_guest events >> while the guest is running. There is no problem with the current >> emulated vPMU. Because perf owns all the PMU counters. It can mask the >> counter which is assigned to an exclude_guest event when a guest is >> running (Intel way), or set the corresponding HOSTONLY bit in evsentsel >> (AMD way). The counter doesn't count when a guest is running. >> >> However, either way doesn't work with the introduced passthrough vPMU. >> A guest owns all the PMU counters when it's running. The host should not >> mask any counters. The counter may be used by the guest. The evsentsel >> may be overwritten. >> >> Perf should explicitly schedule out all exclude_guest events to release >> the PMU resources when entering a guest, and resume the counting when >> exiting the guest. >> >> It's possible that an exclude_guest event is created when a guest is >> running. The new event should not be scheduled in as well. >> >> The ctx time is shared among different PMUs. The time cannot be stopped >> when a guest is running. It is required to calculate the time for events >> from other PMUs, e.g., uncore events. Add timeguest to track the guest >> run time. For an exclude_guest event, the elapsed time equals >> the ctx time - guest time. >> Cgroup has dedicated times. Use the same method to deduct the guest time >> from the cgroup time as well. >> >> Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org> >> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> >> Signed-off-by: Kan Liang <kan.liang@linux.intel.com> >> Signed-off-by: Mingwei Zhang <mizhang@google.com> >> --- >> include/linux/perf_event.h | 6 ++ >> kernel/events/core.c | 178 +++++++++++++++++++++++++++++++------ >> 2 files changed, 155 insertions(+), 29 deletions(-) >> >> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h >> index e22cdb6486e6..81a5f8399cb8 100644 >> --- a/include/linux/perf_event.h >> +++ b/include/linux/perf_event.h >> @@ -952,6 +952,11 @@ struct perf_event_context { >> */ >> struct perf_time_ctx time; >> >> + /* >> + * Context clock, runs when in the guest mode. >> + */ >> + struct perf_time_ctx timeguest; >> + >> /* >> * These fields let us detect when two contexts have both >> * been cloned (inherited) from a common ancestor. >> @@ -1044,6 +1049,7 @@ struct bpf_perf_event_data_kern { >> */ >> struct perf_cgroup_info { >> struct perf_time_ctx time; >> + struct perf_time_ctx timeguest; >> int active; >> }; >> >> diff --git a/kernel/events/core.c b/kernel/events/core.c >> index c25e2bf27001..57648736e43e 100644 >> --- a/kernel/events/core.c >> +++ b/kernel/events/core.c >> @@ -376,7 +376,8 @@ enum event_type_t { >> /* see ctx_resched() for details */ >> EVENT_CPU = 0x8, >> EVENT_CGROUP = 0x10, >> - EVENT_FLAGS = EVENT_CGROUP, >> + EVENT_GUEST = 0x20, >> + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, >> EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, >> }; >> >> @@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly; >> >> static atomic_t nr_mediated_pmu_vms; >> static DEFINE_MUTEX(perf_mediated_pmu_mutex); >> +static DEFINE_PER_CPU(bool, perf_in_guest); >> >> /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */ >> static inline bool is_include_guest_event(struct perf_event *event) >> @@ -706,6 +708,10 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, >> if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) >> return true; >> >> + if ((event_type & EVENT_GUEST) && >> + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) >> + return true; >> + >> return false; >> } >> >> @@ -770,12 +776,21 @@ static inline int is_cgroup_event(struct perf_event *event) >> return event->cgrp != NULL; >> } >> >> +static inline u64 __perf_event_time_ctx(struct perf_event *event, >> + struct perf_time_ctx *time, >> + struct perf_time_ctx *timeguest); >> + >> +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, >> + struct perf_time_ctx *time, >> + struct perf_time_ctx *timeguest, >> + u64 now); >> + >> static inline u64 perf_cgroup_event_time(struct perf_event *event) >> { >> struct perf_cgroup_info *t; >> >> t = per_cpu_ptr(event->cgrp->info, event->cpu); >> - return t->time.time; >> + return __perf_event_time_ctx(event, &t->time, &t->timeguest); >> } >> >> static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) >> @@ -784,9 +799,9 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) >> >> t = per_cpu_ptr(event->cgrp->info, event->cpu); >> if (!__load_acquire(&t->active)) >> - return t->time.time; >> - now += READ_ONCE(t->time.offset); >> - return now; >> + return __perf_event_time_ctx(event, &t->time, &t->timeguest); >> + >> + return __perf_event_time_ctx_now(event, &t->time, &t->timeguest, now); >> } >> >> static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv); >> @@ -796,6 +811,18 @@ static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bo >> update_perf_time_ctx(&info->time, now, adv); >> } >> >> +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) >> +{ >> + update_perf_time_ctx(&info->timeguest, now, adv); >> +} >> + >> +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) >> +{ >> + __update_cgrp_time(info, now, true); >> + if (__this_cpu_read(perf_in_guest)) >> + __update_cgrp_guest_time(info, now, true); >> +} >> + >> static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) >> { >> struct perf_cgroup *cgrp = cpuctx->cgrp; >> @@ -809,7 +836,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, >> cgrp = container_of(css, struct perf_cgroup, css); >> info = this_cpu_ptr(cgrp->info); >> >> - __update_cgrp_time(info, now, true); >> + update_cgrp_time(info, now); >> if (final) >> __store_release(&info->active, 0); >> } >> @@ -832,11 +859,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) >> * Do not update time when cgroup is not active >> */ >> if (info->active) >> - __update_cgrp_time(info, perf_clock(), true); >> + update_cgrp_time(info, perf_clock()); >> } >> >> static inline void >> -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) >> +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) >> { >> struct perf_event_context *ctx = &cpuctx->ctx; >> struct perf_cgroup *cgrp = cpuctx->cgrp; >> @@ -856,8 +883,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) >> for (css = &cgrp->css; css; css = css->parent) { >> cgrp = container_of(css, struct perf_cgroup, css); >> info = this_cpu_ptr(cgrp->info); >> - __update_cgrp_time(info, ctx->time.stamp, false); >> - __store_release(&info->active, 1); >> + if (guest) { >> + __update_cgrp_guest_time(info, ctx->time.stamp, false); >> + } else { >> + __update_cgrp_time(info, ctx->time.stamp, false); >> + __store_release(&info->active, 1); >> + } >> } >> } >> >> @@ -1061,7 +1092,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, >> } >> >> static inline void >> -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) >> +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) >> { >> } >> >> @@ -1488,16 +1519,34 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo >> */ >> static void __update_context_time(struct perf_event_context *ctx, bool adv) >> { >> - u64 now = perf_clock(); >> + lockdep_assert_held(&ctx->lock); >> + >> + update_perf_time_ctx(&ctx->time, perf_clock(), adv); >> +} >> >> +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) >> +{ >> lockdep_assert_held(&ctx->lock); >> >> - update_perf_time_ctx(&ctx->time, now, adv); >> + /* must be called after __update_context_time(); */ >> + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); >> } >> >> static void update_context_time(struct perf_event_context *ctx) >> { >> __update_context_time(ctx, true); >> + if (__this_cpu_read(perf_in_guest)) >> + __update_context_guest_time(ctx, true); >> +} >> + >> +static inline u64 __perf_event_time_ctx(struct perf_event *event, >> + struct perf_time_ctx *time, >> + struct perf_time_ctx *timeguest) >> +{ >> + if (event->attr.exclude_guest) >> + return time->time - timeguest->time; >> + else >> + return time->time; >> } >> >> static u64 perf_event_time(struct perf_event *event) >> @@ -1510,7 +1559,26 @@ static u64 perf_event_time(struct perf_event *event) >> if (is_cgroup_event(event)) >> return perf_cgroup_event_time(event); >> >> - return ctx->time.time; >> + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); >> +} >> + >> +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, >> + struct perf_time_ctx *time, >> + struct perf_time_ctx *timeguest, >> + u64 now) >> +{ >> + /* >> + * The exclude_guest event time should be calculated from >> + * the ctx time - the guest time. >> + * The ctx time is now + READ_ONCE(time->offset). >> + * The guest time is now + READ_ONCE(timeguest->offset). >> + * So the exclude_guest time is >> + * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). >> + */ >> + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) > > Hi Kan, > > we see the following the warning when run perf record command after > enabling "CONFIG_DEBUG_PREEMPT" config item. > > [ 166.779208] BUG: using __this_cpu_read() in preemptible [00000000] code: > perf/9494 > [ 166.779234] caller is __this_cpu_preempt_check+0x13/0x20 > [ 166.779241] CPU: 56 UID: 0 PID: 9494 Comm: perf Not tainted > 6.11.0-rc4-perf-next-mediated-vpmu-v3+ #80 > [ 166.779245] Hardware name: Quanta Cloud Technology Inc. QuantaGrid > D54Q-2U/S6Q-MB-MPS, BIOS 3A11.uh 12/02/2022 > [ 166.779248] Call Trace: > [ 166.779250] <TASK> > [ 166.779252] dump_stack_lvl+0x76/0xa0 > [ 166.779260] dump_stack+0x10/0x20 > [ 166.779267] check_preemption_disabled+0xd7/0xf0 > [ 166.779273] __this_cpu_preempt_check+0x13/0x20 > [ 166.779279] calc_timer_values+0x193/0x200 > [ 166.779287] perf_event_update_userpage+0x4b/0x170 > [ 166.779294] ? ring_buffer_attach+0x14c/0x200 > [ 166.779301] perf_mmap+0x533/0x5d0 > [ 166.779309] mmap_region+0x243/0xaa0 > [ 166.779322] do_mmap+0x35b/0x640 > [ 166.779333] vm_mmap_pgoff+0xf0/0x1c0 > [ 166.779345] ksys_mmap_pgoff+0x17a/0x250 > [ 166.779354] __x64_sys_mmap+0x33/0x70 > [ 166.779362] x64_sys_call+0x1fa4/0x25f0 > [ 166.779369] do_syscall_64+0x70/0x130 > > The season that kernel complains this is __perf_event_time_ctx_now() calls > __this_cpu_read() in preemption enabled context. > > To eliminate the warning, we may need to use this_cpu_read() to replace > __this_cpu_read(). Sure. Besides this, we recently update the time related code in perf. https://lore.kernel.org/lkml/172311312757.2215.323044538405607858.tip-bot2@tip-bot2/ This patch probably have to be rebased on top of it. Thanks, Kan > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index ccd61fd06e8d..1eb628f8b3a0 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -1581,7 +1581,7 @@ static inline u64 __perf_event_time_ctx_now(struct > perf_event *event, > * So the exclude_guest time is > * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). > */ > - if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) > + if (event->attr.exclude_guest && this_cpu_read(perf_in_guest)) > return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); > else > return now + READ_ONCE(time->offset); > >> + return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); >> + else >> + return now + READ_ONCE(time->offset); >> } >> >> static u64 perf_event_time_now(struct perf_event *event, u64 now) >> @@ -1524,10 +1592,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) >> return perf_cgroup_event_time_now(event, now); >> >> if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) >> - return ctx->time.time; >> + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); >> >> - now += READ_ONCE(ctx->time.offset); >> - return now; >> + return __perf_event_time_ctx_now(event, &ctx->time, &ctx->timeguest, now); >> } >> >> static enum event_type_t get_event_type(struct perf_event *event) >> @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) >> * would only update time for the pinned events. >> */ >> if (is_active & EVENT_TIME) { >> + bool stop; >> + >> + /* vPMU should not stop time */ >> + stop = !(event_type & EVENT_GUEST) && >> + ctx == &cpuctx->ctx; >> + >> /* update (and stop) ctx time */ >> update_context_time(ctx); >> - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); >> + update_cgrp_time_from_cpuctx(cpuctx, stop); >> /* >> * CPU-release for the below ->is_active store, >> * see __load_acquire() in perf_event_time_now() >> @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) >> cpuctx->task_ctx = NULL; >> } >> >> - is_active ^= ctx->is_active; /* changed bits */ >> + if (event_type & EVENT_GUEST) { >> + /* >> + * Schedule out all !exclude_guest events of PMU >> + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. >> + */ >> + is_active = EVENT_ALL; >> + __update_context_guest_time(ctx, false); >> + perf_cgroup_set_timestamp(cpuctx, true); >> + barrier(); >> + } else { >> + is_active ^= ctx->is_active; /* changed bits */ >> + } >> >> list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { >> if (perf_skip_pmu_ctx(pmu_ctx, event_type)) >> @@ -3853,10 +3937,15 @@ static inline void group_update_userpage(struct perf_event *group_event) >> event_update_userpage(event); >> } >> >> +struct merge_sched_data { >> + int can_add_hw; >> + enum event_type_t event_type; >> +}; >> + >> static int merge_sched_in(struct perf_event *event, void *data) >> { >> struct perf_event_context *ctx = event->ctx; >> - int *can_add_hw = data; >> + struct merge_sched_data *msd = data; >> >> if (event->state <= PERF_EVENT_STATE_OFF) >> return 0; >> @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data) >> if (!event_filter_match(event)) >> return 0; >> >> - if (group_can_go_on(event, *can_add_hw)) { >> + /* >> + * Don't schedule in any exclude_guest events of PMU with >> + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. >> + */ >> + if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest && >> + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && >> + !(msd->event_type & EVENT_GUEST)) >> + return 0; >> + >> + if (group_can_go_on(event, msd->can_add_hw)) { >> if (!group_sched_in(event, ctx)) >> list_add_tail(&event->active_list, get_event_list(event)); >> } >> >> if (event->state == PERF_EVENT_STATE_INACTIVE) { >> - *can_add_hw = 0; >> + msd->can_add_hw = 0; >> if (event->attr.pinned) { >> perf_cgroup_event_disable(event, ctx); >> perf_event_set_state(event, PERF_EVENT_STATE_ERROR); >> @@ -3889,11 +3987,15 @@ static int merge_sched_in(struct perf_event *event, void *data) >> >> static void pmu_groups_sched_in(struct perf_event_context *ctx, >> struct perf_event_groups *groups, >> - struct pmu *pmu) >> + struct pmu *pmu, >> + enum event_type_t event_type) >> { >> - int can_add_hw = 1; >> + struct merge_sched_data msd = { >> + .can_add_hw = 1, >> + .event_type = event_type, >> + }; >> visit_groups_merge(ctx, groups, smp_processor_id(), pmu, >> - merge_sched_in, &can_add_hw); >> + merge_sched_in, &msd); >> } >> >> static void ctx_groups_sched_in(struct perf_event_context *ctx, >> @@ -3905,14 +4007,14 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx, >> list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { >> if (perf_skip_pmu_ctx(pmu_ctx, event_type)) >> continue; >> - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); >> + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type); >> } >> } >> >> static void __pmu_ctx_sched_in(struct perf_event_context *ctx, >> struct pmu *pmu) >> { >> - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); >> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0); >> } >> >> static void >> @@ -3927,9 +4029,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) >> return; >> >> if (!(is_active & EVENT_TIME)) { >> + /* EVENT_TIME should be active while the guest runs */ >> + WARN_ON_ONCE(event_type & EVENT_GUEST); >> /* start ctx time */ >> __update_context_time(ctx, false); >> - perf_cgroup_set_timestamp(cpuctx); >> + perf_cgroup_set_timestamp(cpuctx, false); >> /* >> * CPU-release for the below ->is_active store, >> * see __load_acquire() in perf_event_time_now() >> @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) >> WARN_ON_ONCE(cpuctx->task_ctx != ctx); >> } >> >> - is_active ^= ctx->is_active; /* changed bits */ >> + if (event_type & EVENT_GUEST) { >> + /* >> + * Schedule in all !exclude_guest events of PMU >> + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. >> + */ >> + is_active = EVENT_ALL; >> + >> + /* >> + * Update ctx time to set the new start time for >> + * the exclude_guest events. >> + */ >> + update_context_time(ctx); >> + update_cgrp_time_from_cpuctx(cpuctx, false); >> + barrier(); >> + } else { >> + is_active ^= ctx->is_active; /* changed bits */ >> + } >> >> /* >> * First go through the list and put on any pinned groups >
On Wed, Aug 21, 2024 at 01:27:17PM +0800, Mi, Dapeng wrote: > On 8/1/2024 12:58 PM, Mingwei Zhang wrote: > > +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, > > + struct perf_time_ctx *time, > > + struct perf_time_ctx *timeguest, > > + u64 now) > > +{ > > + /* > > + * The exclude_guest event time should be calculated from > > + * the ctx time - the guest time. > > + * The ctx time is now + READ_ONCE(time->offset). > > + * The guest time is now + READ_ONCE(timeguest->offset). > > + * So the exclude_guest time is > > + * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). > > + */ > > + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) > > Hi Kan, > > we see the following the warning when run perf record command after > enabling "CONFIG_DEBUG_PREEMPT" config item. > > [ 166.779208] BUG: using __this_cpu_read() in preemptible [00000000] code: > perf/9494 > [ 166.779234] caller is __this_cpu_preempt_check+0x13/0x20 > [ 166.779241] CPU: 56 UID: 0 PID: 9494 Comm: perf Not tainted > 6.11.0-rc4-perf-next-mediated-vpmu-v3+ #80 > [ 166.779245] Hardware name: Quanta Cloud Technology Inc. QuantaGrid > D54Q-2U/S6Q-MB-MPS, BIOS 3A11.uh 12/02/2022 > [ 166.779248] Call Trace: > [ 166.779250] <TASK> > [ 166.779252] dump_stack_lvl+0x76/0xa0 > [ 166.779260] dump_stack+0x10/0x20 > [ 166.779267] check_preemption_disabled+0xd7/0xf0 > [ 166.779273] __this_cpu_preempt_check+0x13/0x20 > [ 166.779279] calc_timer_values+0x193/0x200 > [ 166.779287] perf_event_update_userpage+0x4b/0x170 > [ 166.779294] ? ring_buffer_attach+0x14c/0x200 > [ 166.779301] perf_mmap+0x533/0x5d0 > [ 166.779309] mmap_region+0x243/0xaa0 > [ 166.779322] do_mmap+0x35b/0x640 > [ 166.779333] vm_mmap_pgoff+0xf0/0x1c0 > [ 166.779345] ksys_mmap_pgoff+0x17a/0x250 > [ 166.779354] __x64_sys_mmap+0x33/0x70 > [ 166.779362] x64_sys_call+0x1fa4/0x25f0 > [ 166.779369] do_syscall_64+0x70/0x130 > > The season that kernel complains this is __perf_event_time_ctx_now() calls > __this_cpu_read() in preemption enabled context. > > To eliminate the warning, we may need to use this_cpu_read() to replace > __this_cpu_read(). > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index ccd61fd06e8d..1eb628f8b3a0 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -1581,7 +1581,7 @@ static inline u64 __perf_event_time_ctx_now(struct > perf_event *event, > * So the exclude_guest time is > * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). > */ > - if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) > + if (event->attr.exclude_guest && this_cpu_read(perf_in_guest)) > return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); > else > return now + READ_ONCE(time->offset); The saner fix is moving the preempt_disable() in perf_event_update_userpage() up a few lines, no?
On 2024-10-11 7:41 a.m., Peter Zijlstra wrote: > On Wed, Aug 21, 2024 at 01:27:17PM +0800, Mi, Dapeng wrote: >> On 8/1/2024 12:58 PM, Mingwei Zhang wrote: > >>> +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, >>> + struct perf_time_ctx *time, >>> + struct perf_time_ctx *timeguest, >>> + u64 now) >>> +{ >>> + /* >>> + * The exclude_guest event time should be calculated from >>> + * the ctx time - the guest time. >>> + * The ctx time is now + READ_ONCE(time->offset). >>> + * The guest time is now + READ_ONCE(timeguest->offset). >>> + * So the exclude_guest time is >>> + * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). >>> + */ >>> + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) >> >> Hi Kan, >> >> we see the following the warning when run perf record command after >> enabling "CONFIG_DEBUG_PREEMPT" config item. >> >> [ 166.779208] BUG: using __this_cpu_read() in preemptible [00000000] code: >> perf/9494 >> [ 166.779234] caller is __this_cpu_preempt_check+0x13/0x20 >> [ 166.779241] CPU: 56 UID: 0 PID: 9494 Comm: perf Not tainted >> 6.11.0-rc4-perf-next-mediated-vpmu-v3+ #80 >> [ 166.779245] Hardware name: Quanta Cloud Technology Inc. QuantaGrid >> D54Q-2U/S6Q-MB-MPS, BIOS 3A11.uh 12/02/2022 >> [ 166.779248] Call Trace: >> [ 166.779250] <TASK> >> [ 166.779252] dump_stack_lvl+0x76/0xa0 >> [ 166.779260] dump_stack+0x10/0x20 >> [ 166.779267] check_preemption_disabled+0xd7/0xf0 >> [ 166.779273] __this_cpu_preempt_check+0x13/0x20 >> [ 166.779279] calc_timer_values+0x193/0x200 >> [ 166.779287] perf_event_update_userpage+0x4b/0x170 >> [ 166.779294] ? ring_buffer_attach+0x14c/0x200 >> [ 166.779301] perf_mmap+0x533/0x5d0 >> [ 166.779309] mmap_region+0x243/0xaa0 >> [ 166.779322] do_mmap+0x35b/0x640 >> [ 166.779333] vm_mmap_pgoff+0xf0/0x1c0 >> [ 166.779345] ksys_mmap_pgoff+0x17a/0x250 >> [ 166.779354] __x64_sys_mmap+0x33/0x70 >> [ 166.779362] x64_sys_call+0x1fa4/0x25f0 >> [ 166.779369] do_syscall_64+0x70/0x130 >> >> The season that kernel complains this is __perf_event_time_ctx_now() calls >> __this_cpu_read() in preemption enabled context. >> >> To eliminate the warning, we may need to use this_cpu_read() to replace >> __this_cpu_read(). >> >> diff --git a/kernel/events/core.c b/kernel/events/core.c >> index ccd61fd06e8d..1eb628f8b3a0 100644 >> --- a/kernel/events/core.c >> +++ b/kernel/events/core.c >> @@ -1581,7 +1581,7 @@ static inline u64 __perf_event_time_ctx_now(struct >> perf_event *event, >> * So the exclude_guest time is >> * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). >> */ >> - if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) >> + if (event->attr.exclude_guest && this_cpu_read(perf_in_guest)) >> return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); >> else >> return now + READ_ONCE(time->offset); > > The saner fix is moving the preempt_disable() in > perf_event_update_userpage() up a few lines, no? Yes, the preempt_disable() is to guarantee consistent time stamps. It makes sense to include the time calculation part in the preempt_disable(). Since the time-related code was updated recently, I will fold all the suggestions into the newly re-based code. Thanks, Kan
Can you rework this one along these lines? --- --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -975,6 +975,7 @@ struct perf_event_context { * Context clock, runs when context enabled. */ struct perf_time_ctx time; + struct perf_time_ctx timeguest; /* * These fields let us detect when two contexts have both @@ -1066,6 +1067,7 @@ struct bpf_perf_event_data_kern { */ struct perf_cgroup_info { struct perf_time_ctx time; + struct perf_time_ctx timeguest; int active; }; --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -782,12 +782,44 @@ static inline int is_cgroup_event(struct return event->cgrp != NULL; } +static_assert(offsetof(struct perf_event, timeguest) - + offsetof(struct perf_event, time) == + sizeof(struct perf_time_ctx)); + +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[0].time; + if (event->attr.exclude_guest) + time -= times[1].time; + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (event->attr.exclude_guest) { + /* + * (now + times[0].offset) - (now + times[1].offset) := + * times[0].offset - times[1].offset + */ + return READ_ONCE(times[0].offset) - READ_ONCE(times[1].offset); + } + + return now + READ_ONCE(times[0].offset); +} + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time.time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -796,12 +828,12 @@ static inline u64 perf_cgroup_event_time t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time.time; + return __perf_event_time_ctx(event, &t->time); now += READ_ONCE(t->time.offset); - return now; + return __perf_event_time_ctx_now(event, &t->time, now); } -static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +static inline void __update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) { if (adv) time->time += now - time->stamp; @@ -819,6 +851,13 @@ static inline void update_perf_time_ctx( WRITE_ONCE(time->offset, time->time - time->stamp); } +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + __update_perf_time_ctx(time + 0, now, adv); + if (__this_cpu_read(perf_in_guest)) + __update_perf_time_ctx(time + 1, now, adv) +} + static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp;
On 2024-10-11 2:42 p.m., Peter Zijlstra wrote: > > Can you rework this one along these lines? Sure. I probably also add macros to replace the magic number 0 and 1. For example, #define T_TOTAL 0 #define T_GUEST 1 Thanks, Kan > > --- > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -975,6 +975,7 @@ struct perf_event_context { > * Context clock, runs when context enabled. > */ > struct perf_time_ctx time; > + struct perf_time_ctx timeguest; > > /* > * These fields let us detect when two contexts have both > @@ -1066,6 +1067,7 @@ struct bpf_perf_event_data_kern { > */ > struct perf_cgroup_info { > struct perf_time_ctx time; > + struct perf_time_ctx timeguest; > int active; > }; > > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -782,12 +782,44 @@ static inline int is_cgroup_event(struct > return event->cgrp != NULL; > } > > +static_assert(offsetof(struct perf_event, timeguest) - > + offsetof(struct perf_event, time) == > + sizeof(struct perf_time_ctx)); > + > +static_assert(offsetof(struct perf_cgroup_info, timeguest) - > + offsetof(struct perf_cgroup_info, time) == > + sizeof(struct perf_time_ctx)); > + > +static inline u64 __perf_event_time_ctx(struct perf_event *event, > + struct perf_time_ctx *times) > +{ > + u64 time = times[0].time; > + if (event->attr.exclude_guest) > + time -= times[1].time; > + return time; > +} > + > +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, > + struct perf_time_ctx *times, > + u64 now) > +{ > + if (event->attr.exclude_guest) { > + /* > + * (now + times[0].offset) - (now + times[1].offset) := > + * times[0].offset - times[1].offset > + */ > + return READ_ONCE(times[0].offset) - READ_ONCE(times[1].offset); > + } > + > + return now + READ_ONCE(times[0].offset); > +} > + > static inline u64 perf_cgroup_event_time(struct perf_event *event) > { > struct perf_cgroup_info *t; > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > - return t->time.time; > + return __perf_event_time_ctx(event, &t->time); > } > > static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) > @@ -796,12 +828,12 @@ static inline u64 perf_cgroup_event_time > > t = per_cpu_ptr(event->cgrp->info, event->cpu); > if (!__load_acquire(&t->active)) > - return t->time.time; > + return __perf_event_time_ctx(event, &t->time); > now += READ_ONCE(t->time.offset); > - return now; > + return __perf_event_time_ctx_now(event, &t->time, now); > } > > -static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) > +static inline void __update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) > { > if (adv) > time->time += now - time->stamp; > @@ -819,6 +851,13 @@ static inline void update_perf_time_ctx( > WRITE_ONCE(time->offset, time->time - time->stamp); > } > > +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) > +{ > + __update_perf_time_ctx(time + 0, now, adv); > + if (__this_cpu_read(perf_in_guest)) > + __update_perf_time_ctx(time + 1, now, adv) > +} > + > static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) > { > struct perf_cgroup *cgrp = cpuctx->cgrp; >
On Fri, Oct 11, 2024 at 03:49:44PM -0400, Liang, Kan wrote: > > > On 2024-10-11 2:42 p.m., Peter Zijlstra wrote: > > > > Can you rework this one along these lines? > > Sure. Thanks!
On Thu, Aug 01, 2024 at 04:58:18AM +0000, Mingwei Zhang wrote: > @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) > * would only update time for the pinned events. > */ > if (is_active & EVENT_TIME) { > + bool stop; > + > + /* vPMU should not stop time */ > + stop = !(event_type & EVENT_GUEST) && > + ctx == &cpuctx->ctx; > + > /* update (and stop) ctx time */ > update_context_time(ctx); > - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); > + update_cgrp_time_from_cpuctx(cpuctx, stop); > /* > * CPU-release for the below ->is_active store, > * see __load_acquire() in perf_event_time_now() > @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) > cpuctx->task_ctx = NULL; > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule out all !exclude_guest events of PMU > + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. > + */ I thought the premise was that if we have !exclude_guest events, we'll fail the creation of vPMU, and if we have vPMU we'll fail the creation of !exclude_guest events. As such, they're mutually exclusive, and the above comment doesn't make sense, if we have a vPMU, there are no !exclude_guest events, IOW schedule out the entire context. > + is_active = EVENT_ALL; > + __update_context_guest_time(ctx, false); > + perf_cgroup_set_timestamp(cpuctx, true); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + } > > list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > if (perf_skip_pmu_ctx(pmu_ctx, event_type)) > @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data) > if (!event_filter_match(event)) > return 0; > > - if (group_can_go_on(event, *can_add_hw)) { > + /* > + * Don't schedule in any exclude_guest events of PMU with > + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. > + */ More confusion; if we have vPMU there should not be !exclude_guest events, right? So the above then becomes 'Don't schedule in any events'. > + if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest && > + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && > + !(msd->event_type & EVENT_GUEST)) > + return 0; > + > + if (group_can_go_on(event, msd->can_add_hw)) { > if (!group_sched_in(event, ctx)) > list_add_tail(&event->active_list, get_event_list(event)); > @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) > WARN_ON_ONCE(cpuctx->task_ctx != ctx); > } > > - is_active ^= ctx->is_active; /* changed bits */ > + if (event_type & EVENT_GUEST) { > + /* > + * Schedule in all !exclude_guest events of PMU > + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. > + */ Idem. > + is_active = EVENT_ALL; > + > + /* > + * Update ctx time to set the new start time for > + * the exclude_guest events. > + */ > + update_context_time(ctx); > + update_cgrp_time_from_cpuctx(cpuctx, false); > + barrier(); > + } else { > + is_active ^= ctx->is_active; /* changed bits */ > + }
On 2024-10-14 7:14 a.m., Peter Zijlstra wrote: > On Thu, Aug 01, 2024 at 04:58:18AM +0000, Mingwei Zhang wrote: > >> @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) >> * would only update time for the pinned events. >> */ >> if (is_active & EVENT_TIME) { >> + bool stop; >> + >> + /* vPMU should not stop time */ >> + stop = !(event_type & EVENT_GUEST) && >> + ctx == &cpuctx->ctx; >> + >> /* update (and stop) ctx time */ >> update_context_time(ctx); >> - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); >> + update_cgrp_time_from_cpuctx(cpuctx, stop); >> /* >> * CPU-release for the below ->is_active store, >> * see __load_acquire() in perf_event_time_now() >> @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) >> cpuctx->task_ctx = NULL; >> } >> >> - is_active ^= ctx->is_active; /* changed bits */ >> + if (event_type & EVENT_GUEST) { >> + /* >> + * Schedule out all !exclude_guest events of PMU >> + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. >> + */ > > I thought the premise was that if we have !exclude_guest events, we'll > fail the creation of vPMU, and if we have vPMU we'll fail the creation > of !exclude_guest events. > > As such, they're mutually exclusive, and the above comment doesn't make > sense, if we have a vPMU, there are no !exclude_guest events, IOW > schedule out the entire context. It's a typo. Right, it should schedule out all exclude_guest events of the passthrough PMUs. > >> + is_active = EVENT_ALL; >> + __update_context_guest_time(ctx, false); >> + perf_cgroup_set_timestamp(cpuctx, true); >> + barrier(); >> + } else { >> + is_active ^= ctx->is_active; /* changed bits */ >> + } >> >> list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { >> if (perf_skip_pmu_ctx(pmu_ctx, event_type)) > >> @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data) >> if (!event_filter_match(event)) >> return 0; >> >> - if (group_can_go_on(event, *can_add_hw)) { >> + /* >> + * Don't schedule in any exclude_guest events of PMU with >> + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. >> + */ > > More confusion; if we have vPMU there should not be !exclude_guest > events, right? So the above then becomes 'Don't schedule in any events'. Right. I will correct the three comments. Thanks, Kan > >> + if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest && >> + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && >> + !(msd->event_type & EVENT_GUEST)) >> + return 0; >> + >> + if (group_can_go_on(event, msd->can_add_hw)) { >> if (!group_sched_in(event, ctx)) >> list_add_tail(&event->active_list, get_event_list(event)); > >> @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) >> WARN_ON_ONCE(cpuctx->task_ctx != ctx); >> } >> >> - is_active ^= ctx->is_active; /* changed bits */ >> + if (event_type & EVENT_GUEST) { >> + /* >> + * Schedule in all !exclude_guest events of PMU >> + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. >> + */ > > Idem. > >> + is_active = EVENT_ALL; >> + >> + /* >> + * Update ctx time to set the new start time for >> + * the exclude_guest events. >> + */ >> + update_context_time(ctx); >> + update_cgrp_time_from_cpuctx(cpuctx, false); >> + barrier(); >> + } else { >> + is_active ^= ctx->is_active; /* changed bits */ >> + } >
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e22cdb6486e6..81a5f8399cb8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -952,6 +952,11 @@ struct perf_event_context { */ struct perf_time_ctx time; + /* + * Context clock, runs when in the guest mode. + */ + struct perf_time_ctx timeguest; + /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. @@ -1044,6 +1049,7 @@ struct bpf_perf_event_data_kern { */ struct perf_cgroup_info { struct perf_time_ctx time; + struct perf_time_ctx timeguest; int active; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index c25e2bf27001..57648736e43e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -376,7 +376,8 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_CGROUP = 0x10, - EVENT_FLAGS = EVENT_CGROUP, + EVENT_GUEST = 0x20, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly; static atomic_t nr_mediated_pmu_vms; static DEFINE_MUTEX(perf_mediated_pmu_mutex); +static DEFINE_PER_CPU(bool, perf_in_guest); /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */ static inline bool is_include_guest_event(struct perf_event *event) @@ -706,6 +708,10 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) + return true; + return false; } @@ -770,12 +776,21 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *time, + struct perf_time_ctx *timeguest); + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *time, + struct perf_time_ctx *timeguest, + u64 now); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time.time; + return __perf_event_time_ctx(event, &t->time, &t->timeguest); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -784,9 +799,9 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time.time; - now += READ_ONCE(t->time.offset); - return now; + return __perf_event_time_ctx(event, &t->time, &t->timeguest); + + return __perf_event_time_ctx_now(event, &t->time, &t->timeguest, now); } static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv); @@ -796,6 +811,18 @@ static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bo update_perf_time_ctx(&info->time, now, adv); } +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + __update_cgrp_time(info, now, true); + if (__this_cpu_read(perf_in_guest)) + __update_cgrp_guest_time(info, now, true); +} + static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -809,7 +836,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -832,11 +859,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -856,8 +883,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->time.stamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + __update_cgrp_time(info, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -1061,7 +1092,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1488,16 +1519,34 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ lockdep_assert_held(&ctx->lock); - update_perf_time_ctx(&ctx->time, now, adv); + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (__this_cpu_read(perf_in_guest)) + __update_context_guest_time(ctx, true); +} + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *time, + struct perf_time_ctx *timeguest) +{ + if (event->attr.exclude_guest) + return time->time - timeguest->time; + else + return time->time; } static u64 perf_event_time(struct perf_event *event) @@ -1510,7 +1559,26 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *time, + struct perf_time_ctx *timeguest, + u64 now) +{ + /* + * The exclude_guest event time should be calculated from + * the ctx time - the guest time. + * The ctx time is now + READ_ONCE(time->offset). + * The guest time is now + READ_ONCE(timeguest->offset). + * So the exclude_guest time is + * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset). + */ + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) + return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset); + else + return now + READ_ONCE(time->offset); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1524,10 +1592,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest); - now += READ_ONCE(ctx->time.offset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, &ctx->timeguest, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) * would only update time for the pinned events. */ if (is_active & EVENT_TIME) { + bool stop; + + /* vPMU should not stop time */ + stop = !(event_type & EVENT_GUEST) && + ctx == &cpuctx->ctx; + /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + update_cgrp_time_from_cpuctx(cpuctx, stop); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) @@ -3853,10 +3937,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any exclude_guest events of PMU with + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. + */ + if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest && + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -3889,11 +3987,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void ctx_groups_sched_in(struct perf_event_context *ctx, @@ -3905,14 +4007,14 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx, list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type); } } static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0); } static void @@ -3927,9 +4029,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups