diff mbox series

[v2,07/54] perf: Add generic exclude_guest support

Message ID 20240506053020.3911940-8-mizhang@google.com (mailing list archive)
State New, archived
Headers show
Series Mediated Passthrough vPMU 2.0 for x86 | expand

Commit Message

Mingwei Zhang May 6, 2024, 5:29 a.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Current perf doesn't explicitly schedule out all exclude_guest events
while the guest is running. There is no problem for the current emulated
vPMU. Because perf owns all the PMU counters. It can mask the counter
which is assigned to a exclude_guest event when a guest is running
(Intel way), or set the correspoinding HOSTONLY bit in evsentsel (AMD
way). The counter doesn't count when a guest is running.

However, either way doesn't work with the passthrough vPMU introduced.
A guest owns all the PMU counters when it's running. Host should not
mask any counters. The counter may be used by the guest. The evsentsel
may be overwrite.

Perf should explicitly schedule out all exclude_guest events to release
the PMU resources when entering a guest, and resume the counting when
exiting the guest.

Expose two interfaces to KVM. The KVM should notify the perf when
entering/exiting a guest.

It's possible that a exclude_guest event is created when a guest is
running. The new event should not be scheduled in as well.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |   4 ++
 kernel/events/core.c       | 104 +++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

Comments

Peter Zijlstra May 7, 2024, 8:58 a.m. UTC | #1
On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote:

> @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>  
> +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	update_context_time(ctx);
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		struct perf_event *event, *tmp;
> +		struct pmu *pmu = pmu_ctx->pmu;
> +
> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
> +			continue;
> +
> +		perf_pmu_disable(pmu);
> +
> +		/*
> +		 * All active events must be exclude_guest events.
> +		 * See perf_get_mediated_pmu().
> +		 * Unconditionally remove all active events.
> +		 */
> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
> +			group_sched_out(event, pmu_ctx->ctx);
> +
> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
> +			group_sched_out(event, pmu_ctx->ctx);
> +
> +		pmu_ctx->rotate_necessary = 0;
> +
> +		perf_pmu_enable(pmu);
> +	}
> +}
> +
> +/* When entering a guest, schedule out all exclude_guest events. */
> +void perf_guest_enter(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +		return;
> +	}
> +
> +	perf_sched_out_exclude_guest(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_sched_out_exclude_guest(cpuctx->task_ctx);
> +
> +	__this_cpu_write(perf_in_guest, true);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +
> +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
> +{
> +	struct perf_event_pmu_context *pmu_ctx;
> +
> +	update_context_time(ctx);
> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +		struct pmu *pmu = pmu_ctx->pmu;
> +
> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
> +			continue;
> +
> +		perf_pmu_disable(pmu);
> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +		perf_pmu_enable(pmu);
> +	}
> +}
> +
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +		return;
> +	}
> +
> +	__this_cpu_write(perf_in_guest, false);
> +
> +	perf_sched_in_exclude_guest(&cpuctx->ctx);
> +	if (cpuctx->task_ctx)
> +		perf_sched_in_exclude_guest(cpuctx->task_ctx);
> +
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}

Bah, this is a ton of copy-paste from the normal scheduling code with
random changes. Why ?

Why can't this use ctx_sched_{in,out}() ? Surely the whole
CAP_PASSTHROUGHT thing is but a flag away.
Liang, Kan June 10, 2024, 5:23 p.m. UTC | #2
On 2024-05-07 4:58 a.m., Peter Zijlstra wrote:
> On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote:
> 
>> @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>>  
>> +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
>> +{
>> +	struct perf_event_pmu_context *pmu_ctx;
>> +
>> +	update_context_time(ctx);
>> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>> +		struct perf_event *event, *tmp;
>> +		struct pmu *pmu = pmu_ctx->pmu;
>> +
>> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
>> +			continue;
>> +
>> +		perf_pmu_disable(pmu);
>> +
>> +		/*
>> +		 * All active events must be exclude_guest events.
>> +		 * See perf_get_mediated_pmu().
>> +		 * Unconditionally remove all active events.
>> +		 */
>> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
>> +			group_sched_out(event, pmu_ctx->ctx);
>> +
>> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
>> +			group_sched_out(event, pmu_ctx->ctx);
>> +
>> +		pmu_ctx->rotate_necessary = 0;
>> +
>> +		perf_pmu_enable(pmu);
>> +	}
>> +}
>> +
>> +/* When entering a guest, schedule out all exclude_guest events. */
>> +void perf_guest_enter(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
>> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +		return;
>> +	}
>> +
>> +	perf_sched_out_exclude_guest(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_sched_out_exclude_guest(cpuctx->task_ctx);
>> +
>> +	__this_cpu_write(perf_in_guest, true);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
>> +
>> +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
>> +{
>> +	struct perf_event_pmu_context *pmu_ctx;
>> +
>> +	update_context_time(ctx);
>> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>> +		struct pmu *pmu = pmu_ctx->pmu;
>> +
>> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
>> +			continue;
>> +
>> +		perf_pmu_disable(pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>> +		perf_pmu_enable(pmu);
>> +	}
>> +}
>> +
>> +void perf_guest_exit(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
>> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +		return;
>> +	}
>> +
>> +	__this_cpu_write(perf_in_guest, false);
>> +
>> +	perf_sched_in_exclude_guest(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_sched_in_exclude_guest(cpuctx->task_ctx);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
> 
> Bah, this is a ton of copy-paste from the normal scheduling code with
> random changes. Why ?
> 
> Why can't this use ctx_sched_{in,out}() ? Surely the whole
> CAP_PASSTHROUGHT thing is but a flag away.
>

Not just a flag. The time has to be updated as well, since the ctx->time
is shared among PMUs. Perf shouldn't stop it while other PMUs is still
running.

A timeguest will be introduced to track the start time of a guest.
The event->tstamp of an exclude_guest event should always keep
ctx->timeguest while a guest is running.
When a guest is leaving, update the event->tstamp to now, so the guest
time can be deducted.

The below patch demonstrate how the timeguest works.
(It's an incomplete patch. Just to show the idea.)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 22d3e56682c9..2134e6886e22 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -953,6 +953,7 @@ struct perf_event_context {
 	u64				time;
 	u64				timestamp;
 	u64				timeoffset;
+	u64				timeguest;

 	/*
 	 * These fields let us detect when two contexts have both
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 14fd881e3e1d..2aed56671a24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -690,12 +690,31 @@ __perf_update_times(struct perf_event *event, u64
now, u64 *enabled, u64 *runnin
 		*running += delta;
 }

+static void perf_event_update_time_guest(struct perf_event *event)
+{
+	/*
+	 * If a guest is running, use the timestamp while entering the guest.
+	 * If the guest is leaving, reset the event timestamp.
+	 */
+	if (!__this_cpu_read(perf_in_guest))
+		event->tstamp = event->ctx->time;
+	else
+		event->tstamp = event->ctx->timeguest;
+}
+
 static void perf_event_update_time(struct perf_event *event)
 {
-	u64 now = perf_event_time(event);
+	u64 now;
+
+	/* Never count the time of an active guest into an exclude_guest event. */
+	if (event->ctx->timeguest &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)
+		return perf_event_update_time_guest(event);

+	now = perf_event_time(event);
 	__perf_update_times(event, now, &event->total_time_enabled,
 					&event->total_time_running);
+
 	event->tstamp = now;
 }

@@ -3398,7 +3417,14 @@ ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
 			cpuctx->task_ctx = NULL;
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule out all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
 		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
@@ -3998,7 +4024,20 @@ ctx_sched_in(struct perf_event_context *ctx, enum
event_type_t event_type)
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule in all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+
+		/*
+		 * Update ctx time to set the new start time for
+		 * the exclude_guest events.
+		 */
+		update_context_time(ctx);
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	/*
 	 * First go through the list and put on any pinned groups

@@ -5894,14 +5933,18 @@ void perf_guest_enter(u32 guest_lvtpc)
 	}

 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
+	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
+	/* Set the guest start time */
+	cpuctx->ctx.timeguest = cpuctx->ctx.time;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
-		task_ctx_sched_out(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
+		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}

 	__this_cpu_write(perf_in_guest, true);
@@ -5925,14 +5968,17 @@ void perf_guest_exit(void)

 	__this_cpu_write(perf_in_guest, false);

 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
+	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
+	cpuctx->ctx.timeguest = 0;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
-		ctx_sched_in(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
+		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = 0;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}

Thanks
Kan
Peter Zijlstra June 11, 2024, 12:06 p.m. UTC | #3
On Mon, Jun 10, 2024 at 01:23:04PM -0400, Liang, Kan wrote:
> On 2024-05-07 4:58 a.m., Peter Zijlstra wrote:

> > Bah, this is a ton of copy-paste from the normal scheduling code with
> > random changes. Why ?
> > 
> > Why can't this use ctx_sched_{in,out}() ? Surely the whole
> > CAP_PASSTHROUGHT thing is but a flag away.
> >
> 
> Not just a flag. The time has to be updated as well, since the ctx->time
> is shared among PMUs. Perf shouldn't stop it while other PMUs is still
> running.

Obviously the original changelog didn't mention any of that.... :/

> A timeguest will be introduced to track the start time of a guest.
> The event->tstamp of an exclude_guest event should always keep
> ctx->timeguest while a guest is running.
> When a guest is leaving, update the event->tstamp to now, so the guest
> time can be deducted.
> 
> The below patch demonstrate how the timeguest works.
> (It's an incomplete patch. Just to show the idea.)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 22d3e56682c9..2134e6886e22 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -953,6 +953,7 @@ struct perf_event_context {
>  	u64				time;
>  	u64				timestamp;
>  	u64				timeoffset;
> +	u64				timeguest;
> 
>  	/*
>  	 * These fields let us detect when two contexts have both
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 14fd881e3e1d..2aed56671a24 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -690,12 +690,31 @@ __perf_update_times(struct perf_event *event, u64
> now, u64 *enabled, u64 *runnin
>  		*running += delta;
>  }
> 
> +static void perf_event_update_time_guest(struct perf_event *event)
> +{
> +	/*
> +	 * If a guest is running, use the timestamp while entering the guest.
> +	 * If the guest is leaving, reset the event timestamp.
> +	 */
> +	if (!__this_cpu_read(perf_in_guest))
> +		event->tstamp = event->ctx->time;
> +	else
> +		event->tstamp = event->ctx->timeguest;
> +}

This conditional seems inverted, without a good reason. Also, in another
thread you talk about some PMUs stopping time in a guest, while other
PMUs would keep ticking. I don't think the above captures that.

>  static void perf_event_update_time(struct perf_event *event)
>  {
> -	u64 now = perf_event_time(event);
> +	u64 now;
> +
> +	/* Never count the time of an active guest into an exclude_guest event. */
> +	if (event->ctx->timeguest &&
> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)
> +		return perf_event_update_time_guest(event);

Urgh, weird split. The PMU check is here. Please just inline the above
here, this seems to be the sole caller anyway.

> 
> +	now = perf_event_time(event);
>  	__perf_update_times(event, now, &event->total_time_enabled,
>  					&event->total_time_running);
> +
>  	event->tstamp = now;
>  }
> 
> @@ -3398,7 +3417,14 @@ ctx_sched_out(struct perf_event_context *ctx,
> enum event_type_t event_type)
>  			cpuctx->task_ctx = NULL;
>  	}
> 
> -	is_active ^= ctx->is_active; /* changed bits */
> +	if (event_type & EVENT_GUEST) {

Patch doesn't introduce EVENT_GUEST, lost a hunk somewhere?

> +		/*
> +		 * Schedule out all !exclude_guest events of PMU
> +		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
> +		 */
> +		is_active = EVENT_ALL;
> +	} else
> +		is_active ^= ctx->is_active; /* changed bits */
> 
>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))

> @@ -5894,14 +5933,18 @@ void perf_guest_enter(u32 guest_lvtpc)
>  	}
> 
>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> -	ctx_sched_out(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
> +	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
> +	/* Set the guest start time */
> +	cpuctx->ctx.timeguest = cpuctx->ctx.time;
>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>  	if (cpuctx->task_ctx) {
>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> -		task_ctx_sched_out(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
> +		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
> +		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>  	}
> 
>  	__this_cpu_write(perf_in_guest, true);
> @@ -5925,14 +5968,17 @@ void perf_guest_exit(void)
> 
>  	__this_cpu_write(perf_in_guest, false);
> 
>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> -	ctx_sched_in(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
> +	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
> +	cpuctx->ctx.timeguest = 0;
>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>  	if (cpuctx->task_ctx) {
>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> -		ctx_sched_in(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
> +		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
> +		cpuctx->task_ctx->timeguest = 0;
>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>  	}

I'm thinking EVENT_GUEST should cause the ->timeguest updates, no point
in having them explicitly duplicated here, hmm?
Liang, Kan June 11, 2024, 1:27 p.m. UTC | #4
On 2024-06-11 8:06 a.m., Peter Zijlstra wrote:
> On Mon, Jun 10, 2024 at 01:23:04PM -0400, Liang, Kan wrote:
>> On 2024-05-07 4:58 a.m., Peter Zijlstra wrote:
> 
>>> Bah, this is a ton of copy-paste from the normal scheduling code with
>>> random changes. Why ?
>>>
>>> Why can't this use ctx_sched_{in,out}() ? Surely the whole
>>> CAP_PASSTHROUGHT thing is but a flag away.
>>>
>>
>> Not just a flag. The time has to be updated as well, since the ctx->time
>> is shared among PMUs. Perf shouldn't stop it while other PMUs is still
>> running.
> 
> Obviously the original changelog didn't mention any of that.... :/

Yes, the time issue was a newly found bug when we test the uncore PMUs.

> 
>> A timeguest will be introduced to track the start time of a guest.
>> The event->tstamp of an exclude_guest event should always keep
>> ctx->timeguest while a guest is running.
>> When a guest is leaving, update the event->tstamp to now, so the guest
>> time can be deducted.
>>
>> The below patch demonstrate how the timeguest works.
>> (It's an incomplete patch. Just to show the idea.)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 22d3e56682c9..2134e6886e22 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -953,6 +953,7 @@ struct perf_event_context {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> +	u64				timeguest;
>>
>>  	/*
>>  	 * These fields let us detect when two contexts have both
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 14fd881e3e1d..2aed56671a24 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -690,12 +690,31 @@ __perf_update_times(struct perf_event *event, u64
>> now, u64 *enabled, u64 *runnin
>>  		*running += delta;
>>  }
>>
>> +static void perf_event_update_time_guest(struct perf_event *event)
>> +{
>> +	/*
>> +	 * If a guest is running, use the timestamp while entering the guest.
>> +	 * If the guest is leaving, reset the event timestamp.
>> +	 */
>> +	if (!__this_cpu_read(perf_in_guest))
>> +		event->tstamp = event->ctx->time;
>> +	else
>> +		event->tstamp = event->ctx->timeguest;
>> +}
> 
> This conditional seems inverted, without a good reason. Also, in another
> thread you talk about some PMUs stopping time in a guest, while other
> PMUs would keep ticking. I don't think the above captures that.
> 
>>  static void perf_event_update_time(struct perf_event *event)
>>  {
>> -	u64 now = perf_event_time(event);
>> +	u64 now;
>> +
>> +	/* Never count the time of an active guest into an exclude_guest event. */
>> +	if (event->ctx->timeguest &&
>> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)
>> +		return perf_event_update_time_guest(event);
> 
> Urgh, weird split. The PMU check is here. Please just inline the above
> here, this seems to be the sole caller anyway.
>

Sure

>>
>> +	now = perf_event_time(event);
>>  	__perf_update_times(event, now, &event->total_time_enabled,
>>  					&event->total_time_running);
>> +
>>  	event->tstamp = now;
>>  }
>>
>> @@ -3398,7 +3417,14 @@ ctx_sched_out(struct perf_event_context *ctx,
>> enum event_type_t event_type)
>>  			cpuctx->task_ctx = NULL;
>>  	}
>>
>> -	is_active ^= ctx->is_active; /* changed bits */
>> +	if (event_type & EVENT_GUEST) {
> 
> Patch doesn't introduce EVENT_GUEST, lost a hunk somewhere?

Sorry, there is a prerequisite patch to factor out the EVENT_CGROUP.
I thought it will be complex and confusion to paste both. Some details
are lost.
I will post both two patches at the end.

> 
>> +		/*
>> +		 * Schedule out all !exclude_guest events of PMU
>> +		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>> +		 */
>> +		is_active = EVENT_ALL;
>> +	} else
>> +		is_active ^= ctx->is_active; /* changed bits */
>>
>>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
> 
>> @@ -5894,14 +5933,18 @@ void perf_guest_enter(u32 guest_lvtpc)
>>  	}
>>
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>> -	ctx_sched_out(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
>> +	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
>> +	/* Set the guest start time */
>> +	cpuctx->ctx.timeguest = cpuctx->ctx.time;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>> -		task_ctx_sched_out(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
>> +		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
>> +		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
>>
>>  	__this_cpu_write(perf_in_guest, true);
>> @@ -5925,14 +5968,17 @@ void perf_guest_exit(void)
>>
>>  	__this_cpu_write(perf_in_guest, false);
>>
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>> -	ctx_sched_in(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
>> +	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
>> +	cpuctx->ctx.timeguest = 0;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>> -		ctx_sched_in(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
>> +		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
>> +		cpuctx->task_ctx->timeguest = 0;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
> 
> I'm thinking EVENT_GUEST should cause the ->timeguest updates, no point
> in having them explicitly duplicated here, hmm?

We have to add a EVENT_GUEST check and update the ->timeguest at the end
of the ctx_sched_out/in functions after the pmu_ctx_sched_out/in().
Because the ->timeguest also be used to check if it's leaving the guest
in the perf_event_update_time().

Since the EVENT_GUEST only be used by perf_guest_enter/exit(), I thought
maybe it's better to move it to where it is used rather than the generic
ctx_sched_out/in(). It will minimize the impact on the
non-virtualization user.

Here are the two complete patches.

The first one is to factor out the EVENT_CGROUP.


From c508f2b0e11a2eea71fe3ff16d1d848359ede535 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 27 May 2024 06:58:29 -0700
Subject: [PATCH 1/2] perf: Skip pmu_ctx based on event_type

To optimize the cgroup context switch, the perf_event_pmu_context
iteration skips the PMUs without cgroup events. A bool cgroup was
introduced to indicate the case. It can work, but this way is hard to
extend for other cases, e.g. skipping non-passthrough PMUs. It doesn't
make sense to keep adding bool variables.

Pass the event_type instead of the specific bool variable. Check both
the event_type and related pmu_ctx variables to decide whether skipping
a PMU.

Event flags, e.g., EVENT_CGROUP, should be cleard in the ctx->is_active.
Add EVENT_FLAGS to indicate such event flags.

No functional change.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 kernel/events/core.c | 70 +++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index abd4027e3859..95d1d5a5addc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -376,6 +376,7 @@ enum event_type_t {
 	/* see ctx_resched() for details */
 	EVENT_CPU = 0x8,
 	EVENT_CGROUP = 0x10,
+	EVENT_FLAGS = EVENT_CGROUP,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };

@@ -699,23 +700,32 @@ do {									\
 	___p;								\
 })

-static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
+			      enum event_type_t event_type)
+{
+	if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
+		return true;
+
+	return false;
+}
+
+static void perf_ctx_disable(struct perf_event_context *ctx, enum
event_type_t event_type)
 {
 	struct perf_event_pmu_context *pmu_ctx;

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
+		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
 			continue;
 		perf_pmu_disable(pmu_ctx->pmu);
 	}
 }

-static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+static void perf_ctx_enable(struct perf_event_context *ctx, enum
event_type_t event_type)
 {
 	struct perf_event_pmu_context *pmu_ctx;

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
+		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
 			continue;
 		perf_pmu_enable(pmu_ctx->pmu);
 	}
@@ -877,7 +887,7 @@ static void perf_cgroup_switch(struct task_struct *task)
 		return;

 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-	perf_ctx_disable(&cpuctx->ctx, true);
+	perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP);

 	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 	/*
@@ -893,7 +903,7 @@ static void perf_cgroup_switch(struct task_struct *task)
 	 */
 	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);

-	perf_ctx_enable(&cpuctx->ctx, true);
+	perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }

@@ -2729,9 +2739,9 @@ static void ctx_resched(struct perf_cpu_context
*cpuctx,

 	event_type &= EVENT_ALL;

-	perf_ctx_disable(&cpuctx->ctx, false);
+	perf_ctx_disable(&cpuctx->ctx, 0);
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx, false);
+		perf_ctx_disable(task_ctx, 0);
 		task_ctx_sched_out(task_ctx, event_type);
 	}

@@ -2749,9 +2759,9 @@ static void ctx_resched(struct perf_cpu_context
*cpuctx,

 	perf_event_sched_in(cpuctx, task_ctx);

-	perf_ctx_enable(&cpuctx->ctx, false);
+	perf_ctx_enable(&cpuctx->ctx, 0);
 	if (task_ctx)
-		perf_ctx_enable(task_ctx, false);
+		perf_ctx_enable(task_ctx, 0);
 }

 void perf_pmu_resched(struct pmu *pmu)
@@ -3296,9 +3306,6 @@ ctx_sched_out(struct perf_event_context *ctx, enum
event_type_t event_type)
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
-	bool cgroup = event_type & EVENT_CGROUP;
-
-	event_type &= ~EVENT_CGROUP;

 	lockdep_assert_held(&ctx->lock);

@@ -3333,7 +3340,7 @@ ctx_sched_out(struct perf_event_context *ctx, enum
event_type_t event_type)
 		barrier();
 	}

-	ctx->is_active &= ~event_type;
+	ctx->is_active &= ~(event_type & ~EVENT_FLAGS);
 	if (!(ctx->is_active & EVENT_ALL))
 		ctx->is_active = 0;

@@ -3346,7 +3353,7 @@ ctx_sched_out(struct perf_event_context *ctx, enum
event_type_t event_type)
 	is_active ^= ctx->is_active; /* changed bits */

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
+		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
 			continue;
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
 	}
@@ -3540,7 +3547,7 @@ perf_event_context_sched_out(struct task_struct
*task, struct task_struct *next)
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {

-			perf_ctx_disable(ctx, false);
+			perf_ctx_disable(ctx, 0);

 			/* PMIs are disabled; ctx->nr_pending is stable. */
 			if (local_read(&ctx->nr_pending) ||
@@ -3560,7 +3567,7 @@ perf_event_context_sched_out(struct task_struct
*task, struct task_struct *next)
 			perf_ctx_sched_task_cb(ctx, false);
 			perf_event_swap_task_ctx_data(ctx, next_ctx);

-			perf_ctx_enable(ctx, false);
+			perf_ctx_enable(ctx, 0);

 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
@@ -3584,13 +3591,13 @@ perf_event_context_sched_out(struct task_struct
*task, struct task_struct *next)

 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		perf_ctx_disable(ctx, false);
+		perf_ctx_disable(ctx, 0);

 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
 		task_ctx_sched_out(ctx, EVENT_ALL);

-		perf_ctx_enable(ctx, false);
+		perf_ctx_enable(ctx, 0);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3887,12 +3894,12 @@ static void pmu_groups_sched_in(struct
perf_event_context *ctx,

 static void ctx_groups_sched_in(struct perf_event_context *ctx,
 				struct perf_event_groups *groups,
-				bool cgroup)
+				enum event_type_t event_type)
 {
 	struct perf_event_pmu_context *pmu_ctx;

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-		if (cgroup && !pmu_ctx->nr_cgroups)
+		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
 			continue;
 		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
 	}
@@ -3909,9 +3916,6 @@ ctx_sched_in(struct perf_event_context *ctx, enum
event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	int is_active = ctx->is_active;
-	bool cgroup = event_type & EVENT_CGROUP;
-
-	event_type &= ~EVENT_CGROUP;

 	lockdep_assert_held(&ctx->lock);

@@ -3929,7 +3933,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum
event_type_t event_type)
 		barrier();
 	}

-	ctx->is_active |= (event_type | EVENT_TIME);
+	ctx->is_active |= ((event_type & ~EVENT_FLAGS) | EVENT_TIME);
 	if (ctx->task) {
 		if (!is_active)
 			cpuctx->task_ctx = ctx;
@@ -3944,11 +3948,11 @@ ctx_sched_in(struct perf_event_context *ctx,
enum event_type_t event_type)
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+		ctx_groups_sched_in(ctx, &ctx->pinned_groups, event_type);

 	/* Then walk through the lower prio flexible groups */
 	if (is_active & EVENT_FLEXIBLE)
-		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+		ctx_groups_sched_in(ctx, &ctx->flexible_groups, event_type);
 }

 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3963,11 +3967,11 @@ static void perf_event_context_sched_in(struct
task_struct *task)

 	if (cpuctx->task_ctx == ctx) {
 		perf_ctx_lock(cpuctx, ctx);
-		perf_ctx_disable(ctx, false);
+		perf_ctx_disable(ctx, 0);

 		perf_ctx_sched_task_cb(ctx, true);

-		perf_ctx_enable(ctx, false);
+		perf_ctx_enable(ctx, 0);
 		perf_ctx_unlock(cpuctx, ctx);
 		goto rcu_unlock;
 	}
@@ -3980,7 +3984,7 @@ static void perf_event_context_sched_in(struct
task_struct *task)
 	if (!ctx->nr_events)
 		goto unlock;

-	perf_ctx_disable(ctx, false);
+	perf_ctx_disable(ctx, 0);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3990,7 +3994,7 @@ static void perf_event_context_sched_in(struct
task_struct *task)
 	 * events, no need to flip the cpuctx's events around.
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
-		perf_ctx_disable(&cpuctx->ctx, false);
+		perf_ctx_disable(&cpuctx->ctx, 0);
 		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
 	}

@@ -3999,9 +4003,9 @@ static void perf_event_context_sched_in(struct
task_struct *task)
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-		perf_ctx_enable(&cpuctx->ctx, false);
+		perf_ctx_enable(&cpuctx->ctx, 0);

-	perf_ctx_enable(ctx, false);
+	perf_ctx_enable(ctx, 0);

 unlock:
 	perf_ctx_unlock(cpuctx, ctx);


Here are the second patch to introduce EVENT_GUEST and
perf_guest_exit/enter().


From c052e673dc3e0e7ebacdce23f2b0d50ec98401b3 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 11 Jun 2024 06:04:20 -0700
Subject: [PATCH 2/2] perf: Add generic exclude_guest support

Current perf doesn't explicitly schedule out all exclude_guest events
while the guest is running. There is no problem with the current
emulated vPMU. Because perf owns all the PMU counters. It can mask the
counter which is assigned to an exclude_guest event when a guest is
running (Intel way), or set the corresponding HOSTONLY bit in evsentsel
(AMD way). The counter doesn't count when a guest is running.

However, either way doesn't work with the introduced passthrough vPMU.
A guest owns all the PMU counters when it's running. The host should not
mask any counters. The counter may be used by the guest. The evsentsel
may be overwritten.

Perf should explicitly schedule out all exclude_guest events to release
the PMU resources when entering a guest, and resume the counting when
exiting the guest.

Expose two interfaces to KVM. The KVM should notify the perf when
entering/exiting a guest.

Introduce a new event type EVENT_CGUEST to indicate that perf should
check and skip the PMUs which doesn't support the passthrough mode.

It's possible that an exclude_guest event is created when a guest is
running. The new event should not be scheduled in as well.

The ctx->time is used to calculated the running/enabling time of an
event, which is shared among PMUs. The ctx_sched_in/out() with
EVENT_CGUEST doesn't stop the ctx->time. A timeguest is introduced to
track the start time of a guest. For an exclude_guest event, the time in
the guest mode is deducted.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 include/linux/perf_event.h |   5 ++
 kernel/events/core.c       | 119 +++++++++++++++++++++++++++++++++++--
 2 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dd4920bf3d1b..68c8b93c4e5c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -945,6 +945,7 @@ struct perf_event_context {
 	u64				time;
 	u64				timestamp;
 	u64				timeoffset;
+	u64				timeguest;

 	/*
 	 * These fields let us detect when two contexts have both
@@ -1734,6 +1735,8 @@ extern int perf_event_period(struct perf_event
*event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
 extern int perf_get_mediated_pmu(void);
 extern void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1826,6 +1829,8 @@ static inline int perf_get_mediated_pmu(void)
 }

 static inline void perf_put_mediated_pmu(void)			{ }
+static inline void perf_guest_enter(void)			{ }
+static inline void perf_guest_exit(void)			{ }
 #endif

 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95d1d5a5addc..cd3a89672b14 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -376,7 +376,8 @@ enum event_type_t {
 	/* see ctx_resched() for details */
 	EVENT_CPU = 0x8,
 	EVENT_CGROUP = 0x10,
-	EVENT_FLAGS = EVENT_CGROUP,
+	EVENT_GUEST = 0x20,
+	EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };

@@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly;

 static atomic_t nr_mediated_pmu_vms;
 static DEFINE_MUTEX(perf_mediated_pmu_mutex);
+static DEFINE_PER_CPU(bool, perf_in_guest);

 /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
 static inline bool is_include_guest_event(struct perf_event *event)
@@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64
now, u64 *enabled, u64 *runnin

 static void perf_event_update_time(struct perf_event *event)
 {
-	u64 now = perf_event_time(event);
+	u64 now;
+
+	/* Never count the time of an active guest into an exclude_guest event. */
+	if (event->ctx->timeguest &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
+		/*
+		 * If a guest is running, use the timestamp while entering the guest.
+		 * If the guest is leaving, reset the event timestamp.
+		 */
+		if (__this_cpu_read(perf_in_guest))
+			event->tstamp = event->ctx->timeguest;
+		else
+			event->tstamp = event->ctx->time;
+		return;
+	}

+	now = perf_event_time(event);
 	__perf_update_times(event, now, &event->total_time_enabled,
 					&event->total_time_running);
+
 	event->tstamp = now;
 }

@@ -706,6 +724,10 @@ static bool perf_skip_pmu_ctx(struct
perf_event_pmu_context *pmu_ctx,
 	if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
 		return true;

+	if ((event_type & EVENT_GUEST) &&
+	    !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
+		return true;
+
 	return false;
 }

@@ -3350,7 +3372,14 @@ ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
 			cpuctx->task_ctx = NULL;
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule out all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
 		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
@@ -3860,6 +3889,15 @@ static int merge_sched_in(struct perf_event
*event, void *data)
 	if (!event_filter_match(event))
 		return 0;

+	/*
+	 * Don't schedule in any exclude_guest events of PMU with
+	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
+	 */
+	if (__this_cpu_read(perf_in_guest) &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
+	    event->attr.exclude_guest)
+		return 0;
+
 	if (group_can_go_on(event, *can_add_hw)) {
 		if (!group_sched_in(event, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
@@ -3941,7 +3979,20 @@ ctx_sched_in(struct perf_event_context *ctx, enum
event_type_t event_type)
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule in all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+
+		/*
+		 * Update ctx time to set the new start time for
+		 * the exclude_guest events.
+		 */
+		update_context_time(ctx);
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	/*
 	 * First go through the list and put on any pinned groups
@@ -5788,6 +5839,66 @@ void perf_put_mediated_pmu(void)
 }
 EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);

+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
+	/* Set the guest start time */
+	cpuctx->ctx.timeguest = cpuctx->ctx.time;
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx) {
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	}
+
+	__this_cpu_write(perf_in_guest, true);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	__this_cpu_write(perf_in_guest, false);
+
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
+	cpuctx->ctx.timeguest = 0;
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx) {
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = 0;
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	}
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block


Thanks,
Kan
Peter Zijlstra June 12, 2024, 11:17 a.m. UTC | #5
On Tue, Jun 11, 2024 at 09:27:46AM -0400, Liang, Kan wrote:
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index dd4920bf3d1b..68c8b93c4e5c 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -945,6 +945,7 @@ struct perf_event_context {
>  	u64				time;
>  	u64				timestamp;
>  	u64				timeoffset;
> +	u64				timeguest;
> 
>  	/*
>  	 * These fields let us detect when two contexts have both

> @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64
> now, u64 *enabled, u64 *runnin
> 
>  static void perf_event_update_time(struct perf_event *event)
>  {
> -	u64 now = perf_event_time(event);
> +	u64 now;
> +
> +	/* Never count the time of an active guest into an exclude_guest event. */
> +	if (event->ctx->timeguest &&
> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
> +		/*
> +		 * If a guest is running, use the timestamp while entering the guest.
> +		 * If the guest is leaving, reset the event timestamp.
> +		 */
> +		if (__this_cpu_read(perf_in_guest))
> +			event->tstamp = event->ctx->timeguest;
> +		else
> +			event->tstamp = event->ctx->time;
> +		return;
> +	}
> 
> +	now = perf_event_time(event);
>  	__perf_update_times(event, now, &event->total_time_enabled,
>  					&event->total_time_running);
> +
>  	event->tstamp = now;
>  }

So I really don't like this much, and AFAICT this is broken. At the very
least this doesn't work right for cgroup events, because they have their
own timeline.

Let me have a poke...
Liang, Kan June 12, 2024, 1:38 p.m. UTC | #6
On 2024-06-12 7:17 a.m., Peter Zijlstra wrote:
> On Tue, Jun 11, 2024 at 09:27:46AM -0400, Liang, Kan wrote:
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index dd4920bf3d1b..68c8b93c4e5c 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -945,6 +945,7 @@ struct perf_event_context {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> +	u64				timeguest;
>>
>>  	/*
>>  	 * These fields let us detect when two contexts have both
> 
>> @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64
>> now, u64 *enabled, u64 *runnin
>>
>>  static void perf_event_update_time(struct perf_event *event)
>>  {
>> -	u64 now = perf_event_time(event);
>> +	u64 now;
>> +
>> +	/* Never count the time of an active guest into an exclude_guest event. */
>> +	if (event->ctx->timeguest &&
>> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
>> +		/*
>> +		 * If a guest is running, use the timestamp while entering the guest.
>> +		 * If the guest is leaving, reset the event timestamp.
>> +		 */
>> +		if (__this_cpu_read(perf_in_guest))
>> +			event->tstamp = event->ctx->timeguest;
>> +		else
>> +			event->tstamp = event->ctx->time;
>> +		return;
>> +	}
>>
>> +	now = perf_event_time(event);
>>  	__perf_update_times(event, now, &event->total_time_enabled,
>>  					&event->total_time_running);
>> +
>>  	event->tstamp = now;
>>  }
> 
> So I really don't like this much, 

An alternative way I can imagine may maintain a dedicated timeline for
the PASSTHROUGH PMUs. For that, we probably need two new timelines for
the normal events and the cgroup events. That sounds too complex.

> and AFAICT this is broken. At the very
> least this doesn't work right for cgroup events, because they have their
> own timeline.

I think we just need a new start time for an event. So we may use
perf_event_time() to replace the ctx->time for the cgroup events.

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 019c237dd456..6c46699c6752 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -665,7 +665,7 @@ static void perf_event_update_time(struct perf_event
*event)
 		if (__this_cpu_read(perf_in_guest))
 			event->tstamp = event->ctx->timeguest;
 		else
-			event->tstamp = event->ctx->time;
+			event->tstamp = perf_event_time(event);
 		return;
 	}


> 
> Let me have a poke...

Sure.

Thanks,
Kan
Peter Zijlstra June 13, 2024, 9:15 a.m. UTC | #7
On Wed, Jun 12, 2024 at 09:38:06AM -0400, Liang, Kan wrote:
> On 2024-06-12 7:17 a.m., Peter Zijlstra wrote:
> > On Tue, Jun 11, 2024 at 09:27:46AM -0400, Liang, Kan wrote:
> >> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> >> index dd4920bf3d1b..68c8b93c4e5c 100644
> >> --- a/include/linux/perf_event.h
> >> +++ b/include/linux/perf_event.h
> >> @@ -945,6 +945,7 @@ struct perf_event_context {
> >>  	u64				time;
> >>  	u64				timestamp;
> >>  	u64				timeoffset;
> >> +	u64				timeguest;
> >>
> >>  	/*
> >>  	 * These fields let us detect when two contexts have both
> > 
> >> @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64
> >> now, u64 *enabled, u64 *runnin
> >>
> >>  static void perf_event_update_time(struct perf_event *event)
> >>  {
> >> -	u64 now = perf_event_time(event);
> >> +	u64 now;
> >> +
> >> +	/* Never count the time of an active guest into an exclude_guest event. */
> >> +	if (event->ctx->timeguest &&
> >> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
> >> +		/*
> >> +		 * If a guest is running, use the timestamp while entering the guest.
> >> +		 * If the guest is leaving, reset the event timestamp.
> >> +		 */
> >> +		if (__this_cpu_read(perf_in_guest))
> >> +			event->tstamp = event->ctx->timeguest;
> >> +		else
> >> +			event->tstamp = event->ctx->time;
> >> +		return;
> >> +	}
> >>
> >> +	now = perf_event_time(event);
> >>  	__perf_update_times(event, now, &event->total_time_enabled,
> >>  					&event->total_time_running);
> >> +
> >>  	event->tstamp = now;
> >>  }
> > 
> > So I really don't like this much, 
> 
> An alternative way I can imagine may maintain a dedicated timeline for
> the PASSTHROUGH PMUs. For that, we probably need two new timelines for
> the normal events and the cgroup events. That sounds too complex.

I'm afraid we might have to. Specifically, the below:

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 019c237dd456..6c46699c6752 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -665,7 +665,7 @@ static void perf_event_update_time(struct perf_event
> *event)
>  		if (__this_cpu_read(perf_in_guest))
>  			event->tstamp = event->ctx->timeguest;
>  		else
> -			event->tstamp = event->ctx->time;
> +			event->tstamp = perf_event_time(event);
>  		return;
>  	}

is still broken in that it (ab)uses event state to track time, and this
goes sideways in case of event overcommit, because then
ctx_sched_{out,in}() will not visit all events.

We've ran into that before. Time-keeping really should be per context or
we'll get a ton of pain.

I've ended up with the (uncompiled) below. Yes, it is unfortunate, but
aside from a few cleanups (we could introduce a struct time_ctx { u64
time, stamp, offset }; and fold a bunch of code, this is more or less
the best we can do I'm afraid.

---

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -947,7 +947,9 @@ struct perf_event_context {
 	u64				time;
 	u64				timestamp;
 	u64				timeoffset;
-	u64				timeguest;
+	u64				guest_time;
+	u64				guest_timestamp;
+	u64				guest_timeoffset;
 
 	/*
 	 * These fields let us detect when two contexts have both
@@ -1043,6 +1045,9 @@ struct perf_cgroup_info {
 	u64				time;
 	u64				timestamp;
 	u64				timeoffset;
+	u64				guest_time;
+	u64				guest_timestamp;
+	u64				guest_timeoffset;
 	int				active;
 };
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -638,26 +638,9 @@ __perf_update_times(struct perf_event *e
 
 static void perf_event_update_time(struct perf_event *event)
 {
-	u64 now;
-
-	/* Never count the time of an active guest into an exclude_guest event. */
-	if (event->ctx->timeguest &&
-	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
-		/*
-		 * If a guest is running, use the timestamp while entering the guest.
-		 * If the guest is leaving, reset the event timestamp.
-		 */
-		if (__this_cpu_read(perf_in_guest))
-			event->tstamp = event->ctx->timeguest;
-		else
-			event->tstamp = event->ctx->time;
-		return;
-	}
-
-	now = perf_event_time(event);
+	u64 now = perf_event_time(event);
 	__perf_update_times(event, now, &event->total_time_enabled,
 					&event->total_time_running);
-
 	event->tstamp = now;
 }
 
@@ -780,19 +763,33 @@ static inline int is_cgroup_event(struct
 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 {
 	struct perf_cgroup_info *t;
+	u64 time;
 
 	t = per_cpu_ptr(event->cgrp->info, event->cpu);
-	return t->time;
+	time = t->time;
+	if (event->attr.exclude_guest)
+		time -= t->guest_time;
+	return time;
 }
 
 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
 {
 	struct perf_cgroup_info *t;
+	u64 time, guest_time;
 
 	t = per_cpu_ptr(event->cgrp->info, event->cpu);
-	if (!__load_acquire(&t->active))
-		return t->time;
-	now += READ_ONCE(t->timeoffset);
+	if (!__load_acquire(&t->active)) {
+		time = t->time;
+		if (event->attr.exclude_guest)
+			time -= t->guest_time;
+		return time;
+	}
+
+	time = now + READ_ONCE(t->timeoffset);
+	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
+		guest_time = now + READ_ONCE(t->guest_offset);
+		time -= guest_time;
+	}
 	return now;
 }
 
@@ -807,6 +804,17 @@ static inline void __update_cgrp_time(st
 	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
 }
 
+static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+	if (adv)
+		info->guest_time += now - info->guest_timestamp;
+	info->guest_timestamp = now;
+	/*
+	 * see update_context_time()
+	 */
+	WRITE_ONCE(info->guest_timeoffset, info->guest_time - info->guest_timestamp);
+}
+
 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
 {
 	struct perf_cgroup *cgrp = cpuctx->cgrp;
@@ -821,6 +829,8 @@ static inline void update_cgrp_time_from
 			info = this_cpu_ptr(cgrp->info);
 
 			__update_cgrp_time(info, now, true);
+			if (__this_cpu_read(perf_in_guest))
+				__update_cgrp_guest_time(info, now, true);
 			if (final)
 				__store_release(&info->active, 0);
 		}
@@ -1501,14 +1511,39 @@ static void __update_context_time(struct
 	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
 }
 
+static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
+{
+	u64 now = ctx->timestamp; /* must be called after __update_context_time(); */
+
+	lockdep_assert_held(&ctx->lock);
+
+	if (adv)
+		ctx->guest_time += now - ctx->guest_timestamp;
+	ctx->guest_timestamp = now;
+
+	/*
+	 * The above: time' = time + (now - timestamp), can be re-arranged
+	 * into: time` = now + (time - timestamp), which gives a single value
+	 * offset to compute future time without locks on.
+	 *
+	 * See perf_event_time_now(), which can be used from NMI context where
+	 * it's (obviously) not possible to acquire ctx->lock in order to read
+	 * both the above values in a consistent manner.
+	 */
+	WRITE_ONCE(ctx->guest_timeoffset, ctx->guest_time - ctx->guest_timestamp);
+}
+
 static void update_context_time(struct perf_event_context *ctx)
 {
 	__update_context_time(ctx, true);
+	if (__this_cpu_read(perf_in_guest))
+		__update_context_guest_time(ctx, true);
 }
 
 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
+	u64 time;
 
 	if (unlikely(!ctx))
 		return 0;
@@ -1516,12 +1551,17 @@ static u64 perf_event_time(struct perf_e
 	if (is_cgroup_event(event))
 		return perf_cgroup_event_time(event);
 
-	return ctx->time;
+	time = ctx->time;
+	if (event->attr.exclude_guest)
+		time -= ctx->guest_time;
+
+	return time;
 }
 
 static u64 perf_event_time_now(struct perf_event *event, u64 now)
 {
 	struct perf_event_context *ctx = event->ctx;
+	u64 time, guest_time;
 
 	if (unlikely(!ctx))
 		return 0;
@@ -1529,11 +1569,19 @@ static u64 perf_event_time_now(struct pe
 	if (is_cgroup_event(event))
 		return perf_cgroup_event_time_now(event, now);
 
-	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
-		return ctx->time;
+	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) {
+		time = ctx->time;
+		if (event->attr.exclude_guest)
+			time -= ctx->guest_time;
+		return time;
+	}
 
-	now += READ_ONCE(ctx->timeoffset);
-	return now;
+	time = now + READ_ONCE(ctx->timeoffset);
+	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
+		guest_time = now + READ_ONCE(ctx->guest_timeoffset);
+		time -= guest_time;
+	}
+	return time;
 }
 
 static enum event_type_t get_event_type(struct perf_event *event)
@@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
 	 * would only update time for the pinned events.
 	 */
 	if (is_active & EVENT_TIME) {
+		bool stop;
+
+		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
+		       ctx == &cpuctx->ctx;
+			
 		/* update (and stop) ctx time */
 		update_context_time(ctx);
-		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+		update_cgrp_time_from_cpuctx(cpuctx, stop);
 		/*
 		 * CPU-release for the below ->is_active store,
 		 * see __load_acquire() in perf_event_time_now()
@@ -3366,8 +3419,12 @@ ctx_sched_out(struct perf_event_context
 		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
 		 */
 		is_active = EVENT_ALL;
-	} else
+		__update_context_guest_time(ctx, false);
+		perf_cgroup_set_guest_timestamp(cpuctx);
+		barrier();
+	} else {
 		is_active ^= ctx->is_active; /* changed bits */
+	}
 
 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
 		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
@@ -3866,10 +3923,15 @@ static inline void group_update_userpage
 		event_update_userpage(event);
 }
 
+struct merge_sched_data {
+	int can_add_hw;
+	enum event_type_t event_type;
+};
+
 static int merge_sched_in(struct perf_event *event, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
-	int *can_add_hw = data;
+	struct merge_sched_data *msd = data;
 
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
@@ -3881,18 +3943,18 @@ static int merge_sched_in(struct perf_ev
 	 * Don't schedule in any exclude_guest events of PMU with
 	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
 	 */
-	if (__this_cpu_read(perf_in_guest) &&
-	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
-	    event->attr.exclude_guest)
+	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest) &&
+	    (event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
+	    !(msd->event_type & EVENT_GUEST))
 		return 0;
 
-	if (group_can_go_on(event, *can_add_hw)) {
+	if (group_can_go_on(event, msd->can_add_hw)) {
 		if (!group_sched_in(event, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
 	}
 
 	if (event->state == PERF_EVENT_STATE_INACTIVE) {
-		*can_add_hw = 0;
+		msd->can_add_hw = 0;
 		if (event->attr.pinned) {
 			perf_cgroup_event_disable(event, ctx);
 			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
@@ -3911,11 +3973,15 @@ static int merge_sched_in(struct perf_ev
 
 static void pmu_groups_sched_in(struct perf_event_context *ctx,
 				struct perf_event_groups *groups,
-				struct pmu *pmu)
+				struct pmu *pmu,
+				enum even_type_t event_type)
 {
-	int can_add_hw = 1;
+	struct merge_sched_data msd = {
+		.can_add_hw = 1,
+		.event_type = event_type,
+	};
 	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
-			   merge_sched_in, &can_add_hw);
+			   merge_sched_in, &msd);
 }
 
 static void ctx_groups_sched_in(struct perf_event_context *ctx,
@@ -3927,14 +3993,14 @@ static void ctx_groups_sched_in(struct p
 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
 		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
 			continue;
-		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
+		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type);
 	}
 }
 
 static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
 			       struct pmu *pmu)
 {
-	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0);
 }
 
 static void
@@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
 		return;
 
 	if (!(is_active & EVENT_TIME)) {
+		/* EVENT_TIME should be active while the guest runs */
+		WARN_ON_ONCE(event_type & EVENT_GUEST);
 		/* start ctx time */
 		__update_context_time(ctx, false);
 		perf_cgroup_set_timestamp(cpuctx);
@@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
 		 * the exclude_guest events.
 		 */
 		update_context_time(ctx);
-	} else
+		update_cgrp_time_from_cpuctx(cpuctx, false);
+		barrier();
+	} else {
 		is_active ^= ctx->is_active; /* changed bits */
+	}
 
 	/*
 	 * First go through the list and put on any pinned groups
@@ -5832,25 +5903,20 @@ void perf_guest_enter(void)
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 
-	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
-		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		return;
-	}
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
+		goto unlock;
 
 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
 	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
-	/* Set the guest start time */
-	cpuctx->ctx.timeguest = cpuctx->ctx.time;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
 		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
-		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}
 
 	__this_cpu_write(perf_in_guest, true);
-
+unlock:
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
@@ -5862,24 +5928,21 @@ void perf_guest_exit(void)
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 
-	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
-		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		return;
-	}
-
-	__this_cpu_write(perf_in_guest, false);
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
+		goto unlock;
 
 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
 	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
-	cpuctx->ctx.timeguest = 0;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
 		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
-		cpuctx->task_ctx->timeguest = 0;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}
 
+	__this_cpu_write(perf_in_guest, false);
+
+unlock:
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
Liang, Kan June 13, 2024, 1:37 p.m. UTC | #8
On 2024-06-13 5:15 a.m., Peter Zijlstra wrote:
> On Wed, Jun 12, 2024 at 09:38:06AM -0400, Liang, Kan wrote:
>> On 2024-06-12 7:17 a.m., Peter Zijlstra wrote:
>>> On Tue, Jun 11, 2024 at 09:27:46AM -0400, Liang, Kan wrote:
>>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>>> index dd4920bf3d1b..68c8b93c4e5c 100644
>>>> --- a/include/linux/perf_event.h
>>>> +++ b/include/linux/perf_event.h
>>>> @@ -945,6 +945,7 @@ struct perf_event_context {
>>>>  	u64				time;
>>>>  	u64				timestamp;
>>>>  	u64				timeoffset;
>>>> +	u64				timeguest;
>>>>
>>>>  	/*
>>>>  	 * These fields let us detect when two contexts have both
>>>
>>>> @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64
>>>> now, u64 *enabled, u64 *runnin
>>>>
>>>>  static void perf_event_update_time(struct perf_event *event)
>>>>  {
>>>> -	u64 now = perf_event_time(event);
>>>> +	u64 now;
>>>> +
>>>> +	/* Never count the time of an active guest into an exclude_guest event. */
>>>> +	if (event->ctx->timeguest &&
>>>> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
>>>> +		/*
>>>> +		 * If a guest is running, use the timestamp while entering the guest.
>>>> +		 * If the guest is leaving, reset the event timestamp.
>>>> +		 */
>>>> +		if (__this_cpu_read(perf_in_guest))
>>>> +			event->tstamp = event->ctx->timeguest;
>>>> +		else
>>>> +			event->tstamp = event->ctx->time;
>>>> +		return;
>>>> +	}
>>>>
>>>> +	now = perf_event_time(event);
>>>>  	__perf_update_times(event, now, &event->total_time_enabled,
>>>>  					&event->total_time_running);
>>>> +
>>>>  	event->tstamp = now;
>>>>  }
>>>
>>> So I really don't like this much, 
>>
>> An alternative way I can imagine may maintain a dedicated timeline for
>> the PASSTHROUGH PMUs. For that, we probably need two new timelines for
>> the normal events and the cgroup events. That sounds too complex.
> 
> I'm afraid we might have to. Specifically, the below:
> 
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 019c237dd456..6c46699c6752 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -665,7 +665,7 @@ static void perf_event_update_time(struct perf_event
>> *event)
>>  		if (__this_cpu_read(perf_in_guest))
>>  			event->tstamp = event->ctx->timeguest;
>>  		else
>> -			event->tstamp = event->ctx->time;
>> +			event->tstamp = perf_event_time(event);
>>  		return;
>>  	}
> 
> is still broken in that it (ab)uses event state to track time, and this
> goes sideways in case of event overcommit, because then
> ctx_sched_{out,in}() will not visit all events.
> 
> We've ran into that before. Time-keeping really should be per context or
> we'll get a ton of pain.
> 
> I've ended up with the (uncompiled) below. Yes, it is unfortunate, but
> aside from a few cleanups (we could introduce a struct time_ctx { u64
> time, stamp, offset }; and fold a bunch of code, this is more or less
> the best we can do I'm afraid.

Sure. I will try the below codes and implement the cleanup patch as well.

Thanks,
Kan

> 
> ---
> 
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -947,7 +947,9 @@ struct perf_event_context {
>  	u64				time;
>  	u64				timestamp;
>  	u64				timeoffset;
> -	u64				timeguest;
> +	u64				guest_time;
> +	u64				guest_timestamp;
> +	u64				guest_timeoffset;
>  
>  	/*
>  	 * These fields let us detect when two contexts have both
> @@ -1043,6 +1045,9 @@ struct perf_cgroup_info {
>  	u64				time;
>  	u64				timestamp;
>  	u64				timeoffset;
> +	u64				guest_time;
> +	u64				guest_timestamp;
> +	u64				guest_timeoffset;
>  	int				active;
>  };
>  
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -638,26 +638,9 @@ __perf_update_times(struct perf_event *e
>  
>  static void perf_event_update_time(struct perf_event *event)
>  {
> -	u64 now;
> -
> -	/* Never count the time of an active guest into an exclude_guest event. */
> -	if (event->ctx->timeguest &&
> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
> -		/*
> -		 * If a guest is running, use the timestamp while entering the guest.
> -		 * If the guest is leaving, reset the event timestamp.
> -		 */
> -		if (__this_cpu_read(perf_in_guest))
> -			event->tstamp = event->ctx->timeguest;
> -		else
> -			event->tstamp = event->ctx->time;
> -		return;
> -	}
> -
> -	now = perf_event_time(event);
> +	u64 now = perf_event_time(event);
>  	__perf_update_times(event, now, &event->total_time_enabled,
>  					&event->total_time_running);
> -
>  	event->tstamp = now;
>  }
>  
> @@ -780,19 +763,33 @@ static inline int is_cgroup_event(struct
>  static inline u64 perf_cgroup_event_time(struct perf_event *event)
>  {
>  	struct perf_cgroup_info *t;
> +	u64 time;
>  
>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
> -	return t->time;
> +	time = t->time;
> +	if (event->attr.exclude_guest)
> +		time -= t->guest_time;
> +	return time;
>  }
>  
>  static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
>  {
>  	struct perf_cgroup_info *t;
> +	u64 time, guest_time;
>  
>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
> -	if (!__load_acquire(&t->active))
> -		return t->time;
> -	now += READ_ONCE(t->timeoffset);
> +	if (!__load_acquire(&t->active)) {
> +		time = t->time;
> +		if (event->attr.exclude_guest)
> +			time -= t->guest_time;
> +		return time;
> +	}
> +
> +	time = now + READ_ONCE(t->timeoffset);
> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
> +		guest_time = now + READ_ONCE(t->guest_offset);
> +		time -= guest_time;
> +	}
>  	return now;
>  }
>  
> @@ -807,6 +804,17 @@ static inline void __update_cgrp_time(st
>  	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
>  }
>  
> +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
> +{
> +	if (adv)
> +		info->guest_time += now - info->guest_timestamp;
> +	info->guest_timestamp = now;
> +	/*
> +	 * see update_context_time()
> +	 */
> +	WRITE_ONCE(info->guest_timeoffset, info->guest_time - info->guest_timestamp);
> +}
> +
>  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
>  {
>  	struct perf_cgroup *cgrp = cpuctx->cgrp;
> @@ -821,6 +829,8 @@ static inline void update_cgrp_time_from
>  			info = this_cpu_ptr(cgrp->info);
>  
>  			__update_cgrp_time(info, now, true);
> +			if (__this_cpu_read(perf_in_guest))
> +				__update_cgrp_guest_time(info, now, true);
>  			if (final)
>  				__store_release(&info->active, 0);
>  		}
> @@ -1501,14 +1511,39 @@ static void __update_context_time(struct
>  	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
>  }
>  
> +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
> +{
> +	u64 now = ctx->timestamp; /* must be called after __update_context_time(); */
> +
> +	lockdep_assert_held(&ctx->lock);
> +
> +	if (adv)
> +		ctx->guest_time += now - ctx->guest_timestamp;
> +	ctx->guest_timestamp = now;
> +
> +	/*
> +	 * The above: time' = time + (now - timestamp), can be re-arranged
> +	 * into: time` = now + (time - timestamp), which gives a single value
> +	 * offset to compute future time without locks on.
> +	 *
> +	 * See perf_event_time_now(), which can be used from NMI context where
> +	 * it's (obviously) not possible to acquire ctx->lock in order to read
> +	 * both the above values in a consistent manner.
> +	 */
> +	WRITE_ONCE(ctx->guest_timeoffset, ctx->guest_time - ctx->guest_timestamp);
> +}
> +
>  static void update_context_time(struct perf_event_context *ctx)
>  {
>  	__update_context_time(ctx, true);
> +	if (__this_cpu_read(perf_in_guest))
> +		__update_context_guest_time(ctx, true);
>  }
>  
>  static u64 perf_event_time(struct perf_event *event)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> +	u64 time;
>  
>  	if (unlikely(!ctx))
>  		return 0;
> @@ -1516,12 +1551,17 @@ static u64 perf_event_time(struct perf_e
>  	if (is_cgroup_event(event))
>  		return perf_cgroup_event_time(event);
>  
> -	return ctx->time;
> +	time = ctx->time;
> +	if (event->attr.exclude_guest)
> +		time -= ctx->guest_time;
> +
> +	return time;
>  }
>  
>  static u64 perf_event_time_now(struct perf_event *event, u64 now)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> +	u64 time, guest_time;
>  
>  	if (unlikely(!ctx))
>  		return 0;
> @@ -1529,11 +1569,19 @@ static u64 perf_event_time_now(struct pe
>  	if (is_cgroup_event(event))
>  		return perf_cgroup_event_time_now(event, now);
>  
> -	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
> -		return ctx->time;
> +	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) {
> +		time = ctx->time;
> +		if (event->attr.exclude_guest)
> +			time -= ctx->guest_time;
> +		return time;
> +	}
>  
> -	now += READ_ONCE(ctx->timeoffset);
> -	return now;
> +	time = now + READ_ONCE(ctx->timeoffset);
> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
> +		guest_time = now + READ_ONCE(ctx->guest_timeoffset);
> +		time -= guest_time;
> +	}
> +	return time;
>  }
>  
>  static enum event_type_t get_event_type(struct perf_event *event)
> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
>  	 * would only update time for the pinned events.
>  	 */
>  	if (is_active & EVENT_TIME) {
> +		bool stop;
> +
> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
> +		       ctx == &cpuctx->ctx;
> +			
>  		/* update (and stop) ctx time */
>  		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> +		update_cgrp_time_from_cpuctx(cpuctx, stop);
>  		/*
>  		 * CPU-release for the below ->is_active store,
>  		 * see __load_acquire() in perf_event_time_now()
> @@ -3366,8 +3419,12 @@ ctx_sched_out(struct perf_event_context
>  		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>  		 */
>  		is_active = EVENT_ALL;
> -	} else
> +		__update_context_guest_time(ctx, false);
> +		perf_cgroup_set_guest_timestamp(cpuctx);
> +		barrier();
> +	} else {
>  		is_active ^= ctx->is_active; /* changed bits */
> +	}
>  
>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
> @@ -3866,10 +3923,15 @@ static inline void group_update_userpage
>  		event_update_userpage(event);
>  }
>  
> +struct merge_sched_data {
> +	int can_add_hw;
> +	enum event_type_t event_type;
> +};
> +
>  static int merge_sched_in(struct perf_event *event, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> -	int *can_add_hw = data;
> +	struct merge_sched_data *msd = data;
>  
>  	if (event->state <= PERF_EVENT_STATE_OFF)
>  		return 0;
> @@ -3881,18 +3943,18 @@ static int merge_sched_in(struct perf_ev
>  	 * Don't schedule in any exclude_guest events of PMU with
>  	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
>  	 */
> -	if (__this_cpu_read(perf_in_guest) &&
> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
> -	    event->attr.exclude_guest)
> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest) &&
> +	    (event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
> +	    !(msd->event_type & EVENT_GUEST))
>  		return 0;
>  
> -	if (group_can_go_on(event, *can_add_hw)) {
> +	if (group_can_go_on(event, msd->can_add_hw)) {
>  		if (!group_sched_in(event, ctx))
>  			list_add_tail(&event->active_list, get_event_list(event));
>  	}
>  
>  	if (event->state == PERF_EVENT_STATE_INACTIVE) {
> -		*can_add_hw = 0;
> +		msd->can_add_hw = 0;
>  		if (event->attr.pinned) {
>  			perf_cgroup_event_disable(event, ctx);
>  			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> @@ -3911,11 +3973,15 @@ static int merge_sched_in(struct perf_ev
>  
>  static void pmu_groups_sched_in(struct perf_event_context *ctx,
>  				struct perf_event_groups *groups,
> -				struct pmu *pmu)
> +				struct pmu *pmu,
> +				enum even_type_t event_type)
>  {
> -	int can_add_hw = 1;
> +	struct merge_sched_data msd = {
> +		.can_add_hw = 1,
> +		.event_type = event_type,
> +	};
>  	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
> -			   merge_sched_in, &can_add_hw);
> +			   merge_sched_in, &msd);
>  }
>  
>  static void ctx_groups_sched_in(struct perf_event_context *ctx,
> @@ -3927,14 +3993,14 @@ static void ctx_groups_sched_in(struct p
>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>  			continue;
> -		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
> +		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type);
>  	}
>  }
>  
>  static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
>  			       struct pmu *pmu)
>  {
> -	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0);
>  }
>  
>  static void
> @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
>  		return;
>  
>  	if (!(is_active & EVENT_TIME)) {
> +		/* EVENT_TIME should be active while the guest runs */
> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
>  		/* start ctx time */
>  		__update_context_time(ctx, false);
>  		perf_cgroup_set_timestamp(cpuctx);
> @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
>  		 * the exclude_guest events.
>  		 */
>  		update_context_time(ctx);
> -	} else
> +		update_cgrp_time_from_cpuctx(cpuctx, false);
> +		barrier();
> +	} else {
>  		is_active ^= ctx->is_active; /* changed bits */
> +	}
>  
>  	/*
>  	 * First go through the list and put on any pinned groups
> @@ -5832,25 +5903,20 @@ void perf_guest_enter(void)
>  
>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>  
> -	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> -		return;
> -	}
> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
> +		goto unlock;
>  
>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>  	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
> -	/* Set the guest start time */
> -	cpuctx->ctx.timeguest = cpuctx->ctx.time;
>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>  	if (cpuctx->task_ctx) {
>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>  		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
> -		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>  	}
>  
>  	__this_cpu_write(perf_in_guest, true);
> -
> +unlock:
>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>  }
>  
> @@ -5862,24 +5928,21 @@ void perf_guest_exit(void)
>  
>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>  
> -	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> -		return;
> -	}
> -
> -	__this_cpu_write(perf_in_guest, false);
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
> +		goto unlock;
>  
>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>  	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
> -	cpuctx->ctx.timeguest = 0;
>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>  	if (cpuctx->task_ctx) {
>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>  		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
> -		cpuctx->task_ctx->timeguest = 0;
>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>  	}
>  
> +	__this_cpu_write(perf_in_guest, false);
> +
> +unlock:
>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>  }
>
Liang, Kan June 13, 2024, 6:04 p.m. UTC | #9
On 2024-06-13 9:37 a.m., Liang, Kan wrote:
>> ---
>>
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -947,7 +947,9 @@ struct perf_event_context {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> -	u64				timeguest;
>> +	u64				guest_time;
>> +	u64				guest_timestamp;
>> +	u64				guest_timeoffset;
>>  
>>  	/*
>>  	 * These fields let us detect when two contexts have both
>> @@ -1043,6 +1045,9 @@ struct perf_cgroup_info {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> +	u64				guest_time;
>> +	u64				guest_timestamp;
>> +	u64				guest_timeoffset;
>>  	int				active;
>>  };
>>  
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -638,26 +638,9 @@ __perf_update_times(struct perf_event *e
>>  
>>  static void perf_event_update_time(struct perf_event *event)
>>  {
>> -	u64 now;
>> -
>> -	/* Never count the time of an active guest into an exclude_guest event. */
>> -	if (event->ctx->timeguest &&
>> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
>> -		/*
>> -		 * If a guest is running, use the timestamp while entering the guest.
>> -		 * If the guest is leaving, reset the event timestamp.
>> -		 */
>> -		if (__this_cpu_read(perf_in_guest))
>> -			event->tstamp = event->ctx->timeguest;
>> -		else
>> -			event->tstamp = event->ctx->time;
>> -		return;
>> -	}
>> -
>> -	now = perf_event_time(event);
>> +	u64 now = perf_event_time(event);
>>  	__perf_update_times(event, now, &event->total_time_enabled,
>>  					&event->total_time_running);
>> -
>>  	event->tstamp = now;
>>  }
>>  
>> @@ -780,19 +763,33 @@ static inline int is_cgroup_event(struct
>>  static inline u64 perf_cgroup_event_time(struct perf_event *event)
>>  {
>>  	struct perf_cgroup_info *t;
>> +	u64 time;
>>  
>>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
>> -	return t->time;
>> +	time = t->time;
>> +	if (event->attr.exclude_guest)
>> +		time -= t->guest_time;
>> +	return time;
>>  }
>>  
>>  static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
>>  {
>>  	struct perf_cgroup_info *t;
>> +	u64 time, guest_time;
>>  
>>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
>> -	if (!__load_acquire(&t->active))
>> -		return t->time;
>> -	now += READ_ONCE(t->timeoffset);
>> +	if (!__load_acquire(&t->active)) {
>> +		time = t->time;
>> +		if (event->attr.exclude_guest)
>> +			time -= t->guest_time;
>> +		return time;
>> +	}
>> +
>> +	time = now + READ_ONCE(t->timeoffset);
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
>> +		guest_time = now + READ_ONCE(t->guest_offset);
>> +		time -= guest_time;
>> +	}
>>  	return now;
>>  }
>>  
>> @@ -807,6 +804,17 @@ static inline void __update_cgrp_time(st
>>  	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
>>  }
>>  
>> +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
>> +{
>> +	if (adv)
>> +		info->guest_time += now - info->guest_timestamp;
>> +	info->guest_timestamp = now;
>> +	/*
>> +	 * see update_context_time()
>> +	 */
>> +	WRITE_ONCE(info->guest_timeoffset, info->guest_time - info->guest_timestamp);
>> +}
>> +
>>  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
>>  {
>>  	struct perf_cgroup *cgrp = cpuctx->cgrp;
>> @@ -821,6 +829,8 @@ static inline void update_cgrp_time_from
>>  			info = this_cpu_ptr(cgrp->info);
>>  
>>  			__update_cgrp_time(info, now, true);
>> +			if (__this_cpu_read(perf_in_guest))
>> +				__update_cgrp_guest_time(info, now, true);
>>  			if (final)
>>  				__store_release(&info->active, 0);
>>  		}
>> @@ -1501,14 +1511,39 @@ static void __update_context_time(struct
>>  	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
>>  }
>>  
>> +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
>> +{
>> +	u64 now = ctx->timestamp; /* must be called after __update_context_time(); */
>> +
>> +	lockdep_assert_held(&ctx->lock);
>> +
>> +	if (adv)
>> +		ctx->guest_time += now - ctx->guest_timestamp;
>> +	ctx->guest_timestamp = now;
>> +
>> +	/*
>> +	 * The above: time' = time + (now - timestamp), can be re-arranged
>> +	 * into: time` = now + (time - timestamp), which gives a single value
>> +	 * offset to compute future time without locks on.
>> +	 *
>> +	 * See perf_event_time_now(), which can be used from NMI context where
>> +	 * it's (obviously) not possible to acquire ctx->lock in order to read
>> +	 * both the above values in a consistent manner.
>> +	 */
>> +	WRITE_ONCE(ctx->guest_timeoffset, ctx->guest_time - ctx->guest_timestamp);
>> +}
>> +
>>  static void update_context_time(struct perf_event_context *ctx)
>>  {
>>  	__update_context_time(ctx, true);
>> +	if (__this_cpu_read(perf_in_guest))
>> +		__update_context_guest_time(ctx, true);
>>  }
>>  
>>  static u64 perf_event_time(struct perf_event *event)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> +	u64 time;
>>  
>>  	if (unlikely(!ctx))
>>  		return 0;
>> @@ -1516,12 +1551,17 @@ static u64 perf_event_time(struct perf_e
>>  	if (is_cgroup_event(event))
>>  		return perf_cgroup_event_time(event);
>>  
>> -	return ctx->time;
>> +	time = ctx->time;
>> +	if (event->attr.exclude_guest)
>> +		time -= ctx->guest_time;
>> +
>> +	return time;
>>  }
>>  
>>  static u64 perf_event_time_now(struct perf_event *event, u64 now)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> +	u64 time, guest_time;
>>  
>>  	if (unlikely(!ctx))
>>  		return 0;
>> @@ -1529,11 +1569,19 @@ static u64 perf_event_time_now(struct pe
>>  	if (is_cgroup_event(event))
>>  		return perf_cgroup_event_time_now(event, now);
>>  
>> -	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
>> -		return ctx->time;
>> +	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) {
>> +		time = ctx->time;
>> +		if (event->attr.exclude_guest)
>> +			time -= ctx->guest_time;
>> +		return time;
>> +	}
>>  
>> -	now += READ_ONCE(ctx->timeoffset);
>> -	return now;
>> +	time = now + READ_ONCE(ctx->timeoffset);
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
>> +		guest_time = now + READ_ONCE(ctx->guest_timeoffset);
>> +		time -= guest_time;
>> +	}
>> +	return time;
>>  }
>>  
>>  static enum event_type_t get_event_type(struct perf_event *event)
>> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
>>  	 * would only update time for the pinned events.
>>  	 */
>>  	if (is_active & EVENT_TIME) {
>> +		bool stop;
>> +
>> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
>> +		       ctx == &cpuctx->ctx;
>> +			
>>  		/* update (and stop) ctx time */
>>  		update_context_time(ctx);
>> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
>> +		update_cgrp_time_from_cpuctx(cpuctx, stop);

For the event_type == EVENT_GUEST, the "stop" should always be the same
as "ctx == &cpuctx->ctx". Because the ctx->is_active never set the
EVENT_GUEST bit.
Why the stop is introduced?


>>  		/*
>>  		 * CPU-release for the below ->is_active store,
>>  		 * see __load_acquire() in perf_event_time_now()
>> @@ -3366,8 +3419,12 @@ ctx_sched_out(struct perf_event_context
>>  		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>>  		 */
>>  		is_active = EVENT_ALL;
>> -	} else
>> +		__update_context_guest_time(ctx, false);
>> +		perf_cgroup_set_guest_timestamp(cpuctx);
>> +		barrier();
>> +	} else {
>>  		is_active ^= ctx->is_active; /* changed bits */
>> +	}
>>  
>>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>> @@ -3866,10 +3923,15 @@ static inline void group_update_userpage
>>  		event_update_userpage(event);
>>  }
>>  
>> +struct merge_sched_data {
>> +	int can_add_hw;
>> +	enum event_type_t event_type;
>> +};
>> +
>>  static int merge_sched_in(struct perf_event *event, void *data)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> -	int *can_add_hw = data;
>> +	struct merge_sched_data *msd = data;
>>  
>>  	if (event->state <= PERF_EVENT_STATE_OFF)
>>  		return 0;
>> @@ -3881,18 +3943,18 @@ static int merge_sched_in(struct perf_ev
>>  	 * Don't schedule in any exclude_guest events of PMU with
>>  	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
>>  	 */
>> -	if (__this_cpu_read(perf_in_guest) &&
>> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
>> -	    event->attr.exclude_guest)
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest) &&
>> +	    (event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
>> +	    !(msd->event_type & EVENT_GUEST))
>>  		return 0;
>>  
>> -	if (group_can_go_on(event, *can_add_hw)) {
>> +	if (group_can_go_on(event, msd->can_add_hw)) {
>>  		if (!group_sched_in(event, ctx))
>>  			list_add_tail(&event->active_list, get_event_list(event));
>>  	}
>>  
>>  	if (event->state == PERF_EVENT_STATE_INACTIVE) {
>> -		*can_add_hw = 0;
>> +		msd->can_add_hw = 0;
>>  		if (event->attr.pinned) {
>>  			perf_cgroup_event_disable(event, ctx);
>>  			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
>> @@ -3911,11 +3973,15 @@ static int merge_sched_in(struct perf_ev
>>  
>>  static void pmu_groups_sched_in(struct perf_event_context *ctx,
>>  				struct perf_event_groups *groups,
>> -				struct pmu *pmu)
>> +				struct pmu *pmu,
>> +				enum even_type_t event_type)
>>  {
>> -	int can_add_hw = 1;
>> +	struct merge_sched_data msd = {
>> +		.can_add_hw = 1,
>> +		.event_type = event_type,
>> +	};
>>  	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
>> -			   merge_sched_in, &can_add_hw);
>> +			   merge_sched_in, &msd);
>>  }
>>  
>>  static void ctx_groups_sched_in(struct perf_event_context *ctx,
>> @@ -3927,14 +3993,14 @@ static void ctx_groups_sched_in(struct p
>>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>>  			continue;
>> -		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
>> +		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type);
>>  	}
>>  }
>>  
>>  static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
>>  			       struct pmu *pmu)
>>  {
>> -	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>> +	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0);
>>  }
>>  
>>  static void
>> @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
>>  		return;
>>  
>>  	if (!(is_active & EVENT_TIME)) {
>> +		/* EVENT_TIME should be active while the guest runs */
>> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
>>  		/* start ctx time */
>>  		__update_context_time(ctx, false);
>>  		perf_cgroup_set_timestamp(cpuctx);
>> @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
>>  		 * the exclude_guest events.
>>  		 */
>>  		update_context_time(ctx);
>> -	} else
>> +		update_cgrp_time_from_cpuctx(cpuctx, false);


In the above ctx_sched_out(), the cgrp_time is stopped and the cgrp has
been set to inactive.
I think we need a perf_cgroup_set_timestamp(cpuctx) here to restart the
cgrp_time, Right?

Also, I think the cgrp_time is different from the normal ctx->time. When
a guest is running, there must be no cgroup. It's OK to disable the
cgrp_time. If so, I don't think we need to track the guest_time for the
cgrp.

Thanks,
Kan

>> +		barrier();
>> +	} else {
>>  		is_active ^= ctx->is_active; /* changed bits */
>> +	}
>>  
>>  	/*
>>  	 * First go through the list and put on any pinned groups
>> @@ -5832,25 +5903,20 @@ void perf_guest_enter(void)
>>  
>>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>  
>> -	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
>> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> -		return;
>> -	}
>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
>> +		goto unlock;
>>  
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>  	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
>> -	/* Set the guest start time */
>> -	cpuctx->ctx.timeguest = cpuctx->ctx.time;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>  		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
>> -		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
>>  
>>  	__this_cpu_write(perf_in_guest, true);
>> -
>> +unlock:
>>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>  }
>>  
>> @@ -5862,24 +5928,21 @@ void perf_guest_exit(void)
>>  
>>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>  
>> -	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
>> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> -		return;
>> -	}
>> -
>> -	__this_cpu_write(perf_in_guest, false);
>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
>> +		goto unlock;
>>  
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>  	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
>> -	cpuctx->ctx.timeguest = 0;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>  		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
>> -		cpuctx->task_ctx->timeguest = 0;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
>>  
>> +	__this_cpu_write(perf_in_guest, false);
>> +
>> +unlock:
>>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>  }
>>
Peter Zijlstra June 17, 2024, 7:51 a.m. UTC | #10
On Thu, Jun 13, 2024 at 02:04:36PM -0400, Liang, Kan wrote:
> >>  static enum event_type_t get_event_type(struct perf_event *event)
> >> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
> >>  	 * would only update time for the pinned events.
> >>  	 */
> >>  	if (is_active & EVENT_TIME) {
> >> +		bool stop;
> >> +
> >> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
> >> +		       ctx == &cpuctx->ctx;
> >> +			
> >>  		/* update (and stop) ctx time */
> >>  		update_context_time(ctx);
> >> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> >> +		update_cgrp_time_from_cpuctx(cpuctx, stop);
> 
> For the event_type == EVENT_GUEST, the "stop" should always be the same
> as "ctx == &cpuctx->ctx". Because the ctx->is_active never set the
> EVENT_GUEST bit.
> Why the stop is introduced?

Because the ctx_sched_out() for vPMU should not stop time, only the
'normal' sched-out should stop time.


> >> @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
> >>  		return;
> >>  
> >>  	if (!(is_active & EVENT_TIME)) {
> >> +		/* EVENT_TIME should be active while the guest runs */
> >> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
> >>  		/* start ctx time */
> >>  		__update_context_time(ctx, false);
> >>  		perf_cgroup_set_timestamp(cpuctx);
> >> @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
> >>  		 * the exclude_guest events.
> >>  		 */
> >>  		update_context_time(ctx);
> >> -	} else
> >> +		update_cgrp_time_from_cpuctx(cpuctx, false);
> 
> 
> In the above ctx_sched_out(), the cgrp_time is stopped and the cgrp has
> been set to inactive.
> I think we need a perf_cgroup_set_timestamp(cpuctx) here to restart the
> cgrp_time, Right?

So the idea was to not stop time when we schedule out for the vPMU, as
per the above.

> Also, I think the cgrp_time is different from the normal ctx->time. When
> a guest is running, there must be no cgroup. It's OK to disable the
> cgrp_time. If so, I don't think we need to track the guest_time for the
> cgrp.

Uh, the vCPU thread is/can-be part of a cgroup, and different guests
part of different cgroups. The CPU wide 'guest' time is all time spend
in guets, but the cgroup view of things might differ, depending on how
the guets are arranged in cgroups, no?

As such, we need per cgroup guest tracking.
Liang, Kan June 17, 2024, 1:34 p.m. UTC | #11
On 2024-06-17 3:51 a.m., Peter Zijlstra wrote:
> On Thu, Jun 13, 2024 at 02:04:36PM -0400, Liang, Kan wrote:
>>>>  static enum event_type_t get_event_type(struct perf_event *event)
>>>> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
>>>>  	 * would only update time for the pinned events.
>>>>  	 */
>>>>  	if (is_active & EVENT_TIME) {
>>>> +		bool stop;
>>>> +
>>>> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
>>>> +		       ctx == &cpuctx->ctx;
>>>> +			
>>>>  		/* update (and stop) ctx time */
>>>>  		update_context_time(ctx);
>>>> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
>>>> +		update_cgrp_time_from_cpuctx(cpuctx, stop);
>>
>> For the event_type == EVENT_GUEST, the "stop" should always be the same
>> as "ctx == &cpuctx->ctx". Because the ctx->is_active never set the
>> EVENT_GUEST bit.
>> Why the stop is introduced?
> 
> Because the ctx_sched_out() for vPMU should not stop time, 

But the implementation seems stop the time.

The ctx->is_active should be (EVENT_ALL | EVENT_TIME) for most of cases.

When a vPMU is scheduling in (invoke ctx_sched_out()), the event_type
should only be EVENT_GUEST.

!((ctx->is_active & event_type) & EVENT_ALL) should be TRUE.

For a CPU context, ctx == &cpuctx->ctx is TRUE as well.

The update_cgrp_time_from_cpuctx(cpuctx, TRUE) stops the time by
deactivate the cgroup, __store_release(&info->active, 0).

If an user try to read the cgroup events when a guest is running. The
update_cgrp_time_from_event() doesn't update the cgrp time. So both time
and counter are stopped.

> only the
> 'normal' sched-out should stop time.

If the guest is the only case which we want to keep the time for, I
think we may use a straightforward check as below.

	stop = !(event_type & EVENT_GUEST) && ctx == &cpuctx->ctx;

> 
> 
>>>> @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
>>>>  		return;
>>>>  
>>>>  	if (!(is_active & EVENT_TIME)) {
>>>> +		/* EVENT_TIME should be active while the guest runs */
>>>> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
>>>>  		/* start ctx time */
>>>>  		__update_context_time(ctx, false);
>>>>  		perf_cgroup_set_timestamp(cpuctx);
>>>> @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
>>>>  		 * the exclude_guest events.
>>>>  		 */
>>>>  		update_context_time(ctx);
>>>> -	} else
>>>> +		update_cgrp_time_from_cpuctx(cpuctx, false);
>>
>>
>> In the above ctx_sched_out(), the cgrp_time is stopped and the cgrp has
>> been set to inactive.
>> I think we need a perf_cgroup_set_timestamp(cpuctx) here to restart the
>> cgrp_time, Right?
> 
> So the idea was to not stop time when we schedule out for the vPMU, as
> per the above.
> 
>> Also, I think the cgrp_time is different from the normal ctx->time. When
>> a guest is running, there must be no cgroup. It's OK to disable the
>> cgrp_time. If so, I don't think we need to track the guest_time for the
>> cgrp.
> 
> Uh, the vCPU thread is/can-be part of a cgroup, and different guests
> part of different cgroups. The CPU wide 'guest' time is all time spend
> in guets, but the cgroup view of things might differ, depending on how
> the guets are arranged in cgroups, no?
> 
> As such, we need per cgroup guest tracking.

Got it.

Thanks,
Kan
Peter Zijlstra June 17, 2024, 3 p.m. UTC | #12
On Mon, Jun 17, 2024 at 09:34:15AM -0400, Liang, Kan wrote:
> 
> 
> On 2024-06-17 3:51 a.m., Peter Zijlstra wrote:
> > On Thu, Jun 13, 2024 at 02:04:36PM -0400, Liang, Kan wrote:
> >>>>  static enum event_type_t get_event_type(struct perf_event *event)
> >>>> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
> >>>>  	 * would only update time for the pinned events.
> >>>>  	 */
> >>>>  	if (is_active & EVENT_TIME) {
> >>>> +		bool stop;
> >>>> +
> >>>> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
> >>>> +		       ctx == &cpuctx->ctx;
> >>>> +			
> >>>>  		/* update (and stop) ctx time */
> >>>>  		update_context_time(ctx);
> >>>> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> >>>> +		update_cgrp_time_from_cpuctx(cpuctx, stop);
> >>
> >> For the event_type == EVENT_GUEST, the "stop" should always be the same
> >> as "ctx == &cpuctx->ctx". Because the ctx->is_active never set the
> >> EVENT_GUEST bit.
> >> Why the stop is introduced?
> > 
> > Because the ctx_sched_out() for vPMU should not stop time, 
> 
> But the implementation seems stop the time.
> 
> The ctx->is_active should be (EVENT_ALL | EVENT_TIME) for most of cases.
> 
> When a vPMU is scheduling in (invoke ctx_sched_out()), the event_type
> should only be EVENT_GUEST.
> 
> !((ctx->is_active & event_type) & EVENT_ALL) should be TRUE.

Hmm.. yeah, I think I might've gotten that wrong.

> > only the
> > 'normal' sched-out should stop time.
> 
> If the guest is the only case which we want to keep the time for, I
> think we may use a straightforward check as below.
> 
> 	stop = !(event_type & EVENT_GUEST) && ctx == &cpuctx->ctx;

So I think I was trying to get stop true when there weren't in fact
events on, that is when we got in without EVENT_ALL bits, but perhaps
that case isn't relevant.
Liang, Kan June 17, 2024, 3:45 p.m. UTC | #13
On 2024-06-17 11:00 a.m., Peter Zijlstra wrote:
>>> only the
>>> 'normal' sched-out should stop time.
>> If the guest is the only case which we want to keep the time for, I
>> think we may use a straightforward check as below.
>>
>> 	stop = !(event_type & EVENT_GUEST) && ctx == &cpuctx->ctx;
> So I think I was trying to get stop true when there weren't in fact
> events on, that is when we got in without EVENT_ALL bits, but perhaps
> that case isn't relevant.

It should be irrelevant. Here I think we just need to make sure that the
guest is not the reason for stopping the time if there is an active
time. For the others, there is nothing changed.
The __this_cpu_read(perf_in_guest) check will guarantee that both guest
time and host time sync when any updates occur in the guest mode.

Thanks,
Kan
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dd4920bf3d1b..acf16676401a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1734,6 +1734,8 @@  extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
 extern int perf_get_mediated_pmu(void);
 extern void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1826,6 +1828,8 @@  static inline int perf_get_mediated_pmu(void)
 }
 
 static inline void perf_put_mediated_pmu(void)			{ }
+static inline void perf_guest_enter(void)			{ }
+static inline void perf_guest_exit(void)			{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 701b622c670e..4c6daf5cc923 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -406,6 +406,7 @@  static atomic_t nr_include_guest_events __read_mostly;
 
 static refcount_t nr_mediated_pmu_vms = REFCOUNT_INIT(0);
 static DEFINE_MUTEX(perf_mediated_pmu_mutex);
+static DEFINE_PER_CPU(bool, perf_in_guest);
 
 /* !exclude_guest system wide event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
 static inline bool is_include_guest_event(struct perf_event *event)
@@ -3854,6 +3855,15 @@  static int merge_sched_in(struct perf_event *event, void *data)
 	if (!event_filter_match(event))
 		return 0;
 
+	/*
+	 * Don't schedule in any exclude_guest events of PMU with
+	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
+	 */
+	if (__this_cpu_read(perf_in_guest) &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
+	    event->attr.exclude_guest)
+		return 0;
+
 	if (group_can_go_on(event, *can_add_hw)) {
 		if (!group_sched_in(event, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
@@ -5791,6 +5801,100 @@  void perf_put_mediated_pmu(void)
 }
 EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
 
+static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		struct perf_event *event, *tmp;
+		struct pmu *pmu = pmu_ctx->pmu;
+
+		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
+			continue;
+
+		perf_pmu_disable(pmu);
+
+		/*
+		 * All active events must be exclude_guest events.
+		 * See perf_get_mediated_pmu().
+		 * Unconditionally remove all active events.
+		 */
+		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
+			group_sched_out(event, pmu_ctx->ctx);
+
+		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
+			group_sched_out(event, pmu_ctx->ctx);
+
+		pmu_ctx->rotate_necessary = 0;
+
+		perf_pmu_enable(pmu);
+	}
+}
+
+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	perf_sched_out_exclude_guest(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_sched_out_exclude_guest(cpuctx->task_ctx);
+
+	__this_cpu_write(perf_in_guest, true);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
+static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
+{
+	struct perf_event_pmu_context *pmu_ctx;
+
+	update_context_time(ctx);
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		struct pmu *pmu = pmu_ctx->pmu;
+
+		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
+			continue;
+
+		perf_pmu_disable(pmu);
+		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
+		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+		perf_pmu_enable(pmu);
+	}
+}
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		return;
+	}
+
+	__this_cpu_write(perf_in_guest, false);
+
+	perf_sched_in_exclude_guest(&cpuctx->ctx);
+	if (cpuctx->task_ctx)
+		perf_sched_in_exclude_guest(cpuctx->task_ctx);
+
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block