diff mbox series

[RFC,v3,10/58] perf: Add generic exclude_guest support

Message ID 20240801045907.4010984-11-mizhang@google.com (mailing list archive)
State New, archived
Headers show
Series Mediated Passthrough vPMU 3.0 for x86 | expand

Commit Message

Mingwei Zhang Aug. 1, 2024, 4:58 a.m. UTC
From: Kan Liang <kan.liang@linux.intel.com>

Only KVM knows the exact time when a guest is entering/exiting. Expose
two interfaces to KVM to switch the ownership of the PMU resources.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Tested-by: Yongwei Ma <yongwei.ma@intel.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
---
 include/linux/perf_event.h |  4 +++
 kernel/events/core.c       | 54 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

Comments

Peter Zijlstra Oct. 14, 2024, 11:20 a.m. UTC | #1
On Thu, Aug 01, 2024 at 04:58:19AM +0000, Mingwei Zhang wrote:
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
> +		goto unlock;
> +
> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> +	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> +	if (cpuctx->task_ctx) {
> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> +		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> +	}

Does this not violate the scheduling order of events? AFAICT this will
do:

  cpu pinned
  cpu flexible
  task pinned
  task flexible

as opposed to:

  cpu pinned
  task pinned
  cpu flexible
  task flexible

We have the perf_event_sched_in() helper for this.

> +
> +	__this_cpu_write(perf_in_guest, false);
> +unlock:
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_exit);
Liang, Kan Oct. 14, 2024, 3:27 p.m. UTC | #2
On 2024-10-14 7:20 a.m., Peter Zijlstra wrote:
> On Thu, Aug 01, 2024 at 04:58:19AM +0000, Mingwei Zhang wrote:
>> +void perf_guest_exit(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
>> +		goto unlock;
>> +
>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>> +	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>> +	if (cpuctx->task_ctx) {
>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>> +		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>> +	}
> 
> Does this not violate the scheduling order of events? AFAICT this will
> do:
> 
>   cpu pinned
>   cpu flexible
>   task pinned
>   task flexible
> 
> as opposed to:
> 
>   cpu pinned
>   task pinned
>   cpu flexible
>   task flexible
> 
> We have the perf_event_sched_in() helper for this.

Yes, we can avoid the sched_in() with EVENT_GUEST flag, then invoke the
perf_event_sched_in() helper to do the real schedule. I will do more
tests to double check.

Thanks,
Kan
> 
>> +
>> +	__this_cpu_write(perf_in_guest, false);
>> +unlock:
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
>> +EXPORT_SYMBOL_GPL(perf_guest_exit);
>
diff mbox series

Patch

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 81a5f8399cb8..75773f9890cc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1738,6 +1738,8 @@  extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
 int perf_get_mediated_pmu(void);
 void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1831,6 +1833,8 @@  static inline int perf_get_mediated_pmu(void)
 }
 
 static inline void perf_put_mediated_pmu(void)			{ }
+static inline void perf_guest_enter(void)			{ }
+static inline void perf_guest_exit(void)			{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57648736e43e..57ff737b922b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5941,6 +5941,60 @@  void perf_put_mediated_pmu(void)
 }
 EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
 
+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
+		goto unlock;
+
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx) {
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	}
+
+	__this_cpu_write(perf_in_guest, true);
+
+unlock:
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_enter);
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
+		goto unlock;
+
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx) {
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	}
+
+	__this_cpu_write(perf_in_guest, false);
+unlock:
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_exit);
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block