diff mbox series

[9/9] drm/i915/pmu: Enable legacy PMU events for MTL

Message ID 20230330004103.1295413-10-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series Add MTL PMU support for multi-gt | expand

Commit Message

Umesh Nerlige Ramappa March 30, 2023, 12:41 a.m. UTC
MTL introduces separate GTs for render and media. This complicates the
definition of frequency and rc6 counters for the GPU as a whole since
each GT has an independent counter. The best way to support this change
is to deprecate the GPU-specific counters and create GT-specific
counters, however that just breaks ABI. Since perf tools and scripts may
be decentralized with probably many users, it's hard to deprecate the
legacy counters and have all the users on board with that.

Re-introduce the legacy counters and support them as min/max of
GT-specific counters as necessary to ensure backwards compatibility.

I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
I915_PMU_INTERRUPTS - no changes since it is GPU specific on all platforms
I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters

Note:
- For deeper debugging of performance issues, tools must be upgraded to
  read the GT-specific counters.
- This patch deserves to be separate from the other PMU features so that
  it can be easily dropped if legacy events are ever deprecated.
- Internal implementation relies on creating an extra entry in the
  arrays used for GT specific counters. Index 0 is empty.
  Index 1 through N are mapped to GTs 0 through N - 1.
- User interface will use GT numbers indexed from 0 to specify the GT of
  interest.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c | 134 +++++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_pmu.h |   2 +-
 include/uapi/drm/i915_drm.h     |  14 ++--
 3 files changed, 125 insertions(+), 25 deletions(-)

Comments

Tvrtko Ursulin March 30, 2023, 1:38 p.m. UTC | #1
On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
> MTL introduces separate GTs for render and media. This complicates the
> definition of frequency and rc6 counters for the GPU as a whole since
> each GT has an independent counter. The best way to support this change
> is to deprecate the GPU-specific counters and create GT-specific
> counters, however that just breaks ABI. Since perf tools and scripts may
> be decentralized with probably many users, it's hard to deprecate the
> legacy counters and have all the users on board with that.
> 
> Re-introduce the legacy counters and support them as min/max of
> GT-specific counters as necessary to ensure backwards compatibility.
> 
> I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
> I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
> I915_PMU_INTERRUPTS - no changes since it is GPU specific on all platforms
> I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
> I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters

IMO max/min games are _very_ low value and probably just confusing.

I am not convinced we need to burden the kernel with this. New platform, 
new counters.. userspace can just deal with it.

In intel_gpu_top we can do the smarts in maybe default aggregated view 
(piggy back/extend on engines aggregation via command line '-p' or '1' 
at runtime). But then it's not min/max but probably normalized by number 
of gts.

Regards,

Tvrtko

> 
> Note:
> - For deeper debugging of performance issues, tools must be upgraded to
>    read the GT-specific counters.
> - This patch deserves to be separate from the other PMU features so that
>    it can be easily dropped if legacy events are ever deprecated.
> - Internal implementation relies on creating an extra entry in the
>    arrays used for GT specific counters. Index 0 is empty.
>    Index 1 through N are mapped to GTs 0 through N - 1.
> - User interface will use GT numbers indexed from 0 to specify the GT of
>    interest.
> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 134 +++++++++++++++++++++++++++-----
>   drivers/gpu/drm/i915/i915_pmu.h |   2 +-
>   include/uapi/drm/i915_drm.h     |  14 ++--
>   3 files changed, 125 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 9bd9605d2662..0dc7711c3b4b 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -221,7 +221,7 @@ add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val,
>   static u64 get_rc6(struct intel_gt *gt)
>   {
>   	struct drm_i915_private *i915 = gt->i915;
> -	const unsigned int gt_id = gt->info.id;
> +	const unsigned int gt_id = gt->info.id + 1;
>   	struct i915_pmu *pmu = &i915->pmu;
>   	unsigned long flags;
>   	bool awake = false;
> @@ -267,24 +267,26 @@ static void init_rc6(struct i915_pmu *pmu)
>   
>   	for_each_gt(gt, i915, i) {
>   		intel_wakeref_t wakeref;
> +		const unsigned int gt_id = i + 1;
>   
>   		with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>   			u64 val = __get_rc6(gt);
>   
> -			store_sample(pmu, i, __I915_SAMPLE_RC6, val);
> -			store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
> +			store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
> +			store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED,
>   				     val);
> -			pmu->sleep_last[i] = ktime_get_raw();
> +			pmu->sleep_last[gt_id] = ktime_get_raw();
>   		}
>   	}
>   }
>   
>   static void park_rc6(struct intel_gt *gt)
>   {
> +	const unsigned int gt_id = gt->info.id + 1;
>   	struct i915_pmu *pmu = &gt->i915->pmu;
>   
> -	store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
> -	pmu->sleep_last[gt->info.id] = ktime_get_raw();
> +	store_sample(pmu, gt_id, __I915_SAMPLE_RC6, __get_rc6(gt));
> +	pmu->sleep_last[gt_id] = ktime_get_raw();
>   }
>   
>   static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
> @@ -436,18 +438,18 @@ static void
>   frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>   {
>   	struct drm_i915_private *i915 = gt->i915;
> -	const unsigned int gt_id = gt->info.id;
> +	const unsigned int gt_id = gt->info.id + 1;
>   	struct i915_pmu *pmu = &i915->pmu;
>   	struct intel_rps *rps = &gt->rps;
>   
> -	if (!frequency_sampling_enabled(pmu, gt_id))
> +	if (!frequency_sampling_enabled(pmu, gt->info.id))
>   		return;
>   
>   	/* Report 0/0 (actual/requested) frequency while parked. */
>   	if (!intel_gt_pm_get_if_awake(gt))
>   		return;
>   
> -	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
> +	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt->info.id))) {
>   		u32 val;
>   
>   		/*
> @@ -467,7 +469,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>   				val, period_ns / 1000);
>   	}
>   
> -	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
> +	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt->info.id))) {
>   		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>   				intel_rps_get_requested_frequency(rps),
>   				period_ns / 1000);
> @@ -545,14 +547,15 @@ engine_event_status(struct intel_engine_cs *engine,
>   static int
>   config_status(struct drm_i915_private *i915, u64 config)
>   {
> -	struct intel_gt *gt = to_gt(i915);
> -
>   	unsigned int gt_id = config_gt_id(config);
> -	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
> +	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 2 : 1;
> +	struct intel_gt *gt;
>   
>   	if (gt_id > max_gt_id)
>   		return -ENOENT;
>   
> +	gt = !gt_id ? to_gt(i915) : i915->gt[gt_id - 1];
> +
>   	switch (config_counter(config)) {
>   	case I915_PMU_ACTUAL_FREQUENCY:
>   		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
> @@ -673,23 +676,58 @@ static u64 __i915_pmu_event_read_other(struct perf_event *event)
>   	const unsigned int gt_id = config_gt_id(event->attr.config);
>   	const u64 config = config_counter(event->attr.config);
>   	struct i915_pmu *pmu = &i915->pmu;
> +	struct intel_gt *gt;
>   	u64 val = 0;
> +	int i;
>   
>   	switch (config) {
>   	case I915_PMU_ACTUAL_FREQUENCY:
> -		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
> +		if (gt_id)
> +			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
> +
> +		if (!HAS_EXTRA_GT_LIST(i915))
> +			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_ACT);
> +
> +		for_each_gt(gt, i915, i)
> +			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_ACT));
> +
>   		break;
>   	case I915_PMU_REQUESTED_FREQUENCY:
> -		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
> +		if (gt_id)
> +			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
> +
> +		if (!HAS_EXTRA_GT_LIST(i915))
> +			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_REQ);
> +
> +		for_each_gt(gt, i915, i)
> +			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_REQ));
> +
>   		break;
>   	case I915_PMU_INTERRUPTS:
>   		val = READ_ONCE(pmu->irq_count);
>   		break;
>   	case I915_PMU_RC6_RESIDENCY:
> -		val = get_rc6(i915->gt[gt_id]);
> +		if (gt_id)
> +			return get_rc6(i915->gt[gt_id - 1]);
> +
> +		if (!HAS_EXTRA_GT_LIST(i915))
> +			return get_rc6(i915->gt[0]);
> +
> +		val = U64_MAX;
> +		for_each_gt(gt, i915, i)
> +			val = min(val, get_rc6(gt));
> +
>   		break;
>   	case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
> -		val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
> +		if (gt_id)
> +			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[gt_id - 1]));
> +
> +		if (!HAS_EXTRA_GT_LIST(i915))
> +			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[0]));
> +
> +		val = 0;
> +		for_each_gt(gt, i915, i)
> +			val = max((s64)val, ktime_to_ns(intel_gt_get_awake_time(gt)));
>   		break;
>   	}
>   
> @@ -728,11 +766,14 @@ static void i915_pmu_event_read(struct perf_event *event)
>   
>   static void i915_pmu_enable(struct perf_event *event)
>   {
> +	const unsigned int gt_id = config_gt_id(event->attr.config);
>   	struct drm_i915_private *i915 =
>   		container_of(event->pmu, typeof(*i915), pmu.base);
>   	struct i915_pmu *pmu = &i915->pmu;
> +	struct intel_gt *gt;
>   	unsigned long flags;
>   	unsigned int bit;
> +	u64 i;
>   
>   	bit = event_bit(event);
>   	if (bit == -1)
> @@ -745,12 +786,42 @@ static void i915_pmu_enable(struct perf_event *event)
>   	 * the event reference counter.
>   	 */
>   	BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS);
> +	BUILD_BUG_ON(BITS_PER_TYPE(pmu->enable) < I915_PMU_MASK_BITS);
>   	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>   	GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>   
>   	pmu->enable |= BIT_ULL(bit);
>   	pmu->enable_count[bit]++;
>   
> +	/*
> +	 * The arrays that i915_pmu maintains are now indexed as
> +	 *
> +	 * 0 - aggregate events (a.k.a !gt_id)
> +	 * 1 - gt0
> +	 * 2 - gt1
> +	 *
> +	 * The same logic applies to event_bit masks. The first set of mask are
> +	 * for aggregate, followed by gt0 and gt1 masks. The idea here is to
> +	 * enable the event on all gts if the aggregate event bit is set. This
> +	 * applies only to the non-engine-events.
> +	 */
> +	if (!gt_id && !is_engine_event(event)) {
> +		for_each_gt(gt, i915, i) {
> +			u64 counter = config_counter(event->attr.config);
> +			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
> +			unsigned int bit = config_bit(config);
> +
> +			if (bit == -1)
> +				continue;
> +
> +			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
> +			GEM_BUG_ON(pmu->enable_count[bit] == ~0);
> +
> +			pmu->enable |= BIT_ULL(bit);
> +			pmu->enable_count[bit]++;
> +		}
> +	}
> +
>   	/*
>   	 * Start the sampling timer if needed and not already enabled.
>   	 */
> @@ -793,6 +864,7 @@ static void i915_pmu_enable(struct perf_event *event)
>   
>   static void i915_pmu_disable(struct perf_event *event)
>   {
> +	const unsigned int gt_id = config_gt_id(event->attr.config);
>   	struct drm_i915_private *i915 =
>   		container_of(event->pmu, typeof(*i915), pmu.base);
>   	unsigned int bit = event_bit(event);
> @@ -822,6 +894,26 @@ static void i915_pmu_disable(struct perf_event *event)
>   		 */
>   		if (--engine->pmu.enable_count[sample] == 0)
>   			engine->pmu.enable &= ~BIT(sample);
> +	} else if (!gt_id) {
> +		struct intel_gt *gt;
> +		u64 i;
> +
> +		for_each_gt(gt, i915, i) {
> +			u64 counter = config_counter(event->attr.config);
> +			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
> +			unsigned int bit = config_bit(config);
> +
> +			if (bit == -1)
> +				continue;
> +
> +			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
> +			GEM_BUG_ON(pmu->enable_count[bit] == 0);
> +
> +			if (--pmu->enable_count[bit] == 0) {
> +				pmu->enable &= ~BIT_ULL(bit);
> +				pmu->timer_enabled &= pmu_needs_timer(pmu, true);
> +			}
> +		}
>   	}
>   
>   	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
> @@ -1002,7 +1094,11 @@ create_event_attributes(struct i915_pmu *pmu)
>   		const char *name;
>   		const char *unit;
>   	} global_events[] = {
> +		__event(0, "actual-frequency", "M"),
> +		__event(1, "requested-frequency", "M"),
>   		__event(2, "interrupts", NULL),
> +		__event(3, "rc6-residency", "ns"),
> +		__event(4, "software-gt-awake-time", "ns"),
>   	};
>   	static const struct {
>   		enum drm_i915_pmu_engine_sample sample;
> @@ -1024,7 +1120,7 @@ create_event_attributes(struct i915_pmu *pmu)
>   	/* per gt counters */
>   	for_each_gt(gt, i915, j) {
>   		for (i = 0; i < ARRAY_SIZE(events); i++) {
> -			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
> +			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>   
>   			if (!config_status(i915, config))
>   				count++;
> @@ -1070,7 +1166,7 @@ create_event_attributes(struct i915_pmu *pmu)
>   	/* per gt counters */
>   	for_each_gt(gt, i915, j) {
>   		for (i = 0; i < ARRAY_SIZE(events); i++) {
> -			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
> +			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>   			char *str;
>   
>   			if (config_status(i915, config))
> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
> index a708e44a227e..a4cc1eb218fc 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.h
> +++ b/drivers/gpu/drm/i915/i915_pmu.h
> @@ -38,7 +38,7 @@ enum {
>   	__I915_NUM_PMU_SAMPLERS
>   };
>   
> -#define I915_PMU_MAX_GTS (4) /* FIXME */
> +#define I915_PMU_MAX_GTS (4 + 1) /* FIXME */
>   
>   /**
>    * How many different events we track in the global PMU mask.
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index bbab7f3dbeb4..18794c30027f 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -290,6 +290,7 @@ enum drm_i915_pmu_engine_sample {
>   	(((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>   	((__u64)(gt) << __I915_PMU_GT_SHIFT))
>   
> +/* Aggregate from all gts */
>   #define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>   
>   #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
> @@ -300,11 +301,14 @@ enum drm_i915_pmu_engine_sample {
>   
>   #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>   
> -#define __I915_PMU_ACTUAL_FREQUENCY(gt)		___I915_PMU_OTHER(gt, 0)
> -#define __I915_PMU_REQUESTED_FREQUENCY(gt)	___I915_PMU_OTHER(gt, 1)
> -#define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
> -#define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
> -#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
> +/* GT specific counters */
> +#define ____I915_PMU_OTHER(gt, x) ___I915_PMU_OTHER(((gt) + 1), x)
> +
> +#define __I915_PMU_ACTUAL_FREQUENCY(gt)		____I915_PMU_OTHER(gt, 0)
> +#define __I915_PMU_REQUESTED_FREQUENCY(gt)	____I915_PMU_OTHER(gt, 1)
> +#define __I915_PMU_INTERRUPTS(gt)		____I915_PMU_OTHER(gt, 2)
> +#define __I915_PMU_RC6_RESIDENCY(gt)		____I915_PMU_OTHER(gt, 3)
> +#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	____I915_PMU_OTHER(gt, 4)
>   
>   /* Each region is a minimum of 16k, and there are at most 255 of them.
>    */
Umesh Nerlige Ramappa March 30, 2023, 6:31 p.m. UTC | #2
+ Joonas for comments on this

On Thu, Mar 30, 2023 at 02:38:03PM +0100, Tvrtko Ursulin wrote:
>
>On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
>>MTL introduces separate GTs for render and media. This complicates the
>>definition of frequency and rc6 counters for the GPU as a whole since
>>each GT has an independent counter. The best way to support this change
>>is to deprecate the GPU-specific counters and create GT-specific
>>counters, however that just breaks ABI. Since perf tools and scripts may
>>be decentralized with probably many users, it's hard to deprecate the
>>legacy counters and have all the users on board with that.
>>
>>Re-introduce the legacy counters and support them as min/max of
>>GT-specific counters as necessary to ensure backwards compatibility.
>>
>>I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
>>I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
>>I915_PMU_INTERRUPTS - no changes since it is GPU specific on all platforms
>>I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
>>I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters
>
>IMO max/min games are _very_ low value and probably just confusing.

By value, do you mean ROI or actually that the values would be 
incorrect?

>
>I am not convinced we need to burden the kernel with this. New 
>platform, new counters.. userspace can just deal with it.

I agree and would prefer to drop this patch. There are some counter 
arguments, I have added Joonas here for comments.

1) an app/script hard-coded with the legacy events would be used on a 
new platform and fail and we should maintain backwards compatibility.

2) the sysfs attributes for rc6/frequency have already adopted an 
aggregate vs gt0/gt1 approach to address that and pmu should have a 
similar solution (or rather, PMU and the sysfs approaches should match 
based on whatever is the approach)

Regards,
Umesh

>
>In intel_gpu_top we can do the smarts in maybe default aggregated view 
>(piggy back/extend on engines aggregation via command line '-p' or '1' 
>at runtime). But then it's not min/max but probably normalized by 
>number of gts.
>
>Regards,
>
>Tvrtko
>
>>
>>Note:
>>- For deeper debugging of performance issues, tools must be upgraded to
>>   read the GT-specific counters.
>>- This patch deserves to be separate from the other PMU features so that
>>   it can be easily dropped if legacy events are ever deprecated.
>>- Internal implementation relies on creating an extra entry in the
>>   arrays used for GT specific counters. Index 0 is empty.
>>   Index 1 through N are mapped to GTs 0 through N - 1.
>>- User interface will use GT numbers indexed from 0 to specify the GT of
>>   interest.
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/i915_pmu.c | 134 +++++++++++++++++++++++++++-----
>>  drivers/gpu/drm/i915/i915_pmu.h |   2 +-
>>  include/uapi/drm/i915_drm.h     |  14 ++--
>>  3 files changed, 125 insertions(+), 25 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>index 9bd9605d2662..0dc7711c3b4b 100644
>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>@@ -221,7 +221,7 @@ add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val,
>>  static u64 get_rc6(struct intel_gt *gt)
>>  {
>>  	struct drm_i915_private *i915 = gt->i915;
>>-	const unsigned int gt_id = gt->info.id;
>>+	const unsigned int gt_id = gt->info.id + 1;
>>  	struct i915_pmu *pmu = &i915->pmu;
>>  	unsigned long flags;
>>  	bool awake = false;
>>@@ -267,24 +267,26 @@ static void init_rc6(struct i915_pmu *pmu)
>>  	for_each_gt(gt, i915, i) {
>>  		intel_wakeref_t wakeref;
>>+		const unsigned int gt_id = i + 1;
>>  		with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>>  			u64 val = __get_rc6(gt);
>>-			store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>>-			store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>>+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>>+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED,
>>  				     val);
>>-			pmu->sleep_last[i] = ktime_get_raw();
>>+			pmu->sleep_last[gt_id] = ktime_get_raw();
>>  		}
>>  	}
>>  }
>>  static void park_rc6(struct intel_gt *gt)
>>  {
>>+	const unsigned int gt_id = gt->info.id + 1;
>>  	struct i915_pmu *pmu = &gt->i915->pmu;
>>-	store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>-	pmu->sleep_last[gt->info.id] = ktime_get_raw();
>>+	store_sample(pmu, gt_id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>+	pmu->sleep_last[gt_id] = ktime_get_raw();
>>  }
>>  static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>>@@ -436,18 +438,18 @@ static void
>>  frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>  {
>>  	struct drm_i915_private *i915 = gt->i915;
>>-	const unsigned int gt_id = gt->info.id;
>>+	const unsigned int gt_id = gt->info.id + 1;
>>  	struct i915_pmu *pmu = &i915->pmu;
>>  	struct intel_rps *rps = &gt->rps;
>>-	if (!frequency_sampling_enabled(pmu, gt_id))
>>+	if (!frequency_sampling_enabled(pmu, gt->info.id))
>>  		return;
>>  	/* Report 0/0 (actual/requested) frequency while parked. */
>>  	if (!intel_gt_pm_get_if_awake(gt))
>>  		return;
>>-	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>>+	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt->info.id))) {
>>  		u32 val;
>>  		/*
>>@@ -467,7 +469,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>  				val, period_ns / 1000);
>>  	}
>>-	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>>+	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt->info.id))) {
>>  		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>>  				intel_rps_get_requested_frequency(rps),
>>  				period_ns / 1000);
>>@@ -545,14 +547,15 @@ engine_event_status(struct intel_engine_cs *engine,
>>  static int
>>  config_status(struct drm_i915_private *i915, u64 config)
>>  {
>>-	struct intel_gt *gt = to_gt(i915);
>>-
>>  	unsigned int gt_id = config_gt_id(config);
>>-	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
>>+	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 2 : 1;
>>+	struct intel_gt *gt;
>>  	if (gt_id > max_gt_id)
>>  		return -ENOENT;
>>+	gt = !gt_id ? to_gt(i915) : i915->gt[gt_id - 1];
>>+
>>  	switch (config_counter(config)) {
>>  	case I915_PMU_ACTUAL_FREQUENCY:
>>  		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>>@@ -673,23 +676,58 @@ static u64 __i915_pmu_event_read_other(struct perf_event *event)
>>  	const unsigned int gt_id = config_gt_id(event->attr.config);
>>  	const u64 config = config_counter(event->attr.config);
>>  	struct i915_pmu *pmu = &i915->pmu;
>>+	struct intel_gt *gt;
>>  	u64 val = 0;
>>+	int i;
>>  	switch (config) {
>>  	case I915_PMU_ACTUAL_FREQUENCY:
>>-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>+		if (gt_id)
>>+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>+
>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_ACT);
>>+
>>+		for_each_gt(gt, i915, i)
>>+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_ACT));
>>+
>>  		break;
>>  	case I915_PMU_REQUESTED_FREQUENCY:
>>-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>+		if (gt_id)
>>+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>+
>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_REQ);
>>+
>>+		for_each_gt(gt, i915, i)
>>+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_REQ));
>>+
>>  		break;
>>  	case I915_PMU_INTERRUPTS:
>>  		val = READ_ONCE(pmu->irq_count);
>>  		break;
>>  	case I915_PMU_RC6_RESIDENCY:
>>-		val = get_rc6(i915->gt[gt_id]);
>>+		if (gt_id)
>>+			return get_rc6(i915->gt[gt_id - 1]);
>>+
>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>+			return get_rc6(i915->gt[0]);
>>+
>>+		val = U64_MAX;
>>+		for_each_gt(gt, i915, i)
>>+			val = min(val, get_rc6(gt));
>>+
>>  		break;
>>  	case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>>-		val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>>+		if (gt_id)
>>+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[gt_id - 1]));
>>+
>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[0]));
>>+
>>+		val = 0;
>>+		for_each_gt(gt, i915, i)
>>+			val = max((s64)val, ktime_to_ns(intel_gt_get_awake_time(gt)));
>>  		break;
>>  	}
>>@@ -728,11 +766,14 @@ static void i915_pmu_event_read(struct perf_event *event)
>>  static void i915_pmu_enable(struct perf_event *event)
>>  {
>>+	const unsigned int gt_id = config_gt_id(event->attr.config);
>>  	struct drm_i915_private *i915 =
>>  		container_of(event->pmu, typeof(*i915), pmu.base);
>>  	struct i915_pmu *pmu = &i915->pmu;
>>+	struct intel_gt *gt;
>>  	unsigned long flags;
>>  	unsigned int bit;
>>+	u64 i;
>>  	bit = event_bit(event);
>>  	if (bit == -1)
>>@@ -745,12 +786,42 @@ static void i915_pmu_enable(struct perf_event *event)
>>  	 * the event reference counter.
>>  	 */
>>  	BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS);
>>+	BUILD_BUG_ON(BITS_PER_TYPE(pmu->enable) < I915_PMU_MASK_BITS);
>>  	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>  	GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>  	pmu->enable |= BIT_ULL(bit);
>>  	pmu->enable_count[bit]++;
>>+	/*
>>+	 * The arrays that i915_pmu maintains are now indexed as
>>+	 *
>>+	 * 0 - aggregate events (a.k.a !gt_id)
>>+	 * 1 - gt0
>>+	 * 2 - gt1
>>+	 *
>>+	 * The same logic applies to event_bit masks. The first set of mask are
>>+	 * for aggregate, followed by gt0 and gt1 masks. The idea here is to
>>+	 * enable the event on all gts if the aggregate event bit is set. This
>>+	 * applies only to the non-engine-events.
>>+	 */
>>+	if (!gt_id && !is_engine_event(event)) {
>>+		for_each_gt(gt, i915, i) {
>>+			u64 counter = config_counter(event->attr.config);
>>+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>+			unsigned int bit = config_bit(config);
>>+
>>+			if (bit == -1)
>>+				continue;
>>+
>>+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>+			GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>+
>>+			pmu->enable |= BIT_ULL(bit);
>>+			pmu->enable_count[bit]++;
>>+		}
>>+	}
>>+
>>  	/*
>>  	 * Start the sampling timer if needed and not already enabled.
>>  	 */
>>@@ -793,6 +864,7 @@ static void i915_pmu_enable(struct perf_event *event)
>>  static void i915_pmu_disable(struct perf_event *event)
>>  {
>>+	const unsigned int gt_id = config_gt_id(event->attr.config);
>>  	struct drm_i915_private *i915 =
>>  		container_of(event->pmu, typeof(*i915), pmu.base);
>>  	unsigned int bit = event_bit(event);
>>@@ -822,6 +894,26 @@ static void i915_pmu_disable(struct perf_event *event)
>>  		 */
>>  		if (--engine->pmu.enable_count[sample] == 0)
>>  			engine->pmu.enable &= ~BIT(sample);
>>+	} else if (!gt_id) {
>>+		struct intel_gt *gt;
>>+		u64 i;
>>+
>>+		for_each_gt(gt, i915, i) {
>>+			u64 counter = config_counter(event->attr.config);
>>+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>+			unsigned int bit = config_bit(config);
>>+
>>+			if (bit == -1)
>>+				continue;
>>+
>>+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>+			GEM_BUG_ON(pmu->enable_count[bit] == 0);
>>+
>>+			if (--pmu->enable_count[bit] == 0) {
>>+				pmu->enable &= ~BIT_ULL(bit);
>>+				pmu->timer_enabled &= pmu_needs_timer(pmu, true);
>>+			}
>>+		}
>>  	}
>>  	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>@@ -1002,7 +1094,11 @@ create_event_attributes(struct i915_pmu *pmu)
>>  		const char *name;
>>  		const char *unit;
>>  	} global_events[] = {
>>+		__event(0, "actual-frequency", "M"),
>>+		__event(1, "requested-frequency", "M"),
>>  		__event(2, "interrupts", NULL),
>>+		__event(3, "rc6-residency", "ns"),
>>+		__event(4, "software-gt-awake-time", "ns"),
>>  	};
>>  	static const struct {
>>  		enum drm_i915_pmu_engine_sample sample;
>>@@ -1024,7 +1120,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>  	/* per gt counters */
>>  	for_each_gt(gt, i915, j) {
>>  		for (i = 0; i < ARRAY_SIZE(events); i++) {
>>-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>  			if (!config_status(i915, config))
>>  				count++;
>>@@ -1070,7 +1166,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>  	/* per gt counters */
>>  	for_each_gt(gt, i915, j) {
>>  		for (i = 0; i < ARRAY_SIZE(events); i++) {
>>-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>  			char *str;
>>  			if (config_status(i915, config))
>>diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>>index a708e44a227e..a4cc1eb218fc 100644
>>--- a/drivers/gpu/drm/i915/i915_pmu.h
>>+++ b/drivers/gpu/drm/i915/i915_pmu.h
>>@@ -38,7 +38,7 @@ enum {
>>  	__I915_NUM_PMU_SAMPLERS
>>  };
>>-#define I915_PMU_MAX_GTS (4) /* FIXME */
>>+#define I915_PMU_MAX_GTS (4 + 1) /* FIXME */
>>  /**
>>   * How many different events we track in the global PMU mask.
>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>index bbab7f3dbeb4..18794c30027f 100644
>>--- a/include/uapi/drm/i915_drm.h
>>+++ b/include/uapi/drm/i915_drm.h
>>@@ -290,6 +290,7 @@ enum drm_i915_pmu_engine_sample {
>>  	(((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>>  	((__u64)(gt) << __I915_PMU_GT_SHIFT))
>>+/* Aggregate from all gts */
>>  #define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>  #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
>>@@ -300,11 +301,14 @@ enum drm_i915_pmu_engine_sample {
>>  #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>-#define __I915_PMU_ACTUAL_FREQUENCY(gt)		___I915_PMU_OTHER(gt, 0)
>>-#define __I915_PMU_REQUESTED_FREQUENCY(gt)	___I915_PMU_OTHER(gt, 1)
>>-#define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
>>-#define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
>>-#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
>>+/* GT specific counters */
>>+#define ____I915_PMU_OTHER(gt, x) ___I915_PMU_OTHER(((gt) + 1), x)
>>+
>>+#define __I915_PMU_ACTUAL_FREQUENCY(gt)		____I915_PMU_OTHER(gt, 0)
>>+#define __I915_PMU_REQUESTED_FREQUENCY(gt)	____I915_PMU_OTHER(gt, 1)
>>+#define __I915_PMU_INTERRUPTS(gt)		____I915_PMU_OTHER(gt, 2)
>>+#define __I915_PMU_RC6_RESIDENCY(gt)		____I915_PMU_OTHER(gt, 3)
>>+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	____I915_PMU_OTHER(gt, 4)
>>  /* Each region is a minimum of 16k, and there are at most 255 of them.
>>   */
Tvrtko Ursulin March 31, 2023, 1:02 p.m. UTC | #3
On 30/03/2023 19:31, Umesh Nerlige Ramappa wrote:
> + Joonas for comments on this
> 
> On Thu, Mar 30, 2023 at 02:38:03PM +0100, Tvrtko Ursulin wrote:
>>
>> On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
>>> MTL introduces separate GTs for render and media. This complicates the
>>> definition of frequency and rc6 counters for the GPU as a whole since
>>> each GT has an independent counter. The best way to support this change
>>> is to deprecate the GPU-specific counters and create GT-specific
>>> counters, however that just breaks ABI. Since perf tools and scripts may
>>> be decentralized with probably many users, it's hard to deprecate the
>>> legacy counters and have all the users on board with that.
>>>
>>> Re-introduce the legacy counters and support them as min/max of
>>> GT-specific counters as necessary to ensure backwards compatibility.
>>>
>>> I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
>>> I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
>>> I915_PMU_INTERRUPTS - no changes since it is GPU specific on all 
>>> platforms
>>> I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
>>> I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters
>>
>> IMO max/min games are _very_ low value and probably just confusing.
> 
> By value, do you mean ROI or actually that the values would be incorrect?

Both really.

>> I am not convinced we need to burden the kernel with this. New 
>> platform, new counters.. userspace can just deal with it.
> 
> I agree and would prefer to drop this patch. There are some counter 
> arguments, I have added Joonas here for comments.
> 
> 1) an app/script hard-coded with the legacy events would be used on a 
> new platform and fail and we should maintain backwards compatibility.

I thought we pretty much agreed multiple times in the past (on different 
topics) that a new platform can require new userspace.

PMU is probably even a more clear cut case since it is exposing hardware 
counters (or close) so sometimes it is not even theoretically possible 
to preserve "backward" compatibility.

(I double quote backward because I think real backward compatibility 
does not apply on a new platform. And MTL is under force probe still.)

So for me it all comes under the "would be nice" category. But since we 
need to add kernel code to do it, code which asy intel_gpu_top could run 
in userspace, I am not at all convinced it wouldn't be a bad idea.

The aggregated counters wouldn't even be giving the full picture.

So I'd simply add tiles/gt support to intel_gpu_top. Same as it 
currently can do "-p" on the command line, or '1' in the interactive 
mode, to aggregate the engine classes into one line item, I'd extend 
that concept into frequencies and RC6.

By default we start with normalized values and in physical mode we show 
separate counters per tile/gt.

Someone running old intel_gpu_top on MTL gets to see nothing since the 
counter names are different. Which is IMO fine - better than showing 
tile 0 data, or some minimums/maximums from one tile only.

> 2) the sysfs attributes for rc6/frequency have already adopted an 
> aggregate vs gt0/gt1 approach to address that and pmu should have a 
> similar solution (or rather, PMU and the sysfs approaches should match 
> based on whatever is the approach)

Yeah I disagreed with min/max reads in sysfs too and am pretty sure I 
expressed that at the time. :shrug:

But I don't think there is a strong argument that PMU needs to follow.

Only impact is to people who access perf_event_open directly so yeah, if 
there are such users, they will need to add multi-tile support.

Regards,

Tvrtko
Umesh Nerlige Ramappa April 3, 2023, 7:16 p.m. UTC | #4
+ Joonas for inputs

On Thu, Mar 30, 2023 at 11:31:53AM -0700, Umesh Nerlige Ramappa wrote:
>+ Joonas for comments on this
>
>On Thu, Mar 30, 2023 at 02:38:03PM +0100, Tvrtko Ursulin wrote:
>>
>>On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
>>>MTL introduces separate GTs for render and media. This complicates the
>>>definition of frequency and rc6 counters for the GPU as a whole since
>>>each GT has an independent counter. The best way to support this change
>>>is to deprecate the GPU-specific counters and create GT-specific
>>>counters, however that just breaks ABI. Since perf tools and scripts may
>>>be decentralized with probably many users, it's hard to deprecate the
>>>legacy counters and have all the users on board with that.
>>>
>>>Re-introduce the legacy counters and support them as min/max of
>>>GT-specific counters as necessary to ensure backwards compatibility.
>>>
>>>I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
>>>I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
>>>I915_PMU_INTERRUPTS - no changes since it is GPU specific on all platforms
>>>I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
>>>I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters
>>
>>IMO max/min games are _very_ low value and probably just confusing.
>
>By value, do you mean ROI or actually that the values would be 
>incorrect?
>
>>
>>I am not convinced we need to burden the kernel with this. New 
>>platform, new counters.. userspace can just deal with it.
>
>I agree and would prefer to drop this patch. There are some counter 
>arguments, I have added Joonas here for comments.
>
>1) an app/script hard-coded with the legacy events would be used on a 
>new platform and fail and we should maintain backwards compatibility.
>
>2) the sysfs attributes for rc6/frequency have already adopted an 
>aggregate vs gt0/gt1 approach to address that and pmu should have a 
>similar solution (or rather, PMU and the sysfs approaches should match 
>based on whatever is the approach)
>
>Regards,
>Umesh
>
>>
>>In intel_gpu_top we can do the smarts in maybe default aggregated 
>>view (piggy back/extend on engines aggregation via command line '-p' 
>>or '1' at runtime). But then it's not min/max but probably 
>>normalized by number of gts.
>>
>>Regards,
>>
>>Tvrtko
>>
>>>
>>>Note:
>>>- For deeper debugging of performance issues, tools must be upgraded to
>>>  read the GT-specific counters.
>>>- This patch deserves to be separate from the other PMU features so that
>>>  it can be easily dropped if legacy events are ever deprecated.
>>>- Internal implementation relies on creating an extra entry in the
>>>  arrays used for GT specific counters. Index 0 is empty.
>>>  Index 1 through N are mapped to GTs 0 through N - 1.
>>>- User interface will use GT numbers indexed from 0 to specify the GT of
>>>  interest.
>>>
>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>---
>>> drivers/gpu/drm/i915/i915_pmu.c | 134 +++++++++++++++++++++++++++-----
>>> drivers/gpu/drm/i915/i915_pmu.h |   2 +-
>>> include/uapi/drm/i915_drm.h     |  14 ++--
>>> 3 files changed, 125 insertions(+), 25 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>>index 9bd9605d2662..0dc7711c3b4b 100644
>>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>>@@ -221,7 +221,7 @@ add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val,
>>> static u64 get_rc6(struct intel_gt *gt)
>>> {
>>> 	struct drm_i915_private *i915 = gt->i915;
>>>-	const unsigned int gt_id = gt->info.id;
>>>+	const unsigned int gt_id = gt->info.id + 1;
>>> 	struct i915_pmu *pmu = &i915->pmu;
>>> 	unsigned long flags;
>>> 	bool awake = false;
>>>@@ -267,24 +267,26 @@ static void init_rc6(struct i915_pmu *pmu)
>>> 	for_each_gt(gt, i915, i) {
>>> 		intel_wakeref_t wakeref;
>>>+		const unsigned int gt_id = i + 1;
>>> 		with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>>> 			u64 val = __get_rc6(gt);
>>>-			store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>>>-			store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>>>+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>>>+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED,
>>> 				     val);
>>>-			pmu->sleep_last[i] = ktime_get_raw();
>>>+			pmu->sleep_last[gt_id] = ktime_get_raw();
>>> 		}
>>> 	}
>>> }
>>> static void park_rc6(struct intel_gt *gt)
>>> {
>>>+	const unsigned int gt_id = gt->info.id + 1;
>>> 	struct i915_pmu *pmu = &gt->i915->pmu;
>>>-	store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>-	pmu->sleep_last[gt->info.id] = ktime_get_raw();
>>>+	store_sample(pmu, gt_id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>+	pmu->sleep_last[gt_id] = ktime_get_raw();
>>> }
>>> static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>>>@@ -436,18 +438,18 @@ static void
>>> frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>> {
>>> 	struct drm_i915_private *i915 = gt->i915;
>>>-	const unsigned int gt_id = gt->info.id;
>>>+	const unsigned int gt_id = gt->info.id + 1;
>>> 	struct i915_pmu *pmu = &i915->pmu;
>>> 	struct intel_rps *rps = &gt->rps;
>>>-	if (!frequency_sampling_enabled(pmu, gt_id))
>>>+	if (!frequency_sampling_enabled(pmu, gt->info.id))
>>> 		return;
>>> 	/* Report 0/0 (actual/requested) frequency while parked. */
>>> 	if (!intel_gt_pm_get_if_awake(gt))
>>> 		return;
>>>-	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>>>+	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt->info.id))) {
>>> 		u32 val;
>>> 		/*
>>>@@ -467,7 +469,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>> 				val, period_ns / 1000);
>>> 	}
>>>-	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>>>+	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt->info.id))) {
>>> 		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>>> 				intel_rps_get_requested_frequency(rps),
>>> 				period_ns / 1000);
>>>@@ -545,14 +547,15 @@ engine_event_status(struct intel_engine_cs *engine,
>>> static int
>>> config_status(struct drm_i915_private *i915, u64 config)
>>> {
>>>-	struct intel_gt *gt = to_gt(i915);
>>>-
>>> 	unsigned int gt_id = config_gt_id(config);
>>>-	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
>>>+	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 2 : 1;
>>>+	struct intel_gt *gt;
>>> 	if (gt_id > max_gt_id)
>>> 		return -ENOENT;
>>>+	gt = !gt_id ? to_gt(i915) : i915->gt[gt_id - 1];
>>>+
>>> 	switch (config_counter(config)) {
>>> 	case I915_PMU_ACTUAL_FREQUENCY:
>>> 		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>>>@@ -673,23 +676,58 @@ static u64 __i915_pmu_event_read_other(struct perf_event *event)
>>> 	const unsigned int gt_id = config_gt_id(event->attr.config);
>>> 	const u64 config = config_counter(event->attr.config);
>>> 	struct i915_pmu *pmu = &i915->pmu;
>>>+	struct intel_gt *gt;
>>> 	u64 val = 0;
>>>+	int i;
>>> 	switch (config) {
>>> 	case I915_PMU_ACTUAL_FREQUENCY:
>>>-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>>+		if (gt_id)
>>>+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>>+
>>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>>+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_ACT);
>>>+
>>>+		for_each_gt(gt, i915, i)
>>>+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_ACT));
>>>+
>>> 		break;
>>> 	case I915_PMU_REQUESTED_FREQUENCY:
>>>-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>>+		if (gt_id)
>>>+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>>+
>>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>>+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_REQ);
>>>+
>>>+		for_each_gt(gt, i915, i)
>>>+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_REQ));
>>>+
>>> 		break;
>>> 	case I915_PMU_INTERRUPTS:
>>> 		val = READ_ONCE(pmu->irq_count);
>>> 		break;
>>> 	case I915_PMU_RC6_RESIDENCY:
>>>-		val = get_rc6(i915->gt[gt_id]);
>>>+		if (gt_id)
>>>+			return get_rc6(i915->gt[gt_id - 1]);
>>>+
>>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>>+			return get_rc6(i915->gt[0]);
>>>+
>>>+		val = U64_MAX;
>>>+		for_each_gt(gt, i915, i)
>>>+			val = min(val, get_rc6(gt));
>>>+
>>> 		break;
>>> 	case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>>>-		val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>>>+		if (gt_id)
>>>+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[gt_id - 1]));
>>>+
>>>+		if (!HAS_EXTRA_GT_LIST(i915))
>>>+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[0]));
>>>+
>>>+		val = 0;
>>>+		for_each_gt(gt, i915, i)
>>>+			val = max((s64)val, ktime_to_ns(intel_gt_get_awake_time(gt)));
>>> 		break;
>>> 	}
>>>@@ -728,11 +766,14 @@ static void i915_pmu_event_read(struct perf_event *event)
>>> static void i915_pmu_enable(struct perf_event *event)
>>> {
>>>+	const unsigned int gt_id = config_gt_id(event->attr.config);
>>> 	struct drm_i915_private *i915 =
>>> 		container_of(event->pmu, typeof(*i915), pmu.base);
>>> 	struct i915_pmu *pmu = &i915->pmu;
>>>+	struct intel_gt *gt;
>>> 	unsigned long flags;
>>> 	unsigned int bit;
>>>+	u64 i;
>>> 	bit = event_bit(event);
>>> 	if (bit == -1)
>>>@@ -745,12 +786,42 @@ static void i915_pmu_enable(struct perf_event *event)
>>> 	 * the event reference counter.
>>> 	 */
>>> 	BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS);
>>>+	BUILD_BUG_ON(BITS_PER_TYPE(pmu->enable) < I915_PMU_MASK_BITS);
>>> 	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>> 	GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>> 	pmu->enable |= BIT_ULL(bit);
>>> 	pmu->enable_count[bit]++;
>>>+	/*
>>>+	 * The arrays that i915_pmu maintains are now indexed as
>>>+	 *
>>>+	 * 0 - aggregate events (a.k.a !gt_id)
>>>+	 * 1 - gt0
>>>+	 * 2 - gt1
>>>+	 *
>>>+	 * The same logic applies to event_bit masks. The first set of mask are
>>>+	 * for aggregate, followed by gt0 and gt1 masks. The idea here is to
>>>+	 * enable the event on all gts if the aggregate event bit is set. This
>>>+	 * applies only to the non-engine-events.
>>>+	 */
>>>+	if (!gt_id && !is_engine_event(event)) {
>>>+		for_each_gt(gt, i915, i) {
>>>+			u64 counter = config_counter(event->attr.config);
>>>+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>>+			unsigned int bit = config_bit(config);
>>>+
>>>+			if (bit == -1)
>>>+				continue;
>>>+
>>>+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>+			GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>>+
>>>+			pmu->enable |= BIT_ULL(bit);
>>>+			pmu->enable_count[bit]++;
>>>+		}
>>>+	}
>>>+
>>> 	/*
>>> 	 * Start the sampling timer if needed and not already enabled.
>>> 	 */
>>>@@ -793,6 +864,7 @@ static void i915_pmu_enable(struct perf_event *event)
>>> static void i915_pmu_disable(struct perf_event *event)
>>> {
>>>+	const unsigned int gt_id = config_gt_id(event->attr.config);
>>> 	struct drm_i915_private *i915 =
>>> 		container_of(event->pmu, typeof(*i915), pmu.base);
>>> 	unsigned int bit = event_bit(event);
>>>@@ -822,6 +894,26 @@ static void i915_pmu_disable(struct perf_event *event)
>>> 		 */
>>> 		if (--engine->pmu.enable_count[sample] == 0)
>>> 			engine->pmu.enable &= ~BIT(sample);
>>>+	} else if (!gt_id) {
>>>+		struct intel_gt *gt;
>>>+		u64 i;
>>>+
>>>+		for_each_gt(gt, i915, i) {
>>>+			u64 counter = config_counter(event->attr.config);
>>>+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>>+			unsigned int bit = config_bit(config);
>>>+
>>>+			if (bit == -1)
>>>+				continue;
>>>+
>>>+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>+			GEM_BUG_ON(pmu->enable_count[bit] == 0);
>>>+
>>>+			if (--pmu->enable_count[bit] == 0) {
>>>+				pmu->enable &= ~BIT_ULL(bit);
>>>+				pmu->timer_enabled &= pmu_needs_timer(pmu, true);
>>>+			}
>>>+		}
>>> 	}
>>> 	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>@@ -1002,7 +1094,11 @@ create_event_attributes(struct i915_pmu *pmu)
>>> 		const char *name;
>>> 		const char *unit;
>>> 	} global_events[] = {
>>>+		__event(0, "actual-frequency", "M"),
>>>+		__event(1, "requested-frequency", "M"),
>>> 		__event(2, "interrupts", NULL),
>>>+		__event(3, "rc6-residency", "ns"),
>>>+		__event(4, "software-gt-awake-time", "ns"),
>>> 	};
>>> 	static const struct {
>>> 		enum drm_i915_pmu_engine_sample sample;
>>>@@ -1024,7 +1120,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>> 	/* per gt counters */
>>> 	for_each_gt(gt, i915, j) {
>>> 		for (i = 0; i < ARRAY_SIZE(events); i++) {
>>>-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>>+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>> 			if (!config_status(i915, config))
>>> 				count++;
>>>@@ -1070,7 +1166,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>> 	/* per gt counters */
>>> 	for_each_gt(gt, i915, j) {
>>> 		for (i = 0; i < ARRAY_SIZE(events); i++) {
>>>-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>>+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>> 			char *str;
>>> 			if (config_status(i915, config))
>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>>>index a708e44a227e..a4cc1eb218fc 100644
>>>--- a/drivers/gpu/drm/i915/i915_pmu.h
>>>+++ b/drivers/gpu/drm/i915/i915_pmu.h
>>>@@ -38,7 +38,7 @@ enum {
>>> 	__I915_NUM_PMU_SAMPLERS
>>> };
>>>-#define I915_PMU_MAX_GTS (4) /* FIXME */
>>>+#define I915_PMU_MAX_GTS (4 + 1) /* FIXME */
>>> /**
>>>  * How many different events we track in the global PMU mask.
>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>index bbab7f3dbeb4..18794c30027f 100644
>>>--- a/include/uapi/drm/i915_drm.h
>>>+++ b/include/uapi/drm/i915_drm.h
>>>@@ -290,6 +290,7 @@ enum drm_i915_pmu_engine_sample {
>>> 	(((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>>> 	((__u64)(gt) << __I915_PMU_GT_SHIFT))
>>>+/* Aggregate from all gts */
>>> #define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>> #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
>>>@@ -300,11 +301,14 @@ enum drm_i915_pmu_engine_sample {
>>> #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>>-#define __I915_PMU_ACTUAL_FREQUENCY(gt)		___I915_PMU_OTHER(gt, 0)
>>>-#define __I915_PMU_REQUESTED_FREQUENCY(gt)	___I915_PMU_OTHER(gt, 1)
>>>-#define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
>>>-#define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
>>>-#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
>>>+/* GT specific counters */
>>>+#define ____I915_PMU_OTHER(gt, x) ___I915_PMU_OTHER(((gt) + 1), x)
>>>+
>>>+#define __I915_PMU_ACTUAL_FREQUENCY(gt)		____I915_PMU_OTHER(gt, 0)
>>>+#define __I915_PMU_REQUESTED_FREQUENCY(gt)	____I915_PMU_OTHER(gt, 1)
>>>+#define __I915_PMU_INTERRUPTS(gt)		____I915_PMU_OTHER(gt, 2)
>>>+#define __I915_PMU_RC6_RESIDENCY(gt)		____I915_PMU_OTHER(gt, 3)
>>>+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	____I915_PMU_OTHER(gt, 4)
>>> /* Each region is a minimum of 16k, and there are at most 255 of them.
>>>  */
Umesh Nerlige Ramappa April 20, 2023, 8:12 p.m. UTC | #5
On Fri, Mar 31, 2023 at 02:02:40PM +0100, Tvrtko Ursulin wrote:
>
>On 30/03/2023 19:31, Umesh Nerlige Ramappa wrote:
>>+ Joonas for comments on this
>>
>>On Thu, Mar 30, 2023 at 02:38:03PM +0100, Tvrtko Ursulin wrote:
>>>
>>>On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
>>>>MTL introduces separate GTs for render and media. This complicates the
>>>>definition of frequency and rc6 counters for the GPU as a whole since
>>>>each GT has an independent counter. The best way to support this change
>>>>is to deprecate the GPU-specific counters and create GT-specific
>>>>counters, however that just breaks ABI. Since perf tools and scripts may
>>>>be decentralized with probably many users, it's hard to deprecate the
>>>>legacy counters and have all the users on board with that.
>>>>
>>>>Re-introduce the legacy counters and support them as min/max of
>>>>GT-specific counters as necessary to ensure backwards compatibility.
>>>>
>>>>I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
>>>>I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
>>>>I915_PMU_INTERRUPTS - no changes since it is GPU specific on all 
>>>>platforms
>>>>I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
>>>>I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters
>>>
>>>IMO max/min games are _very_ low value and probably just confusing.
>>
>>By value, do you mean ROI or actually that the values would be incorrect?
>
>Both really.
>
>>>I am not convinced we need to burden the kernel with this. New 
>>>platform, new counters.. userspace can just deal with it.
>>
>>I agree and would prefer to drop this patch. There are some counter 
>>arguments, I have added Joonas here for comments.
>>
>>1) an app/script hard-coded with the legacy events would be used on 
>>a new platform and fail and we should maintain backwards 
>>compatibility.
>
>I thought we pretty much agreed multiple times in the past (on 
>different topics) that a new platform can require new userspace.
>
>PMU is probably even a more clear cut case since it is exposing 
>hardware counters (or close) so sometimes it is not even theoretically 
>possible to preserve "backward" compatibility.
>
>(I double quote backward because I think real backward compatibility 
>does not apply on a new platform. And MTL is under force probe still.)
>
>So for me it all comes under the "would be nice" category. But since 
>we need to add kernel code to do it, code which asy intel_gpu_top 
>could run in userspace, I am not at all convinced it wouldn't be a bad 
>idea.
>
>The aggregated counters wouldn't even be giving the full picture.
>
>So I'd simply add tiles/gt support to intel_gpu_top. Same as it 
>currently can do "-p" on the command line, or '1' in the interactive 
>mode, to aggregate the engine classes into one line item, I'd extend 
>that concept into frequencies and RC6.
>
>By default we start with normalized values and in physical mode we 
>show separate counters per tile/gt.
>
>Someone running old intel_gpu_top on MTL gets to see nothing since the 
>counter names are different. Which is IMO fine - better than showing 
>tile 0 data, or some minimums/maximums from one tile only.
>
>>2) the sysfs attributes for rc6/frequency have already adopted an 
>>aggregate vs gt0/gt1 approach to address that and pmu should have a 
>>similar solution (or rather, PMU and the sysfs approaches should 
>>match based on whatever is the approach)
>
>Yeah I disagreed with min/max reads in sysfs too and am pretty sure I 
>expressed that at the time. :shrug:
>
>But I don't think there is a strong argument that PMU needs to follow.
>
>Only impact is to people who access perf_event_open directly so yeah, 
>if there are such users, they will need to add multi-tile support.

I discussed with Joonas offline and I guess I had a wrong idea regarding 
ABI. Looks like ABI is only broken if we remove something that existed 
for a platform, so this does not break ABI for MTL. The motivation was 
to have per-platform differences smaller for applications/UMD.

The other aspect he pointed out was that we should not push anything 
that does not have an IGT. For aggregate events, we do not have any 
plans to support IGTs (large effort as well as no clear way to support 
such aggregation).

In short, I will drop this patch and post what you originally had for 
multi-gt PMU support.

Thanks,
Umesh
>
>Regards,
>
>Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9bd9605d2662..0dc7711c3b4b 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -221,7 +221,7 @@  add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val,
 static u64 get_rc6(struct intel_gt *gt)
 {
 	struct drm_i915_private *i915 = gt->i915;
-	const unsigned int gt_id = gt->info.id;
+	const unsigned int gt_id = gt->info.id + 1;
 	struct i915_pmu *pmu = &i915->pmu;
 	unsigned long flags;
 	bool awake = false;
@@ -267,24 +267,26 @@  static void init_rc6(struct i915_pmu *pmu)
 
 	for_each_gt(gt, i915, i) {
 		intel_wakeref_t wakeref;
+		const unsigned int gt_id = i + 1;
 
 		with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
 			u64 val = __get_rc6(gt);
 
-			store_sample(pmu, i, __I915_SAMPLE_RC6, val);
-			store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
+			store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED,
 				     val);
-			pmu->sleep_last[i] = ktime_get_raw();
+			pmu->sleep_last[gt_id] = ktime_get_raw();
 		}
 	}
 }
 
 static void park_rc6(struct intel_gt *gt)
 {
+	const unsigned int gt_id = gt->info.id + 1;
 	struct i915_pmu *pmu = &gt->i915->pmu;
 
-	store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
-	pmu->sleep_last[gt->info.id] = ktime_get_raw();
+	store_sample(pmu, gt_id, __I915_SAMPLE_RC6, __get_rc6(gt));
+	pmu->sleep_last[gt_id] = ktime_get_raw();
 }
 
 static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
@@ -436,18 +438,18 @@  static void
 frequency_sample(struct intel_gt *gt, unsigned int period_ns)
 {
 	struct drm_i915_private *i915 = gt->i915;
-	const unsigned int gt_id = gt->info.id;
+	const unsigned int gt_id = gt->info.id + 1;
 	struct i915_pmu *pmu = &i915->pmu;
 	struct intel_rps *rps = &gt->rps;
 
-	if (!frequency_sampling_enabled(pmu, gt_id))
+	if (!frequency_sampling_enabled(pmu, gt->info.id))
 		return;
 
 	/* Report 0/0 (actual/requested) frequency while parked. */
 	if (!intel_gt_pm_get_if_awake(gt))
 		return;
 
-	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
+	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt->info.id))) {
 		u32 val;
 
 		/*
@@ -467,7 +469,7 @@  frequency_sample(struct intel_gt *gt, unsigned int period_ns)
 				val, period_ns / 1000);
 	}
 
-	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
+	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt->info.id))) {
 		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
 				intel_rps_get_requested_frequency(rps),
 				period_ns / 1000);
@@ -545,14 +547,15 @@  engine_event_status(struct intel_engine_cs *engine,
 static int
 config_status(struct drm_i915_private *i915, u64 config)
 {
-	struct intel_gt *gt = to_gt(i915);
-
 	unsigned int gt_id = config_gt_id(config);
-	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
+	unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 2 : 1;
+	struct intel_gt *gt;
 
 	if (gt_id > max_gt_id)
 		return -ENOENT;
 
+	gt = !gt_id ? to_gt(i915) : i915->gt[gt_id - 1];
+
 	switch (config_counter(config)) {
 	case I915_PMU_ACTUAL_FREQUENCY:
 		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
@@ -673,23 +676,58 @@  static u64 __i915_pmu_event_read_other(struct perf_event *event)
 	const unsigned int gt_id = config_gt_id(event->attr.config);
 	const u64 config = config_counter(event->attr.config);
 	struct i915_pmu *pmu = &i915->pmu;
+	struct intel_gt *gt;
 	u64 val = 0;
+	int i;
 
 	switch (config) {
 	case I915_PMU_ACTUAL_FREQUENCY:
-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
+		if (gt_id)
+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
+
+		if (!HAS_EXTRA_GT_LIST(i915))
+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_ACT);
+
+		for_each_gt(gt, i915, i)
+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_ACT));
+
 		break;
 	case I915_PMU_REQUESTED_FREQUENCY:
-		val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
+		if (gt_id)
+			return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
+
+		if (!HAS_EXTRA_GT_LIST(i915))
+			return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_REQ);
+
+		for_each_gt(gt, i915, i)
+			val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_REQ));
+
 		break;
 	case I915_PMU_INTERRUPTS:
 		val = READ_ONCE(pmu->irq_count);
 		break;
 	case I915_PMU_RC6_RESIDENCY:
-		val = get_rc6(i915->gt[gt_id]);
+		if (gt_id)
+			return get_rc6(i915->gt[gt_id - 1]);
+
+		if (!HAS_EXTRA_GT_LIST(i915))
+			return get_rc6(i915->gt[0]);
+
+		val = U64_MAX;
+		for_each_gt(gt, i915, i)
+			val = min(val, get_rc6(gt));
+
 		break;
 	case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
-		val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
+		if (gt_id)
+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[gt_id - 1]));
+
+		if (!HAS_EXTRA_GT_LIST(i915))
+			return ktime_to_ns(intel_gt_get_awake_time(i915->gt[0]));
+
+		val = 0;
+		for_each_gt(gt, i915, i)
+			val = max((s64)val, ktime_to_ns(intel_gt_get_awake_time(gt)));
 		break;
 	}
 
@@ -728,11 +766,14 @@  static void i915_pmu_event_read(struct perf_event *event)
 
 static void i915_pmu_enable(struct perf_event *event)
 {
+	const unsigned int gt_id = config_gt_id(event->attr.config);
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), pmu.base);
 	struct i915_pmu *pmu = &i915->pmu;
+	struct intel_gt *gt;
 	unsigned long flags;
 	unsigned int bit;
+	u64 i;
 
 	bit = event_bit(event);
 	if (bit == -1)
@@ -745,12 +786,42 @@  static void i915_pmu_enable(struct perf_event *event)
 	 * the event reference counter.
 	 */
 	BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS);
+	BUILD_BUG_ON(BITS_PER_TYPE(pmu->enable) < I915_PMU_MASK_BITS);
 	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
 	GEM_BUG_ON(pmu->enable_count[bit] == ~0);
 
 	pmu->enable |= BIT_ULL(bit);
 	pmu->enable_count[bit]++;
 
+	/*
+	 * The arrays that i915_pmu maintains are now indexed as
+	 *
+	 * 0 - aggregate events (a.k.a !gt_id)
+	 * 1 - gt0
+	 * 2 - gt1
+	 *
+	 * The same logic applies to event_bit masks. The first set of mask are
+	 * for aggregate, followed by gt0 and gt1 masks. The idea here is to
+	 * enable the event on all gts if the aggregate event bit is set. This
+	 * applies only to the non-engine-events.
+	 */
+	if (!gt_id && !is_engine_event(event)) {
+		for_each_gt(gt, i915, i) {
+			u64 counter = config_counter(event->attr.config);
+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
+			unsigned int bit = config_bit(config);
+
+			if (bit == -1)
+				continue;
+
+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
+			GEM_BUG_ON(pmu->enable_count[bit] == ~0);
+
+			pmu->enable |= BIT_ULL(bit);
+			pmu->enable_count[bit]++;
+		}
+	}
+
 	/*
 	 * Start the sampling timer if needed and not already enabled.
 	 */
@@ -793,6 +864,7 @@  static void i915_pmu_enable(struct perf_event *event)
 
 static void i915_pmu_disable(struct perf_event *event)
 {
+	const unsigned int gt_id = config_gt_id(event->attr.config);
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), pmu.base);
 	unsigned int bit = event_bit(event);
@@ -822,6 +894,26 @@  static void i915_pmu_disable(struct perf_event *event)
 		 */
 		if (--engine->pmu.enable_count[sample] == 0)
 			engine->pmu.enable &= ~BIT(sample);
+	} else if (!gt_id) {
+		struct intel_gt *gt;
+		u64 i;
+
+		for_each_gt(gt, i915, i) {
+			u64 counter = config_counter(event->attr.config);
+			u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
+			unsigned int bit = config_bit(config);
+
+			if (bit == -1)
+				continue;
+
+			GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
+			GEM_BUG_ON(pmu->enable_count[bit] == 0);
+
+			if (--pmu->enable_count[bit] == 0) {
+				pmu->enable &= ~BIT_ULL(bit);
+				pmu->timer_enabled &= pmu_needs_timer(pmu, true);
+			}
+		}
 	}
 
 	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
@@ -1002,7 +1094,11 @@  create_event_attributes(struct i915_pmu *pmu)
 		const char *name;
 		const char *unit;
 	} global_events[] = {
+		__event(0, "actual-frequency", "M"),
+		__event(1, "requested-frequency", "M"),
 		__event(2, "interrupts", NULL),
+		__event(3, "rc6-residency", "ns"),
+		__event(4, "software-gt-awake-time", "ns"),
 	};
 	static const struct {
 		enum drm_i915_pmu_engine_sample sample;
@@ -1024,7 +1120,7 @@  create_event_attributes(struct i915_pmu *pmu)
 	/* per gt counters */
 	for_each_gt(gt, i915, j) {
 		for (i = 0; i < ARRAY_SIZE(events); i++) {
-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
 
 			if (!config_status(i915, config))
 				count++;
@@ -1070,7 +1166,7 @@  create_event_attributes(struct i915_pmu *pmu)
 	/* per gt counters */
 	for_each_gt(gt, i915, j) {
 		for (i = 0; i < ARRAY_SIZE(events); i++) {
-			u64 config = ___I915_PMU_OTHER(j, events[i].counter);
+			u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
 			char *str;
 
 			if (config_status(i915, config))
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index a708e44a227e..a4cc1eb218fc 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -38,7 +38,7 @@  enum {
 	__I915_NUM_PMU_SAMPLERS
 };
 
-#define I915_PMU_MAX_GTS (4) /* FIXME */
+#define I915_PMU_MAX_GTS (4 + 1) /* FIXME */
 
 /**
  * How many different events we track in the global PMU mask.
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index bbab7f3dbeb4..18794c30027f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -290,6 +290,7 @@  enum drm_i915_pmu_engine_sample {
 	(((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
 	((__u64)(gt) << __I915_PMU_GT_SHIFT))
 
+/* Aggregate from all gts */
 #define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
 
 #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
@@ -300,11 +301,14 @@  enum drm_i915_pmu_engine_sample {
 
 #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
 
-#define __I915_PMU_ACTUAL_FREQUENCY(gt)		___I915_PMU_OTHER(gt, 0)
-#define __I915_PMU_REQUESTED_FREQUENCY(gt)	___I915_PMU_OTHER(gt, 1)
-#define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
-#define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
-#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
+/* GT specific counters */
+#define ____I915_PMU_OTHER(gt, x) ___I915_PMU_OTHER(((gt) + 1), x)
+
+#define __I915_PMU_ACTUAL_FREQUENCY(gt)		____I915_PMU_OTHER(gt, 0)
+#define __I915_PMU_REQUESTED_FREQUENCY(gt)	____I915_PMU_OTHER(gt, 1)
+#define __I915_PMU_INTERRUPTS(gt)		____I915_PMU_OTHER(gt, 2)
+#define __I915_PMU_RC6_RESIDENCY(gt)		____I915_PMU_OTHER(gt, 3)
+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	____I915_PMU_OTHER(gt, 4)
 
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */