diff mbox series

[06/19] drm/i915/perf: Use helpers to process reports w.r.t. OA buffer size

Message ID 20220823204155.8178-7-umesh.nerlige.ramappa@intel.com (mailing list archive)
State New, archived
Headers show
Series Add DG2 OA support | expand

Commit Message

Umesh Nerlige Ramappa Aug. 23, 2022, 8:41 p.m. UTC
DG2 has a new feature to supports OA buffer sizes up to 128Mb by
toggling a bit in OA_DEBUG. This would eventually be a user configurable
parameter. Use OA buffer vma size in all calculations with some helpers.

v2: Let compiler decide inline (Jani)

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c | 46 +++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 15 deletions(-)

Comments

Dixit, Ashutosh Sept. 14, 2022, 4:04 p.m. UTC | #1
On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 6fc4f0d8fc5a..bbf1c574f393 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header;
>
>  static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
>
> +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head)

nit: no space between * and stream.

> +{
> +	u32 size = stream->oa_buffer.vma->size;
> +
> +	return tail >= head ? tail - head : size - (head - tail);
> +}

If we are doing this we should probably eliminate references to OA_TAKEN
which serves an identical purpose (I think there is one remaining
reference) and also delete OA_TAKEN #define.

> +
> +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail,
> +			u32 rewind_delta)
> +{
> +	return rewind_delta > relative_hw_tail ?
> +	       stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) :
> +	       relative_hw_tail - rewind_delta;
> +}

Also are we really saying here that we are supporting non-power-of-2 OA
buffer sizes? Because if we stayed with power-of-2 sizes the expression
above are nice and elegant and actually closer to the previous code being
changed in this patch. For example:

#include <linux/circ_buf.h>

static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head)
{
	return CIRC_CNT(tail, head, stream->oa_buffer.vma->size);
}

static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail,
       			u32 rewind_delta)
{
	return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size);
}

Note that for power-of-2 sizes the two functions above are identical but we
should keep them separate for clarity (as is done in the patch) since they
are serving two different functions in the OA code.

Also another assumption in the code seems to be:

	stream->oa_buffer.vma->size == OA_BUFFER_SIZE

which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer
sizes? So we might as well stick with power-of-2 sizes and change later in
a separate patch only if needed?

Thanks.
--
Ashutosh

> +
>  void i915_oa_config_release(struct kref *ref)
>  {
>	struct i915_oa_config *oa_config =
> @@ -487,12 +502,14 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>	 * sizes need not be integral multiples or 64 or powers of 2.
>	 * Compute potentially partially landed report in the OA buffer
>	 */
> -	partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail);
> +	partial_report_size =
> +		_oa_taken(stream, hw_tail, stream->oa_buffer.tail);
>	partial_report_size %= report_size;
>
>	/* Subtract partial amount off the tail */
> -	hw_tail = gtt_offset + ((hw_tail - partial_report_size) &
> -				(stream->oa_buffer.vma->size - 1));
> +	hw_tail = gtt_offset + _rewind_tail(stream,
> +					    hw_tail - gtt_offset,
> +					    partial_report_size);
>
>	now = ktime_get_mono_fast_ns();
>
> @@ -527,16 +544,16 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>		 * memory in the order they were written to.
>		 * If not : (╯°□°)╯︵ ┻━┻
>		 */
> -		while (OA_TAKEN(tail, aged_tail) >= report_size) {
> +		while (_oa_taken(stream, tail, aged_tail) >= report_size) {
>			u32 *report32 = (void *)(stream->oa_buffer.vaddr + tail);
>
>			if (report32[0] != 0 || report32[1] != 0)
>				break;
>
> -			tail = (tail - report_size) & (OA_BUFFER_SIZE - 1);
> +			tail = _rewind_tail(stream, tail, report_size);
>		}
>
> -		if (OA_TAKEN(hw_tail, tail) > report_size &&
> +		if (_oa_taken(stream, hw_tail, tail) > report_size &&
>		    __ratelimit(&stream->perf->tail_pointer_race))
>			DRM_NOTE("unlanded report(s) head=0x%x "
>				 "tail=0x%x hw_tail=0x%x\n",
> @@ -547,8 +564,9 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
>		stream->oa_buffer.aging_timestamp = now;
>	}
>
> -	pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
> -			  stream->oa_buffer.head - gtt_offset) >= report_size;
> +	pollin = _oa_taken(stream,
> +			   stream->oa_buffer.tail,
> +			   stream->oa_buffer.head) >= report_size;
>
>	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
>
> @@ -679,11 +697,9 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>	int report_size = stream->oa_buffer.format_size;
>	u8 *oa_buf_base = stream->oa_buffer.vaddr;
>	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
> -	u32 mask = (OA_BUFFER_SIZE - 1);
>	size_t start_offset = *offset;
>	unsigned long flags;
> -	u32 head, tail;
> -	u32 taken;
> +	u32 head, tail, size;
>	int ret = 0;
>
>	if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled))
> @@ -693,6 +709,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>
>	head = stream->oa_buffer.head;
>	tail = stream->oa_buffer.tail;
> +	size = stream->oa_buffer.vma->size;
>
>	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
>
> @@ -711,16 +728,15 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>	 * all a power of two).
>	 */
>	if (drm_WARN_ONCE(&uncore->i915->drm,
> -			  head > stream->oa_buffer.vma->size ||
> -			  tail > stream->oa_buffer.vma->size,
> +			  head > size || tail > size,
>			  "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>			  head, tail))
>		return -EIO;
>
>
>	for (/* none */;
> -	     (taken = OA_TAKEN(tail, head));
> -	     head = (head + report_size) & mask) {
> +	     _oa_taken(stream, tail, head);
> +	     head = (head + report_size) % size) {
>		u8 *report = oa_buf_base + head;
>		u32 *report32 = (void *)report;
>		u32 ctx_id;
> --
> 2.25.1
>
Umesh Nerlige Ramappa Sept. 14, 2022, 6:19 p.m. UTC | #2
On Wed, Sep 14, 2022 at 09:04:10AM -0700, Dixit, Ashutosh wrote:
>On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote:
>>
>
>Hi Umesh,
>
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 6fc4f0d8fc5a..bbf1c574f393 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header;
>>
>>  static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
>>
>> +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head)
>
>nit: no space between * and stream.
>
>> +{
>> +	u32 size = stream->oa_buffer.vma->size;
>> +
>> +	return tail >= head ? tail - head : size - (head - tail);
>> +}
>
>If we are doing this we should probably eliminate references to OA_TAKEN
>which serves an identical purpose (I think there is one remaining
>reference) and also delete OA_TAKEN #define.
>
>> +
>> +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail,
>> +			u32 rewind_delta)
>> +{
>> +	return rewind_delta > relative_hw_tail ?
>> +	       stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) :
>> +	       relative_hw_tail - rewind_delta;
>> +}
>
>Also are we really saying here that we are supporting non-power-of-2 OA
>buffer sizes? Because if we stayed with power-of-2 sizes the expression
>above are nice and elegant and actually closer to the previous code being
>changed in this patch. For example:
>
>#include <linux/circ_buf.h>
>
>static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head)
>{
>	return CIRC_CNT(tail, head, stream->oa_buffer.vma->size);
>}
>
>static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail,
>       			u32 rewind_delta)
>{
>	return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size);
>}
>
>Note that for power-of-2 sizes the two functions above are identical but we
>should keep them separate for clarity (as is done in the patch) since they
>are serving two different functions in the OA code.
>
>Also another assumption in the code seems to be:
>
>	stream->oa_buffer.vma->size == OA_BUFFER_SIZE
>
>which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer
>sizes? So we might as well stick with power-of-2 sizes and change later in
>a separate patch only if needed?

Most changes here are related to the OA buffer size issue and that is 
specific to xehpsdv where the size is not a power of 2. I am thinking of 
dropping these changes in the next revision since DG2 is fixed and OA 
buffer sizes are power of 2.

Thanks,
Umesh

>
>Thanks.
>--
>Ashutosh
Dixit, Ashutosh Sept. 14, 2022, 7:07 p.m. UTC | #3
On Wed, 14 Sep 2022 11:19:30 -0700, Umesh Nerlige Ramappa wrote:
>
> On Wed, Sep 14, 2022 at 09:04:10AM -0700, Dixit, Ashutosh wrote:
> > On Tue, 23 Aug 2022 13:41:42 -0700, Umesh Nerlige Ramappa wrote:
> >>
> >
> > Hi Umesh,
> >
> >> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> >> index 6fc4f0d8fc5a..bbf1c574f393 100644
> >> --- a/drivers/gpu/drm/i915/i915_perf.c
> >> +++ b/drivers/gpu/drm/i915/i915_perf.c
> >> @@ -385,6 +385,21 @@ static struct ctl_table_header *sysctl_header;
> >>
> >>  static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
> >>
> >> +static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head)
> >
> > nit: no space between * and stream.
> >
> >> +{
> >> +	u32 size = stream->oa_buffer.vma->size;
> >> +
> >> +	return tail >= head ? tail - head : size - (head - tail);
> >> +}
> >
> > If we are doing this we should probably eliminate references to OA_TAKEN
> > which serves an identical purpose (I think there is one remaining
> > reference) and also delete OA_TAKEN #define.
> >
> >> +
> >> +static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail,
> >> +			u32 rewind_delta)
> >> +{
> >> +	return rewind_delta > relative_hw_tail ?
> >> +	       stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) :
> >> +	       relative_hw_tail - rewind_delta;
> >> +}
> >
> > Also are we really saying here that we are supporting non-power-of-2 OA
> > buffer sizes? Because if we stayed with power-of-2 sizes the expression
> > above are nice and elegant and actually closer to the previous code being
> > changed in this patch. For example:
> >
> > #include <linux/circ_buf.h>
> >
> > static u32 _oa_taken(struct i915_perf_stream *stream, u32 tail, u32 head)
> > {
> >	return CIRC_CNT(tail, head, stream->oa_buffer.vma->size);
> > }
> >
> > static u32 _rewind_tail(struct i915_perf_stream *stream, u32 relative_hw_tail,
> >				u32 rewind_delta)
> > {
> >	return CIRC_CNT(relative_hw_tail, rewind_delta, stream->oa_buffer.vma->size);
> > }
> >
> > Note that for power-of-2 sizes the two functions above are identical but we
> > should keep them separate for clarity (as is done in the patch) since they
> > are serving two different functions in the OA code.
> >
> > Also another assumption in the code seems to be:
> >
> >	stream->oa_buffer.vma->size == OA_BUFFER_SIZE
> >
> > which I am pretty sure will not hold for arbitrary non-power-of-2 OA buffer
> > sizes? So we might as well stick with power-of-2 sizes and change later in
> > a separate patch only if needed?
>
> Most changes here are related to the OA buffer size issue and that is
> specific to xehpsdv where the size is not a power of 2. I am thinking of
> dropping these changes in the next revision since DG2 is fixed and OA
> buffer sizes are power of 2.

In the code stream->oa_buffer.vma->size and OA_BUFFER_SIZE are both used,
if we want to clean that up and only use stream->oa_buffer.vma->size, we
could still do soemthing like I suggested with just power-of-2 sizes and
keep this patch. If we ever have to support non-power-of-2 sizes in the
future we'll just need to change _oa_taken and _rewind_tail
functions. Anyway your call.

Thanks.
--
Ashutosh
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 6fc4f0d8fc5a..bbf1c574f393 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -385,6 +385,21 @@  static struct ctl_table_header *sysctl_header;
 
 static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
 
+static u32 _oa_taken(struct i915_perf_stream * stream, u32 tail, u32 head)
+{
+	u32 size = stream->oa_buffer.vma->size;
+
+	return tail >= head ? tail - head : size - (head - tail);
+}
+
+static u32 _rewind_tail(struct i915_perf_stream * stream, u32 relative_hw_tail,
+			u32 rewind_delta)
+{
+	return rewind_delta > relative_hw_tail ?
+	       stream->oa_buffer.vma->size - (rewind_delta - relative_hw_tail) :
+	       relative_hw_tail - rewind_delta;
+}
+
 void i915_oa_config_release(struct kref *ref)
 {
 	struct i915_oa_config *oa_config =
@@ -487,12 +502,14 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 	 * sizes need not be integral multiples or 64 or powers of 2.
 	 * Compute potentially partially landed report in the OA buffer
 	 */
-	partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail);
+	partial_report_size =
+		_oa_taken(stream, hw_tail, stream->oa_buffer.tail);
 	partial_report_size %= report_size;
 
 	/* Subtract partial amount off the tail */
-	hw_tail = gtt_offset + ((hw_tail - partial_report_size) &
-				(stream->oa_buffer.vma->size - 1));
+	hw_tail = gtt_offset + _rewind_tail(stream,
+					    hw_tail - gtt_offset,
+					    partial_report_size);
 
 	now = ktime_get_mono_fast_ns();
 
@@ -527,16 +544,16 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 		 * memory in the order they were written to.
 		 * If not : (╯°□°)╯︵ ┻━┻
 		 */
-		while (OA_TAKEN(tail, aged_tail) >= report_size) {
+		while (_oa_taken(stream, tail, aged_tail) >= report_size) {
 			u32 *report32 = (void *)(stream->oa_buffer.vaddr + tail);
 
 			if (report32[0] != 0 || report32[1] != 0)
 				break;
 
-			tail = (tail - report_size) & (OA_BUFFER_SIZE - 1);
+			tail = _rewind_tail(stream, tail, report_size);
 		}
 
-		if (OA_TAKEN(hw_tail, tail) > report_size &&
+		if (_oa_taken(stream, hw_tail, tail) > report_size &&
 		    __ratelimit(&stream->perf->tail_pointer_race))
 			DRM_NOTE("unlanded report(s) head=0x%x "
 				 "tail=0x%x hw_tail=0x%x\n",
@@ -547,8 +564,9 @@  static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 		stream->oa_buffer.aging_timestamp = now;
 	}
 
-	pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset,
-			  stream->oa_buffer.head - gtt_offset) >= report_size;
+	pollin = _oa_taken(stream,
+			   stream->oa_buffer.tail,
+			   stream->oa_buffer.head) >= report_size;
 
 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 
@@ -679,11 +697,9 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	int report_size = stream->oa_buffer.format_size;
 	u8 *oa_buf_base = stream->oa_buffer.vaddr;
 	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
-	u32 mask = (OA_BUFFER_SIZE - 1);
 	size_t start_offset = *offset;
 	unsigned long flags;
-	u32 head, tail;
-	u32 taken;
+	u32 head, tail, size;
 	int ret = 0;
 
 	if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled))
@@ -693,6 +709,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 
 	head = stream->oa_buffer.head;
 	tail = stream->oa_buffer.tail;
+	size = stream->oa_buffer.vma->size;
 
 	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
 
@@ -711,16 +728,15 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 * all a power of two).
 	 */
 	if (drm_WARN_ONCE(&uncore->i915->drm,
-			  head > stream->oa_buffer.vma->size ||
-			  tail > stream->oa_buffer.vma->size,
+			  head > size || tail > size,
 			  "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
 			  head, tail))
 		return -EIO;
 
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
-	     head = (head + report_size) & mask) {
+	     _oa_taken(stream, tail, head);
+	     head = (head + report_size) % size) {
 		u8 *report = oa_buf_base + head;
 		u32 *report32 = (void *)report;
 		u32 ctx_id;