diff mbox series

[1/1] drm/i915/guc: Fix GuC error capture sizing estimation and reporting

Message ID 20220930010507.108296-2-alan.previn.teres.alexis@intel.com (mailing list archive)
State New, archived
Headers show
Series Fix Guc-Err-Capture sizing warning | expand

Commit Message

Teres Alexis, Alan Previn Sept. 30, 2022, 1:05 a.m. UTC
During GuC error capture initialization, we estimate the amount of size
we need for the error-capture-region of the shared GuC-log-buffer.
This calculation was incorrect so fix that. Additionally, if the size
was insufficient, don't drm_warn or drm_notice, just drm_debug since
actually running out based on that estimation is a corner case. It
can only occur if all engine instances get reset all at once and i915
isn't able extract the capture data fast enough within G2H handler
worker.

Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output

Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 31 ++++++++++++-------
 1 file changed, 19 insertions(+), 12 deletions(-)

Comments

John Harrison Sept. 30, 2022, 5:48 p.m. UTC | #1
On 9/29/2022 18:05, Alan Previn wrote:
> During GuC error capture initialization, we estimate the amount of size
> we need for the error-capture-region of the shared GuC-log-buffer.
> This calculation was incorrect so fix that. Additionally, if the size
> was insufficient, don't drm_warn or drm_notice, just drm_debug since
> actually running out based on that estimation is a corner case. It
> can only occur if all engine instances get reset all at once and i915
> isn't able extract the capture data fast enough within G2H handler
> worker.
>
> Fixes d7c15d76a5547: drm/i915/guc: Check sizing of guc_capture output
>
> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
> ---
>   .../gpu/drm/i915/gt/uc/intel_guc_capture.c    | 31 ++++++++++++-------
>   1 file changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index 8f1165146013..cb62507c91ce 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> @@ -502,8 +502,9 @@ intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 cl
>   	if (!num_regs)
>   		return -ENODATA;
>   
> -	*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> -			   (num_regs * sizeof(struct guc_mmio_reg)));
> +	if (size)
> +		*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
> +				   (num_regs * sizeof(struct guc_mmio_reg)));
>   
>   	return 0;
>   }
> @@ -606,7 +607,7 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
> -	int worst_min_size = 0, num_regs = 0;
> +	int worst_min_size = 0;
>   	size_t tmp = 0;
>   
>   	if (!guc->capture)
> @@ -628,20 +629,18 @@ guc_capture_output_min_size_est(struct intel_guc *guc)
>   					 (3 * sizeof(struct guc_state_capture_header_t));
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
>   						   engine->class, &tmp)) {
> -			num_regs += tmp;
> +			worst_min_size += tmp;
>   		}
>   	}
>   
> -	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
> -
>   	return worst_min_size;
>   }
>   
> @@ -658,15 +657,23 @@ static void check_guc_capture_size(struct intel_guc *guc)
>   	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
>   	u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
>   
> +	/*
> +	 * Don't drm_warn or drm_error here on "possible" insufficient size because we only run out
> +	 * of space if all engines were to suffer resets all at once AND the driver is not able to
> +	 * extract that data fast enough in the interrupt handler worker (moving them to the
> +	 * larger pool of pre-allocated capture nodes. If GuC does run out of space, we will
> +	 * print an error when processing the G2H event capture-notification (search for
> +	 * "INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE").
> +	 */
>   	if (min_size < 0)
>   		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
>   			 min_size);
>   	else if (min_size > buffer_size)
> -		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
> -			 buffer_size, min_size);
> +		drm_dbg(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
> +			buffer_size, min_size);
Isn't min_size the bare minimum to get a valid capture? Surely this 
still needs to be a warning not a debug. If we can't manage a basic 
working error capture then there is a problem. This needs to be caught 
by CI and logged as a bug if it is ever hit. And that means an end user 
should never see it fire because we won't let a driver out the door 
unless the default buffer size is sufficient.

John.

>   	else if (spare_size > buffer_size)
> -		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
> -			   buffer_size, spare_size, min_size);
> +		drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
> +			buffer_size, spare_size, min_size);
>   }
>   
>   /*
Teres Alexis, Alan Previn Sept. 30, 2022, 9:08 p.m. UTC | #2
I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
would just discard the ones after the first one and so it wouldnt even matter if we did catch it.

But I'll go ahead and re-rev this.
...alan

On Fri, 2022-09-30 at 10:48 -0700, Harrison, John C wrote:
> Isn't min_size the bare minimum to get a valid capture? Surely this 
> still needs to be a warning not a debug. If we can't manage a basic 
> working error capture then there is a problem. This needs to be caught 
> by CI and logged as a bug if it is ever hit. And that means an end user 
> should never see it fire because we won't let a driver out the door 
> unless the default buffer size is sufficient.
John Harrison Sept. 30, 2022, 10:35 p.m. UTC | #3
On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
> I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
> point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
> would just discard the ones after the first one and so it wouldnt even matter if we did catch it.
So min_size is not actually the minimal size for a meaningful capture? 
So what is? And remember that for compute class engines, there is 
dependent engine reset. So a reset of CCS2 also means a reset of RCS, 
CCS0, CCS1 and CCS3. So having at least four engines per capture is not 
unreasonable.

It seems pointless to go through a lot of effort to calculate the 
minimum and recommend sizes only to basically ignore them by just 
whispering very, very quietly that there might be a problem. It also 
seems pointless to complain about a minimum size that actually isn't the 
minimum size. That's sort of worse - now you are telling the user there 
is a problem when really there isn't.

IMHO, the min_size check should be meaningful and should be visible to 
the user if it fails.

Also, are we still hitting the minimum size failure message? Now that 
the calculation has been fixed, what sizes does it come up with for min 
and spare? Are they within the allocation now or not?

John.


> But I'll go ahead and re-rev this.
> ...alan
>
> On Fri, 2022-09-30 at 10:48 -0700, Harrison, John C wrote:
>> Isn't min_size the bare minimum to get a valid capture? Surely this
>> still needs to be a warning not a debug. If we can't manage a basic
>> working error capture then there is a problem. This needs to be caught
>> by CI and logged as a bug if it is ever hit. And that means an end user
>> should never see it fire because we won't let a driver out the door
>> unless the default buffer size is sufficient.
Teres Alexis, Alan Previn Oct. 3, 2022, 6:28 p.m. UTC | #4
On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
> On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
> > I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
> > point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
> > would just discard the ones after the first one and so it wouldnt even matter if we did catch it.
> So min_size is not actually the minimal size for a meaningful capture? 
> So what is? And remember that for compute class engines, there is 
> dependent engine reset. So a reset of CCS2 also means a reset of RCS, 
> CCS0, CCS1 and CCS3. So having at least four engines per capture is not 
> unreasonable.
> 
Alan: min_size is a meaningful size for the worst case scenario of collecting the guc-report. However due to gpu-core-
dump, its not meaningful in terms of reporting that information out to the user. Thats not a limitation of this
subsystem.

> It seems pointless to go through a lot of effort to calculate the 
> minimum and recommend sizes only to basically ignore them by just 
> whispering very, very quietly that there might be a problem.
> 
Alan: I already responded that i will re-rev as per your recommendation and switch back to drm_notice.

> It also 
> seems pointless to complain about a minimum size that actually isn't the 
> minimum size. That's sort of worse - now you are telling the user there 
> is a problem when really there isn't.
> 
Alan: the min size is accurate - but whether we report to the use about them possibly facing a problem is where it gets
a bit unclear because of the limitation in the gpu-core-dump framework. We could drop the message completely if you like
- but then we'd have to remember to re-add it if we fix gpu-core-dump in future. For now, as i mentioned in the last
reply, i have already changed it back to "notice" as per your last comment. Perhaps you should have looked at rev2 which
was posted before your responses above. As mentioned in last reply, i disagreed but i am committing to your request
which was fixed in rev2 as per your request.

> IMHO, the min_size check should be meaningful and should be visible to 
> the user if it fails.
> 
> Also, are we still hitting the minimum size failure message? Now that 
> the calculation has been fixed, what sizes does it come up with for min 
> and spare? Are they within the allocation now or not?
> 
Yes - we are within the allocation with this patch (the fix of removing the redundant register-struct-size
multiplication brought the number down significantly).

> John.

So how would you like to proceed? Could we reply on rev2 btw?
John Harrison Oct. 3, 2022, 7:47 p.m. UTC | #5
On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
> On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
>> On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
>>> I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
>>> point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
>>> would just discard the ones after the first one and so it wouldnt even matter if we did catch it.
>> So min_size is not actually the minimal size for a meaningful capture?
>> So what is? And remember that for compute class engines, there is
>> dependent engine reset. So a reset of CCS2 also means a reset of RCS,
>> CCS0, CCS1 and CCS3. So having at least four engines per capture is not
>> unreasonable.
>>
> Alan: min_size is a meaningful size for the worst case scenario of collecting the guc-report. However due to gpu-core-
> dump, its not meaningful in terms of reporting that information out to the user. Thats not a limitation of this
> subsystem.
I'm not following what you mean about gpu-core-dump. The point of the 
warning is to let the user know that the interface they want to use 
(error capture via sysfs) might not be complete in the majority of 
cases. If there is some catastrophic worst case scenario that overflows 
but 99% of instances are fine then that's what the drm_notice about 
'spare_size' is for. But if the 99% of instances of a capture are going 
to overflow then that is what the drm_warn about 'min_size' should be for.

>> It seems pointless to go through a lot of effort to calculate the
>> minimum and recommend sizes only to basically ignore them by just
>> whispering very, very quietly that there might be a problem.
>>
> Alan: I already responded that i will re-rev as per your recommendation and switch back to drm_notice.
>
>> It also
>> seems pointless to complain about a minimum size that actually isn't the
>> minimum size. That's sort of worse - now you are telling the user there
>> is a problem when really there isn't.
>>
> Alan: the min size is accurate - but whether we report to the use about them possibly facing a problem is where it gets
> a bit unclear because of the limitation in the gpu-core-dump framework. We could drop the message completely if you like
> - but then we'd have to remember to re-add it if we fix gpu-core-dump in future. For now, as i mentioned in the last
What limitation in the 'gpu-core-dump framework'? What is the fix required?

> reply, i have already changed it back to "notice" as per your last comment. Perhaps you should have looked at rev2 which
> was posted before your responses above. As mentioned in last reply, i disagreed but i am committing to your request
> which was fixed in rev2 as per your request.
The point of a code review is not "I say X so you must do X, immediately 
post a new revision now". I asked some questions. I stated my opinion 
about what the end user should see. If you think your implementation 
already matches that or you disagree because you think I am basing my 
comments on incorrect information, or even just that you disagree with 
my reasoning, then you should not blindly post a new revision saying 
"here are your changes, I don't like it because Y but just r-b it and 
I'll merge". You should reply to the comments with your thoughts and 
suggestions. Maybe the reviewer is wrong!

>> IMHO, the min_size check should be meaningful and should be visible to
>> the user if it fails.
>>
>> Also, are we still hitting the minimum size failure message? Now that
>> the calculation has been fixed, what sizes does it come up with for min
>> and spare? Are they within the allocation now or not?
>>
> Yes - we are within the allocation with this patch (the fix of removing the redundant register-struct-size
> multiplication brought the number down significantly).
Bringing in comment from other thread:
 > Some context: with the calculation fix we are allocating 4MB but we 
only need 78k as min-est.

Wow! That is quite a big smaller. And much more plausible! So if 
min_size is 78KB, what is spare_size now?

Sounds like we can drop it down to a 1MB allocation. And even if 
min_size is not the absolute minimum but quite a bit more worst case 
kind of size, it still seems reasonable to keep it as a warning rather 
than a notice. Given that it is so much smaller than the allocation 
size, if it does somehow overflow on some platform/configuration then 
that sounds like something we should know about because it likely means 
something is broken.


>> John.
> So how would you like to proceed? Could we reply on rev2 btw?
I would like to answer the questions/concerns before jumping in to 
writing/re-writing code.

Why split the email thread? The discussion is already happening here. 
There is no point splitting a single discussion across multiple patch 
sets just because a new patch has been posted if that patch does not 
actually change the discussion.

John.
Teres Alexis, Alan Previn Oct. 3, 2022, 9:10 p.m. UTC | #6
On Mon, 2022-10-03 at 12:47 -0700, Harrison, John C wrote:
> On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
> > On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
> > > On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
> > > > I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
> > > > point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
> > > > would just discard the ones after the first one and so it wouldnt even matter if we did catch it.
> > > So min_size is not actually the minimal size for a meaningful capture?
> > > So what is? And remember that for compute class engines, there is
> > > dependent engine reset. So a reset of CCS2 also means a reset of RCS,
> > > CCS0, CCS1 and CCS3. So having at least four engines per capture is not
> > > unreasonable.
> > > 
> > Alan: min_size is a meaningful size for the worst case scenario of collecting the guc-report. However due to gpu-core-
> > dump, its not meaningful in terms of reporting that information out to the user. Thats not a limitation of this
> > subsystem.
> I'm not following what you mean about gpu-core-dump. The point of the 
> warning is to let the user know that the interface they want to use 
> (error capture via sysfs) might not be complete in the majority of 
> cases. If there is some catastrophic worst case scenario that overflows 
> but 99% of instances are fine then that's what the drm_notice about 
> 'spare_size' is for. But if the 99% of instances of a capture are going 
> to overflow then that is what the drm_warn about 'min_size' should be for.
> 
> > > It seems pointless to go through a lot of effort to calculate the
> > > minimum and recommend sizes only to basically ignore them by just
> > > whispering very, very quietly that there might be a problem.
> > > 
> > Alan: I already responded that i will re-rev as per your recommendation and switch back to drm_notice.
> > 
> > > It also
> > > seems pointless to complain about a minimum size that actually isn't the
> > > minimum size. That's sort of worse - now you are telling the user there
> > > is a problem when really there isn't.
> > > 
> > Alan: the min size is accurate - but whether we report to the use about them possibly facing a problem is where it gets
> > a bit unclear because of the limitation in the gpu-core-dump framework. We could drop the message completely if you like
> > - but then we'd have to remember to re-add it if we fix gpu-core-dump in future. For now, as i mentioned in the last
> What limitation in the 'gpu-core-dump framework'? What is the fix required?
> 
Unless it has changed since i last looked at it, gpu_coredump framework will store the first error-state captured and
wait until the user extracts it via sysfs. Any other error-state capture that gets created before that prior one was
cleared by the user gets dropped. I'm not sure if that limitation needs to be "fixed". I am not sure of its history -
maybe it was designed that way for a reason. ATM I dont have the bandwidth to chase that history down.

> > reply, i have already changed it back to "notice" as per your last comment. Perhaps you should have looked at rev2 which
> > was posted before your responses above. As mentioned in last reply, i disagreed but i am committing to your request
> > which was fixed in rev2 as per your request.
> The point of a code review is not "I say X so you must do X, immediately 
> post a new revision now". I asked some questions. I stated my opinion 
> about what the end user should see. If you think your implementation 
> already matches that or you disagree because you think I am basing my 
> comments on incorrect information, or even just that you disagree with 
> my reasoning, then you should not blindly post a new revision saying 
> "here are your changes, I don't like it because Y but just r-b it and 
> I'll merge". You should reply to the comments with your thoughts and 
> suggestions. Maybe the reviewer is wrong!
> 
I think this comes down to preference as there is no clear rule about what is right vs what is wrong in this case:

	- we have a possible issue but its a corner case when all engine instances suffer a reset all at once.
	- in the event of that corner case occurring, the end-user could go back and increase the guc-log-capture-
region size and thus the warning will go away and GuC will have space to report the error-state dump for all engine
instances if they got reset all at once (within the same G2H IRQ event).
	- assuming the issue is reproducible, the user, now happy that the warning is resolved, reruns the workload.
The
user dumps the first error-state-capture via sysfs which only contains info for the first engine that was reset (as it
appeared in the G2H-CTB). the user inspects the same sysfs and none of the other dmesg-reported engine-hang error-state-
dumps seem to be made available in the sysfs after clearing that first one.

so the preference here becomes "should we warn the user about this possible event of running out of guc-error-state
capture region when resolving it still doesn't allow the user to get it anyway"... OR "should we just make a quiet debug
message about it since an i915 developer knows that the gpu_coredump framework design can only ever store 1 error-state-
capture-dump and throws the rest away until its cleared by the user via sysfs".

My preference is the latter. However, just to be clear, with the fixed calculation, we'll probably never see the
warning(or-debug) message with the HW that we have today and the foreseeable future.

> > > IMHO, the min_size check should be meaningful and should be visible to
> > > the user if it fails.
> > > 
> > > Also, are we still hitting the minimum size failure message? Now that
> > > the calculation has been fixed, what sizes does it come up with for min
> > > and spare? Are they within the allocation now or not?
> > > 
> > Yes - we are within the allocation with this patch (the fix of removing the redundant register-struct-size
> > multiplication brought the number down significantly).
> Bringing in comment from other thread:
>  > Some context: with the calculation fix we are allocating 4MB but we 
> only need 78k as min-est.
> 
> Wow! That is quite a big smaller. And much more plausible! So if 
> min_size is 78KB, what is spare_size now?
> 
Around 230K. (the min_est was exactly 78504 bytes or 76.664Kb - that was based on a DG2 - i think it was a 384 - cant
recall for sure)

> Sounds like we can drop it down to a 1MB allocation.
> 
Okay - will do a 3rd rev with a 1MB change.

> And even if 
> min_size is not the absolute minimum but quite a bit more worst case 
> kind of size, it still seems reasonable to keep it as a warning rather 
> than a notice. Given that it is so much smaller than the allocation 
> size, if it does somehow overflow on some platform/configuration then 
> that sounds like something we should know about because it likely means 
> something is broken.

Okay - this works fine too since we probably will likely never trigger that message until we have significantly more
engine instances than todays hw.
>
John Harrison Oct. 3, 2022, 11:51 p.m. UTC | #7
On 10/3/2022 14:10, Teres Alexis, Alan Previn wrote:
> On Mon, 2022-10-03 at 12:47 -0700, Harrison, John C wrote:
>> On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
>>> On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
>>>> On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
>>>>> I disagree because its unlikely that all engines can reset all at once (we probably have bigger problems at the at
>>>>> point) and if they all occurred within the same G2H handler scheduled worker run, our current gpu_coredump framework
>>>>> would just discard the ones after the first one and so it wouldnt even matter if we did catch it.
>>>> So min_size is not actually the minimal size for a meaningful capture?
>>>> So what is? And remember that for compute class engines, there is
>>>> dependent engine reset. So a reset of CCS2 also means a reset of RCS,
>>>> CCS0, CCS1 and CCS3. So having at least four engines per capture is not
>>>> unreasonable.
>>>>
>>> Alan: min_size is a meaningful size for the worst case scenario of collecting the guc-report. However due to gpu-core-
>>> dump, its not meaningful in terms of reporting that information out to the user. Thats not a limitation of this
>>> subsystem.
>> I'm not following what you mean about gpu-core-dump. The point of the
>> warning is to let the user know that the interface they want to use
>> (error capture via sysfs) might not be complete in the majority of
>> cases. If there is some catastrophic worst case scenario that overflows
>> but 99% of instances are fine then that's what the drm_notice about
>> 'spare_size' is for. But if the 99% of instances of a capture are going
>> to overflow then that is what the drm_warn about 'min_size' should be for.
>>
>>>> It seems pointless to go through a lot of effort to calculate the
>>>> minimum and recommend sizes only to basically ignore them by just
>>>> whispering very, very quietly that there might be a problem.
>>>>
>>> Alan: I already responded that i will re-rev as per your recommendation and switch back to drm_notice.
>>>
>>>> It also
>>>> seems pointless to complain about a minimum size that actually isn't the
>>>> minimum size. That's sort of worse - now you are telling the user there
>>>> is a problem when really there isn't.
>>>>
>>> Alan: the min size is accurate - but whether we report to the use about them possibly facing a problem is where it gets
>>> a bit unclear because of the limitation in the gpu-core-dump framework. We could drop the message completely if you like
>>> - but then we'd have to remember to re-add it if we fix gpu-core-dump in future. For now, as i mentioned in the last
>> What limitation in the 'gpu-core-dump framework'? What is the fix required?
>>
> Unless it has changed since i last looked at it, gpu_coredump framework will store the first error-state captured and
> wait until the user extracts it via sysfs. Any other error-state capture that gets created before that prior one was
> cleared by the user gets dropped. I'm not sure if that limitation needs to be "fixed". I am not sure of its history -
> maybe it was designed that way for a reason. ATM I dont have the bandwidth to chase that history down.
That is by design. Generally speaking, you want to see the first error 
that occurred. If there were a bunch back to back then likely the first 
one is the real problem and the rest are either symptoms or repeat 
offences of the first.

Also, in the case where a user doesn't know what an error capture is but 
they are running (or developing) some app that keeps dying and produces 
core dumps, you can't just swallow all system memory in a linked list of 
error captures that won't ever be read out and freed up.

Hence the design is to save the first and hold on to it until read out. 
And if any further hangs happen before the current one is cleared, they 
are discarded as uninteresting if not plain old spam.

>
>>> reply, i have already changed it back to "notice" as per your last comment. Perhaps you should have looked at rev2 which
>>> was posted before your responses above. As mentioned in last reply, i disagreed but i am committing to your request
>>> which was fixed in rev2 as per your request.
>> The point of a code review is not "I say X so you must do X, immediately
>> post a new revision now". I asked some questions. I stated my opinion
>> about what the end user should see. If you think your implementation
>> already matches that or you disagree because you think I am basing my
>> comments on incorrect information, or even just that you disagree with
>> my reasoning, then you should not blindly post a new revision saying
>> "here are your changes, I don't like it because Y but just r-b it and
>> I'll merge". You should reply to the comments with your thoughts and
>> suggestions. Maybe the reviewer is wrong!
>>
> I think this comes down to preference as there is no clear rule about what is right vs what is wrong in this case:
>
> 	- we have a possible issue but its a corner case when all engine instances suffer a reset all at once.
> 	- in the event of that corner case occurring, the end-user could go back and increase the guc-log-capture-
> region size and thus the warning will go away and GuC will have space to report the error-state dump for all engine
> instances if they got reset all at once (within the same G2H IRQ event).
> 	- assuming the issue is reproducible, the user, now happy that the warning is resolved, reruns the workload.
> The
> user dumps the first error-state-capture via sysfs which only contains info for the first engine that was reset (as it
> appeared in the G2H-CTB). the user inspects the same sysfs and none of the other dmesg-reported engine-hang error-state-
> dumps seem to be made available in the sysfs after clearing that first one.
>
> so the preference here becomes "should we warn the user about this possible event of running out of guc-error-state
> capture region when resolving it still doesn't allow the user to get it anyway"... OR "should we just make a quiet debug
> message about it since an i915 developer knows that the gpu_coredump framework design can only ever store 1 error-state-
> capture-dump and throws the rest away until its cleared by the user via sysfs".
>
> My preference is the latter. However, just to be clear, with the fixed calculation, we'll probably never see the
> warning(or-debug) message with the HW that we have today and the foreseeable future.

The warning isn't meant to be for i915 developers who understand bizarre 
internal workings of the driver. The warning is meant to be for end 
users who might not be able to debug their applications.

Given that the error capture interface is fundamentally only really 
concerned about obtaining a single valid capture at a time. I would 
argue that supporting a dozen back-to-back captures is no more than a 
nice-to-have convenience for a very rare situation (such as the user has 
explicitly modified their kernel to save all the captures in a list). 
And that back-to-back scenario is the 'spare_size', not 'min_size' isn't 
it? So there is no need for a warning about that. A notice is fine but 
not a warning.

The thing that really, really, really matters to an end user is that the 
first capture of a session is complete and useful. So my view is that 
any warning message should be focused on that. Can we get one complete 
error capture for one complete set of engines? If not, that is something 
we should warn about. And that is what 'min_size' should be.

Note that 'one complete set of engines' really only means five on 
current hardware - RCS + CCS0-3 is the most that can reset at one time 
(due to dependent engine resets) given that GuC managed resets are 
context based and a context can only run on one engine. Although I think 
it is technically possible for more engines to be involved but that 
would also count as hard to hit a corner case. And given the small size 
involved, it might be simplest just to base min_size on all engines anyway.



>>>> IMHO, the min_size check should be meaningful and should be visible to
>>>> the user if it fails.
>>>>
>>>> Also, are we still hitting the minimum size failure message? Now that
>>>> the calculation has been fixed, what sizes does it come up with for min
>>>> and spare? Are they within the allocation now or not?
>>>>
>>> Yes - we are within the allocation with this patch (the fix of removing the redundant register-struct-size
>>> multiplication brought the number down significantly).
>> Bringing in comment from other thread:
>>   > Some context: with the calculation fix we are allocating 4MB but we
>> only need 78k as min-est.
>>
>> Wow! That is quite a big smaller. And much more plausible! So if
>> min_size is 78KB, what is spare_size now?
>>
> Around 230K. (the min_est was exactly 78504 bytes or 76.664Kb - that was based on a DG2 - i think it was a 384 - cant
> recall for sure)
Might be worth checking on PVC. That could be getting close to 1MB with 
all the extra blitter engines.

>
>> Sounds like we can drop it down to a 1MB allocation.
>>
> Okay - will do a 3rd rev with a 1MB change.
Don't start writing anything yet. Let's get to a final decision on the 
plan first.

>
>> And even if
>> min_size is not the absolute minimum but quite a bit more worst case
>> kind of size, it still seems reasonable to keep it as a warning rather
>> than a notice. Given that it is so much smaller than the allocation
>> size, if it does somehow overflow on some platform/configuration then
>> that sounds like something we should know about because it likely means
>> something is broken.
> Okay - this works fine too since we probably will likely never trigger that message until we have significantly more
> engine instances than todays hw.
See above about PVC.

John.
Teres Alexis, Alan Previn Oct. 4, 2022, 12:46 a.m. UTC | #8
So as per the last response and the offline conversation we had we agreed that:

1. we shall stick with drm_warn( ... maybe too small...) if the allocation didn't meet min_size.
2. I'll model for PVC (since its better to look at the spec as opposed to trying to hunt for a free machine with the
most engines and DSS (for those steering registers that are counted multiple times).
3. If #2 yields us with substantial headroom (i.e. a model's PVC would be less than 700K min_size), then lets drop to
1MB allocation.

...alan


On Mon, 2022-10-03 at 16:51 -0700, Harrison, John C wrote:
> On 10/3/2022 14:10, Teres Alexis, Alan Previn wrote:
> > On Mon, 2022-10-03 at 12:47 -0700, Harrison, John C wrote:
> > > On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
> > > > On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
> > > > > On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
> > > > >
John Harrison Oct. 4, 2022, 1:51 a.m. UTC | #9
On 10/3/2022 17:46, Teres Alexis, Alan Previn wrote:
> So as per the last response and the offline conversation we had we agreed that:
>
> 1. we shall stick with drm_warn( ... maybe too small...) if the allocation didn't meet min_size.
> 2. I'll model for PVC (since its better to look at the spec as opposed to trying to hunt for a free machine with the
> most engines and DSS (for those steering registers that are counted multiple times).
> 3. If #2 yields us with substantial headroom (i.e. a model's PVC would be less than 700K min_size), then lets drop to
> 1MB allocation.
>
> ...alan
Sounds good to me.

John.

>
>
> On Mon, 2022-10-03 at 16:51 -0700, Harrison, John C wrote:
>> On 10/3/2022 14:10, Teres Alexis, Alan Previn wrote:
>>> On Mon, 2022-10-03 at 12:47 -0700, Harrison, John C wrote:
>>>> On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
>>>>> On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
>>>>>> On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
>>>>>>
Teres Alexis, Alan Previn Oct. 5, 2022, 11:31 p.m. UTC | #10
Hi John, tested on real PVC and the estimated min size needed was ~115K. Even without modelling, we can safely say that
an imaginary device with a tile that is 4x bigger would still be half than 1 MB. That said I shall proceed with a re-rev
that shall include dropping the size of guc-log-error-capture-region down to 1MB in addition to sticking with the
"drm_warn(...mayb too small)" and ofc rebasing to latest drm-tip. Shall rerun some tests just in case - so will get it
out later this week.

...alan

On Mon, 2022-10-03 at 18:51 -0700, Harrison, John C wrote:
> On 10/3/2022 17:46, Teres Alexis, Alan Previn wrote:
> > So as per the last response and the offline conversation we had we agreed that:
> > 
> > 1. we shall stick with drm_warn( ... maybe too small...) if the allocation didn't meet min_size.
> > 2. I'll model for PVC (since its better to look at the spec as opposed to trying to hunt for a free machine with the
> > most engines and DSS (for those steering registers that are counted multiple times).
> > 3. If #2 yields us with substantial headroom (i.e. a model's PVC would be less than 700K min_size), then lets drop to
> > 1MB allocation.
> > 
> > ...alan
> Sounds good to me.
> 
> John.
> 
> > 
> > 
> > On Mon, 2022-10-03 at 16:51 -0700, Harrison, John C wrote:
> > > On 10/3/2022 14:10, Teres Alexis, Alan Previn wrote:
> > > > On Mon, 2022-10-03 at 12:47 -0700, Harrison, John C wrote:
> > > > > On 10/3/2022 11:28, Teres Alexis, Alan Previn wrote:
> > > > > > On Fri, 2022-09-30 at 15:35 -0700, Harrison, John C wrote:
> > > > > > > On 9/30/2022 14:08, Teres Alexis, Alan Previn wrote:
> > > > > > > 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 8f1165146013..cb62507c91ce 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -502,8 +502,9 @@  intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, u32 cl
 	if (!num_regs)
 		return -ENODATA;
 
-	*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
-			   (num_regs * sizeof(struct guc_mmio_reg)));
+	if (size)
+		*size = PAGE_ALIGN((sizeof(struct guc_debug_capture_list)) +
+				   (num_regs * sizeof(struct guc_mmio_reg)));
 
 	return 0;
 }
@@ -606,7 +607,7 @@  guc_capture_output_min_size_est(struct intel_guc *guc)
 	struct intel_gt *gt = guc_to_gt(guc);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	int worst_min_size = 0, num_regs = 0;
+	int worst_min_size = 0;
 	size_t tmp = 0;
 
 	if (!guc->capture)
@@ -628,20 +629,18 @@  guc_capture_output_min_size_est(struct intel_guc *guc)
 					 (3 * sizeof(struct guc_state_capture_header_t));
 
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp))
-			num_regs += tmp;
+			worst_min_size += tmp;
 
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
 						   engine->class, &tmp)) {
-			num_regs += tmp;
+			worst_min_size += tmp;
 		}
 		if (!intel_guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
 						   engine->class, &tmp)) {
-			num_regs += tmp;
+			worst_min_size += tmp;
 		}
 	}
 
-	worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
-
 	return worst_min_size;
 }
 
@@ -658,15 +657,23 @@  static void check_guc_capture_size(struct intel_guc *guc)
 	int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
 	u32 buffer_size = intel_guc_log_section_size_capture(&guc->log);
 
+	/*
+	 * Don't drm_warn or drm_error here on "possible" insufficient size because we only run out
+	 * of space if all engines were to suffer resets all at once AND the driver is not able to
+	 * extract that data fast enough in the interrupt handler worker (moving them to the
+	 * larger pool of pre-allocated capture nodes. If GuC does run out of space, we will
+	 * print an error when processing the G2H event capture-notification (search for
+	 * "INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE").
+	 */
 	if (min_size < 0)
 		drm_warn(&i915->drm, "Failed to calculate GuC error state capture buffer minimum size: %d!\n",
 			 min_size);
 	else if (min_size > buffer_size)
-		drm_warn(&i915->drm, "GuC error state capture buffer is too small: %d < %d\n",
-			 buffer_size, min_size);
+		drm_dbg(&i915->drm, "GuC error state capture buffer maybe small: %d < %d\n",
+			buffer_size, min_size);
 	else if (spare_size > buffer_size)
-		drm_notice(&i915->drm, "GuC error state capture buffer maybe too small: %d < %d (min = %d)\n",
-			   buffer_size, spare_size, min_size);
+		drm_dbg(&i915->drm, "GuC error state capture buffer lacks spare size: %d < %d (min = %d)\n",
+			buffer_size, spare_size, min_size);
 }
 
 /*