diff mbox series

iomm/arm-smmu: Add stall implementation hook

Message ID 20200421202004.11686-1-saiprakash.ranjan@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series iomm/arm-smmu: Add stall implementation hook | expand

Commit Message

Sai Prakash Ranjan April 21, 2020, 8:20 p.m. UTC
Add stall implementation hook to enable stalling
faults on QCOM platforms which supports it without
causing any kind of hardware mishaps. Without this
on QCOM platforms, GPU faults can cause unrelated
GPU memory accesses to return zeroes. This has the
unfortunate result of command-stream reads from CP
getting invalid data, causing a cascade of fail.

Suggested-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
---
This has been attempted previously by Rob Clark in 2017, 2018.
Hopefully we can get something concluded in 2020.
 * https://patchwork.kernel.org/patch/9953803/
 * https://patchwork.kernel.org/patch/10618713/
---
 drivers/iommu/arm-smmu-qcom.c | 1 +
 drivers/iommu/arm-smmu.c      | 7 +++++++
 drivers/iommu/arm-smmu.h      | 1 +
 3 files changed, 9 insertions(+)

Comments

Sai Prakash Ranjan May 7, 2020, 10:14 a.m. UTC | #1
Hi Will, Robin

On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> Add stall implementation hook to enable stalling
> faults on QCOM platforms which supports it without
> causing any kind of hardware mishaps. Without this
> on QCOM platforms, GPU faults can cause unrelated
> GPU memory accesses to return zeroes. This has the
> unfortunate result of command-stream reads from CP
> getting invalid data, causing a cascade of fail.
> 
> Suggested-by: Rob Clark <robdclark@gmail.com>
> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
> ---
> This has been attempted previously by Rob Clark in 2017, 2018.
> Hopefully we can get something concluded in 2020.
>  * https://patchwork.kernel.org/patch/9953803/
>  * https://patchwork.kernel.org/patch/10618713/
> ---
>  drivers/iommu/arm-smmu-qcom.c | 1 +
>  drivers/iommu/arm-smmu.c      | 7 +++++++
>  drivers/iommu/arm-smmu.h      | 1 +
>  3 files changed, 9 insertions(+)
> 
> diff --git a/drivers/iommu/arm-smmu-qcom.c 
> b/drivers/iommu/arm-smmu-qcom.c
> index 24c071c1d8b0..a13b229389d4 100644
> --- a/drivers/iommu/arm-smmu-qcom.c
> +++ b/drivers/iommu/arm-smmu-qcom.c
> @@ -32,6 +32,7 @@ static int qcom_sdm845_smmu500_reset(struct
> arm_smmu_device *smmu)
> 
>  static const struct arm_smmu_impl qcom_smmu_impl = {
>  	.reset = qcom_sdm845_smmu500_reset,
> +	.stall = true,
>  };
> 
>  struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device 
> *smmu)
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index e622f4e33379..16b03fca9966 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -488,6 +488,11 @@ static irqreturn_t arm_smmu_context_fault(int
> irq, void *dev)
>  			    fsr, iova, fsynr, cbfrsynra, idx);
> 
>  	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
> +
> +	if (smmu->impl && smmu->impl->stall && (fsr & ARM_SMMU_FSR_SS))
> +		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
> +				  ARM_SMMU_RESUME_TERMINATE);
> +
>  	return IRQ_HANDLED;
>  }
> 
> @@ -659,6 +664,8 @@ static void arm_smmu_write_context_bank(struct
> arm_smmu_device *smmu, int idx)
>  		reg |= ARM_SMMU_SCTLR_S1_ASIDPNE;
>  	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
>  		reg |= ARM_SMMU_SCTLR_E;
> +	if (smmu->impl && smmu->impl->stall)
> +		reg |= ARM_SMMU_SCTLR_CFCFG;
> 
>  	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
>  }
> diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> index 8d1cd54d82a6..d5134e0d5cce 100644
> --- a/drivers/iommu/arm-smmu.h
> +++ b/drivers/iommu/arm-smmu.h
> @@ -386,6 +386,7 @@ struct arm_smmu_impl {
>  	int (*init_context)(struct arm_smmu_domain *smmu_domain);
>  	void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
>  			 int status);
> +	bool stall;
>  };
> 
>  static inline void __iomem *arm_smmu_page(struct arm_smmu_device 
> *smmu, int n)

Any comments on this patch?

Thanks,
Sai
Robin Murphy May 7, 2020, 10:55 a.m. UTC | #2
On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> Hi Will, Robin
> 
> On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
>> Add stall implementation hook to enable stalling
>> faults on QCOM platforms which supports it without
>> causing any kind of hardware mishaps. Without this
>> on QCOM platforms, GPU faults can cause unrelated
>> GPU memory accesses to return zeroes. This has the
>> unfortunate result of command-stream reads from CP
>> getting invalid data, causing a cascade of fail.

I think this came up before, but something about this rationale doesn't 
add up - we're not *using* stalls at all, we're still terminating 
faulting transactions unconditionally; we're just using CFCFG to 
terminate them with a slight delay, rather than immediately. It's really 
not clear how or why that makes a difference. Is it a GPU bug? Or an 
SMMU bug? Is this reliable (or even a documented workaround for 
something), or might things start blowing up again if any other 
behaviour subtly changes? I'm not dead set against adding this, but I'd 
*really* like to have a lot more confidence in it.

>> Suggested-by: Rob Clark <robdclark@gmail.com>
>> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
>> ---
>> This has been attempted previously by Rob Clark in 2017, 2018.
>> Hopefully we can get something concluded in 2020.
>>  * https://patchwork.kernel.org/patch/9953803/
>>  * https://patchwork.kernel.org/patch/10618713/
>> ---
>>  drivers/iommu/arm-smmu-qcom.c | 1 +
>>  drivers/iommu/arm-smmu.c      | 7 +++++++
>>  drivers/iommu/arm-smmu.h      | 1 +
>>  3 files changed, 9 insertions(+)
>>
>> diff --git a/drivers/iommu/arm-smmu-qcom.c 
>> b/drivers/iommu/arm-smmu-qcom.c
>> index 24c071c1d8b0..a13b229389d4 100644
>> --- a/drivers/iommu/arm-smmu-qcom.c
>> +++ b/drivers/iommu/arm-smmu-qcom.c
>> @@ -32,6 +32,7 @@ static int qcom_sdm845_smmu500_reset(struct
>> arm_smmu_device *smmu)
>>
>>  static const struct arm_smmu_impl qcom_smmu_impl = {
>>      .reset = qcom_sdm845_smmu500_reset,
>> +    .stall = true,
>>  };
>>
>>  struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device 
>> *smmu)
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index e622f4e33379..16b03fca9966 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -488,6 +488,11 @@ static irqreturn_t arm_smmu_context_fault(int
>> irq, void *dev)
>>                  fsr, iova, fsynr, cbfrsynra, idx);
>>
>>      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
>> +
>> +    if (smmu->impl && smmu->impl->stall && (fsr & ARM_SMMU_FSR_SS))
>> +        arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
>> +                  ARM_SMMU_RESUME_TERMINATE);

Shouldn't this be *before* the write to FSR, in case the outstanding 
fault causes that to be immediately reasserted before we write CB_RESUME 
and we end up immediately taking the IRQ a second time?

(The overall enablement being in impl is sound, but you still don't get 
to play "works on my machine" in the architectural code :P)

Robin.

>> +
>>      return IRQ_HANDLED;
>>  }
>>
>> @@ -659,6 +664,8 @@ static void arm_smmu_write_context_bank(struct
>> arm_smmu_device *smmu, int idx)
>>          reg |= ARM_SMMU_SCTLR_S1_ASIDPNE;
>>      if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
>>          reg |= ARM_SMMU_SCTLR_E;
>> +    if (smmu->impl && smmu->impl->stall)
>> +        reg |= ARM_SMMU_SCTLR_CFCFG;
>>
>>      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
>>  }
>> diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
>> index 8d1cd54d82a6..d5134e0d5cce 100644
>> --- a/drivers/iommu/arm-smmu.h
>> +++ b/drivers/iommu/arm-smmu.h
>> @@ -386,6 +386,7 @@ struct arm_smmu_impl {
>>      int (*init_context)(struct arm_smmu_domain *smmu_domain);
>>      void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
>>               int status);
>> +    bool stall;
>>  };
>>
>>  static inline void __iomem *arm_smmu_page(struct arm_smmu_device 
>> *smmu, int n)
> 
> Any comments on this patch?
> 
> Thanks,
> Sai
>
Sai Prakash Ranjan May 7, 2020, 12:06 p.m. UTC | #3
Hi Robin,

On 2020-05-07 16:25, Robin Murphy wrote:
> On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
>> Hi Will, Robin
>> 
>> On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
>>> Add stall implementation hook to enable stalling
>>> faults on QCOM platforms which supports it without
>>> causing any kind of hardware mishaps. Without this
>>> on QCOM platforms, GPU faults can cause unrelated
>>> GPU memory accesses to return zeroes. This has the
>>> unfortunate result of command-stream reads from CP
>>> getting invalid data, causing a cascade of fail.
> 
> I think this came up before, but something about this rationale
> doesn't add up - we're not *using* stalls at all, we're still
> terminating faulting transactions unconditionally; we're just using
> CFCFG to terminate them with a slight delay, rather than immediately.
> It's really not clear how or why that makes a difference. Is it a GPU
> bug? Or an SMMU bug? Is this reliable (or even a documented workaround
> for something), or might things start blowing up again if any other
> behaviour subtly changes? I'm not dead set against adding this, but
> I'd *really* like to have a lot more confidence in it.
> 

Yes it has come up before, you can find details in below links.
  - https://patchwork.kernel.org/patch/9953803/
  - https://patchwork.kernel.org/patch/10618713/

Rob Clark can add more details on this probably for the GPU faults.
As for the reliability, downstream kernel(I mean kernels with which 
android
devices with QCOM chipsets are shipped) has stalling enabled for a long 
time
now and has been stable in the field. So we can say that we are safe 
with
this enabled in QCOM implementation.

>>> Suggested-by: Rob Clark <robdclark@gmail.com>
>>> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
>>> ---
>>> This has been attempted previously by Rob Clark in 2017, 2018.
>>> Hopefully we can get something concluded in 2020.
>>>  * https://patchwork.kernel.org/patch/9953803/
>>>  * https://patchwork.kernel.org/patch/10618713/
>>> ---
>>>  drivers/iommu/arm-smmu-qcom.c | 1 +
>>>  drivers/iommu/arm-smmu.c      | 7 +++++++
>>>  drivers/iommu/arm-smmu.h      | 1 +
>>>  3 files changed, 9 insertions(+)
>>> 
>>> diff --git a/drivers/iommu/arm-smmu-qcom.c 
>>> b/drivers/iommu/arm-smmu-qcom.c
>>> index 24c071c1d8b0..a13b229389d4 100644
>>> --- a/drivers/iommu/arm-smmu-qcom.c
>>> +++ b/drivers/iommu/arm-smmu-qcom.c
>>> @@ -32,6 +32,7 @@ static int qcom_sdm845_smmu500_reset(struct
>>> arm_smmu_device *smmu)
>>> 
>>>  static const struct arm_smmu_impl qcom_smmu_impl = {
>>>      .reset = qcom_sdm845_smmu500_reset,
>>> +    .stall = true,
>>>  };
>>> 
>>>  struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device 
>>> *smmu)
>>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>>> index e622f4e33379..16b03fca9966 100644
>>> --- a/drivers/iommu/arm-smmu.c
>>> +++ b/drivers/iommu/arm-smmu.c
>>> @@ -488,6 +488,11 @@ static irqreturn_t arm_smmu_context_fault(int
>>> irq, void *dev)
>>>                  fsr, iova, fsynr, cbfrsynra, idx);
>>> 
>>>      arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
>>> +
>>> +    if (smmu->impl && smmu->impl->stall && (fsr & ARM_SMMU_FSR_SS))
>>> +        arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
>>> +                  ARM_SMMU_RESUME_TERMINATE);
> 
> Shouldn't this be *before* the write to FSR, in case the outstanding
> fault causes that to be immediately reasserted before we write
> CB_RESUME and we end up immediately taking the IRQ a second time?
> 

Yes, I will fixup this in the next version.

> (The overall enablement being in impl is sound, but you still don't
> get to play "works on my machine" in the architectural code :P)
> 

We could have our own context fault handler in QCOM implementation,
but that would just be duplicating things from arm-smmu context fault
handler. So I did not think it makes much sense to have our own
fault handler in qcom impl just for enabling stall model.

Thanks,
Sai
Will Deacon May 7, 2020, 12:53 p.m. UTC | #4
On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> > > Add stall implementation hook to enable stalling
> > > faults on QCOM platforms which supports it without
> > > causing any kind of hardware mishaps. Without this
> > > on QCOM platforms, GPU faults can cause unrelated
> > > GPU memory accesses to return zeroes. This has the
> > > unfortunate result of command-stream reads from CP
> > > getting invalid data, causing a cascade of fail.
> 
> I think this came up before, but something about this rationale doesn't add
> up - we're not *using* stalls at all, we're still terminating faulting
> transactions unconditionally; we're just using CFCFG to terminate them with
> a slight delay, rather than immediately. It's really not clear how or why
> that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> (or even a documented workaround for something), or might things start
> blowing up again if any other behaviour subtly changes? I'm not dead set
> against adding this, but I'd *really* like to have a lot more confidence in
> it.

Rob mentioned something about the "bus returning zeroes" before, but I agree
that we need more information so that we can reason about this and maintain
the code as the driver continues to change. That needs to be a comment in
the driver, and I don't think "but android seems to work" is a good enough
justification. There was some interaction with HUPCF as well.

As a template, I'd suggest:

> > > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> > > index 8d1cd54d82a6..d5134e0d5cce 100644
> > > --- a/drivers/iommu/arm-smmu.h
> > > +++ b/drivers/iommu/arm-smmu.h
> > > @@ -386,6 +386,7 @@ struct arm_smmu_impl {
> > >      int (*init_context)(struct arm_smmu_domain *smmu_domain);
> > >      void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> > >               int status);

/*
 * Stall transactions on a context fault, where they will be terminated
 * in response to the resulting IRQ rather than immediately. This should
 * pretty much always be set to "false" as stalling can introduce the
 * potential for deadlock in most SoCs, however it is needed on Qualcomm
 * XXXX because YYYY.
 */

> > > +    bool stall;

Hmm, the more I think about this, the more I think this is an erratum
workaround in disguise, in which case this could be better named...

Will
Robin Murphy May 7, 2020, 2:56 p.m. UTC | #5
On 2020-05-07 1:06 pm, Sai Prakash Ranjan wrote:
[...]
> We could have our own context fault handler in QCOM implementation,
> but that would just be duplicating things from arm-smmu context fault
> handler. So I did not think it makes much sense to have our own
> fault handler in qcom impl just for enabling stall model.

Hmm, it's probably worth thinking ahead a bit here, to the "actually 
doing things with stalls" plan. I don't have a clear picture off-hand of 
how well the new device fault handler API might fit into arm-smmu - at 
the very least trying to make it truly generic implies having to play 
nasty tricks with disable_irq() for the general case given the "IRQ may 
remain asserted while SS is active" possibility, and that isn't 
particularly inviting. Not to mention tying it into the 
pretend-auxdomain stuff that *is* rather dependent on the qcom impl. If 
it turns out that you'll eventually have to reimplement the IRQ handler 
anyway for all that, then starting off down that route *might* work out 
cleaner and less hassle overall.

Robin.
Rob Clark May 8, 2020, 3:32 p.m. UTC | #6
On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
>
> On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> > > > Add stall implementation hook to enable stalling
> > > > faults on QCOM platforms which supports it without
> > > > causing any kind of hardware mishaps. Without this
> > > > on QCOM platforms, GPU faults can cause unrelated
> > > > GPU memory accesses to return zeroes. This has the
> > > > unfortunate result of command-stream reads from CP
> > > > getting invalid data, causing a cascade of fail.
> >
> > I think this came up before, but something about this rationale doesn't add
> > up - we're not *using* stalls at all, we're still terminating faulting
> > transactions unconditionally; we're just using CFCFG to terminate them with
> > a slight delay, rather than immediately. It's really not clear how or why
> > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> > (or even a documented workaround for something), or might things start
> > blowing up again if any other behaviour subtly changes? I'm not dead set
> > against adding this, but I'd *really* like to have a lot more confidence in
> > it.
>
> Rob mentioned something about the "bus returning zeroes" before, but I agree
> that we need more information so that we can reason about this and maintain
> the code as the driver continues to change. That needs to be a comment in
> the driver, and I don't think "but android seems to work" is a good enough
> justification. There was some interaction with HUPCF as well.

The issue is that there are multiple parallel memory accesses
happening at the same time, for example CP (the cmdstream processor)
will be reading ahead and setting things up for the next draw or
compute grid, in parallel with some memory accesses from the shader
which could trigger a fault.  (And with faults triggered by something
in the shader, there are *many* shader threads running in parallel so
those tend to generate a big number of faults at the same time.)

We need either CFCFG or HUPCF, otherwise what I have observed is that
while the fault happens, CP's memory access will start returning
zero's instead of valid cmdstream data, which triggers a GPU hang.  I
can't say whether this is something unique to qcom's implementation of
the smmu spec or not.

*Often* a fault is the result of the usermode gl/vk/cl driver bug,
although I don't think that is an argument against fixing this in the
smmu driver.. I've been carrying around a local patch to set HUPCF for
*years* because debugging usermode driver issues is so much harder
without.  But there are some APIs where faults can be caused by the
user's app on top of the usermode driver.


BR,
-R

>
> As a template, I'd suggest:
>
> > > > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> > > > index 8d1cd54d82a6..d5134e0d5cce 100644
> > > > --- a/drivers/iommu/arm-smmu.h
> > > > +++ b/drivers/iommu/arm-smmu.h
> > > > @@ -386,6 +386,7 @@ struct arm_smmu_impl {
> > > >      int (*init_context)(struct arm_smmu_domain *smmu_domain);
> > > >      void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> > > >               int status);
>
> /*
>  * Stall transactions on a context fault, where they will be terminated
>  * in response to the resulting IRQ rather than immediately. This should
>  * pretty much always be set to "false" as stalling can introduce the
>  * potential for deadlock in most SoCs, however it is needed on Qualcomm
>  * XXXX because YYYY.
>  */
>
> > > > +    bool stall;
>
> Hmm, the more I think about this, the more I think this is an erratum
> workaround in disguise, in which case this could be better named...
>
> Will
Rob Clark May 8, 2020, 3:40 p.m. UTC | #7
On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
>
> On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
> >
> > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> > > > > Add stall implementation hook to enable stalling
> > > > > faults on QCOM platforms which supports it without
> > > > > causing any kind of hardware mishaps. Without this
> > > > > on QCOM platforms, GPU faults can cause unrelated
> > > > > GPU memory accesses to return zeroes. This has the
> > > > > unfortunate result of command-stream reads from CP
> > > > > getting invalid data, causing a cascade of fail.
> > >
> > > I think this came up before, but something about this rationale doesn't add
> > > up - we're not *using* stalls at all, we're still terminating faulting
> > > transactions unconditionally; we're just using CFCFG to terminate them with
> > > a slight delay, rather than immediately. It's really not clear how or why
> > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> > > (or even a documented workaround for something), or might things start
> > > blowing up again if any other behaviour subtly changes? I'm not dead set
> > > against adding this, but I'd *really* like to have a lot more confidence in
> > > it.
> >
> > Rob mentioned something about the "bus returning zeroes" before, but I agree
> > that we need more information so that we can reason about this and maintain
> > the code as the driver continues to change. That needs to be a comment in
> > the driver, and I don't think "but android seems to work" is a good enough
> > justification. There was some interaction with HUPCF as well.
>
> The issue is that there are multiple parallel memory accesses
> happening at the same time, for example CP (the cmdstream processor)
> will be reading ahead and setting things up for the next draw or
> compute grid, in parallel with some memory accesses from the shader
> which could trigger a fault.  (And with faults triggered by something
> in the shader, there are *many* shader threads running in parallel so
> those tend to generate a big number of faults at the same time.)
>
> We need either CFCFG or HUPCF, otherwise what I have observed is that
> while the fault happens, CP's memory access will start returning
> zero's instead of valid cmdstream data, which triggers a GPU hang.  I
> can't say whether this is something unique to qcom's implementation of
> the smmu spec or not.
>
> *Often* a fault is the result of the usermode gl/vk/cl driver bug,
> although I don't think that is an argument against fixing this in the
> smmu driver.. I've been carrying around a local patch to set HUPCF for
> *years* because debugging usermode driver issues is so much harder
> without.  But there are some APIs where faults can be caused by the
> user's app on top of the usermode driver.
>

Also, I'll add to that, a big wish of mine is to have stall with the
ability to resume later from a wq context.  That would enable me to
hook in the gpu crash dump handling for faults, which would make
debugging these sorts of issues much easier.  I think I posted a
prototype of this quite some time back, which would schedule a worker
on the first fault (since there are cases where you see 1000's of
faults at once), which grabbed some information about the currently
executing submit and some gpu registers to indicate *where* in the
submit (a single submit could have 100's or 1000's of draws), and then
resumed the iommu cb.

(This would ofc eventually be useful for svm type things.. I expect
we'll eventually care about that too.)

BR,
-R

>
> BR,
> -R
>
> >
> > As a template, I'd suggest:
> >
> > > > > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> > > > > index 8d1cd54d82a6..d5134e0d5cce 100644
> > > > > --- a/drivers/iommu/arm-smmu.h
> > > > > +++ b/drivers/iommu/arm-smmu.h
> > > > > @@ -386,6 +386,7 @@ struct arm_smmu_impl {
> > > > >      int (*init_context)(struct arm_smmu_domain *smmu_domain);
> > > > >      void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> > > > >               int status);
> >
> > /*
> >  * Stall transactions on a context fault, where they will be terminated
> >  * in response to the resulting IRQ rather than immediately. This should
> >  * pretty much always be set to "false" as stalling can introduce the
> >  * potential for deadlock in most SoCs, however it is needed on Qualcomm
> >  * XXXX because YYYY.
> >  */
> >
> > > > > +    bool stall;
> >
> > Hmm, the more I think about this, the more I think this is an erratum
> > workaround in disguise, in which case this could be better named...
> >
> > Will
Jordan Crouse May 11, 2020, 5:30 p.m. UTC | #8
On Fri, May 08, 2020 at 08:40:40AM -0700, Rob Clark wrote:
> On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
> >
> > On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
> > >
> > > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> > > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> > > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> > > > > > Add stall implementation hook to enable stalling
> > > > > > faults on QCOM platforms which supports it without
> > > > > > causing any kind of hardware mishaps. Without this
> > > > > > on QCOM platforms, GPU faults can cause unrelated
> > > > > > GPU memory accesses to return zeroes. This has the
> > > > > > unfortunate result of command-stream reads from CP
> > > > > > getting invalid data, causing a cascade of fail.
> > > >
> > > > I think this came up before, but something about this rationale doesn't add
> > > > up - we're not *using* stalls at all, we're still terminating faulting
> > > > transactions unconditionally; we're just using CFCFG to terminate them with
> > > > a slight delay, rather than immediately. It's really not clear how or why
> > > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> > > > (or even a documented workaround for something), or might things start
> > > > blowing up again if any other behaviour subtly changes? I'm not dead set
> > > > against adding this, but I'd *really* like to have a lot more confidence in
> > > > it.
> > >
> > > Rob mentioned something about the "bus returning zeroes" before, but I agree
> > > that we need more information so that we can reason about this and maintain
> > > the code as the driver continues to change. That needs to be a comment in
> > > the driver, and I don't think "but android seems to work" is a good enough
> > > justification. There was some interaction with HUPCF as well.
> >
> > The issue is that there are multiple parallel memory accesses
> > happening at the same time, for example CP (the cmdstream processor)
> > will be reading ahead and setting things up for the next draw or
> > compute grid, in parallel with some memory accesses from the shader
> > which could trigger a fault.  (And with faults triggered by something
> > in the shader, there are *many* shader threads running in parallel so
> > those tend to generate a big number of faults at the same time.)
> >
> > We need either CFCFG or HUPCF, otherwise what I have observed is that
> > while the fault happens, CP's memory access will start returning
> > zero's instead of valid cmdstream data, which triggers a GPU hang.  I
> > can't say whether this is something unique to qcom's implementation of
> > the smmu spec or not.
> >
> > *Often* a fault is the result of the usermode gl/vk/cl driver bug,
> > although I don't think that is an argument against fixing this in the
> > smmu driver.. I've been carrying around a local patch to set HUPCF for
> > *years* because debugging usermode driver issues is so much harder
> > without.  But there are some APIs where faults can be caused by the
> > user's app on top of the usermode driver.
> >
> 
> Also, I'll add to that, a big wish of mine is to have stall with the
> ability to resume later from a wq context.  That would enable me to
> hook in the gpu crash dump handling for faults, which would make
> debugging these sorts of issues much easier.  I think I posted a
> prototype of this quite some time back, which would schedule a worker
> on the first fault (since there are cases where you see 1000's of
> faults at once), which grabbed some information about the currently
> executing submit and some gpu registers to indicate *where* in the
> submit (a single submit could have 100's or 1000's of draws), and then
> resumed the iommu cb.
> 
> (This would ofc eventually be useful for svm type things.. I expect
> we'll eventually care about that too.)

Rob is right about HUPCF. Due to the parallel nature of the command processor
there is always a very good chance that a CP access is somewhere in the bus so
any pagefault is usually a death sentence. The GPU context bank would always
want HUPCF set to 1.

Downstream also uses CFCFG for stall-on-fault debug case. You wouldn't want
this on all the time in production since bringing down the world for every user
pagefault is less than desirable so it needs to be modified in run-time (or at
the very least kernel command line selectable).

Jordan

PS: Interestingly, the GMU does not want HUPCF set to 1 because it wants to
crash immediately on all invalid accesses so ideally these combination of bits
would be configurable on a per-context basis.

> > >
> > > As a template, I'd suggest:
> > >
> > > > > > diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> > > > > > index 8d1cd54d82a6..d5134e0d5cce 100644
> > > > > > --- a/drivers/iommu/arm-smmu.h
> > > > > > +++ b/drivers/iommu/arm-smmu.h
> > > > > > @@ -386,6 +386,7 @@ struct arm_smmu_impl {
> > > > > >      int (*init_context)(struct arm_smmu_domain *smmu_domain);
> > > > > >      void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> > > > > >               int status);
> > >
> > > /*
> > >  * Stall transactions on a context fault, where they will be terminated
> > >  * in response to the resulting IRQ rather than immediately. This should
> > >  * pretty much always be set to "false" as stalling can introduce the
> > >  * potential for deadlock in most SoCs, however it is needed on Qualcomm
> > >  * XXXX because YYYY.
> > >  */
> > >
> > > > > > +    bool stall;
> > >
> > > Hmm, the more I think about this, the more I think this is an erratum
> > > workaround in disguise, in which case this could be better named...
> > >
> > > Will
Will Deacon May 18, 2020, 3:45 p.m. UTC | #9
On Mon, May 11, 2020 at 11:30:08AM -0600, Jordan Crouse wrote:
> On Fri, May 08, 2020 at 08:40:40AM -0700, Rob Clark wrote:
> > On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
> > >
> > > On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
> > > >
> > > > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> > > > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> > > > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> > > > > > > Add stall implementation hook to enable stalling
> > > > > > > faults on QCOM platforms which supports it without
> > > > > > > causing any kind of hardware mishaps. Without this
> > > > > > > on QCOM platforms, GPU faults can cause unrelated
> > > > > > > GPU memory accesses to return zeroes. This has the
> > > > > > > unfortunate result of command-stream reads from CP
> > > > > > > getting invalid data, causing a cascade of fail.
> > > > >
> > > > > I think this came up before, but something about this rationale doesn't add
> > > > > up - we're not *using* stalls at all, we're still terminating faulting
> > > > > transactions unconditionally; we're just using CFCFG to terminate them with
> > > > > a slight delay, rather than immediately. It's really not clear how or why
> > > > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> > > > > (or even a documented workaround for something), or might things start
> > > > > blowing up again if any other behaviour subtly changes? I'm not dead set
> > > > > against adding this, but I'd *really* like to have a lot more confidence in
> > > > > it.
> > > >
> > > > Rob mentioned something about the "bus returning zeroes" before, but I agree
> > > > that we need more information so that we can reason about this and maintain
> > > > the code as the driver continues to change. That needs to be a comment in
> > > > the driver, and I don't think "but android seems to work" is a good enough
> > > > justification. There was some interaction with HUPCF as well.
> > >
> > > The issue is that there are multiple parallel memory accesses
> > > happening at the same time, for example CP (the cmdstream processor)
> > > will be reading ahead and setting things up for the next draw or
> > > compute grid, in parallel with some memory accesses from the shader
> > > which could trigger a fault.  (And with faults triggered by something
> > > in the shader, there are *many* shader threads running in parallel so
> > > those tend to generate a big number of faults at the same time.)
> > >
> > > We need either CFCFG or HUPCF, otherwise what I have observed is that
> > > while the fault happens, CP's memory access will start returning
> > > zero's instead of valid cmdstream data, which triggers a GPU hang.  I
> > > can't say whether this is something unique to qcom's implementation of
> > > the smmu spec or not.
> > >
> > > *Often* a fault is the result of the usermode gl/vk/cl driver bug,
> > > although I don't think that is an argument against fixing this in the
> > > smmu driver.. I've been carrying around a local patch to set HUPCF for
> > > *years* because debugging usermode driver issues is so much harder
> > > without.  But there are some APIs where faults can be caused by the
> > > user's app on top of the usermode driver.
> > >
> > 
> > Also, I'll add to that, a big wish of mine is to have stall with the
> > ability to resume later from a wq context.  That would enable me to
> > hook in the gpu crash dump handling for faults, which would make
> > debugging these sorts of issues much easier.  I think I posted a
> > prototype of this quite some time back, which would schedule a worker
> > on the first fault (since there are cases where you see 1000's of
> > faults at once), which grabbed some information about the currently
> > executing submit and some gpu registers to indicate *where* in the
> > submit (a single submit could have 100's or 1000's of draws), and then
> > resumed the iommu cb.
> > 
> > (This would ofc eventually be useful for svm type things.. I expect
> > we'll eventually care about that too.)
> 
> Rob is right about HUPCF. Due to the parallel nature of the command processor
> there is always a very good chance that a CP access is somewhere in the bus so
> any pagefault is usually a death sentence. The GPU context bank would always
> want HUPCF set to 1.

So this sounds like an erratum to me, and I'm happy to set HUPCF if we
detect the broken implementation. However, it will need an entry in
Documentation/arm64/silicon-errata.rst and a decent comment in the driver
to explain what we're doing and why.

Thanks,

Will
Sai Prakash Ranjan May 19, 2020, 9:26 a.m. UTC | #10
Hi Will,

On 2020-05-18 21:15, Will Deacon wrote:
> On Mon, May 11, 2020 at 11:30:08AM -0600, Jordan Crouse wrote:
>> On Fri, May 08, 2020 at 08:40:40AM -0700, Rob Clark wrote:
>> > On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
>> > >
>> > > On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
>> > > >
>> > > > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
>> > > > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
>> > > > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
>> > > > > > > Add stall implementation hook to enable stalling
>> > > > > > > faults on QCOM platforms which supports it without
>> > > > > > > causing any kind of hardware mishaps. Without this
>> > > > > > > on QCOM platforms, GPU faults can cause unrelated
>> > > > > > > GPU memory accesses to return zeroes. This has the
>> > > > > > > unfortunate result of command-stream reads from CP
>> > > > > > > getting invalid data, causing a cascade of fail.
>> > > > >
>> > > > > I think this came up before, but something about this rationale doesn't add
>> > > > > up - we're not *using* stalls at all, we're still terminating faulting
>> > > > > transactions unconditionally; we're just using CFCFG to terminate them with
>> > > > > a slight delay, rather than immediately. It's really not clear how or why
>> > > > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
>> > > > > (or even a documented workaround for something), or might things start
>> > > > > blowing up again if any other behaviour subtly changes? I'm not dead set
>> > > > > against adding this, but I'd *really* like to have a lot more confidence in
>> > > > > it.
>> > > >
>> > > > Rob mentioned something about the "bus returning zeroes" before, but I agree
>> > > > that we need more information so that we can reason about this and maintain
>> > > > the code as the driver continues to change. That needs to be a comment in
>> > > > the driver, and I don't think "but android seems to work" is a good enough
>> > > > justification. There was some interaction with HUPCF as well.
>> > >
>> > > The issue is that there are multiple parallel memory accesses
>> > > happening at the same time, for example CP (the cmdstream processor)
>> > > will be reading ahead and setting things up for the next draw or
>> > > compute grid, in parallel with some memory accesses from the shader
>> > > which could trigger a fault.  (And with faults triggered by something
>> > > in the shader, there are *many* shader threads running in parallel so
>> > > those tend to generate a big number of faults at the same time.)
>> > >
>> > > We need either CFCFG or HUPCF, otherwise what I have observed is that
>> > > while the fault happens, CP's memory access will start returning
>> > > zero's instead of valid cmdstream data, which triggers a GPU hang.  I
>> > > can't say whether this is something unique to qcom's implementation of
>> > > the smmu spec or not.
>> > >
>> > > *Often* a fault is the result of the usermode gl/vk/cl driver bug,
>> > > although I don't think that is an argument against fixing this in the
>> > > smmu driver.. I've been carrying around a local patch to set HUPCF for
>> > > *years* because debugging usermode driver issues is so much harder
>> > > without.  But there are some APIs where faults can be caused by the
>> > > user's app on top of the usermode driver.
>> > >
>> >
>> > Also, I'll add to that, a big wish of mine is to have stall with the
>> > ability to resume later from a wq context.  That would enable me to
>> > hook in the gpu crash dump handling for faults, which would make
>> > debugging these sorts of issues much easier.  I think I posted a
>> > prototype of this quite some time back, which would schedule a worker
>> > on the first fault (since there are cases where you see 1000's of
>> > faults at once), which grabbed some information about the currently
>> > executing submit and some gpu registers to indicate *where* in the
>> > submit (a single submit could have 100's or 1000's of draws), and then
>> > resumed the iommu cb.
>> >
>> > (This would ofc eventually be useful for svm type things.. I expect
>> > we'll eventually care about that too.)
>> 
>> Rob is right about HUPCF. Due to the parallel nature of the command 
>> processor
>> there is always a very good chance that a CP access is somewhere in 
>> the bus so
>> any pagefault is usually a death sentence. The GPU context bank would 
>> always
>> want HUPCF set to 1.
> 
> So this sounds like an erratum to me, and I'm happy to set HUPCF if we
> detect the broken implementation. However, it will need an entry in
> Documentation/arm64/silicon-errata.rst and a decent comment in the 
> driver
> to explain what we're doing and why.
> 

AFAIK there is no erratum documented internally for this behaviour and 
this
exists from MSM8996 SoC time and errata usually don't survive this long
across generation of SoCs and there is no point for us in disguising it.

Is it OK if we clearly mention it as the "design limitation" or some 
other
term which we can agree upon along with the description which Rob and 
Jordan
provided for setting HUPCF in the driver when we add the set_hupcf 
callback?

Thanks,
Sai
Rob Clark May 19, 2020, 3:11 p.m. UTC | #11
On Tue, May 19, 2020 at 2:26 AM Sai Prakash Ranjan
<saiprakash.ranjan@codeaurora.org> wrote:
>
> Hi Will,
>
> On 2020-05-18 21:15, Will Deacon wrote:
> > On Mon, May 11, 2020 at 11:30:08AM -0600, Jordan Crouse wrote:
> >> On Fri, May 08, 2020 at 08:40:40AM -0700, Rob Clark wrote:
> >> > On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
> >> > >
> >> > > On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
> >> > > >
> >> > > > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
> >> > > > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
> >> > > > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
> >> > > > > > > Add stall implementation hook to enable stalling
> >> > > > > > > faults on QCOM platforms which supports it without
> >> > > > > > > causing any kind of hardware mishaps. Without this
> >> > > > > > > on QCOM platforms, GPU faults can cause unrelated
> >> > > > > > > GPU memory accesses to return zeroes. This has the
> >> > > > > > > unfortunate result of command-stream reads from CP
> >> > > > > > > getting invalid data, causing a cascade of fail.
> >> > > > >
> >> > > > > I think this came up before, but something about this rationale doesn't add
> >> > > > > up - we're not *using* stalls at all, we're still terminating faulting
> >> > > > > transactions unconditionally; we're just using CFCFG to terminate them with
> >> > > > > a slight delay, rather than immediately. It's really not clear how or why
> >> > > > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
> >> > > > > (or even a documented workaround for something), or might things start
> >> > > > > blowing up again if any other behaviour subtly changes? I'm not dead set
> >> > > > > against adding this, but I'd *really* like to have a lot more confidence in
> >> > > > > it.
> >> > > >
> >> > > > Rob mentioned something about the "bus returning zeroes" before, but I agree
> >> > > > that we need more information so that we can reason about this and maintain
> >> > > > the code as the driver continues to change. That needs to be a comment in
> >> > > > the driver, and I don't think "but android seems to work" is a good enough
> >> > > > justification. There was some interaction with HUPCF as well.
> >> > >
> >> > > The issue is that there are multiple parallel memory accesses
> >> > > happening at the same time, for example CP (the cmdstream processor)
> >> > > will be reading ahead and setting things up for the next draw or
> >> > > compute grid, in parallel with some memory accesses from the shader
> >> > > which could trigger a fault.  (And with faults triggered by something
> >> > > in the shader, there are *many* shader threads running in parallel so
> >> > > those tend to generate a big number of faults at the same time.)
> >> > >
> >> > > We need either CFCFG or HUPCF, otherwise what I have observed is that
> >> > > while the fault happens, CP's memory access will start returning
> >> > > zero's instead of valid cmdstream data, which triggers a GPU hang.  I
> >> > > can't say whether this is something unique to qcom's implementation of
> >> > > the smmu spec or not.
> >> > >
> >> > > *Often* a fault is the result of the usermode gl/vk/cl driver bug,
> >> > > although I don't think that is an argument against fixing this in the
> >> > > smmu driver.. I've been carrying around a local patch to set HUPCF for
> >> > > *years* because debugging usermode driver issues is so much harder
> >> > > without.  But there are some APIs where faults can be caused by the
> >> > > user's app on top of the usermode driver.
> >> > >
> >> >
> >> > Also, I'll add to that, a big wish of mine is to have stall with the
> >> > ability to resume later from a wq context.  That would enable me to
> >> > hook in the gpu crash dump handling for faults, which would make
> >> > debugging these sorts of issues much easier.  I think I posted a
> >> > prototype of this quite some time back, which would schedule a worker
> >> > on the first fault (since there are cases where you see 1000's of
> >> > faults at once), which grabbed some information about the currently
> >> > executing submit and some gpu registers to indicate *where* in the
> >> > submit (a single submit could have 100's or 1000's of draws), and then
> >> > resumed the iommu cb.
> >> >
> >> > (This would ofc eventually be useful for svm type things.. I expect
> >> > we'll eventually care about that too.)
> >>
> >> Rob is right about HUPCF. Due to the parallel nature of the command
> >> processor
> >> there is always a very good chance that a CP access is somewhere in
> >> the bus so
> >> any pagefault is usually a death sentence. The GPU context bank would
> >> always
> >> want HUPCF set to 1.
> >
> > So this sounds like an erratum to me, and I'm happy to set HUPCF if we
> > detect the broken implementation. However, it will need an entry in
> > Documentation/arm64/silicon-errata.rst and a decent comment in the
> > driver
> > to explain what we're doing and why.
> >
>
> AFAIK there is no erratum documented internally for this behaviour and
> this
> exists from MSM8996 SoC time and errata usually don't survive this long
> across generation of SoCs and there is no point for us in disguising it.

possibly longer, qcom_iommu sets CFCFG..

> Is it OK if we clearly mention it as the "design limitation" or some
> other
> term which we can agree upon along with the description which Rob and
> Jordan
> provided for setting HUPCF in the driver when we add the set_hupcf
> callback?

I'm not too picky on what we call it, but afaict it has been this way
since the beginning of time, rather than specific to a certain SoC or
generation of SoCs.  So it doesn't seem like the hw designers consider
it a bug.

(I'm not sure what the expected behavior is.. nor if any other SMMU
implementation encounters this sort of situation..)

BR,
-R

>
> Thanks,
> Sai
>
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
> member
> of Code Aurora Forum, hosted by The Linux Foundation
Sai Prakash Ranjan May 20, 2020, 9:32 a.m. UTC | #12
On 2020-05-19 20:41, Rob Clark wrote:
> On Tue, May 19, 2020 at 2:26 AM Sai Prakash Ranjan
> <saiprakash.ranjan@codeaurora.org> wrote:
>> 
>> Hi Will,
>> 
>> On 2020-05-18 21:15, Will Deacon wrote:
>> > On Mon, May 11, 2020 at 11:30:08AM -0600, Jordan Crouse wrote:
>> >> On Fri, May 08, 2020 at 08:40:40AM -0700, Rob Clark wrote:
>> >> > On Fri, May 8, 2020 at 8:32 AM Rob Clark <robdclark@gmail.com> wrote:
>> >> > >
>> >> > > On Thu, May 7, 2020 at 5:54 AM Will Deacon <will@kernel.org> wrote:
>> >> > > >
>> >> > > > On Thu, May 07, 2020 at 11:55:54AM +0100, Robin Murphy wrote:
>> >> > > > > On 2020-05-07 11:14 am, Sai Prakash Ranjan wrote:
>> >> > > > > > On 2020-04-22 01:50, Sai Prakash Ranjan wrote:
>> >> > > > > > > Add stall implementation hook to enable stalling
>> >> > > > > > > faults on QCOM platforms which supports it without
>> >> > > > > > > causing any kind of hardware mishaps. Without this
>> >> > > > > > > on QCOM platforms, GPU faults can cause unrelated
>> >> > > > > > > GPU memory accesses to return zeroes. This has the
>> >> > > > > > > unfortunate result of command-stream reads from CP
>> >> > > > > > > getting invalid data, causing a cascade of fail.
>> >> > > > >
>> >> > > > > I think this came up before, but something about this rationale doesn't add
>> >> > > > > up - we're not *using* stalls at all, we're still terminating faulting
>> >> > > > > transactions unconditionally; we're just using CFCFG to terminate them with
>> >> > > > > a slight delay, rather than immediately. It's really not clear how or why
>> >> > > > > that makes a difference. Is it a GPU bug? Or an SMMU bug? Is this reliable
>> >> > > > > (or even a documented workaround for something), or might things start
>> >> > > > > blowing up again if any other behaviour subtly changes? I'm not dead set
>> >> > > > > against adding this, but I'd *really* like to have a lot more confidence in
>> >> > > > > it.
>> >> > > >
>> >> > > > Rob mentioned something about the "bus returning zeroes" before, but I agree
>> >> > > > that we need more information so that we can reason about this and maintain
>> >> > > > the code as the driver continues to change. That needs to be a comment in
>> >> > > > the driver, and I don't think "but android seems to work" is a good enough
>> >> > > > justification. There was some interaction with HUPCF as well.
>> >> > >
>> >> > > The issue is that there are multiple parallel memory accesses
>> >> > > happening at the same time, for example CP (the cmdstream processor)
>> >> > > will be reading ahead and setting things up for the next draw or
>> >> > > compute grid, in parallel with some memory accesses from the shader
>> >> > > which could trigger a fault.  (And with faults triggered by something
>> >> > > in the shader, there are *many* shader threads running in parallel so
>> >> > > those tend to generate a big number of faults at the same time.)
>> >> > >
>> >> > > We need either CFCFG or HUPCF, otherwise what I have observed is that
>> >> > > while the fault happens, CP's memory access will start returning
>> >> > > zero's instead of valid cmdstream data, which triggers a GPU hang.  I
>> >> > > can't say whether this is something unique to qcom's implementation of
>> >> > > the smmu spec or not.
>> >> > >
>> >> > > *Often* a fault is the result of the usermode gl/vk/cl driver bug,
>> >> > > although I don't think that is an argument against fixing this in the
>> >> > > smmu driver.. I've been carrying around a local patch to set HUPCF for
>> >> > > *years* because debugging usermode driver issues is so much harder
>> >> > > without.  But there are some APIs where faults can be caused by the
>> >> > > user's app on top of the usermode driver.
>> >> > >
>> >> >
>> >> > Also, I'll add to that, a big wish of mine is to have stall with the
>> >> > ability to resume later from a wq context.  That would enable me to
>> >> > hook in the gpu crash dump handling for faults, which would make
>> >> > debugging these sorts of issues much easier.  I think I posted a
>> >> > prototype of this quite some time back, which would schedule a worker
>> >> > on the first fault (since there are cases where you see 1000's of
>> >> > faults at once), which grabbed some information about the currently
>> >> > executing submit and some gpu registers to indicate *where* in the
>> >> > submit (a single submit could have 100's or 1000's of draws), and then
>> >> > resumed the iommu cb.
>> >> >
>> >> > (This would ofc eventually be useful for svm type things.. I expect
>> >> > we'll eventually care about that too.)
>> >>
>> >> Rob is right about HUPCF. Due to the parallel nature of the command
>> >> processor
>> >> there is always a very good chance that a CP access is somewhere in
>> >> the bus so
>> >> any pagefault is usually a death sentence. The GPU context bank would
>> >> always
>> >> want HUPCF set to 1.
>> >
>> > So this sounds like an erratum to me, and I'm happy to set HUPCF if we
>> > detect the broken implementation. However, it will need an entry in
>> > Documentation/arm64/silicon-errata.rst and a decent comment in the
>> > driver
>> > to explain what we're doing and why.
>> >
>> 
>> AFAIK there is no erratum documented internally for this behaviour and
>> this
>> exists from MSM8996 SoC time and errata usually don't survive this 
>> long
>> across generation of SoCs and there is no point for us in disguising 
>> it.
> 
> possibly longer, qcom_iommu sets CFCFG..
> 

Oh right, I was still in college when those SoCs were released ;)

>> Is it OK if we clearly mention it as the "design limitation" or some
>> other
>> term which we can agree upon along with the description which Rob and
>> Jordan
>> provided for setting HUPCF in the driver when we add the set_hupcf
>> callback?
> 
> I'm not too picky on what we call it, but afaict it has been this way
> since the beginning of time, rather than specific to a certain SoC or
> generation of SoCs.  So it doesn't seem like the hw designers consider
> it a bug.
> 
> (I'm not sure what the expected behavior is.. nor if any other SMMU
> implementation encounters this sort of situation..)
> 

Yes, that was my point as well that its probably not counted as a bug
by the hw designers. So I'm going to post setting HUPCF on QCOM
implementation with clear comments based on yours and Jordan's 
description
of this problem, but I wanted to have a way to set this only for GPU 
context
bank and not GMU as Jordan mentioned earlier that GMU doesnt need HUPCF 
set.
I was checking as to how do we map cb to device, if it was possible then 
we can have
a compatibility thing like we did for identity mapping. Any ideas Robin?

Thanks,
Sai
Will Deacon May 20, 2020, 12:59 p.m. UTC | #13
On Wed, May 20, 2020 at 03:02:45PM +0530, Sai Prakash Ranjan wrote:
> On 2020-05-19 20:41, Rob Clark wrote:
> > On Tue, May 19, 2020 at 2:26 AM Sai Prakash Ranjan
> > <saiprakash.ranjan@codeaurora.org> wrote:
> > > On 2020-05-18 21:15, Will Deacon wrote:
> > > > So this sounds like an erratum to me, and I'm happy to set HUPCF if we
> > > > detect the broken implementation. However, it will need an entry in
> > > > Documentation/arm64/silicon-errata.rst and a decent comment in the
> > > > driver
> > > > to explain what we're doing and why.
> > > >
> > > 
> > > AFAIK there is no erratum documented internally for this behaviour and
> > > this
> > > exists from MSM8996 SoC time and errata usually don't survive this
> > > long
> > > across generation of SoCs and there is no point for us in disguising
> > > it.
> > 
> > possibly longer, qcom_iommu sets CFCFG..
> > 
> 
> Oh right, I was still in college when those SoCs were released ;)
> 
> > > Is it OK if we clearly mention it as the "design limitation" or some
> > > other
> > > term which we can agree upon along with the description which Rob and
> > > Jordan
> > > provided for setting HUPCF in the driver when we add the set_hupcf
> > > callback?
> > 
> > I'm not too picky on what we call it, but afaict it has been this way
> > since the beginning of time, rather than specific to a certain SoC or
> > generation of SoCs.  So it doesn't seem like the hw designers consider
> > it a bug.
> > 
> > (I'm not sure what the expected behavior is.. nor if any other SMMU
> > implementation encounters this sort of situation..)
> 
> Yes, that was my point as well that its probably not counted as a bug
> by the hw designers. So I'm going to post setting HUPCF on QCOM
> implementation with clear comments based on yours and Jordan's description
> of this problem, but I wanted to have a way to set this only for GPU context
> bank and not GMU as Jordan mentioned earlier that GMU doesnt need HUPCF set.
> I was checking as to how do we map cb to device, if it was possible then we
> can have
> a compatibility thing like we did for identity mapping. Any ideas Robin?

Right, see my reply over at:

https://lore.kernel.org/r/20200520125700.GD25815@willie-the-truck

Hopefully something like that can be made to work.

Will
diff mbox series

Patch

diff --git a/drivers/iommu/arm-smmu-qcom.c b/drivers/iommu/arm-smmu-qcom.c
index 24c071c1d8b0..a13b229389d4 100644
--- a/drivers/iommu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm-smmu-qcom.c
@@ -32,6 +32,7 @@  static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu)
 
 static const struct arm_smmu_impl qcom_smmu_impl = {
 	.reset = qcom_sdm845_smmu500_reset,
+	.stall = true,
 };
 
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e622f4e33379..16b03fca9966 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -488,6 +488,11 @@  static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 			    fsr, iova, fsynr, cbfrsynra, idx);
 
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+
+	if (smmu->impl && smmu->impl->stall && (fsr & ARM_SMMU_FSR_SS))
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+				  ARM_SMMU_RESUME_TERMINATE);
+
 	return IRQ_HANDLED;
 }
 
@@ -659,6 +664,8 @@  static void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx)
 		reg |= ARM_SMMU_SCTLR_S1_ASIDPNE;
 	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
 		reg |= ARM_SMMU_SCTLR_E;
+	if (smmu->impl && smmu->impl->stall)
+		reg |= ARM_SMMU_SCTLR_CFCFG;
 
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
 }
diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
index 8d1cd54d82a6..d5134e0d5cce 100644
--- a/drivers/iommu/arm-smmu.h
+++ b/drivers/iommu/arm-smmu.h
@@ -386,6 +386,7 @@  struct arm_smmu_impl {
 	int (*init_context)(struct arm_smmu_domain *smmu_domain);
 	void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
 			 int status);
+	bool stall;
 };
 
 static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)