diff mbox

[1/1] iommu/arm-smmu: Add support to use Last level cache

Message ID 20180615105329.26800-1-vivek.gautam@codeaurora.org (mailing list archive)
State Not Applicable, archived
Delegated to: Andy Gross
Headers show

Commit Message

Vivek Gautam June 15, 2018, 10:53 a.m. UTC
Qualcomm SoCs have an additional level of cache called as
System cache or Last level cache[1]. This cache sits right
before the DDR, and is tightly coupled with the memory
controller.
The cache is available to all the clients present in the
SoC system. The clients request their slices from this system
cache, make it active, and can then start using it. For these
clients with smmu, to start using the system cache for
dma buffers and related page tables [2], few of the memory
attributes need to be set accordingly.
This change makes the related memory Outer-Shareable, and
updates the MAIR with necessary protection.

The MAIR attribute requirements are:
    Inner Cacheablity = 0
    Outer Cacheablity = 1, Write-Back Write Allocate
    Outer Shareablity = 1

This change is a realisation of following changes
from downstream msm-4.9:
iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT
iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT

[1] https://patchwork.kernel.org/patch/10422531/
[2] https://patchwork.kernel.org/patch/10302791/

Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
---
 drivers/iommu/arm-smmu.c       | 14 ++++++++++++++
 drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++-----
 drivers/iommu/io-pgtable.h     |  4 ++++
 include/linux/iommu.h          |  4 ++++
 4 files changed, 41 insertions(+), 5 deletions(-)

Comments

Will Deacon June 15, 2018, 4:52 p.m. UTC | #1
Hi Vivek,

On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> Qualcomm SoCs have an additional level of cache called as
> System cache or Last level cache[1]. This cache sits right
> before the DDR, and is tightly coupled with the memory
> controller.
> The cache is available to all the clients present in the
> SoC system. The clients request their slices from this system
> cache, make it active, and can then start using it. For these
> clients with smmu, to start using the system cache for
> dma buffers and related page tables [2], few of the memory
> attributes need to be set accordingly.
> This change makes the related memory Outer-Shareable, and
> updates the MAIR with necessary protection.
> 
> The MAIR attribute requirements are:
>     Inner Cacheablity = 0
>     Outer Cacheablity = 1, Write-Back Write Allocate
>     Outer Shareablity = 1

Hmm, so is this cache coherent with the CPU or not? Why don't normal
non-cacheable mappings allocated in the LLC by default?

> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index f7a96bcf94a6..8058e7205034 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -249,6 +249,7 @@ struct arm_smmu_domain {
>  	struct mutex			init_mutex; /* Protects smmu pointer */
>  	spinlock_t			cb_lock; /* Serialises ATS1* ops and TLB syncs */
>  	struct iommu_domain		domain;
> +	bool				has_sys_cache;
>  };
>  
>  struct arm_smmu_option_prop {
> @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
>  
>  	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
>  		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
> +	if (smmu_domain->has_sys_cache)
> +		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
>  
>  	smmu_domain->smmu = smmu;
>  	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
> @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  	case DOMAIN_ATTR_NESTING:
>  		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
>  		return 0;
> +	case DOMAIN_ATTR_USE_SYS_CACHE:
> +		*((int *)data) = smmu_domain->has_sys_cache;
> +		return 0;

I really don't like exposing this to clients directly like this,
particularly as there aren't any in-tree users. I would prefer that we
provide a way for the io-pgtable code to have its MAIR values overridden
so that all non-coherent DMA ends up using the system cache.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jordan Crouse June 15, 2018, 5:12 p.m. UTC | #2
On Fri, Jun 15, 2018 at 05:52:32PM +0100, Will Deacon wrote:
> Hi Vivek,
> 
> On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> > Qualcomm SoCs have an additional level of cache called as
> > System cache or Last level cache[1]. This cache sits right
> > before the DDR, and is tightly coupled with the memory
> > controller.
> > The cache is available to all the clients present in the
> > SoC system. The clients request their slices from this system
> > cache, make it active, and can then start using it. For these
> > clients with smmu, to start using the system cache for
> > dma buffers and related page tables [2], few of the memory
> > attributes need to be set accordingly.
> > This change makes the related memory Outer-Shareable, and
> > updates the MAIR with necessary protection.
> > 
> > The MAIR attribute requirements are:
> >     Inner Cacheablity = 0
> >     Outer Cacheablity = 1, Write-Back Write Allocate
> >     Outer Shareablity = 1
> 
> Hmm, so is this cache coherent with the CPU or not? Why don't normal
> non-cacheable mappings allocated in the LLC by default?
> 
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> > index f7a96bcf94a6..8058e7205034 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -249,6 +249,7 @@ struct arm_smmu_domain {
> >  	struct mutex			init_mutex; /* Protects smmu pointer */
> >  	spinlock_t			cb_lock; /* Serialises ATS1* ops and TLB syncs */
> >  	struct iommu_domain		domain;
> > +	bool				has_sys_cache;
> >  };
> >  
> >  struct arm_smmu_option_prop {
> > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
> >  
> >  	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
> >  		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
> > +	if (smmu_domain->has_sys_cache)
> > +		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
> >  
> >  	smmu_domain->smmu = smmu;
> >  	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
> > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  	case DOMAIN_ATTR_NESTING:
> >  		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> >  		return 0;
> > +	case DOMAIN_ATTR_USE_SYS_CACHE:
> > +		*((int *)data) = smmu_domain->has_sys_cache;
> > +		return 0;
> 
> I really don't like exposing this to clients directly like this,
> particularly as there aren't any in-tree users. I would prefer that we
> provide a way for the io-pgtable code to have its MAIR values overridden
> so that all non-coherent DMA ends up using the system cache.

FWIW here is a future in-tree user for LLC:

https://patchwork.freedesktop.org/series/40545/

Specifically:

https://patchwork.freedesktop.org/patch/212400/

Jordan
Vivek Gautam June 19, 2018, 8:34 a.m. UTC | #3
Hi Will,


On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> Hi Vivek,
>
> On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
>> Qualcomm SoCs have an additional level of cache called as
>> System cache or Last level cache[1]. This cache sits right
>> before the DDR, and is tightly coupled with the memory
>> controller.
>> The cache is available to all the clients present in the
>> SoC system. The clients request their slices from this system
>> cache, make it active, and can then start using it. For these
>> clients with smmu, to start using the system cache for
>> dma buffers and related page tables [2], few of the memory
>> attributes need to be set accordingly.
>> This change makes the related memory Outer-Shareable, and
>> updates the MAIR with necessary protection.
>>
>> The MAIR attribute requirements are:
>>     Inner Cacheablity = 0
>>     Outer Cacheablity = 1, Write-Back Write Allocate
>>     Outer Shareablity = 1
>
> Hmm, so is this cache coherent with the CPU or not?

Thanks for reviewing.
Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
The different masters such as GPU as able to allocated and activate a slice
in this Last Level Cache.

> Why don't normal
> non-cacheable mappings allocated in the LLC by default?

Sorry, I couldn't fully understand your question here.
Few of the masters on qcom socs are not io-coherent, so for them
the IC has to be marked as 0.
But they are able to use the LLC with OC marked as 1.

Handling the IO-coherency is possibly a separate change to address?

>
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index f7a96bcf94a6..8058e7205034 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -249,6 +249,7 @@ struct arm_smmu_domain {
>>       struct mutex                    init_mutex; /* Protects smmu pointer */
>>       spinlock_t                      cb_lock; /* Serialises ATS1* ops and TLB syncs */
>>       struct iommu_domain             domain;
>> +     bool                            has_sys_cache;
>>  };
>>
>>  struct arm_smmu_option_prop {
>> @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
>>
>>       if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
>>               pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
>> +     if (smmu_domain->has_sys_cache)
>> +             pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
>>
>>       smmu_domain->smmu = smmu;
>>       pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
>> @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>>       case DOMAIN_ATTR_NESTING:
>>               *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
>>               return 0;
>> +     case DOMAIN_ATTR_USE_SYS_CACHE:
>> +             *((int *)data) = smmu_domain->has_sys_cache;
>> +             return 0;
>
> I really don't like exposing this to clients directly like this,
> particularly as there aren't any in-tree users. I would prefer that we
> provide a way for the io-pgtable code to have its MAIR values overridden
> so that all non-coherent DMA ends up using the system cache.

From the way it looks from the users of LLC (as also pointed to by Jordan),
the masters have to request and activate their slices in the cache, and then
they can start using it. Before that the transaction don't go through LLC.

But I will try to find out more on this.

Thanks & Regards
Vivek

>
> Will
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon June 27, 2018, 4:37 p.m. UTC | #4
Hi Vivek,

On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> >> Qualcomm SoCs have an additional level of cache called as
> >> System cache or Last level cache[1]. This cache sits right
> >> before the DDR, and is tightly coupled with the memory
> >> controller.
> >> The cache is available to all the clients present in the
> >> SoC system. The clients request their slices from this system
> >> cache, make it active, and can then start using it. For these
> >> clients with smmu, to start using the system cache for
> >> dma buffers and related page tables [2], few of the memory
> >> attributes need to be set accordingly.
> >> This change makes the related memory Outer-Shareable, and
> >> updates the MAIR with necessary protection.
> >>
> >> The MAIR attribute requirements are:
> >>     Inner Cacheablity = 0
> >>     Outer Cacheablity = 1, Write-Back Write Allocate
> >>     Outer Shareablity = 1
> >
> > Hmm, so is this cache coherent with the CPU or not?
> 
> Thanks for reviewing.
> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> The different masters such as GPU as able to allocated and activate a slice
> in this Last Level Cache.

What I mean is, for example, if the CPU writes some data using Normal, Inner
Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
Read/Write-allocate and a device reads that data using your MAIR encoding
above, is the device guaranteed to see the CPU writes after the CPU has
executed a DSB instruction?

I don't think so, because the ARM ARM would say that there's a mismatch on
the Inner Cacheability attribute.

> > Why don't normal
> > non-cacheable mappings allocated in the LLC by default?
> 
> Sorry, I couldn't fully understand your question here.
> Few of the masters on qcom socs are not io-coherent, so for them
> the IC has to be marked as 0.

By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
so I don't understand the problem. What goes wrong if non-coherent devices
use your MAIR encoding for their DMA buffers?

> But they are able to use the LLC with OC marked as 1.

The issue here is that whatever attributes we put in the SMMU need to align
with the attributes used by the CPU in order to avoid introducing mismatched
aliases. Currently, we support three types of mapping in the SMMU:

1. DMA non-coherent (e.g. "dma-coherent" is not set on the device)
	Normal, Inner Shareable, Inner/Outer Non-Cacheable

2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE]
	Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer
	Write-back, Non-transient Read/Write-allocate

3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO]
	Device-nGnRE (Outer Shareable)

So either you override one of these types (I was suggesting (1)) or you need
to create a new memory type, along with the infrastructure for it to be
recognised on a per-device basis and used by the DMA API so that we don't
get mismatched aliases on the CPU.

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Gautam July 24, 2018, 9:43 a.m. UTC | #5
Hi Will,


On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> Hi Vivek,
>
> On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
>> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
>> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
>> >> Qualcomm SoCs have an additional level of cache called as
>> >> System cache or Last level cache[1]. This cache sits right
>> >> before the DDR, and is tightly coupled with the memory
>> >> controller.
>> >> The cache is available to all the clients present in the
>> >> SoC system. The clients request their slices from this system
>> >> cache, make it active, and can then start using it. For these
>> >> clients with smmu, to start using the system cache for
>> >> dma buffers and related page tables [2], few of the memory
>> >> attributes need to be set accordingly.
>> >> This change makes the related memory Outer-Shareable, and
>> >> updates the MAIR with necessary protection.
>> >>
>> >> The MAIR attribute requirements are:
>> >>     Inner Cacheablity = 0
>> >>     Outer Cacheablity = 1, Write-Back Write Allocate
>> >>     Outer Shareablity = 1
>> >
>> > Hmm, so is this cache coherent with the CPU or not?
>>
>> Thanks for reviewing.
>> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
>> The different masters such as GPU as able to allocated and activate a slice
>> in this Last Level Cache.
>
> What I mean is, for example, if the CPU writes some data using Normal, Inner
> Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> Read/Write-allocate and a device reads that data using your MAIR encoding
> above, is the device guaranteed to see the CPU writes after the CPU has
> executed a DSB instruction?
>
> I don't think so, because the ARM ARM would say that there's a mismatch on
> the Inner Cacheability attribute.
>
>> > Why don't normal
>> > non-cacheable mappings allocated in the LLC by default?
>>
>> Sorry, I couldn't fully understand your question here.
>> Few of the masters on qcom socs are not io-coherent, so for them
>> the IC has to be marked as 0.
>
> By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> so I don't understand the problem. What goes wrong if non-coherent devices
> use your MAIR encoding for their DMA buffers?
>
>> But they are able to use the LLC with OC marked as 1.
>
> The issue here is that whatever attributes we put in the SMMU need to align
> with the attributes used by the CPU in order to avoid introducing mismatched
> aliases. Currently, we support three types of mapping in the SMMU:
>
> 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device)
>         Normal, Inner Shareable, Inner/Outer Non-Cacheable
>
> 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE]
>         Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer
>         Write-back, Non-transient Read/Write-allocate
>
> 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO]
>         Device-nGnRE (Outer Shareable)
>
> So either you override one of these types (I was suggesting (1)) or you need
> to create a new memory type, along with the infrastructure for it to be
> recognised on a per-device basis and used by the DMA API so that we don't
> get mismatched aliases on the CPU.

My apologies for delay in responding to this thread.
I have been digging and getting in touch with internal tech teams
to get more information on this. I will update as soon as I have enough
details.
Thanks.

Best regards
Vivek

>
> Will
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Jordan Crouse Sept. 19, 2018, 7:35 p.m. UTC | #6
On Tue, Jul 24, 2018 at 03:13:37PM +0530, Vivek Gautam wrote:
> Hi Will,
> 
> 
> On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> > Hi Vivek,
> >
> > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> >> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> >> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> >> >> Qualcomm SoCs have an additional level of cache called as
> >> >> System cache or Last level cache[1]. This cache sits right
> >> >> before the DDR, and is tightly coupled with the memory
> >> >> controller.
> >> >> The cache is available to all the clients present in the
> >> >> SoC system. The clients request their slices from this system
> >> >> cache, make it active, and can then start using it. For these
> >> >> clients with smmu, to start using the system cache for
> >> >> dma buffers and related page tables [2], few of the memory
> >> >> attributes need to be set accordingly.
> >> >> This change makes the related memory Outer-Shareable, and
> >> >> updates the MAIR with necessary protection.
> >> >>
> >> >> The MAIR attribute requirements are:
> >> >>     Inner Cacheablity = 0
> >> >>     Outer Cacheablity = 1, Write-Back Write Allocate
> >> >>     Outer Shareablity = 1
> >> >
> >> > Hmm, so is this cache coherent with the CPU or not?
> >>
> >> Thanks for reviewing.
> >> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> >> The different masters such as GPU as able to allocated and activate a slice
> >> in this Last Level Cache.
> >
> > What I mean is, for example, if the CPU writes some data using Normal, Inner
> > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> > Read/Write-allocate and a device reads that data using your MAIR encoding
> > above, is the device guaranteed to see the CPU writes after the CPU has
> > executed a DSB instruction?
> >
> > I don't think so, because the ARM ARM would say that there's a mismatch on
> > the Inner Cacheability attribute.
> >
> >> > Why don't normal
> >> > non-cacheable mappings allocated in the LLC by default?
> >>
> >> Sorry, I couldn't fully understand your question here.
> >> Few of the masters on qcom socs are not io-coherent, so for them
> >> the IC has to be marked as 0.
> >
> > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> > so I don't understand the problem. What goes wrong if non-coherent devices
> > use your MAIR encoding for their DMA buffers?
> >
> >> But they are able to use the LLC with OC marked as 1.
> >
> > The issue here is that whatever attributes we put in the SMMU need to align
> > with the attributes used by the CPU in order to avoid introducing mismatched
> > aliases. Currently, we support three types of mapping in the SMMU:
> >
> > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device)
> >         Normal, Inner Shareable, Inner/Outer Non-Cacheable
> >
> > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE]
> >         Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer
> >         Write-back, Non-transient Read/Write-allocate
> >
> > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO]
> >         Device-nGnRE (Outer Shareable)
> >
> > So either you override one of these types (I was suggesting (1)) or you need
> > to create a new memory type, along with the infrastructure for it to be
> > recognised on a per-device basis and used by the DMA API so that we don't
> > get mismatched aliases on the CPU.
> 
> My apologies for delay in responding to this thread.
> I have been digging and getting in touch with internal tech teams
> to get more information on this. I will update as soon as I have enough
> details.
> Thanks.

Hi Vivek.  I want to revive this discussion. I believe that Andy has pulled
in the base LLCC support so this the remaining dependency we need to implement
the LLCC in the GPU driver. 

Thanks,
Jordan
Vivek Gautam Sept. 20, 2018, 10:25 a.m. UTC | #7
On Thu, Sep 20, 2018 at 1:05 AM Jordan Crouse <jcrouse@codeaurora.org> wrote:
>
> On Tue, Jul 24, 2018 at 03:13:37PM +0530, Vivek Gautam wrote:
> > Hi Will,
> >
> >
> > On Wed, Jun 27, 2018 at 10:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> > > Hi Vivek,
> > >
> > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> > >> On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> > >> > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> > >> >> Qualcomm SoCs have an additional level of cache called as
> > >> >> System cache or Last level cache[1]. This cache sits right
> > >> >> before the DDR, and is tightly coupled with the memory
> > >> >> controller.
> > >> >> The cache is available to all the clients present in the
> > >> >> SoC system. The clients request their slices from this system
> > >> >> cache, make it active, and can then start using it. For these
> > >> >> clients with smmu, to start using the system cache for
> > >> >> dma buffers and related page tables [2], few of the memory
> > >> >> attributes need to be set accordingly.
> > >> >> This change makes the related memory Outer-Shareable, and
> > >> >> updates the MAIR with necessary protection.
> > >> >>
> > >> >> The MAIR attribute requirements are:
> > >> >>     Inner Cacheablity = 0
> > >> >>     Outer Cacheablity = 1, Write-Back Write Allocate
> > >> >>     Outer Shareablity = 1
> > >> >
> > >> > Hmm, so is this cache coherent with the CPU or not?
> > >>
> > >> Thanks for reviewing.
> > >> Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> > >> The different masters such as GPU as able to allocated and activate a slice
> > >> in this Last Level Cache.
> > >
> > > What I mean is, for example, if the CPU writes some data using Normal, Inner
> > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> > > Read/Write-allocate and a device reads that data using your MAIR encoding
> > > above, is the device guaranteed to see the CPU writes after the CPU has
> > > executed a DSB instruction?
> > >
> > > I don't think so, because the ARM ARM would say that there's a mismatch on
> > > the Inner Cacheability attribute.
> > >
> > >> > Why don't normal
> > >> > non-cacheable mappings allocated in the LLC by default?
> > >>
> > >> Sorry, I couldn't fully understand your question here.
> > >> Few of the masters on qcom socs are not io-coherent, so for them
> > >> the IC has to be marked as 0.
> > >
> > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> > > so I don't understand the problem. What goes wrong if non-coherent devices
> > > use your MAIR encoding for their DMA buffers?
> > >
> > >> But they are able to use the LLC with OC marked as 1.
> > >
> > > The issue here is that whatever attributes we put in the SMMU need to align
> > > with the attributes used by the CPU in order to avoid introducing mismatched
> > > aliases. Currently, we support three types of mapping in the SMMU:
> > >
> > > 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device)
> > >         Normal, Inner Shareable, Inner/Outer Non-Cacheable
> > >
> > > 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE]
> > >         Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer
> > >         Write-back, Non-transient Read/Write-allocate
> > >
> > > 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO]
> > >         Device-nGnRE (Outer Shareable)
> > >
> > > So either you override one of these types (I was suggesting (1)) or you need
> > > to create a new memory type, along with the infrastructure for it to be
> > > recognised on a per-device basis and used by the DMA API so that we don't
> > > get mismatched aliases on the CPU.
> >
> > My apologies for delay in responding to this thread.
> > I have been digging and getting in touch with internal tech teams
> > to get more information on this. I will update as soon as I have enough
> > details.
> > Thanks.
>
> Hi Vivek.  I want to revive this discussion. I believe that Andy has pulled
> in the base LLCC support so this the remaining dependency we need to implement
> the LLCC in the GPU driver.

Hi Jordan, yes I was in process of gathering information about the system cache
usage and the attributes configurations required when devices use system cache.

Let me respond to Will's questions now.

Thanks
Vivek
>
> Thanks,
> Jordan
>
> --
> The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> a Linux Foundation Collaborative Project
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu


--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
Vivek Gautam Sept. 20, 2018, 11:41 a.m. UTC | #8
Hi Will,

On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon@arm.com> wrote:
>
> Hi Vivek,
>
> On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> > >> Qualcomm SoCs have an additional level of cache called as
> > >> System cache or Last level cache[1]. This cache sits right
> > >> before the DDR, and is tightly coupled with the memory
> > >> controller.
> > >> The cache is available to all the clients present in the
> > >> SoC system. The clients request their slices from this system
> > >> cache, make it active, and can then start using it. For these
> > >> clients with smmu, to start using the system cache for
> > >> dma buffers and related page tables [2], few of the memory
> > >> attributes need to be set accordingly.
> > >> This change makes the related memory Outer-Shareable, and
> > >> updates the MAIR with necessary protection.
> > >>
> > >> The MAIR attribute requirements are:
> > >>     Inner Cacheablity = 0
> > >>     Outer Cacheablity = 1, Write-Back Write Allocate
> > >>     Outer Shareablity = 1
> > >
> > > Hmm, so is this cache coherent with the CPU or not?
> >
> > Thanks for reviewing.
> > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> > The different masters such as GPU as able to allocated and activate a slice
> > in this Last Level Cache.
>
> What I mean is, for example, if the CPU writes some data using Normal, Inner
> Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> Read/Write-allocate and a device reads that data using your MAIR encoding
> above, is the device guaranteed to see the CPU writes after the CPU has
> executed a DSB instruction?

No, these MAIR configurations don't guarantee that devices will have
coherent view
of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent
devices can).
So a normal cached memory configuration in CPU MMU tables, and SMMU page tables
is valid only for few devices that are IO-coherent.

Moreover, CPU can lookup in system cache, and so do all devices;
allocation will depend on h/w configurations and memory attributes.
So anything that CPU caches in system cache will be coherently visible
to devices.

>
> I don't think so, because the ARM ARM would say that there's a mismatch on
> the Inner Cacheability attribute.
>
> > > Why don't normal
> > > non-cacheable mappings allocated in the LLC by default?
> >
> > Sorry, I couldn't fully understand your question here.
> > Few of the masters on qcom socs are not io-coherent, so for them
> > the IC has to be marked as 0.
>
> By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> so I don't understand the problem. What goes wrong if non-coherent devices
> use your MAIR encoding for their DMA buffers?
>
> > But they are able to use the LLC with OC marked as 1.
>
> The issue here is that whatever attributes we put in the SMMU need to align
> with the attributes used by the CPU in order to avoid introducing mismatched
> aliases.

Not really, right?
Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate
into the system cache (as these devices don't want to allocate in
their inner caches),
and the CPU will have a coherent view of these buffers/page-tables.
This should be
a normal cached non-IO-Coherent memory.

But anything that CPU writes using Normal, Inner Shareable,
Inner/Outer Cacheable,
Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible
to the device.

Also added Jordan, and Pratik to this thread.

Thanks & Regards
Vivek

> Currently, we support three types of mapping in the SMMU:
>
> 1. DMA non-coherent (e.g. "dma-coherent" is not set on the device)
>         Normal, Inner Shareable, Inner/Outer Non-Cacheable
>
> 2. DMA coherent (e.g. "dma-coherent" is set on the device) [IOMMU_CACHE]
>         Normal, Inner Shareable, Inner/Outer Cacheable, Inner/Outer
>         Write-back, Non-transient Read/Write-allocate
>
> 3. MMIO (e.g. MSI doorbell) [IOMMU_MMIO]
>         Device-nGnRE (Outer Shareable)
>
> So either you override one of these types (I was suggesting (1)) or you need
> to create a new memory type, along with the infrastructure for it to be
> recognised on a per-device basis and used by the DMA API so that we don't
> get mismatched aliases on the CPU.
>
> Will
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Will Deacon Sept. 28, 2018, 1:19 p.m. UTC | #9
Hi Vivek,

On Thu, Sep 20, 2018 at 05:11:53PM +0530, Vivek Gautam wrote:
> On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon@arm.com> wrote:
> > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> > > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> > > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> > > >> Qualcomm SoCs have an additional level of cache called as
> > > >> System cache or Last level cache[1]. This cache sits right
> > > >> before the DDR, and is tightly coupled with the memory
> > > >> controller.
> > > >> The cache is available to all the clients present in the
> > > >> SoC system. The clients request their slices from this system
> > > >> cache, make it active, and can then start using it. For these
> > > >> clients with smmu, to start using the system cache for
> > > >> dma buffers and related page tables [2], few of the memory
> > > >> attributes need to be set accordingly.
> > > >> This change makes the related memory Outer-Shareable, and
> > > >> updates the MAIR with necessary protection.
> > > >>
> > > >> The MAIR attribute requirements are:
> > > >>     Inner Cacheablity = 0
> > > >>     Outer Cacheablity = 1, Write-Back Write Allocate
> > > >>     Outer Shareablity = 1
> > > >
> > > > Hmm, so is this cache coherent with the CPU or not?
> > >
> > > Thanks for reviewing.
> > > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> > > The different masters such as GPU as able to allocated and activate a slice
> > > in this Last Level Cache.
> >
> > What I mean is, for example, if the CPU writes some data using Normal, Inner
> > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> > Read/Write-allocate and a device reads that data using your MAIR encoding
> > above, is the device guaranteed to see the CPU writes after the CPU has
> > executed a DSB instruction?
> 
> No, these MAIR configurations don't guarantee that devices will have
> coherent view
> of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent
> devices can).
> So a normal cached memory configuration in CPU MMU tables, and SMMU page tables
> is valid only for few devices that are IO-coherent.
> 
> Moreover, CPU can lookup in system cache, and so do all devices;
> allocation will depend on h/w configurations and memory attributes.
> So anything that CPU caches in system cache will be coherently visible
> to devices.
> 
> >
> > I don't think so, because the ARM ARM would say that there's a mismatch on
> > the Inner Cacheability attribute.
> >
> > > > Why don't normal
> > > > non-cacheable mappings allocated in the LLC by default?
> > >
> > > Sorry, I couldn't fully understand your question here.
> > > Few of the masters on qcom socs are not io-coherent, so for them
> > > the IC has to be marked as 0.
> >
> > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> > so I don't understand the problem. What goes wrong if non-coherent devices
> > use your MAIR encoding for their DMA buffers?
> >
> > > But they are able to use the LLC with OC marked as 1.
> >
> > The issue here is that whatever attributes we put in the SMMU need to align
> > with the attributes used by the CPU in order to avoid introducing mismatched
> > aliases.
> 
> Not really, right?
> Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate
> into the system cache (as these devices don't want to allocate in
> their inner caches),
> and the CPU will have a coherent view of these buffers/page-tables.
> This should be
> a normal cached non-IO-Coherent memory.
> 
> But anything that CPU writes using Normal, Inner Shareable,
> Inner/Outer Cacheable,
> Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible
> to the device.
> 
> Also added Jordan, and Pratik to this thread.

Sorry, but I'm still completely confused.

If you only end up with mismatched memory attributes in the non-coherent
case, then why can't you just follow my suggestion to override the
attributes for non-coherent mappings on your SoC?

Will
Vivek Gautam Oct. 5, 2018, 5:25 a.m. UTC | #10
Hi Will,

On Fri, Sep 28, 2018 at 6:49 PM Will Deacon <will.deacon@arm.com> wrote:
>
> Hi Vivek,
>
> On Thu, Sep 20, 2018 at 05:11:53PM +0530, Vivek Gautam wrote:
> > On Wed, Jun 27, 2018 at 10:07 PM Will Deacon <will.deacon@arm.com> wrote:
> > > On Tue, Jun 19, 2018 at 02:04:44PM +0530, Vivek Gautam wrote:
> > > > On Fri, Jun 15, 2018 at 10:22 PM, Will Deacon <will.deacon@arm.com> wrote:
> > > > > On Fri, Jun 15, 2018 at 04:23:29PM +0530, Vivek Gautam wrote:
> > > > >> Qualcomm SoCs have an additional level of cache called as
> > > > >> System cache or Last level cache[1]. This cache sits right
> > > > >> before the DDR, and is tightly coupled with the memory
> > > > >> controller.
> > > > >> The cache is available to all the clients present in the
> > > > >> SoC system. The clients request their slices from this system
> > > > >> cache, make it active, and can then start using it. For these
> > > > >> clients with smmu, to start using the system cache for
> > > > >> dma buffers and related page tables [2], few of the memory
> > > > >> attributes need to be set accordingly.
> > > > >> This change makes the related memory Outer-Shareable, and
> > > > >> updates the MAIR with necessary protection.
> > > > >>
> > > > >> The MAIR attribute requirements are:
> > > > >>     Inner Cacheablity = 0
> > > > >>     Outer Cacheablity = 1, Write-Back Write Allocate
> > > > >>     Outer Shareablity = 1
> > > > >
> > > > > Hmm, so is this cache coherent with the CPU or not?
> > > >
> > > > Thanks for reviewing.
> > > > Yes, this LLC is cache coherent with CPU, so we mark for Outer-cacheable.
> > > > The different masters such as GPU as able to allocated and activate a slice
> > > > in this Last Level Cache.
> > >
> > > What I mean is, for example, if the CPU writes some data using Normal, Inner
> > > Shareable, Inner/Outer Cacheable, Inner/Outer Write-back, Non-transient
> > > Read/Write-allocate and a device reads that data using your MAIR encoding
> > > above, is the device guaranteed to see the CPU writes after the CPU has
> > > executed a DSB instruction?
> >
> > No, these MAIR configurations don't guarantee that devices will have
> > coherent view
> > of what CPU writes. Not all devices can snoop into CPU caches (only IO-Coherent
> > devices can).
> > So a normal cached memory configuration in CPU MMU tables, and SMMU page tables
> > is valid only for few devices that are IO-coherent.
> >
> > Moreover, CPU can lookup in system cache, and so do all devices;
> > allocation will depend on h/w configurations and memory attributes.
> > So anything that CPU caches in system cache will be coherently visible
> > to devices.
> >
> > >
> > > I don't think so, because the ARM ARM would say that there's a mismatch on
> > > the Inner Cacheability attribute.
> > >
> > > > > Why don't normal
> > > > > non-cacheable mappings allocated in the LLC by default?
> > > >
> > > > Sorry, I couldn't fully understand your question here.
> > > > Few of the masters on qcom socs are not io-coherent, so for them
> > > > the IC has to be marked as 0.
> > >
> > > By IC you mean Inner Cacheability? In your MAIR encoding above, it is zero
> > > so I don't understand the problem. What goes wrong if non-coherent devices
> > > use your MAIR encoding for their DMA buffers?
> > >
> > > > But they are able to use the LLC with OC marked as 1.
> > >
> > > The issue here is that whatever attributes we put in the SMMU need to align
> > > with the attributes used by the CPU in order to avoid introducing mismatched
> > > aliases.
> >
> > Not really, right?
> > Devices can use Inner non-Cacheable, Outer-cacheable (IC=0, OC=1) to allocate
> > into the system cache (as these devices don't want to allocate in
> > their inner caches),
> > and the CPU will have a coherent view of these buffers/page-tables.
> > This should be
> > a normal cached non-IO-Coherent memory.
> >
> > But anything that CPU writes using Normal, Inner Shareable,
> > Inner/Outer Cacheable,
> > Inner/Outer Write-back, Non-transient Read/Write-allocate, may not be visible
> > to the device.
> >
> > Also added Jordan, and Pratik to this thread.
>
> Sorry, but I'm still completely confused.
>
> If you only end up with mismatched memory attributes in the non-coherent
> case, then why can't you just follow my suggestion to override the
> attributes for non-coherent mappings on your SoC?

As seen in downstream kernels there are few non-coherent devices which
would not want to allocate in system cache, and therefore would want
Inner/Outer non-cached memory.
So, we may want to either override the attributes per-device, or as
you suggested
we may want to introduce another memory type 'sys-cached' that can be
added with its separate infra.
Thanks.

[...]

Best regards
Vivek


--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
Tomasz Figa Oct. 23, 2018, 4:15 a.m. UTC | #11
Hi Vivek,

On Fri, Jun 15, 2018 at 7:53 PM Vivek Gautam
<vivek.gautam@codeaurora.org> wrote:
>
> Qualcomm SoCs have an additional level of cache called as
> System cache or Last level cache[1]. This cache sits right
> before the DDR, and is tightly coupled with the memory
> controller.
> The cache is available to all the clients present in the
> SoC system. The clients request their slices from this system
> cache, make it active, and can then start using it. For these
> clients with smmu, to start using the system cache for
> dma buffers and related page tables [2], few of the memory
> attributes need to be set accordingly.
> This change makes the related memory Outer-Shareable, and
> updates the MAIR with necessary protection.
>
> The MAIR attribute requirements are:
>     Inner Cacheablity = 0
>     Outer Cacheablity = 1, Write-Back Write Allocate
>     Outer Shareablity = 1
>
> This change is a realisation of following changes
> from downstream msm-4.9:
> iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT
> iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT

Would you be able to provide links to those 2 downstream changes?

>
> [1] https://patchwork.kernel.org/patch/10422531/
> [2] https://patchwork.kernel.org/patch/10302791/
>
> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
> ---
>  drivers/iommu/arm-smmu.c       | 14 ++++++++++++++
>  drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++-----
>  drivers/iommu/io-pgtable.h     |  4 ++++
>  include/linux/iommu.h          |  4 ++++
>  4 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index f7a96bcf94a6..8058e7205034 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -249,6 +249,7 @@ struct arm_smmu_domain {
>         struct mutex                    init_mutex; /* Protects smmu pointer */
>         spinlock_t                      cb_lock; /* Serialises ATS1* ops and TLB syncs */
>         struct iommu_domain             domain;
> +       bool                            has_sys_cache;
>  };
>
>  struct arm_smmu_option_prop {
> @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
>
>         if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
>                 pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
> +       if (smmu_domain->has_sys_cache)
> +               pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
>
>         smmu_domain->smmu = smmu;
>         pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
> @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>         case DOMAIN_ATTR_NESTING:
>                 *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
>                 return 0;
> +       case DOMAIN_ATTR_USE_SYS_CACHE:
> +               *((int *)data) = smmu_domain->has_sys_cache;
> +               return 0;
>         default:
>                 return -ENODEV;
>         }
> @@ -1506,6 +1512,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
>                         smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
>
>                 break;
> +       case DOMAIN_ATTR_USE_SYS_CACHE:
> +               if (smmu_domain->smmu) {
> +                       ret = -EPERM;
> +                       goto out_unlock;
> +               }
> +               if (*((int *)data))
> +                       smmu_domain->has_sys_cache = true;
> +               break;
>         default:
>                 ret = -ENODEV;
>         }
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 010a254305dd..b2aee1828524 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -169,9 +169,11 @@
>  #define ARM_LPAE_MAIR_ATTR_DEVICE      0x04
>  #define ARM_LPAE_MAIR_ATTR_NC          0x44
>  #define ARM_LPAE_MAIR_ATTR_WBRWA       0xff
> +#define ARM_LPAE_MAIR_ATTR_SYS_CACHE   0xf4
>  #define ARM_LPAE_MAIR_ATTR_IDX_NC      0
>  #define ARM_LPAE_MAIR_ATTR_IDX_CACHE   1
>  #define ARM_LPAE_MAIR_ATTR_IDX_DEV     2
> +#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE       3
>
>  /* IOPTE accessors */
>  #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
> @@ -442,6 +444,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
>                 else if (prot & IOMMU_CACHE)
>                         pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
>                                 << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> +               else if (prot & IOMMU_SYS_CACHE)
> +                       pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE
> +                               << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> +

Okay, so we favor the full caching (IC WBRWA, OC WBRWA, OS) first if
requested or otherwise try to use system cache (IC NC, OC WBWA?, OS)?
Sounds fine.

nit: Unnecessary blank line.

>         } else {
>                 pte = ARM_LPAE_PTE_HAP_FAULT;
>                 if (prot & IOMMU_READ)
> @@ -771,7 +777,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
>         u64 reg;
>         struct arm_lpae_io_pgtable *data;
>
> -       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA))
> +       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
> +                           IO_PGTABLE_QUIRK_SYS_CACHE))
>                 return NULL;
>
>         data = arm_lpae_alloc_pgtable(cfg);
> @@ -779,9 +786,14 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
>                 return NULL;
>
>         /* TCR */
> -       reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
> -             (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
> -             (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
> +       if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
> +               reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) |
> +                     (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT);

Contrary to the earlier code which favored IC/IS if possible, here we
seem to disable IC/IS if the SYS_CACHE quirk is requested, regardless
of whether it could still be desirable to use IC/IS. Perhaps rather
than
IO_PGTABLE_QUIRK_SYS_CACHE, we need something like
IO_PGTABLE_QUIRK_NO_INNER_CACHE?

> +       } else {
> +               reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
> +                     (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT);
> +       }
> +       reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
>

[keeping the context]

Best regards,
Tomasz
Vivek Gautam Oct. 24, 2018, 5:48 p.m. UTC | #12
Hi Tomasz,

On Tue, Oct 23, 2018 at 9:45 AM Tomasz Figa <tfiga@chromium.org> wrote:
>
> Hi Vivek,
>
> On Fri, Jun 15, 2018 at 7:53 PM Vivek Gautam
> <vivek.gautam@codeaurora.org> wrote:
> >
> > Qualcomm SoCs have an additional level of cache called as
> > System cache or Last level cache[1]. This cache sits right
> > before the DDR, and is tightly coupled with the memory
> > controller.
> > The cache is available to all the clients present in the
> > SoC system. The clients request their slices from this system
> > cache, make it active, and can then start using it. For these
> > clients with smmu, to start using the system cache for
> > dma buffers and related page tables [2], few of the memory
> > attributes need to be set accordingly.
> > This change makes the related memory Outer-Shareable, and
> > updates the MAIR with necessary protection.
> >
> > The MAIR attribute requirements are:
> >     Inner Cacheablity = 0
> >     Outer Cacheablity = 1, Write-Back Write Allocate
> >     Outer Shareablity = 1
> >
> > This change is a realisation of following changes
> > from downstream msm-4.9:
> > iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT
> > iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT
>
> Would you be able to provide links to those 2 downstream changes?

Thanks for the review.
Here are the links for the changes:
[1] -- iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT
[2] -- iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT

[1] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=bf762276796e79ca90014992f4d9da5593fa7d51
[2] https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9&id=d4c72c413ea27c43f60825193d4de9cb8ffd9602

>
> >
> > [1] https://patchwork.kernel.org/patch/10422531/
> > [2] https://patchwork.kernel.org/patch/10302791/
> >
> > Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
> > ---
> >  drivers/iommu/arm-smmu.c       | 14 ++++++++++++++
> >  drivers/iommu/io-pgtable-arm.c | 24 +++++++++++++++++++-----
> >  drivers/iommu/io-pgtable.h     |  4 ++++
> >  include/linux/iommu.h          |  4 ++++
> >  4 files changed, 41 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> > index f7a96bcf94a6..8058e7205034 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -249,6 +249,7 @@ struct arm_smmu_domain {
> >         struct mutex                    init_mutex; /* Protects smmu pointer */
> >         spinlock_t                      cb_lock; /* Serialises ATS1* ops and TLB syncs */
> >         struct iommu_domain             domain;
> > +       bool                            has_sys_cache;
> >  };
> >
> >  struct arm_smmu_option_prop {
> > @@ -862,6 +863,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
> >
> >         if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
> >                 pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
> > +       if (smmu_domain->has_sys_cache)
> > +               pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
> >
> >         smmu_domain->smmu = smmu;
> >         pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
> > @@ -1477,6 +1480,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >         case DOMAIN_ATTR_NESTING:
> >                 *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> >                 return 0;
> > +       case DOMAIN_ATTR_USE_SYS_CACHE:
> > +               *((int *)data) = smmu_domain->has_sys_cache;
> > +               return 0;
> >         default:
> >                 return -ENODEV;
> >         }
> > @@ -1506,6 +1512,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
> >                         smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
> >
> >                 break;
> > +       case DOMAIN_ATTR_USE_SYS_CACHE:
> > +               if (smmu_domain->smmu) {
> > +                       ret = -EPERM;
> > +                       goto out_unlock;
> > +               }
> > +               if (*((int *)data))
> > +                       smmu_domain->has_sys_cache = true;
> > +               break;
> >         default:
> >                 ret = -ENODEV;
> >         }
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 010a254305dd..b2aee1828524 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -169,9 +169,11 @@
> >  #define ARM_LPAE_MAIR_ATTR_DEVICE      0x04
> >  #define ARM_LPAE_MAIR_ATTR_NC          0x44
> >  #define ARM_LPAE_MAIR_ATTR_WBRWA       0xff
> > +#define ARM_LPAE_MAIR_ATTR_SYS_CACHE   0xf4
> >  #define ARM_LPAE_MAIR_ATTR_IDX_NC      0
> >  #define ARM_LPAE_MAIR_ATTR_IDX_CACHE   1
> >  #define ARM_LPAE_MAIR_ATTR_IDX_DEV     2
> > +#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE       3
> >
> >  /* IOPTE accessors */
> >  #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
> > @@ -442,6 +444,10 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> >                 else if (prot & IOMMU_CACHE)
> >                         pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> >                                 << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > +               else if (prot & IOMMU_SYS_CACHE)
> > +                       pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE
> > +                               << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > +
>
> Okay, so we favor the full caching (IC WBRWA, OC WBRWA, OS) first if
> requested or otherwise try to use system cache (IC NC, OC WBWA?, OS)?
> Sounds fine.

That's right. When devices can use full caching, the system cache will
also be used. Otherwise when devices can't allocate/use inner caches,
system cache (Outer WBRWA, and IC NC) is the option.

>
> nit: Unnecessary blank line.

will remove it.

>
> >         } else {
> >                 pte = ARM_LPAE_PTE_HAP_FAULT;
> >                 if (prot & IOMMU_READ)
> > @@ -771,7 +777,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
> >         u64 reg;
> >         struct arm_lpae_io_pgtable *data;
> >
> > -       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA))
> > +       if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
> > +                           IO_PGTABLE_QUIRK_SYS_CACHE))
> >                 return NULL;
> >
> >         data = arm_lpae_alloc_pgtable(cfg);
> > @@ -779,9 +786,14 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
> >                 return NULL;
> >
> >         /* TCR */
> > -       reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
> > -             (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
> > -             (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
> > +       if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
> > +               reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) |
> > +                     (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT);
>
> Contrary to the earlier code which favored IC/IS if possible, here we
> seem to disable IC/IS if the SYS_CACHE quirk is requested, regardless
> of whether it could still be desirable to use IC/IS. Perhaps rather
> than
> IO_PGTABLE_QUIRK_SYS_CACHE, we need something like
> IO_PGTABLE_QUIRK_NO_INNER_CACHE?

IIUC, with QUIRK_NO_INNER_CACHE, we would explicitly handle the case
of non I/O-coherent devices that can't use inner caches.
Other devices will have coherent view of inner as well as outer caches
(including system cache).
Do I understand you correctly?

Best regards
Vivek
>
> > +       } else {
> > +               reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
> > +                     (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT);
> > +       }
> > +       reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
> >
>
> [keeping the context]
>
> Best regards,
> Tomasz
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
diff mbox

Patch

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index f7a96bcf94a6..8058e7205034 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -249,6 +249,7 @@  struct arm_smmu_domain {
 	struct mutex			init_mutex; /* Protects smmu pointer */
 	spinlock_t			cb_lock; /* Serialises ATS1* ops and TLB syncs */
 	struct iommu_domain		domain;
+	bool				has_sys_cache;
 };
 
 struct arm_smmu_option_prop {
@@ -862,6 +863,8 @@  static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 
 	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK)
 		pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_NO_DMA;
+	if (smmu_domain->has_sys_cache)
+		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
 
 	smmu_domain->smmu = smmu;
 	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -1477,6 +1480,9 @@  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	case DOMAIN_ATTR_NESTING:
 		*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
 		return 0;
+	case DOMAIN_ATTR_USE_SYS_CACHE:
+		*((int *)data) = smmu_domain->has_sys_cache;
+		return 0;
 	default:
 		return -ENODEV;
 	}
@@ -1506,6 +1512,14 @@  static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 			smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
 
 		break;
+	case DOMAIN_ATTR_USE_SYS_CACHE:
+		if (smmu_domain->smmu) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+		if (*((int *)data))
+			smmu_domain->has_sys_cache = true;
+		break;
 	default:
 		ret = -ENODEV;
 	}
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 010a254305dd..b2aee1828524 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -169,9 +169,11 @@ 
 #define ARM_LPAE_MAIR_ATTR_DEVICE	0x04
 #define ARM_LPAE_MAIR_ATTR_NC		0x44
 #define ARM_LPAE_MAIR_ATTR_WBRWA	0xff
+#define ARM_LPAE_MAIR_ATTR_SYS_CACHE	0xf4
 #define ARM_LPAE_MAIR_ATTR_IDX_NC	0
 #define ARM_LPAE_MAIR_ATTR_IDX_CACHE	1
 #define ARM_LPAE_MAIR_ATTR_IDX_DEV	2
+#define ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE	3
 
 /* IOPTE accessors */
 #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
@@ -442,6 +444,10 @@  static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 		else if (prot & IOMMU_CACHE)
 			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
 				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+		else if (prot & IOMMU_SYS_CACHE)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+
 	} else {
 		pte = ARM_LPAE_PTE_HAP_FAULT;
 		if (prot & IOMMU_READ)
@@ -771,7 +777,8 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	u64 reg;
 	struct arm_lpae_io_pgtable *data;
 
-	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA))
+	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA |
+			    IO_PGTABLE_QUIRK_SYS_CACHE))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
@@ -779,9 +786,14 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 		return NULL;
 
 	/* TCR */
-	reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
-	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
-	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
+	if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+		reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH0_SHIFT) |
+		      (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN0_SHIFT);
+	} else {
+		reg = (ARM_LPAE_TCR_SH_IS << ARM_LPAE_TCR_SH0_SHIFT) |
+		      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT);
+	}
+	reg |= (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
 
 	switch (ARM_LPAE_GRANULE(data)) {
 	case SZ_4K:
@@ -833,7 +845,9 @@  arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	      (ARM_LPAE_MAIR_ATTR_WBRWA
 	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
 	      (ARM_LPAE_MAIR_ATTR_DEVICE
-	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+	      (ARM_LPAE_MAIR_ATTR_SYS_CACHE
+	       << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_SYS_CACHE));
 
 	cfg->arm_lpae_s1_cfg.mair[0] = reg;
 	cfg->arm_lpae_s1_cfg.mair[1] = 0;
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 2df79093cad9..b5a398380e9f 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -71,12 +71,16 @@  struct io_pgtable_cfg {
 	 *	be accessed by a fully cache-coherent IOMMU or CPU (e.g. for a
 	 *	software-emulated IOMMU), such that pagetable updates need not
 	 *	be treated as explicit DMA data.
+	 *
+	 * IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for
+	 *	the page table walker when using system cache.
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
 	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
 	#define IO_PGTABLE_QUIRK_ARM_MTK_4GB	BIT(3)
 	#define IO_PGTABLE_QUIRK_NO_DMA		BIT(4)
+	#define IO_PGTABLE_QUIRK_SYS_CACHE	BIT(5)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 19938ee6eb31..dacb9648e9b3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,9 @@ 
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV	(1 << 5)
+/* Use last level cache available with few architectures */
+#define IOMMU_SYS_CACHE	(1 << 6)
+
 
 struct iommu_ops;
 struct iommu_group;
@@ -124,6 +127,7 @@  enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
+	DOMAIN_ATTR_USE_SYS_CACHE,
 	DOMAIN_ATTR_MAX,
 };