diff mbox series

[v2,7/7] drm/msm/a6xx: Add support for using system cache(LLC)

Message ID 1538744915-25490-8-git-send-email-smasetty@codeaurora.org (mailing list archive)
State New, archived
Headers show
Series drm/msm/a6xx: System Cache Support | expand

Commit Message

Sharat Masetty Oct. 5, 2018, 1:08 p.m. UTC
The last level system cache can be partitioned to 32 different slices
of which GPU has two slices preallocated. One slice is used for caching GPU
buffers and the other slice is used for caching the GPU SMMU pagetables.
This patch talks to the core system cache driver to acquire the slice handles,
configure the SCID's to those slices and activates and deactivates the slices
upon GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use of the
system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables
caching GPU data buffers in the system cache with memory attributes such
as outer cacheable, read-allocate, write-allocate for buffers. The GPU
then has the ability to override a few cacheability parameters which it
does to override write-allocate to write-no-allocate as the GPU hardware
does not benefit much from it.

Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute
used by the IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 +++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
 drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
 drivers/gpu/drm/msm/msm_mmu.h         |   3 +
 4 files changed, 183 insertions(+), 1 deletion(-)

Comments

Jordan Crouse Oct. 5, 2018, 3:07 p.m. UTC | #1
On Fri, Oct 05, 2018 at 06:38:35PM +0530, Sharat Masetty wrote:
> The last level system cache can be partitioned to 32 different slices
> of which GPU has two slices preallocated. One slice is used for caching GPU
> buffers and the other slice is used for caching the GPU SMMU pagetables.
> This patch talks to the core system cache driver to acquire the slice handles,
> configure the SCID's to those slices and activates and deactivates the slices
> upon GPU power collapse and restore.
> 
> Some support from the IOMMU driver is also needed to make use of the
> system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables
> caching GPU data buffers in the system cache with memory attributes such
> as outer cacheable, read-allocate, write-allocate for buffers. The GPU
> then has the ability to override a few cacheability parameters which it
> does to override write-allocate to write-no-allocate as the GPU hardware
> does not benefit much from it.
> 
> Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute
> used by the IOMMU driver to set the right attributes to cache the hardware
> pagetables into the system cache.
> 
> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 +++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
>  drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
>  drivers/gpu/drm/msm/msm_mmu.h         |   3 +
>  4 files changed, 183 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 177dbfc..1790dde 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -8,6 +8,7 @@
>  #include "a6xx_gmu.xml.h"
>  
>  #include <linux/devfreq.h>
> +#include <linux/soc/qcom/llcc-qcom.h>
>  
>  static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
>  {
> @@ -674,6 +675,151 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>  	~0
>  };
>  
> +#define A6XX_LLC_NUM_GPU_SCIDS		5
> +#define A6XX_GPU_LLC_SCID_NUM_BITS	5
> +
> +#define A6XX_GPU_LLC_SCID_MASK \
> +	((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
> +
> +#define A6XX_GPUHTW_LLC_SCID_SHIFT	25
> +#define A6XX_GPUHTW_LLC_SCID_MASK \
> +	(((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
> +
> +static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
> +	u32 reg, u32 mask, u32 or)
> +{
> +	msm_rmw(llc->mmio + (reg << 2), mask, or);
> +}
> +
> +static void a6xx_llc_deactivate(struct msm_gpu *gpu)
> +{
> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
> +
> +	llcc_slice_deactivate(llc->gpu_llc_slice);
> +	llcc_slice_deactivate(llc->gpuhtw_llc_slice);
> +}
> +
> +static void a6xx_llc_activate(struct msm_gpu *gpu)
> +{
> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
> +
> +	if (!llc->mmio)
> +		return;
> +
> +	/*
> +	 * If the LLCC_GPU slice activated, program the sub-cache ID for all
> +	 * GPU blocks
> +	 */
> +	if (!llcc_slice_activate(llc->gpu_llc_slice))
> +		a6xx_gpu_cx_rmw(llc,
> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> +				A6XX_GPU_LLC_SCID_MASK,
> +				(llc->cntl1_regval &
> +				 A6XX_GPU_LLC_SCID_MASK));
> +
> +	/*
> +	 * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the
> +	 * GPU pagetables
> +	 */
> +	if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
> +		a6xx_gpu_cx_rmw(llc,
> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> +				A6XX_GPUHTW_LLC_SCID_MASK,
> +				(llc->cntl1_regval &
> +				 A6XX_GPUHTW_LLC_SCID_MASK));
> +
> +	/* Program cacheability overrides */
> +	a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
> +		llc->cntl0_regval);
> +}
> +
> +void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
> +{
> +	if (llc->mmio) {
> +		iounmap(llc->mmio);
> +		llc->mmio = NULL;
> +	}
> +
> +	llcc_slice_putd(llc->gpu_llc_slice);
> +	llc->gpu_llc_slice = NULL;

I don't think these need to be put back to NULL - we shouldn't touch them again
after this point.

> +
> +	llcc_slice_putd(llc->gpuhtw_llc_slice);
> +	llc->gpuhtw_llc_slice = NULL;
> +}
> +
> +static int a6xx_llc_slices_init(struct platform_device *pdev,
> +		struct a6xx_llc *llc)
> +{
> +	int i;
> +
> +	/* Map registers */
> +	llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
> +	if (IS_ERR(llc->mmio)) {
> +		llc->mmio = NULL;
> +		return -1;

Return a valid error code here even if we don't care what it is.  -ENODEV maybe.
And in fact, if we don't care what it is (LLCC is very optional) then just don't
return anything at all.

> +	}
> +
> +	/* Get the system cache slice descriptor for GPU and GPUHTWs */
> +	llc->gpu_llc_slice = llcc_slice_getd(LLCC_GPU);
> +	llc->gpuhtw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
> +	if (IS_ERR(llc->gpu_llc_slice) && IS_ERR(llc->gpuhtw_llc_slice))
> +		return -1;
> +
> +	/*
> +	 * Setup GPU system cache CNTL0 and CNTL1 register values.
> +	 * These values will be programmed everytime GPU comes out
> +	 * of power collapse as these are non-retention registers.
> +	 */
> +
> +	/*
> +	 * CNTL0 provides options to override the settings for the
> +	 * read and write allocation policies for the LLC. These
> +	 * overrides are global for all memory transactions from
> +	 * the GPU.
> +	 *
> +	 * 0x3: read-no-alloc-overridden = 0
> +	 *      read-no-alloc = 0 - Allocate lines on read miss
> +	 *      write-no-alloc-overridden = 1
> +	 *      write-no-alloc = 1 - Do not allocates lines on write miss
> +	 */
> +	llc->cntl0_regval = 0x03;
> +
> +	/*
> +	 * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC
> +	 * FLAG cache) GPU blocks. This value will be passed along with
> +	 * the address for any memory transaction from GPU to identify
> +	 * the sub-cache for that transaction.
> +	 *
> +	 * Currently there is only one SCID allocated for all GPU blocks
> +	 * Hence set same SCID for all the blocks.

This last sentence is not needed

> +	 */
> +
> +	if (!IS_ERR(llc->gpu_llc_slice)) {
> +		u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice);
> +
> +		for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++)
> +			llc->cntl1_regval |=
> +				gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i);
> +	}
> +
> +	/*
> +	 * Set SCID for GPU IOMMU. This will be used to access
> +	 * page tables that are cached in LLC.
> +	 */
> +	if (!IS_ERR(llc->gpuhtw_llc_slice)) {
> +		u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice);
> +
> +		llc->cntl1_regval |=
> +			gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT;
> +	}
> +
> +	return 0;
> +}
> +
>  static int a6xx_pm_resume(struct msm_gpu *gpu)
>  {
>  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> @@ -686,6 +832,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
>  
>  	msm_gpu_resume_devfreq(gpu);
>  
> +	/* Activate LLC slices */
> +	a6xx_llc_activate(gpu);
> +
>  	return ret;
>  }
>  
> @@ -694,6 +843,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
>  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>  	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>  
> +	/* Deactivate LLC slices */
> +	a6xx_llc_deactivate(gpu);
> +
>  	devfreq_suspend_device(gpu->devfreq.devfreq);
>  
>  	/*
> @@ -753,6 +905,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
>  		drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo);
>  	}
>  
> +	a6xx_llc_slices_destroy(&a6xx_gpu->llc);
> +
>  	a6xx_gmu_remove(a6xx_gpu);
>  
>  	adreno_gpu_cleanup(adreno_gpu);
> @@ -819,7 +973,10 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>  	adreno_gpu->registers = a6xx_registers;
>  	adreno_gpu->reg_offsets = a6xx_register_offsets;
>  
> -	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, 0);
> +	ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc);

Yep - there is no reason to take a ret and not deal with it.

> +
> +	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1,
> +			ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE);
>  	if (ret) {
>  		a6xx_destroy(&(a6xx_gpu->base.base));
>  		return ERR_PTR(ret);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> index 4127dce..86353e8 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
> @@ -12,6 +12,14 @@
>  
>  extern bool hang_debug;
>  
> +struct a6xx_llc {
> +	void __iomem *mmio;
> +	void *gpu_llc_slice;
> +	void *gpuhtw_llc_slice;
> +	u32 cntl0_regval;
> +	u32 cntl1_regval;
> +};
> +
>  struct a6xx_gpu {
>  	struct adreno_gpu base;
>  
> @@ -21,6 +29,7 @@ struct a6xx_gpu {
>  	struct msm_ringbuffer *cur_ring;
>  
>  	struct a6xx_gmu gmu;
> +	struct a6xx_llc llc;
>  };
>  
>  #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
> index e80c79b..66612c4 100644
> --- a/drivers/gpu/drm/msm/msm_iommu.c
> +++ b/drivers/gpu/drm/msm/msm_iommu.c
> @@ -38,6 +38,16 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char * const *names,
>  			    int cnt)
>  {
>  	struct msm_iommu *iommu = to_msm_iommu(mmu);
> +	int gpu_htw_llc = 1;
> +
> +	/*
> +	 * This allows GPU to set the bus attributes required
> +	 * to use system cache on behalf of the iommu page table
> +	 * walker.
> +	 */
> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
> +		iommu_domain_set_attr(iommu->domain,
> +				DOMAIN_ATTR_USE_SYS_CACHE, &gpu_htw_llc);
>  
>  	return iommu_attach_device(iommu->domain, mmu->dev);
>  }
> @@ -56,6 +66,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
>  	struct msm_iommu *iommu = to_msm_iommu(mmu);
>  	size_t ret;
>  
> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
> +		prot |= IOMMU_SYS_CACHE;
> +
>  	ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot);
>  	WARN_ON(ret < 0);
>  
> diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
> index 9b9f43f..524790b 100644
> --- a/drivers/gpu/drm/msm/msm_mmu.h
> +++ b/drivers/gpu/drm/msm/msm_mmu.h
> @@ -49,6 +49,9 @@ struct msm_mmu_funcs {
>  	bool (*is_domain_secure)(struct msm_mmu *mmu);
>  };
>  
> +/* MMU features */
> +#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0)
> +
>  struct msm_mmu {
>  	const struct msm_mmu_funcs *funcs;
>  	struct device *dev;
> -- 
> 1.9.1
>
Sharat Masetty Oct. 8, 2018, 1:59 p.m. UTC | #2
On 10/5/2018 8:37 PM, Jordan Crouse wrote:
> On Fri, Oct 05, 2018 at 06:38:35PM +0530, Sharat Masetty wrote:
>> The last level system cache can be partitioned to 32 different slices
>> of which GPU has two slices preallocated. One slice is used for caching GPU
>> buffers and the other slice is used for caching the GPU SMMU pagetables.
>> This patch talks to the core system cache driver to acquire the slice handles,
>> configure the SCID's to those slices and activates and deactivates the slices
>> upon GPU power collapse and restore.
>>
>> Some support from the IOMMU driver is also needed to make use of the
>> system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables
>> caching GPU data buffers in the system cache with memory attributes such
>> as outer cacheable, read-allocate, write-allocate for buffers. The GPU
>> then has the ability to override a few cacheability parameters which it
>> does to override write-allocate to write-no-allocate as the GPU hardware
>> does not benefit much from it.
>>
>> Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute
>> used by the IOMMU driver to set the right attributes to cache the hardware
>> pagetables into the system cache.
>>
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 +++++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
>>   drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
>>   drivers/gpu/drm/msm/msm_mmu.h         |   3 +
>>   4 files changed, 183 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> index 177dbfc..1790dde 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> @@ -8,6 +8,7 @@
>>   #include "a6xx_gmu.xml.h"
>>   
>>   #include <linux/devfreq.h>
>> +#include <linux/soc/qcom/llcc-qcom.h>
>>   
>>   static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
>>   {
>> @@ -674,6 +675,151 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>>   	~0
>>   };
>>   
>> +#define A6XX_LLC_NUM_GPU_SCIDS		5
>> +#define A6XX_GPU_LLC_SCID_NUM_BITS	5
>> +
>> +#define A6XX_GPU_LLC_SCID_MASK \
>> +	((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
>> +
>> +#define A6XX_GPUHTW_LLC_SCID_SHIFT	25
>> +#define A6XX_GPUHTW_LLC_SCID_MASK \
>> +	(((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
>> +
>> +static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
>> +	u32 reg, u32 mask, u32 or)
>> +{
>> +	msm_rmw(llc->mmio + (reg << 2), mask, or);
>> +}
>> +
>> +static void a6xx_llc_deactivate(struct msm_gpu *gpu)
>> +{
>> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
>> +
>> +	llcc_slice_deactivate(llc->gpu_llc_slice);
>> +	llcc_slice_deactivate(llc->gpuhtw_llc_slice);
>> +}
>> +
>> +static void a6xx_llc_activate(struct msm_gpu *gpu)
>> +{
>> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
>> +
>> +	if (!llc->mmio)
>> +		return;
>> +
>> +	/*
>> +	 * If the LLCC_GPU slice activated, program the sub-cache ID for all
>> +	 * GPU blocks
>> +	 */
>> +	if (!llcc_slice_activate(llc->gpu_llc_slice))
>> +		a6xx_gpu_cx_rmw(llc,
>> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
>> +				A6XX_GPU_LLC_SCID_MASK,
>> +				(llc->cntl1_regval &
>> +				 A6XX_GPU_LLC_SCID_MASK));
>> +
>> +	/*
>> +	 * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the
>> +	 * GPU pagetables
>> +	 */
>> +	if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
>> +		a6xx_gpu_cx_rmw(llc,
>> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
>> +				A6XX_GPUHTW_LLC_SCID_MASK,
>> +				(llc->cntl1_regval &
>> +				 A6XX_GPUHTW_LLC_SCID_MASK));
>> +
>> +	/* Program cacheability overrides */
>> +	a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
>> +		llc->cntl0_regval);
>> +}
>> +
>> +void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
>> +{
>> +	if (llc->mmio) {
>> +		iounmap(llc->mmio);
>> +		llc->mmio = NULL;
>> +	}
>> +
>> +	llcc_slice_putd(llc->gpu_llc_slice);
>> +	llc->gpu_llc_slice = NULL;
> 
> I don't think these need to be put back to NULL - we shouldn't touch them again
> after this point.
> 
>> +
>> +	llcc_slice_putd(llc->gpuhtw_llc_slice);
>> +	llc->gpuhtw_llc_slice = NULL;
>> +}
>> +
>> +static int a6xx_llc_slices_init(struct platform_device *pdev,
>> +		struct a6xx_llc *llc)
>> +{
>> +	int i;
>> +
>> +	/* Map registers */
>> +	llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
>> +	if (IS_ERR(llc->mmio)) {
>> +		llc->mmio = NULL;
>> +		return -1;
> 
> Return a valid error code here even if we don't care what it is.  -ENODEV maybe.
> And in fact, if we don't care what it is (LLCC is very optional) then just don't
> return anything at all.
Hi Jordan,

We do need the error code, as we have to let iommu layer know, so that 
it can set additional properties for the buffers and the page tables.
> 
>> +	}
>> +
>> +	/* Get the system cache slice descriptor for GPU and GPUHTWs */
>> +	llc->gpu_llc_slice = llcc_slice_getd(LLCC_GPU);
>> +	llc->gpuhtw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
>> +	if (IS_ERR(llc->gpu_llc_slice) && IS_ERR(llc->gpuhtw_llc_slice))
>> +		return -1;
>> +
>> +	/*
>> +	 * Setup GPU system cache CNTL0 and CNTL1 register values.
>> +	 * These values will be programmed everytime GPU comes out
>> +	 * of power collapse as these are non-retention registers.
>> +	 */
>> +
>> +	/*
>> +	 * CNTL0 provides options to override the settings for the
>> +	 * read and write allocation policies for the LLC. These
>> +	 * overrides are global for all memory transactions from
>> +	 * the GPU.
>> +	 *
>> +	 * 0x3: read-no-alloc-overridden = 0
>> +	 *      read-no-alloc = 0 - Allocate lines on read miss
>> +	 *      write-no-alloc-overridden = 1
>> +	 *      write-no-alloc = 1 - Do not allocates lines on write miss
>> +	 */
>> +	llc->cntl0_regval = 0x03;
>> +
>> +	/*
>> +	 * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC
>> +	 * FLAG cache) GPU blocks. This value will be passed along with
>> +	 * the address for any memory transaction from GPU to identify
>> +	 * the sub-cache for that transaction.
>> +	 *
>> +	 * Currently there is only one SCID allocated for all GPU blocks
>> +	 * Hence set same SCID for all the blocks.
> 
> This last sentence is not needed
> 
>> +	 */
>> +
>> +	if (!IS_ERR(llc->gpu_llc_slice)) {
>> +		u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice);
>> +
>> +		for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++)
>> +			llc->cntl1_regval |=
>> +				gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i);
>> +	}
>> +
>> +	/*
>> +	 * Set SCID for GPU IOMMU. This will be used to access
>> +	 * page tables that are cached in LLC.
>> +	 */
>> +	if (!IS_ERR(llc->gpuhtw_llc_slice)) {
>> +		u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice);
>> +
>> +		llc->cntl1_regval |=
>> +			gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static int a6xx_pm_resume(struct msm_gpu *gpu)
>>   {
>>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> @@ -686,6 +832,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
>>   
>>   	msm_gpu_resume_devfreq(gpu);
>>   
>> +	/* Activate LLC slices */
>> +	a6xx_llc_activate(gpu);
>> +
>>   	return ret;
>>   }
>>   
>> @@ -694,6 +843,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
>>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>   	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>>   
>> +	/* Deactivate LLC slices */
>> +	a6xx_llc_deactivate(gpu);
>> +
>>   	devfreq_suspend_device(gpu->devfreq.devfreq);
>>   
>>   	/*
>> @@ -753,6 +905,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
>>   		drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo);
>>   	}
>>   
>> +	a6xx_llc_slices_destroy(&a6xx_gpu->llc);
>> +
>>   	a6xx_gmu_remove(a6xx_gpu);
>>   
>>   	adreno_gpu_cleanup(adreno_gpu);
>> @@ -819,7 +973,10 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>>   	adreno_gpu->registers = a6xx_registers;
>>   	adreno_gpu->reg_offsets = a6xx_register_offsets;
>>   
>> -	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, 0);
>> +	ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc);
> 
> Yep - there is no reason to take a ret and not deal with it.
> 
>> +
>> +	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1,
>> +			ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE);
>>   	if (ret) {
>>   		a6xx_destroy(&(a6xx_gpu->base.base));
>>   		return ERR_PTR(ret);
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> index 4127dce..86353e8 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> @@ -12,6 +12,14 @@
>>   
>>   extern bool hang_debug;
>>   
>> +struct a6xx_llc {
>> +	void __iomem *mmio;
>> +	void *gpu_llc_slice;
>> +	void *gpuhtw_llc_slice;
>> +	u32 cntl0_regval;
>> +	u32 cntl1_regval;
>> +};
>> +
>>   struct a6xx_gpu {
>>   	struct adreno_gpu base;
>>   
>> @@ -21,6 +29,7 @@ struct a6xx_gpu {
>>   	struct msm_ringbuffer *cur_ring;
>>   
>>   	struct a6xx_gmu gmu;
>> +	struct a6xx_llc llc;
>>   };
>>   
>>   #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
>> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
>> index e80c79b..66612c4 100644
>> --- a/drivers/gpu/drm/msm/msm_iommu.c
>> +++ b/drivers/gpu/drm/msm/msm_iommu.c
>> @@ -38,6 +38,16 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char * const *names,
>>   			    int cnt)
>>   {
>>   	struct msm_iommu *iommu = to_msm_iommu(mmu);
>> +	int gpu_htw_llc = 1;
>> +
>> +	/*
>> +	 * This allows GPU to set the bus attributes required
>> +	 * to use system cache on behalf of the iommu page table
>> +	 * walker.
>> +	 */
>> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
>> +		iommu_domain_set_attr(iommu->domain,
>> +				DOMAIN_ATTR_USE_SYS_CACHE, &gpu_htw_llc);
>>   
>>   	return iommu_attach_device(iommu->domain, mmu->dev);
>>   }
>> @@ -56,6 +66,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
>>   	struct msm_iommu *iommu = to_msm_iommu(mmu);
>>   	size_t ret;
>>   
>> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
>> +		prot |= IOMMU_SYS_CACHE;
>> +
>>   	ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot);
>>   	WARN_ON(ret < 0);
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
>> index 9b9f43f..524790b 100644
>> --- a/drivers/gpu/drm/msm/msm_mmu.h
>> +++ b/drivers/gpu/drm/msm/msm_mmu.h
>> @@ -49,6 +49,9 @@ struct msm_mmu_funcs {
>>   	bool (*is_domain_secure)(struct msm_mmu *mmu);
>>   };
>>   
>> +/* MMU features */
>> +#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0)
>> +
>>   struct msm_mmu {
>>   	const struct msm_mmu_funcs *funcs;
>>   	struct device *dev;
>> -- 
>> 1.9.1
>>
>
Jordan Crouse Oct. 8, 2018, 2:18 p.m. UTC | #3
On Mon, Oct 08, 2018 at 07:29:03PM +0530, Sharat Masetty wrote:
> 
> 
> On 10/5/2018 8:37 PM, Jordan Crouse wrote:
> >On Fri, Oct 05, 2018 at 06:38:35PM +0530, Sharat Masetty wrote:
> >>The last level system cache can be partitioned to 32 different slices
> >>of which GPU has two slices preallocated. One slice is used for caching GPU
> >>buffers and the other slice is used for caching the GPU SMMU pagetables.
> >>This patch talks to the core system cache driver to acquire the slice handles,
> >>configure the SCID's to those slices and activates and deactivates the slices
> >>upon GPU power collapse and restore.
> >>
> >>Some support from the IOMMU driver is also needed to make use of the
> >>system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables
> >>caching GPU data buffers in the system cache with memory attributes such
> >>as outer cacheable, read-allocate, write-allocate for buffers. The GPU
> >>then has the ability to override a few cacheability parameters which it
> >>does to override write-allocate to write-no-allocate as the GPU hardware
> >>does not benefit much from it.
> >>
> >>Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute
> >>used by the IOMMU driver to set the right attributes to cache the hardware
> >>pagetables into the system cache.
> >>
> >>Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> >>---
> >>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 +++++++++++++++++++++++++++++++++-
> >>  drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
> >>  drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
> >>  drivers/gpu/drm/msm/msm_mmu.h         |   3 +
> >>  4 files changed, 183 insertions(+), 1 deletion(-)
> >>
> >>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >>index 177dbfc..1790dde 100644
> >>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> >>@@ -8,6 +8,7 @@
> >>  #include "a6xx_gmu.xml.h"
> >>  #include <linux/devfreq.h>
> >>+#include <linux/soc/qcom/llcc-qcom.h>
> >>  static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
> >>  {
> >>@@ -674,6 +675,151 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
> >>  	~0
> >>  };
> >>+#define A6XX_LLC_NUM_GPU_SCIDS		5
> >>+#define A6XX_GPU_LLC_SCID_NUM_BITS	5
> >>+
> >>+#define A6XX_GPU_LLC_SCID_MASK \
> >>+	((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
> >>+
> >>+#define A6XX_GPUHTW_LLC_SCID_SHIFT	25
> >>+#define A6XX_GPUHTW_LLC_SCID_MASK \
> >>+	(((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
> >>+
> >>+static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
> >>+	u32 reg, u32 mask, u32 or)
> >>+{
> >>+	msm_rmw(llc->mmio + (reg << 2), mask, or);
> >>+}
> >>+
> >>+static void a6xx_llc_deactivate(struct msm_gpu *gpu)
> >>+{
> >>+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> >>+	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> >>+	struct a6xx_llc *llc = &a6xx_gpu->llc;
> >>+
> >>+	llcc_slice_deactivate(llc->gpu_llc_slice);
> >>+	llcc_slice_deactivate(llc->gpuhtw_llc_slice);
> >>+}
> >>+
> >>+static void a6xx_llc_activate(struct msm_gpu *gpu)
> >>+{
> >>+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
> >>+	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> >>+	struct a6xx_llc *llc = &a6xx_gpu->llc;
> >>+
> >>+	if (!llc->mmio)
> >>+		return;
> >>+
> >>+	/*
> >>+	 * If the LLCC_GPU slice activated, program the sub-cache ID for all
> >>+	 * GPU blocks
> >>+	 */
> >>+	if (!llcc_slice_activate(llc->gpu_llc_slice))
> >>+		a6xx_gpu_cx_rmw(llc,
> >>+				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> >>+				A6XX_GPU_LLC_SCID_MASK,
> >>+				(llc->cntl1_regval &
> >>+				 A6XX_GPU_LLC_SCID_MASK));
> >>+
> >>+	/*
> >>+	 * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the
> >>+	 * GPU pagetables
> >>+	 */
> >>+	if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
> >>+		a6xx_gpu_cx_rmw(llc,
> >>+				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
> >>+				A6XX_GPUHTW_LLC_SCID_MASK,
> >>+				(llc->cntl1_regval &
> >>+				 A6XX_GPUHTW_LLC_SCID_MASK));
> >>+
> >>+	/* Program cacheability overrides */
> >>+	a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
> >>+		llc->cntl0_regval);
> >>+}
> >>+
> >>+void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
> >>+{
> >>+	if (llc->mmio) {
> >>+		iounmap(llc->mmio);
> >>+		llc->mmio = NULL;
> >>+	}
> >>+
> >>+	llcc_slice_putd(llc->gpu_llc_slice);
> >>+	llc->gpu_llc_slice = NULL;
> >
> >I don't think these need to be put back to NULL - we shouldn't touch them again
> >after this point.
> >
> >>+
> >>+	llcc_slice_putd(llc->gpuhtw_llc_slice);
> >>+	llc->gpuhtw_llc_slice = NULL;
> >>+}
> >>+
> >>+static int a6xx_llc_slices_init(struct platform_device *pdev,
> >>+		struct a6xx_llc *llc)
> >>+{
> >>+	int i;
> >>+
> >>+	/* Map registers */
> >>+	llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
> >>+	if (IS_ERR(llc->mmio)) {
> >>+		llc->mmio = NULL;
> >>+		return -1;
> >
> >Return a valid error code here even if we don't care what it is.  -ENODEV maybe.
> >And in fact, if we don't care what it is (LLCC is very optional) then just don't
> >return anything at all.
> Hi Jordan,
> 
> We do need the error code, as we have to let iommu layer know, so
> that it can set additional properties for the buffers and the page
> tables.

Oh, I see what you did (you should set the flag outside of the function call
so it is clearer what is going on).

But why is the MMU flag conditional? If we don't end up activating our slice
does it hurt to set the UPSTREAM_HINT in the pagetable?

Jordan
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 177dbfc..1790dde 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,6 +8,7 @@ 
 #include "a6xx_gmu.xml.h"
 
 #include <linux/devfreq.h>
+#include <linux/soc/qcom/llcc-qcom.h>
 
 static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
 {
@@ -674,6 +675,151 @@  static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
 	~0
 };
 
+#define A6XX_LLC_NUM_GPU_SCIDS		5
+#define A6XX_GPU_LLC_SCID_NUM_BITS	5
+
+#define A6XX_GPU_LLC_SCID_MASK \
+	((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
+
+#define A6XX_GPUHTW_LLC_SCID_SHIFT	25
+#define A6XX_GPUHTW_LLC_SCID_MASK \
+	(((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
+
+static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
+	u32 reg, u32 mask, u32 or)
+{
+	msm_rmw(llc->mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_deactivate(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+	struct a6xx_llc *llc = &a6xx_gpu->llc;
+
+	llcc_slice_deactivate(llc->gpu_llc_slice);
+	llcc_slice_deactivate(llc->gpuhtw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+	struct a6xx_llc *llc = &a6xx_gpu->llc;
+
+	if (!llc->mmio)
+		return;
+
+	/*
+	 * If the LLCC_GPU slice activated, program the sub-cache ID for all
+	 * GPU blocks
+	 */
+	if (!llcc_slice_activate(llc->gpu_llc_slice))
+		a6xx_gpu_cx_rmw(llc,
+				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
+				A6XX_GPU_LLC_SCID_MASK,
+				(llc->cntl1_regval &
+				 A6XX_GPU_LLC_SCID_MASK));
+
+	/*
+	 * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the
+	 * GPU pagetables
+	 */
+	if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
+		a6xx_gpu_cx_rmw(llc,
+				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
+				A6XX_GPUHTW_LLC_SCID_MASK,
+				(llc->cntl1_regval &
+				 A6XX_GPUHTW_LLC_SCID_MASK));
+
+	/* Program cacheability overrides */
+	a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
+		llc->cntl0_regval);
+}
+
+void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
+{
+	if (llc->mmio) {
+		iounmap(llc->mmio);
+		llc->mmio = NULL;
+	}
+
+	llcc_slice_putd(llc->gpu_llc_slice);
+	llc->gpu_llc_slice = NULL;
+
+	llcc_slice_putd(llc->gpuhtw_llc_slice);
+	llc->gpuhtw_llc_slice = NULL;
+}
+
+static int a6xx_llc_slices_init(struct platform_device *pdev,
+		struct a6xx_llc *llc)
+{
+	int i;
+
+	/* Map registers */
+	llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+	if (IS_ERR(llc->mmio)) {
+		llc->mmio = NULL;
+		return -1;
+	}
+
+	/* Get the system cache slice descriptor for GPU and GPUHTWs */
+	llc->gpu_llc_slice = llcc_slice_getd(LLCC_GPU);
+	llc->gpuhtw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+	if (IS_ERR(llc->gpu_llc_slice) && IS_ERR(llc->gpuhtw_llc_slice))
+		return -1;
+
+	/*
+	 * Setup GPU system cache CNTL0 and CNTL1 register values.
+	 * These values will be programmed everytime GPU comes out
+	 * of power collapse as these are non-retention registers.
+	 */
+
+	/*
+	 * CNTL0 provides options to override the settings for the
+	 * read and write allocation policies for the LLC. These
+	 * overrides are global for all memory transactions from
+	 * the GPU.
+	 *
+	 * 0x3: read-no-alloc-overridden = 0
+	 *      read-no-alloc = 0 - Allocate lines on read miss
+	 *      write-no-alloc-overridden = 1
+	 *      write-no-alloc = 1 - Do not allocates lines on write miss
+	 */
+	llc->cntl0_regval = 0x03;
+
+	/*
+	 * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC
+	 * FLAG cache) GPU blocks. This value will be passed along with
+	 * the address for any memory transaction from GPU to identify
+	 * the sub-cache for that transaction.
+	 *
+	 * Currently there is only one SCID allocated for all GPU blocks
+	 * Hence set same SCID for all the blocks.
+	 */
+
+	if (!IS_ERR(llc->gpu_llc_slice)) {
+		u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice);
+
+		for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++)
+			llc->cntl1_regval |=
+				gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i);
+	}
+
+	/*
+	 * Set SCID for GPU IOMMU. This will be used to access
+	 * page tables that are cached in LLC.
+	 */
+	if (!IS_ERR(llc->gpuhtw_llc_slice)) {
+		u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice);
+
+		llc->cntl1_regval |=
+			gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT;
+	}
+
+	return 0;
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -686,6 +832,9 @@  static int a6xx_pm_resume(struct msm_gpu *gpu)
 
 	msm_gpu_resume_devfreq(gpu);
 
+	/* Activate LLC slices */
+	a6xx_llc_activate(gpu);
+
 	return ret;
 }
 
@@ -694,6 +843,9 @@  static int a6xx_pm_suspend(struct msm_gpu *gpu)
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 
+	/* Deactivate LLC slices */
+	a6xx_llc_deactivate(gpu);
+
 	devfreq_suspend_device(gpu->devfreq.devfreq);
 
 	/*
@@ -753,6 +905,8 @@  static void a6xx_destroy(struct msm_gpu *gpu)
 		drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo);
 	}
 
+	a6xx_llc_slices_destroy(&a6xx_gpu->llc);
+
 	a6xx_gmu_remove(a6xx_gpu);
 
 	adreno_gpu_cleanup(adreno_gpu);
@@ -819,7 +973,10 @@  struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
 	adreno_gpu->registers = a6xx_registers;
 	adreno_gpu->reg_offsets = a6xx_register_offsets;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, 0);
+	ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc);
+
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1,
+			ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE);
 	if (ret) {
 		a6xx_destroy(&(a6xx_gpu->base.base));
 		return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 4127dce..86353e8 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -12,6 +12,14 @@ 
 
 extern bool hang_debug;
 
+struct a6xx_llc {
+	void __iomem *mmio;
+	void *gpu_llc_slice;
+	void *gpuhtw_llc_slice;
+	u32 cntl0_regval;
+	u32 cntl1_regval;
+};
+
 struct a6xx_gpu {
 	struct adreno_gpu base;
 
@@ -21,6 +29,7 @@  struct a6xx_gpu {
 	struct msm_ringbuffer *cur_ring;
 
 	struct a6xx_gmu gmu;
+	struct a6xx_llc llc;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index e80c79b..66612c4 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -38,6 +38,16 @@  static int msm_iommu_attach(struct msm_mmu *mmu, const char * const *names,
 			    int cnt)
 {
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	int gpu_htw_llc = 1;
+
+	/*
+	 * This allows GPU to set the bus attributes required
+	 * to use system cache on behalf of the iommu page table
+	 * walker.
+	 */
+	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
+		iommu_domain_set_attr(iommu->domain,
+				DOMAIN_ATTR_USE_SYS_CACHE, &gpu_htw_llc);
 
 	return iommu_attach_device(iommu->domain, mmu->dev);
 }
@@ -56,6 +66,9 @@  static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	size_t ret;
 
+	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
+		prot |= IOMMU_SYS_CACHE;
+
 	ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot);
 	WARN_ON(ret < 0);
 
diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
index 9b9f43f..524790b 100644
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@@ -49,6 +49,9 @@  struct msm_mmu_funcs {
 	bool (*is_domain_secure)(struct msm_mmu *mmu);
 };
 
+/* MMU features */
+#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0)
+
 struct msm_mmu {
 	const struct msm_mmu_funcs *funcs;
 	struct device *dev;