diff mbox series

[v2,6/9] drm/msm/a6xx: A640/A650 GMU firmware path

Message ID 20200421234127.27965-7-jonathan@marek.ca (mailing list archive)
State New, archived
Headers show
Series [v2,1/9] drm/msm: add msm_gem_get_and_pin_iova_range | expand

Commit Message

Jonathan Marek April 21, 2020, 11:41 p.m. UTC
Newer GPUs have different GMU firmware path.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c     | 135 +++++++++++++++++++---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.h     |  11 ++
 drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h |   6 +
 3 files changed, 136 insertions(+), 16 deletions(-)

Comments

Jordan Crouse April 23, 2020, 4:20 p.m. UTC | #1
On Tue, Apr 21, 2020 at 07:41:24PM -0400, Jonathan Marek wrote:
> Newer GPUs have different GMU firmware path.
> 
> Signed-off-by: Jonathan Marek <jonathan@marek.ca>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c     | 135 +++++++++++++++++++---
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.h     |  11 ++
>  drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h |   6 +
>  3 files changed, 136 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> index b22a69e2f4b0..4aef5fe985d6 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> @@ -571,6 +571,8 @@ static void a6xx_gmu_power_config(struct a6xx_gmu *gmu)
>  {
>  	/* Disable GMU WB/RB buffer */
>  	gmu_write(gmu, REG_A6XX_GMU_SYS_BUS_CONFIG, 0x1);
> +	gmu_write(gmu, REG_A6XX_GMU_ICACHE_CONFIG, 0x1);
> +	gmu_write(gmu, REG_A6XX_GMU_DCACHE_CONFIG, 0x1);
>  
>  	gmu_write(gmu, REG_A6XX_GMU_PWR_COL_INTER_FRAME_CTRL, 0x9c40400);
>  
> @@ -600,14 +602,91 @@ static void a6xx_gmu_power_config(struct a6xx_gmu *gmu)
>  		A6XX_GMU_RPMH_CTRL_GFX_VOTE_ENABLE);
>  }
>  
> +static int in_range(u32 addr, u32 start, u32 size)
> +{
> +	return addr >= start && addr < start + size;
> +}

I can't believe they don't have a clever macro in include/kernel.h for range
checking.

> +static int a6xx_gmu_fw_load(struct a6xx_gmu *gmu)
> +{
> +	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
> +	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> +	const struct firmware *fw_image = adreno_gpu->fw[ADRENO_FW_GMU];
> +	const struct block_header {
> +		u32 addr;
> +		u32 size;
> +		u32 type;
> +		u32 value;
> +		u32 data[];
> +	} *blk;
> +	struct a6xx_gmu_bo *mem;
> +	u32 reg;
> +
> +	u32 itcm_base = 0x00000000;
> +	u32 dtcm_base = 0x00040000;
> +
> +	if (adreno_is_a650(adreno_gpu))
> +		dtcm_base = 0x10004000;
> +

Yet another one of those spots that might grow an ugly conditional.

> +	if (gmu->legacy) {
> +		/* Sanity check the size of the firmware that was loaded */
> +		if (fw_image->size > 0x8000) {
> +			DRM_DEV_ERROR(gmu->dev,
> +				"GMU firmware is bigger than the available region\n");
> +			return -EINVAL;
> +		}
> +
> +		gmu_write_bulk(gmu, REG_A6XX_GMU_CM3_ITCM_START,
> +			       (u32*) fw_image->data, fw_image->size >> 2);
> +		return 0;
> +	}
> +
> +
> +	for (blk = (const struct block_header *) fw_image->data;
> +	     (const u8*) blk < fw_image->data + fw_image->size;
> +	     blk = (const struct block_header *) &blk->data[blk->size >> 2]) {
> +		if (blk->size == 0)
> +			continue;
> +
> +		reg = 0;
> +		mem = NULL;
> +
> +		if (in_range(blk->addr, itcm_base, SZ_16K))
> +			reg = REG_A6XX_GMU_CM3_ITCM_START + ((blk->addr - itcm_base) >> 2);

Personally I would just gmu_write_bulk() here just to avoid the extra registers.

> +		else if (in_range(blk->addr, dtcm_base, SZ_16K))
> +			reg = REG_A6XX_GMU_CM3_DTCM_START + ((blk->addr - dtcm_base) >> 2);

> +		else if (in_range(blk->addr, gmu->icache.iova, gmu->icache.size))
> +			mem = &gmu->icache;

I would mempcpy here. 

> +		else if (in_range(blk->addr, gmu->dcache.iova, gmu->dcache.size))

> +			mem = &gmu->dcache;
> +		else if (in_range(blk->addr, gmu->dummy.iova, gmu->dummy.size))
> +			mem = &gmu->dummy;
> +		else {
> +			DRM_DEV_ERROR(gmu->dev,
> +				"failed to match fw block (addr=%.8x size=%d data[0]=%.8x)\n",
> +				blk->addr, blk->size, blk->data[0]);
> +		}
> +
> +		if (reg)
> +			gmu_write_bulk(gmu, reg, blk->data, blk->size >> 2);
> +
> +		if (mem)
> +			memcpy(mem->virt + blk->addr - mem->iova, blk->data, blk->size);
> +	}
> +
> +	return 0;
> +}
> +
>  static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
>  {
>  	static bool rpmh_init;
>  	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>  	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
> -	int i, ret;
> +	int ret;
>  	u32 chipid;
> -	u32 *image;
> +
> +	if (adreno_is_a650(adreno_gpu))
> +		gmu_write(gmu, REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF, 1);
>  
>  	if (state == GMU_WARM_BOOT) {
>  		ret = a6xx_rpmh_start(gmu);
> @@ -618,13 +697,6 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
>  			"GMU firmware is not loaded\n"))
>  			return -ENOENT;
>  
> -		/* Sanity check the size of the firmware that was loaded */
> -		if (adreno_gpu->fw[ADRENO_FW_GMU]->size > 0x8000) {
> -			DRM_DEV_ERROR(gmu->dev,
> -				"GMU firmware is bigger than the available region\n");
> -			return -EINVAL;
> -		}
> -
>  		/* Turn on register retention */
>  		gmu_write(gmu, REG_A6XX_GMU_GENERAL_7, 1);
>  
> @@ -638,11 +710,9 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
>  				return ret;
>  		}
>  
> -		image = (u32 *) adreno_gpu->fw[ADRENO_FW_GMU]->data;
> -
> -		for (i = 0; i < adreno_gpu->fw[ADRENO_FW_GMU]->size >> 2; i++)
> -			gmu_write(gmu, REG_A6XX_GMU_CM3_ITCM_START + i,
> -				image[i]);
> +		ret = a6xx_gmu_fw_load(gmu);
> +		if (ret)
> +			return ret;
>  	}
>  
>  	gmu_write(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, 0);
> @@ -769,12 +839,20 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu)
>  	/* Enable the GMU interrupt */
>  	gmu_write(gmu, REG_A6XX_GMU_AO_HOST_INTERRUPT_CLR, ~0);
>  	gmu_write(gmu, REG_A6XX_GMU_AO_HOST_INTERRUPT_MASK, ~A6XX_GMU_IRQ_MASK);
> +

Extra whitespace change.

>  	enable_irq(gmu->gmu_irq);
>  
>  	/* Check to see if we are doing a cold or warm boot */
>  	status = gmu_read(gmu, REG_A6XX_GMU_GENERAL_7) == 1 ?
>  		GMU_WARM_BOOT : GMU_COLD_BOOT;
>  
> +	/*
> +	 * Warm boot path does not work on newer GPUs
> +	 * Presumably this is because icache/dcache regions must be restored
> +	 */
> +	if (!gmu->legacy)
> +		status = GMU_COLD_BOOT;
> +

To be honest, I'm not 100% sure WARM_BOOT even makes sense on a630. Downstream
4.19 has no paths that actually use WARM_BOOT. It might be worth a shot to
try removing it in a future patch.

>  	ret = a6xx_gmu_fw_start(gmu, status);
>  	if (ret)
>  		goto out;
> @@ -957,6 +1035,9 @@ static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu)
>  {
>  	msm_gem_kernel_put(gmu->hfi.obj, gmu->aspace, false);
>  	msm_gem_kernel_put(gmu->debug.obj, gmu->aspace, false);
> +	msm_gem_kernel_put(gmu->icache.obj, gmu->aspace, false);
> +	msm_gem_kernel_put(gmu->dcache.obj, gmu->aspace, false);
> +	msm_gem_kernel_put(gmu->dummy.obj, gmu->aspace, false);
>  
>  	gmu->aspace->mmu->funcs->detach(gmu->aspace->mmu);
>  	msm_gem_address_space_put(gmu->aspace);
> @@ -974,12 +1055,14 @@ static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo,
>  	size = PAGE_ALIGN(size);
>  	if (!iova) {
>  		/* no fixed address - use GMU's uncached range */
> -		range_start = 0x60000000;
> +		range_start = 0x60000000 + PAGE_SIZE; /* skip dummy page */
>  		range_end = 0x80000000;
>  	} else {
>  		/* range for fixed address */
>  		range_start = iova;
>  		range_end = iova + size;
> +		/* use IOMMU_PRIV for icache/dcache */
> +		flags |= MSM_BO_MAP_PRIV;
>  	}
>  
>  	bo->obj = msm_gem_new(dev, size, flags);
> @@ -1320,7 +1403,27 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node)
>  	if (ret)
>  		goto err_put_device;
>  
> -	if (!adreno_is_a640(adreno_gpu) && !adreno_is_a650(adreno_gpu)) {
> +	/* Allocate memory for the GMU dummy page */
> +	ret = a6xx_gmu_memory_alloc(gmu, &gmu->dummy, SZ_4K, 0x60000000);
> +	if (ret)
> +		goto err_memory;
> +
> +	if (adreno_is_a650(adreno_gpu)) {
> +		ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
> +			SZ_16M - SZ_16K, 0x04000);
> +		if (ret)
> +			goto err_memory;
> +	} else if (adreno_is_a640(adreno_gpu)) {
> +		ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
> +			SZ_256K - SZ_16K, 0x04000);
> +		if (ret)
> +			goto err_memory;
> +
> +		ret = a6xx_gmu_memory_alloc(gmu, &gmu->dcache,
> +			SZ_256K - SZ_16K, 0x44000);
> +		if (ret)
> +			goto err_memory;
> +	} else {
>  		/* HFI v1, has sptprac */
>  		gmu->legacy = true;
>  
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> index 463e2d5f2bb9..526911010cb1 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
> @@ -57,6 +57,9 @@ struct a6xx_gmu {
>  
>  	struct a6xx_gmu_bo hfi;
>  	struct a6xx_gmu_bo debug;
> +	struct a6xx_gmu_bo icache;
> +	struct a6xx_gmu_bo dcache;
> +	struct a6xx_gmu_bo dummy;
>  
>  	int nr_clocks;
>  	struct clk_bulk_data *clocks;
> @@ -92,6 +95,14 @@ static inline void gmu_write(struct a6xx_gmu *gmu, u32 offset, u32 value)
>  	return msm_writel(value, gmu->mmio + (offset << 2));
>  }
>  
> +static inline void
> +gmu_write_bulk(struct a6xx_gmu *gmu, u32 offset, const u32 *data, u32 size)
> +{
> +	int i;
> +	for (i = 0; i < size; i++)
> +		gmu_write(gmu, offset + i, data[i]);
> +}

mempcy_toio() might be helpful here instead.

> +
>  static inline void gmu_rmw(struct a6xx_gmu *gmu, u32 reg, u32 mask, u32 or)
>  {
>  	u32 val = gmu_read(gmu, reg);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> index 1cc1c135236b..eb2cd41dae6e 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
> @@ -101,6 +101,10 @@ static inline uint32_t A6XX_HFI_IRQ_OOB_MASK(uint32_t val)
>  
>  #define REG_A6XX_GMU_DCVS_RETURN				0x000023ff
>  
> +#define REG_A6XX_GMU_ICACHE_CONFIG				0x00004c00
> +
> +#define REG_A6XX_GMU_DCACHE_CONFIG				0x00004c01
> +
>  #define REG_A6XX_GMU_SYS_BUS_CONFIG				0x00004c0f
>  
>  #define REG_A6XX_GMU_CM3_SYSRESET				0x00005000
> @@ -199,6 +203,8 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val)
>  
>  #define REG_A6XX_GPU_GMU_CX_GMU_RPMH_POWER_STATE		0x000050ec
>  
> +#define REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF			0x000050f0
> +
>  #define REG_A6XX_GMU_BOOT_KMD_LM_HANDSHAKE			0x000051f0
>  
>  #define REG_A6XX_GMU_LLM_GLM_SLEEP_CTRL				0x00005157
> -- 
> 2.26.1
>
Ruhl, Michael J April 23, 2020, 5:36 p.m. UTC | #2
>-----Original Message-----
>From: Jonathan Marek <jonathan@marek.ca>
>Sent: Tuesday, April 21, 2020 7:41 PM
>To: freedreno@lists.freedesktop.org
>Cc: Rob Clark <robdclark@gmail.com>; Sean Paul <sean@poorly.run>; David
>Airlie <airlied@linux.ie>; Daniel Vetter <daniel@ffwll.ch>; Jordan Crouse
><jcrouse@codeaurora.org>; Sharat Masetty <smasetty@codeaurora.org>;
>Ruhl, Michael J <michael.j.ruhl@intel.com>; open list:DRM DRIVER FOR MSM
>ADRENO GPU <linux-arm-msm@vger.kernel.org>; open list:DRM DRIVER FOR
>MSM ADRENO GPU <dri-devel@lists.freedesktop.org>; open list <linux-
>kernel@vger.kernel.org>
>Subject: [PATCH v2 6/9] drm/msm/a6xx: A640/A650 GMU firmware path
>
>Newer GPUs have different GMU firmware path.
>
>Signed-off-by: Jonathan Marek <jonathan@marek.ca>
>---
> drivers/gpu/drm/msm/adreno/a6xx_gmu.c     | 135 +++++++++++++++++++-
>--
> drivers/gpu/drm/msm/adreno/a6xx_gmu.h     |  11 ++
> drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h |   6 +
> 3 files changed, 136 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>index b22a69e2f4b0..4aef5fe985d6 100644
>--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>@@ -571,6 +571,8 @@ static void a6xx_gmu_power_config(struct a6xx_gmu
>*gmu)
> {
> 	/* Disable GMU WB/RB buffer */
> 	gmu_write(gmu, REG_A6XX_GMU_SYS_BUS_CONFIG, 0x1);
>+	gmu_write(gmu, REG_A6XX_GMU_ICACHE_CONFIG, 0x1);
>+	gmu_write(gmu, REG_A6XX_GMU_DCACHE_CONFIG, 0x1);
>
> 	gmu_write(gmu, REG_A6XX_GMU_PWR_COL_INTER_FRAME_CTRL,
>0x9c40400);
>
>@@ -600,14 +602,91 @@ static void a6xx_gmu_power_config(struct
>a6xx_gmu *gmu)
> 		A6XX_GMU_RPMH_CTRL_GFX_VOTE_ENABLE);
> }
>
>+static int in_range(u32 addr, u32 start, u32 size)
>+{
>+	return addr >= start && addr < start + size;
>+}

Minor nit:

should this return a bool?

M
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index b22a69e2f4b0..4aef5fe985d6 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -571,6 +571,8 @@  static void a6xx_gmu_power_config(struct a6xx_gmu *gmu)
 {
 	/* Disable GMU WB/RB buffer */
 	gmu_write(gmu, REG_A6XX_GMU_SYS_BUS_CONFIG, 0x1);
+	gmu_write(gmu, REG_A6XX_GMU_ICACHE_CONFIG, 0x1);
+	gmu_write(gmu, REG_A6XX_GMU_DCACHE_CONFIG, 0x1);
 
 	gmu_write(gmu, REG_A6XX_GMU_PWR_COL_INTER_FRAME_CTRL, 0x9c40400);
 
@@ -600,14 +602,91 @@  static void a6xx_gmu_power_config(struct a6xx_gmu *gmu)
 		A6XX_GMU_RPMH_CTRL_GFX_VOTE_ENABLE);
 }
 
+static int in_range(u32 addr, u32 start, u32 size)
+{
+	return addr >= start && addr < start + size;
+}
+
+static int a6xx_gmu_fw_load(struct a6xx_gmu *gmu)
+{
+	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
+	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
+	const struct firmware *fw_image = adreno_gpu->fw[ADRENO_FW_GMU];
+	const struct block_header {
+		u32 addr;
+		u32 size;
+		u32 type;
+		u32 value;
+		u32 data[];
+	} *blk;
+	struct a6xx_gmu_bo *mem;
+	u32 reg;
+
+	u32 itcm_base = 0x00000000;
+	u32 dtcm_base = 0x00040000;
+
+	if (adreno_is_a650(adreno_gpu))
+		dtcm_base = 0x10004000;
+
+	if (gmu->legacy) {
+		/* Sanity check the size of the firmware that was loaded */
+		if (fw_image->size > 0x8000) {
+			DRM_DEV_ERROR(gmu->dev,
+				"GMU firmware is bigger than the available region\n");
+			return -EINVAL;
+		}
+
+		gmu_write_bulk(gmu, REG_A6XX_GMU_CM3_ITCM_START,
+			       (u32*) fw_image->data, fw_image->size >> 2);
+		return 0;
+	}
+
+
+	for (blk = (const struct block_header *) fw_image->data;
+	     (const u8*) blk < fw_image->data + fw_image->size;
+	     blk = (const struct block_header *) &blk->data[blk->size >> 2]) {
+		if (blk->size == 0)
+			continue;
+
+		reg = 0;
+		mem = NULL;
+
+		if (in_range(blk->addr, itcm_base, SZ_16K))
+			reg = REG_A6XX_GMU_CM3_ITCM_START + ((blk->addr - itcm_base) >> 2);
+		else if (in_range(blk->addr, dtcm_base, SZ_16K))
+			reg = REG_A6XX_GMU_CM3_DTCM_START + ((blk->addr - dtcm_base) >> 2);
+		else if (in_range(blk->addr, gmu->icache.iova, gmu->icache.size))
+			mem = &gmu->icache;
+		else if (in_range(blk->addr, gmu->dcache.iova, gmu->dcache.size))
+			mem = &gmu->dcache;
+		else if (in_range(blk->addr, gmu->dummy.iova, gmu->dummy.size))
+			mem = &gmu->dummy;
+		else {
+			DRM_DEV_ERROR(gmu->dev,
+				"failed to match fw block (addr=%.8x size=%d data[0]=%.8x)\n",
+				blk->addr, blk->size, blk->data[0]);
+		}
+
+		if (reg)
+			gmu_write_bulk(gmu, reg, blk->data, blk->size >> 2);
+
+		if (mem)
+			memcpy(mem->virt + blk->addr - mem->iova, blk->data, blk->size);
+	}
+
+	return 0;
+}
+
 static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
 {
 	static bool rpmh_init;
 	struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
 	struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
-	int i, ret;
+	int ret;
 	u32 chipid;
-	u32 *image;
+
+	if (adreno_is_a650(adreno_gpu))
+		gmu_write(gmu, REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF, 1);
 
 	if (state == GMU_WARM_BOOT) {
 		ret = a6xx_rpmh_start(gmu);
@@ -618,13 +697,6 @@  static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
 			"GMU firmware is not loaded\n"))
 			return -ENOENT;
 
-		/* Sanity check the size of the firmware that was loaded */
-		if (adreno_gpu->fw[ADRENO_FW_GMU]->size > 0x8000) {
-			DRM_DEV_ERROR(gmu->dev,
-				"GMU firmware is bigger than the available region\n");
-			return -EINVAL;
-		}
-
 		/* Turn on register retention */
 		gmu_write(gmu, REG_A6XX_GMU_GENERAL_7, 1);
 
@@ -638,11 +710,9 @@  static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state)
 				return ret;
 		}
 
-		image = (u32 *) adreno_gpu->fw[ADRENO_FW_GMU]->data;
-
-		for (i = 0; i < adreno_gpu->fw[ADRENO_FW_GMU]->size >> 2; i++)
-			gmu_write(gmu, REG_A6XX_GMU_CM3_ITCM_START + i,
-				image[i]);
+		ret = a6xx_gmu_fw_load(gmu);
+		if (ret)
+			return ret;
 	}
 
 	gmu_write(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, 0);
@@ -769,12 +839,20 @@  int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu)
 	/* Enable the GMU interrupt */
 	gmu_write(gmu, REG_A6XX_GMU_AO_HOST_INTERRUPT_CLR, ~0);
 	gmu_write(gmu, REG_A6XX_GMU_AO_HOST_INTERRUPT_MASK, ~A6XX_GMU_IRQ_MASK);
+
 	enable_irq(gmu->gmu_irq);
 
 	/* Check to see if we are doing a cold or warm boot */
 	status = gmu_read(gmu, REG_A6XX_GMU_GENERAL_7) == 1 ?
 		GMU_WARM_BOOT : GMU_COLD_BOOT;
 
+	/*
+	 * Warm boot path does not work on newer GPUs
+	 * Presumably this is because icache/dcache regions must be restored
+	 */
+	if (!gmu->legacy)
+		status = GMU_COLD_BOOT;
+
 	ret = a6xx_gmu_fw_start(gmu, status);
 	if (ret)
 		goto out;
@@ -957,6 +1035,9 @@  static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu)
 {
 	msm_gem_kernel_put(gmu->hfi.obj, gmu->aspace, false);
 	msm_gem_kernel_put(gmu->debug.obj, gmu->aspace, false);
+	msm_gem_kernel_put(gmu->icache.obj, gmu->aspace, false);
+	msm_gem_kernel_put(gmu->dcache.obj, gmu->aspace, false);
+	msm_gem_kernel_put(gmu->dummy.obj, gmu->aspace, false);
 
 	gmu->aspace->mmu->funcs->detach(gmu->aspace->mmu);
 	msm_gem_address_space_put(gmu->aspace);
@@ -974,12 +1055,14 @@  static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo,
 	size = PAGE_ALIGN(size);
 	if (!iova) {
 		/* no fixed address - use GMU's uncached range */
-		range_start = 0x60000000;
+		range_start = 0x60000000 + PAGE_SIZE; /* skip dummy page */
 		range_end = 0x80000000;
 	} else {
 		/* range for fixed address */
 		range_start = iova;
 		range_end = iova + size;
+		/* use IOMMU_PRIV for icache/dcache */
+		flags |= MSM_BO_MAP_PRIV;
 	}
 
 	bo->obj = msm_gem_new(dev, size, flags);
@@ -1320,7 +1403,27 @@  int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node)
 	if (ret)
 		goto err_put_device;
 
-	if (!adreno_is_a640(adreno_gpu) && !adreno_is_a650(adreno_gpu)) {
+	/* Allocate memory for the GMU dummy page */
+	ret = a6xx_gmu_memory_alloc(gmu, &gmu->dummy, SZ_4K, 0x60000000);
+	if (ret)
+		goto err_memory;
+
+	if (adreno_is_a650(adreno_gpu)) {
+		ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
+			SZ_16M - SZ_16K, 0x04000);
+		if (ret)
+			goto err_memory;
+	} else if (adreno_is_a640(adreno_gpu)) {
+		ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache,
+			SZ_256K - SZ_16K, 0x04000);
+		if (ret)
+			goto err_memory;
+
+		ret = a6xx_gmu_memory_alloc(gmu, &gmu->dcache,
+			SZ_256K - SZ_16K, 0x44000);
+		if (ret)
+			goto err_memory;
+	} else {
 		/* HFI v1, has sptprac */
 		gmu->legacy = true;
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
index 463e2d5f2bb9..526911010cb1 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
@@ -57,6 +57,9 @@  struct a6xx_gmu {
 
 	struct a6xx_gmu_bo hfi;
 	struct a6xx_gmu_bo debug;
+	struct a6xx_gmu_bo icache;
+	struct a6xx_gmu_bo dcache;
+	struct a6xx_gmu_bo dummy;
 
 	int nr_clocks;
 	struct clk_bulk_data *clocks;
@@ -92,6 +95,14 @@  static inline void gmu_write(struct a6xx_gmu *gmu, u32 offset, u32 value)
 	return msm_writel(value, gmu->mmio + (offset << 2));
 }
 
+static inline void
+gmu_write_bulk(struct a6xx_gmu *gmu, u32 offset, const u32 *data, u32 size)
+{
+	int i;
+	for (i = 0; i < size; i++)
+		gmu_write(gmu, offset + i, data[i]);
+}
+
 static inline void gmu_rmw(struct a6xx_gmu *gmu, u32 reg, u32 mask, u32 or)
 {
 	u32 val = gmu_read(gmu, reg);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
index 1cc1c135236b..eb2cd41dae6e 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h
@@ -101,6 +101,10 @@  static inline uint32_t A6XX_HFI_IRQ_OOB_MASK(uint32_t val)
 
 #define REG_A6XX_GMU_DCVS_RETURN				0x000023ff
 
+#define REG_A6XX_GMU_ICACHE_CONFIG				0x00004c00
+
+#define REG_A6XX_GMU_DCACHE_CONFIG				0x00004c01
+
 #define REG_A6XX_GMU_SYS_BUS_CONFIG				0x00004c0f
 
 #define REG_A6XX_GMU_CM3_SYSRESET				0x00005000
@@ -199,6 +203,8 @@  static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val)
 
 #define REG_A6XX_GPU_GMU_CX_GMU_RPMH_POWER_STATE		0x000050ec
 
+#define REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF			0x000050f0
+
 #define REG_A6XX_GMU_BOOT_KMD_LM_HANDSHAKE			0x000051f0
 
 #define REG_A6XX_GMU_LLM_GLM_SLEEP_CTRL				0x00005157