diff mbox series

[v2,8/8] drm/etnaviv: implement per-process address spaces on MMUv2

Message ID 20190705171727.27501-8-l.stach@pengutronix.de (mailing list archive)
State New, archived
Headers show
Series [v2,1/8] drm/etnaviv: simplify unbind checks | expand

Commit Message

Lucas Stach July 5, 2019, 5:17 p.m. UTC
This builds on top of the MMU contexts introduced earlier. Instead of having
one context per GPU core, each GPU client receives its own context.

On MMUv1 this still means a single shared pagetable set is used by all
clients, but on MMUv2 there is now a distinct set of pagetables for each
client. As the command fetch is also translated via the MMU on MMUv2 the
kernel command ringbuffer is mapped into each of the client pagetables.

As the MMU context switch is a bit of a heavy operation, due to the needed
cache and TLB flushing, this patch implements a lazy way of switching the
MMU context. The kernel does not have its own MMU context, but reuses the
last client context for all of its operations. This has some visible impact,
as the GPU can now only be started once a client has submitted some work and
we got the client MMU context assigned. Also the MMU context has a different
lifetime than the general client context, as the GPU might still execute the
kernel command buffer in the context of a client even after the client has
completed all GPU work and has been terminated. Only when the GPU is runtime
suspended or switches to another clients MMU context is the old context
freed up.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/etnaviv/etnaviv_buffer.c     |  59 ++++++++---
 drivers/gpu/drm/etnaviv/etnaviv_drv.c        |  38 ++++++-
 drivers/gpu/drm/etnaviv/etnaviv_drv.h        |   6 +-
 drivers/gpu/drm/etnaviv/etnaviv_dump.c       |   4 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c        |   5 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.h        |   4 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c |  10 +-
 drivers/gpu/drm/etnaviv/etnaviv_gpu.c        | 100 ++++++++-----------
 drivers/gpu/drm/etnaviv/etnaviv_gpu.h        |   4 -
 drivers/gpu/drm/etnaviv/etnaviv_iommu.c      |  10 +-
 drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c   |  17 +++-
 drivers/gpu/drm/etnaviv/etnaviv_mmu.c        |  42 ++++++--
 drivers/gpu/drm/etnaviv/etnaviv_mmu.h        |  11 +-
 13 files changed, 199 insertions(+), 111 deletions(-)

Comments

Philipp Zabel July 30, 2019, 9:44 a.m. UTC | #1
On Fri, 2019-07-05 at 19:17 +0200, Lucas Stach wrote:
> This builds on top of the MMU contexts introduced earlier. Instead of having
> one context per GPU core, each GPU client receives its own context.
> 
> On MMUv1 this still means a single shared pagetable set is used by all
> clients, but on MMUv2 there is now a distinct set of pagetables for each
> client. As the command fetch is also translated via the MMU on MMUv2 the
> kernel command ringbuffer is mapped into each of the client pagetables.
> 
> As the MMU context switch is a bit of a heavy operation, due to the needed
> cache and TLB flushing, this patch implements a lazy way of switching the
> MMU context. The kernel does not have its own MMU context, but reuses the
> last client context for all of its operations. This has some visible impact,
> as the GPU can now only be started once a client has submitted some work and
> we got the client MMU context assigned. Also the MMU context has a different
> lifetime than the general client context, as the GPU might still execute the
> kernel command buffer in the context of a client even after the client has
> completed all GPU work and has been terminated. Only when the GPU is runtime
> suspended or switches to another clients MMU context is the old context
> freed up.
> 
> Signed-off-by: Lucas Stach <l.stach@pengutronix.de>

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>

I just have two nitpicks below:

> ---
>  drivers/gpu/drm/etnaviv/etnaviv_buffer.c     |  59 ++++++++---
>  drivers/gpu/drm/etnaviv/etnaviv_drv.c        |  38 ++++++-
>  drivers/gpu/drm/etnaviv/etnaviv_drv.h        |   6 +-
>  drivers/gpu/drm/etnaviv/etnaviv_dump.c       |   4 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gem.c        |   5 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gem.h        |   4 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c |  10 +-
>  drivers/gpu/drm/etnaviv/etnaviv_gpu.c        | 100 ++++++++-----------
>  drivers/gpu/drm/etnaviv/etnaviv_gpu.h        |   4 -
>  drivers/gpu/drm/etnaviv/etnaviv_iommu.c      |  10 +-
>  drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c   |  17 +++-
>  drivers/gpu/drm/etnaviv/etnaviv_mmu.c        |  42 ++++++--
>  drivers/gpu/drm/etnaviv/etnaviv_mmu.h        |  11 +-
>  13 files changed, 199 insertions(+), 111 deletions(-)
> 
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
> index 022134238184..9bdebe045a31 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
[...]
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> index cf49f0e2e1cb..99c20094295c 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> @@ -290,6 +290,8 @@ static void etnaviv_iommu_context_free(struct kref *kref)
>  	struct etnaviv_iommu_context *context =
>  		container_of(kref, struct etnaviv_iommu_context, refcount);
>  
> +	etnaviv_cmdbuf_suballoc_unmap(context, &context->cmdbuf_mapping);
> +
>  	context->global->ops->free(context);
>  }
>  void etnaviv_iommu_context_put(struct etnaviv_iommu_context *context)
> @@ -298,12 +300,28 @@ void etnaviv_iommu_context_put(struct etnaviv_iommu_context *context)
>  }
>  
>  struct etnaviv_iommu_context *
> -etnaviv_iommu_context_init(struct etnaviv_iommu_global *global)
> +etnaviv_iommu_context_init(struct etnaviv_iommu_global *global,
> +			   struct etnaviv_cmdbuf_suballoc *suballoc)
>  {
> +	struct etnaviv_iommu_context *ctx;
> +	int ret;
> +
>  	if (global->version == ETNAVIV_IOMMU_V1)
> -		return etnaviv_iommuv1_context_alloc(global);
> +		ctx = etnaviv_iommuv1_context_alloc(global);
>  	else
> -		return etnaviv_iommuv2_context_alloc(global);
> +		ctx = etnaviv_iommuv2_context_alloc(global);
> +
> +	if (!ctx)
> +		return NULL;
> +
> +	ret = etnaviv_cmdbuf_suballoc_map(suballoc, ctx, &ctx->cmdbuf_mapping,
> +					  global->memory_base);
> +	if (ret) {
> +		etnaviv_iommu_context_put(ctx);

This will call etnaviv_cmdbuf_suballoc_unmap
inĀ etnaviv_iommu_context_free above, even though
etnaviv_cmdbuf_suballoc_map didn't succeed. See below.

> +		return NULL;
> +	}
> +
> +	return ctx;
>  }
>  
>  void etnaviv_iommu_restore(struct etnaviv_gpu *gpu,
> @@ -319,6 +337,12 @@ int etnaviv_iommu_get_suballoc_va(struct etnaviv_iommu_context *context,
>  {
>  	mutex_lock(&context->lock);
>  
> +	if (mapping->use > 0) {
> +		mapping->use++;
> +		mutex_unlock(&context->lock);
> +		return 0;
> +	}
> +
>  	/*
>  	 * For MMUv1 we don't add the suballoc region to the pagetables, as
>  	 * those GPUs can only work with cmdbufs accessed through the linear
> @@ -341,7 +365,6 @@ int etnaviv_iommu_get_suballoc_va(struct etnaviv_iommu_context *context,
>  		mapping->iova = node->start;
>  		ret = etnaviv_context_map(context, node->start, paddr, size,
>  					  ETNAVIV_PROT_READ);
> -

Maybe squash this into "drm/etnaviv: split out cmdbuf mapping into
address space" instead.

>  		if (ret < 0) {
>  			drm_mm_remove_node(node);
>  			mutex_unlock(&context->lock);
> @@ -364,15 +387,14 @@ void etnaviv_iommu_put_suballoc_va(struct etnaviv_iommu_context *context,
>  {
>  	struct drm_mm_node *node = &mapping->vram_node;
>  
> -	if (!mapping->use)
> -		return;
> -
> -	mapping->use = 0;
> +	mutex_lock(&context->lock);
> +	mapping->use--;

See above, when called from the etnaviv_iommu_context_init error path,
mapping->use wraps from 0 to UINT_MAX ...

> -	if (context->global->version == ETNAVIV_IOMMU_V1)
> +	if (mapping->use > 0 || context->global->version == ETNAVIV_IOMMU_V1) {
> +		mutex_unlock(&context->lock);

... which is > 0, so we return here.

This works out, but it does look a bit weird.

regards
Philipp
diff mbox series

Patch

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
index 022134238184..9bdebe045a31 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
@@ -118,7 +118,7 @@  static void etnaviv_buffer_dump(struct etnaviv_gpu *gpu,
 	u32 *ptr = buf->vaddr + off;
 
 	dev_info(gpu->dev, "virt %p phys 0x%08x free 0x%08x\n",
-			ptr, etnaviv_cmdbuf_get_va(buf, &gpu->cmdbuf_mapping) +
+			ptr, etnaviv_cmdbuf_get_va(buf, &gpu->mmu->cmdbuf_mapping) +
 			off, size - len * 4 - off);
 
 	print_hex_dump(KERN_INFO, "cmd ", DUMP_PREFIX_OFFSET, 16, 4,
@@ -152,7 +152,7 @@  static u32 etnaviv_buffer_reserve(struct etnaviv_gpu *gpu,
 	if (buffer->user_size + cmd_dwords * sizeof(u64) > buffer->size)
 		buffer->user_size = 0;
 
-	return etnaviv_cmdbuf_get_va(buffer, &gpu->cmdbuf_mapping) +
+	return etnaviv_cmdbuf_get_va(buffer, &gpu->mmu->cmdbuf_mapping) +
 	       buffer->user_size;
 }
 
@@ -166,7 +166,7 @@  u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu)
 	buffer->user_size = 0;
 
 	CMD_WAIT(buffer);
-	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->cmdbuf_mapping)
+	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->mmu->cmdbuf_mapping)
 		 + buffer->user_size - 4);
 
 	return buffer->user_size / 8;
@@ -293,7 +293,7 @@  void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event)
 
 	/* Append waitlink */
 	CMD_WAIT(buffer);
-	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->cmdbuf_mapping)
+	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->mmu->cmdbuf_mapping)
 		 + buffer->user_size - 4);
 
 	/*
@@ -308,25 +308,27 @@  void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event)
 
 /* Append a command buffer to the ring buffer. */
 void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
-	unsigned int event, struct etnaviv_cmdbuf *cmdbuf)
+	struct etnaviv_iommu_context *mmu, unsigned int event, struct etnaviv_cmdbuf *cmdbuf)
 {
 	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
 	unsigned int waitlink_offset = buffer->user_size - 16;
 	u32 return_target, return_dwords;
 	u32 link_target, link_dwords;
 	bool switch_context = gpu->exec_state != exec_state;
-	bool need_flush = gpu->flush_seq != gpu->mmu->flush_seq;
+	bool switch_mmu_context = gpu->mmu != mmu;
+	bool need_flush = switch_mmu_context ||
+			  gpu->flush_seq != gpu->mmu->flush_seq;
 
 	lockdep_assert_held(&gpu->lock);
 
 	if (drm_debug & DRM_UT_DRIVER)
 		etnaviv_buffer_dump(gpu, buffer, 0, 0x50);
 
-	link_target = etnaviv_cmdbuf_get_va(cmdbuf, &gpu->cmdbuf_mapping);
+	link_target = etnaviv_cmdbuf_get_va(cmdbuf, &gpu->mmu->cmdbuf_mapping);
 	link_dwords = cmdbuf->size / 8;
 
 	/*
-	 * If we need maintanence prior to submitting this buffer, we will
+	 * If we need maintenance prior to submitting this buffer, we will
 	 * need to append a mmu flush load state, followed by a new
 	 * link to this buffer - a total of four additional words.
 	 */
@@ -348,7 +350,24 @@  void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
 		if (switch_context)
 			extra_dwords += 4;
 
+		/* PTA load command */
+		if (switch_mmu_context && gpu->sec_mode == ETNA_SEC_KERNEL)
+			extra_dwords += 1;
+
 		target = etnaviv_buffer_reserve(gpu, buffer, extra_dwords);
+		/*
+		 * Switch MMU context if necessary. Must be done after the
+		 * link target has been calculated, as the jump forward in the
+		 * kernel ring still uses the last active MMU context before
+		 * the switch.
+		 */
+		if (switch_mmu_context) {
+			struct etnaviv_iommu_context *mmu_old = gpu->mmu;
+
+			etnaviv_iommu_context_get(mmu);
+			gpu->mmu = mmu;
+			etnaviv_iommu_context_put(mmu_old);
+		}
 
 		if (need_flush) {
 			/* Add the MMU flush */
@@ -360,10 +379,23 @@  void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
 					       VIVS_GL_FLUSH_MMU_FLUSH_PEMMU |
 					       VIVS_GL_FLUSH_MMU_FLUSH_UNK4);
 			} else {
+				u32 flush = VIVS_MMUv2_CONFIGURATION_MODE_MASK |
+					    VIVS_MMUv2_CONFIGURATION_FLUSH_FLUSH;
+
+				if (switch_mmu_context &&
+				    gpu->sec_mode == ETNA_SEC_KERNEL) {
+					unsigned short id =
+						etnaviv_iommuv2_get_pta_id(gpu->mmu);
+					CMD_LOAD_STATE(buffer,
+						VIVS_MMUv2_PTA_CONFIG,
+						VIVS_MMUv2_PTA_CONFIG_INDEX(id));
+				}
+
+				if (gpu->sec_mode == ETNA_SEC_NONE)
+					flush |= etnaviv_iommuv2_get_mtlb_addr(gpu->mmu);
+
 				CMD_LOAD_STATE(buffer, VIVS_MMUv2_CONFIGURATION,
-					VIVS_MMUv2_CONFIGURATION_MODE_MASK |
-					VIVS_MMUv2_CONFIGURATION_ADDRESS_MASK |
-					VIVS_MMUv2_CONFIGURATION_FLUSH_FLUSH);
+					       flush);
 				CMD_SEM(buffer, SYNC_RECIPIENT_FE,
 					SYNC_RECIPIENT_PE);
 				CMD_STALL(buffer, SYNC_RECIPIENT_FE,
@@ -379,6 +411,7 @@  void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
 		}
 
 		/* And the link to the submitted buffer */
+		link_target = etnaviv_cmdbuf_get_va(cmdbuf, &gpu->mmu->cmdbuf_mapping);
 		CMD_LINK(buffer, link_dwords, link_target);
 
 		/* Update the link target to point to above instructions */
@@ -415,13 +448,13 @@  void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
 	CMD_LOAD_STATE(buffer, VIVS_GL_EVENT, VIVS_GL_EVENT_EVENT_ID(event) |
 		       VIVS_GL_EVENT_FROM_PE);
 	CMD_WAIT(buffer);
-	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->cmdbuf_mapping)
+	CMD_LINK(buffer, 2, etnaviv_cmdbuf_get_va(buffer, &gpu->mmu->cmdbuf_mapping)
 		 + buffer->user_size - 4);
 
 	if (drm_debug & DRM_UT_DRIVER)
 		pr_info("stream link to 0x%08x @ 0x%08x %p\n",
 			return_target,
-			etnaviv_cmdbuf_get_va(cmdbuf, &gpu->cmdbuf_mapping),
+			etnaviv_cmdbuf_get_va(cmdbuf, &gpu->mmu->cmdbuf_mapping),
 			cmdbuf->vaddr);
 
 	if (drm_debug & DRM_UT_DRIVER) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
index 5fa3aa7bdbc5..8bcbd3fb02c6 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
@@ -50,12 +50,19 @@  static int etnaviv_open(struct drm_device *dev, struct drm_file *file)
 {
 	struct etnaviv_drm_private *priv = dev->dev_private;
 	struct etnaviv_file_private *ctx;
-	int i;
+	int ret, i;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
+	ctx->mmu = etnaviv_iommu_context_init(priv->mmu_global,
+					      priv->cmdbuf_suballoc);
+	if (!ctx->mmu) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
 	for (i = 0; i < ETNA_MAX_PIPES; i++) {
 		struct etnaviv_gpu *gpu = priv->gpu[i];
 		struct drm_sched_rq *rq;
@@ -70,6 +77,10 @@  static int etnaviv_open(struct drm_device *dev, struct drm_file *file)
 	file->driver_priv = ctx;
 
 	return 0;
+
+out_free:
+	kfree(ctx);
+	return ret;
 }
 
 static void etnaviv_postclose(struct drm_device *dev, struct drm_file *file)
@@ -85,6 +96,8 @@  static void etnaviv_postclose(struct drm_device *dev, struct drm_file *file)
 			drm_sched_entity_destroy(&ctx->sched_entity[i]);
 	}
 
+	etnaviv_iommu_context_put(ctx->mmu);
+
 	kfree(ctx);
 }
 
@@ -116,12 +129,29 @@  static int etnaviv_mm_show(struct drm_device *dev, struct seq_file *m)
 static int etnaviv_mmu_show(struct etnaviv_gpu *gpu, struct seq_file *m)
 {
 	struct drm_printer p = drm_seq_file_printer(m);
+	struct etnaviv_iommu_context *mmu;
 
 	seq_printf(m, "Active Objects (%s):\n", dev_name(gpu->dev));
 
-	mutex_lock(&gpu->mmu->lock);
-	drm_mm_print(&gpu->mmu->mm, &p);
-	mutex_unlock(&gpu->mmu->lock);
+	/*
+	 * Lock the GPU to avoid a MMU context switch just now and elevate
+	 * the refcount of the current context to avoid it disappearing from
+	 * under our feet.
+	 */
+	mutex_lock(&gpu->lock);
+	mmu = gpu->mmu;
+	if (mmu)
+		etnaviv_iommu_context_get(mmu);
+	mutex_unlock(&gpu->lock);
+
+	if (!mmu)
+		return 0;
+
+	mutex_lock(&mmu->lock);
+	drm_mm_print(&mmu->mm, &p);
+	mutex_unlock(&mmu->lock);
+
+	etnaviv_iommu_context_put(mmu);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.h b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
index 5f8db08f1c17..a488cfdb6bbf 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
@@ -25,10 +25,7 @@  struct etnaviv_gem_submit;
 struct etnaviv_iommu_global;
 
 struct etnaviv_file_private {
-	/*
-	 * When per-context address spaces are supported we'd keep track of
-	 * the context's page-tables here.
-	 */
+	struct etnaviv_iommu_context	*mmu;
 	struct drm_sched_entity		sched_entity[ETNA_MAX_PIPES];
 };
 
@@ -75,6 +72,7 @@  u16 etnaviv_buffer_config_pta(struct etnaviv_gpu *gpu, unsigned short id);
 void etnaviv_buffer_end(struct etnaviv_gpu *gpu);
 void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event);
 void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
+	struct etnaviv_iommu_context *mmu,
 	unsigned int event, struct etnaviv_cmdbuf *cmdbuf);
 void etnaviv_validate_init(void);
 bool etnaviv_cmd_validate_one(struct etnaviv_gpu *gpu,
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 2c164480bcfc..8554e930a71a 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -174,12 +174,12 @@  void etnaviv_core_dump(struct etnaviv_gem_submit *submit)
 	etnaviv_core_dump_mem(&iter, ETDUMP_BUF_RING, gpu->buffer.vaddr,
 			      gpu->buffer.size,
 			      etnaviv_cmdbuf_get_va(&gpu->buffer,
-						    &gpu->cmdbuf_mapping));
+						    &gpu->mmu->cmdbuf_mapping));
 
 	etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
 			      submit->cmdbuf.vaddr, submit->cmdbuf.size,
 			      etnaviv_cmdbuf_get_va(&submit->cmdbuf,
-						    &gpu->cmdbuf_mapping));
+						    &gpu->mmu->cmdbuf_mapping));
 
 	/* Reserve space for the bomap */
 	if (n_bomap_pages) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 5fee0bb145c8..74680c0254b6 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -248,8 +248,7 @@  void etnaviv_gem_mapping_unreference(struct etnaviv_vram_mapping *mapping)
 }
 
 struct etnaviv_vram_mapping *etnaviv_gem_mapping_get(
-	struct drm_gem_object *obj, struct etnaviv_gpu *gpu,
-	struct etnaviv_iommu_context *mmu)
+	struct drm_gem_object *obj, struct etnaviv_iommu_context *mmu)
 {
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct etnaviv_vram_mapping *mapping;
@@ -308,7 +307,7 @@  struct etnaviv_vram_mapping *etnaviv_gem_mapping_get(
 	mapping->context = mmu;
 	mapping->use = 1;
 
-	ret = etnaviv_iommu_map_gem(mmu, etnaviv_obj, gpu->memory_base,
+	ret = etnaviv_iommu_map_gem(mmu, etnaviv_obj, mmu->global->memory_base,
 				    mapping);
 	if (ret < 0)
 		kfree(mapping);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
index 36486254a3d3..175e6128c4bc 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
@@ -93,6 +93,7 @@  struct etnaviv_gem_submit {
 	struct kref refcount;
 	struct etnaviv_file_private *ctx;
 	struct etnaviv_gpu *gpu;
+	struct etnaviv_iommu_context *mmu, *prev_mmu;
 	struct dma_fence *out_fence, *in_fence;
 	int out_fence_id;
 	struct list_head node; /* GPU active submit list */
@@ -119,8 +120,7 @@  struct page **etnaviv_gem_get_pages(struct etnaviv_gem_object *obj);
 void etnaviv_gem_put_pages(struct etnaviv_gem_object *obj);
 
 struct etnaviv_vram_mapping *etnaviv_gem_mapping_get(
-	struct drm_gem_object *obj, struct etnaviv_gpu *gpu,
-	struct etnaviv_iommu_context *mmu);
+	struct drm_gem_object *obj, struct etnaviv_iommu_context *mmu);
 void etnaviv_gem_mapping_unreference(struct etnaviv_vram_mapping *mapping);
 
 #endif /* __ETNAVIV_GEM_H__ */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index 267ff5863e5d..27a14a270a55 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -224,7 +224,7 @@  static int submit_pin_objects(struct etnaviv_gem_submit *submit)
 		struct etnaviv_vram_mapping *mapping;
 
 		mapping = etnaviv_gem_mapping_get(&etnaviv_obj->base,
-						  submit->gpu, submit->gpu->mmu);
+						  submit->mmu);
 		if (IS_ERR(mapping)) {
 			ret = PTR_ERR(mapping);
 			break;
@@ -361,6 +361,12 @@  static void submit_cleanup(struct kref *kref)
 	if (submit->cmdbuf.suballoc)
 		etnaviv_cmdbuf_free(&submit->cmdbuf);
 
+	if (submit->mmu)
+		etnaviv_iommu_context_put(submit->mmu);
+
+	if (submit->prev_mmu)
+		etnaviv_iommu_context_put(submit->prev_mmu);
+
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
 
@@ -502,6 +508,8 @@  int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		goto err_submit_objects;
 
 	submit->ctx = file->driver_priv;
+	etnaviv_iommu_context_get(submit->ctx->mmu);
+	submit->mmu = submit->ctx->mmu;
 	submit->exec_state = args->exec_state;
 	submit->flags = args->flags;
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index b46d8207f6e6..a3d0e7adc8bf 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -602,7 +602,7 @@  void etnaviv_gpu_start_fe(struct etnaviv_gpu *gpu, u32 address, u16 prefetch)
 
 static void etnaviv_gpu_start_fe_idleloop(struct etnaviv_gpu *gpu)
 {
-	u32 address = etnaviv_cmdbuf_get_va(&gpu->buffer, &gpu->cmdbuf_mapping);
+	u32 address = etnaviv_cmdbuf_get_va(&gpu->buffer, &gpu->mmu->cmdbuf_mapping);
 	u16 prefetch;
 
 	/* setup the MMU */
@@ -693,8 +693,6 @@  static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 	etnaviv_gpu_setup_pulse_eater(gpu);
 
 	gpu_write(gpu, VIVS_HI_INTR_ENBL, ~0U);
-
-	etnaviv_gpu_start_fe_idleloop(gpu);
 }
 
 int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
@@ -724,28 +722,6 @@  int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 		goto fail;
 	}
 
-	/*
-	 * Set the GPU linear window to be at the end of the DMA window, where
-	 * the CMA area is likely to reside. This ensures that we are able to
-	 * map the command buffers while having the linear window overlap as
-	 * much RAM as possible, so we can optimize mappings for other buffers.
-	 *
-	 * For 3D cores only do this if MC2.0 is present, as with MC1.0 it leads
-	 * to different views of the memory on the individual engines.
-	 */
-	if (!(gpu->identity.features & chipFeatures_PIPE_3D) ||
-	    (gpu->identity.minor_features0 & chipMinorFeatures0_MC20)) {
-		u32 dma_mask = (u32)dma_get_required_mask(gpu->dev);
-		if (dma_mask < PHYS_OFFSET + SZ_2G)
-			gpu->memory_base = PHYS_OFFSET;
-		else
-			gpu->memory_base = dma_mask - SZ_2G + 1;
-	} else if (PHYS_OFFSET >= SZ_2G) {
-		dev_info(gpu->dev, "Need to move linear window on MC1.0, disabling TS\n");
-		gpu->memory_base = PHYS_OFFSET;
-		gpu->identity.features &= ~chipFeatures_FAST_CLEAR;
-	}
-
 	/*
 	 * On cores with security features supported, we claim control over the
 	 * security states.
@@ -764,19 +740,26 @@  int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 	if (ret)
 		goto fail;
 
-	gpu->mmu = etnaviv_iommu_context_init(priv->mmu_global);
-	if (IS_ERR(gpu->mmu)) {
-		dev_err(gpu->dev, "Failed to instantiate GPU IOMMU\n");
-		ret = PTR_ERR(gpu->mmu);
-		goto iommu_global_fini;
-	}
-
-	ret = etnaviv_cmdbuf_suballoc_map(priv->cmdbuf_suballoc, gpu->mmu,
-					  &gpu->cmdbuf_mapping,
-					  gpu->memory_base);
-	if (ret) {
-		dev_err(gpu->dev, "failed to map cmdbuf suballoc\n");
-		goto destroy_iommu;
+	/*
+	 * Set the GPU linear window to be at the end of the DMA window, where
+	 * the CMA area is likely to reside. This ensures that we are able to
+	 * map the command buffers while having the linear window overlap as
+	 * much RAM as possible, so we can optimize mappings for other buffers.
+	 *
+	 * For 3D cores only do this if MC2.0 is present, as with MC1.0 it leads
+	 * to different views of the memory on the individual engines.
+	 */
+	if (!(gpu->identity.features & chipFeatures_PIPE_3D) ||
+	    (gpu->identity.minor_features0 & chipMinorFeatures0_MC20)) {
+		u32 dma_mask = (u32)dma_get_required_mask(gpu->dev);
+		if (dma_mask < PHYS_OFFSET + SZ_2G)
+			priv->mmu_global->memory_base = PHYS_OFFSET;
+		else
+			priv->mmu_global->memory_base = dma_mask - SZ_2G + 1;
+	} else if (PHYS_OFFSET >= SZ_2G) {
+		dev_info(gpu->dev, "Need to move linear window on MC1.0, disabling TS\n");
+		priv->mmu_global->memory_base = PHYS_OFFSET;
+		gpu->identity.features &= ~chipFeatures_FAST_CLEAR;
 	}
 
 	/* Create buffer: */
@@ -784,15 +767,7 @@  int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 				  PAGE_SIZE);
 	if (ret) {
 		dev_err(gpu->dev, "could not create command buffer\n");
-		goto unmap_suballoc;
-	}
-
-	if (!(gpu->identity.minor_features1 & chipMinorFeatures1_MMU_VERSION) &&
-	    etnaviv_cmdbuf_get_va(&gpu->buffer, &gpu->cmdbuf_mapping) > 0x80000000) {
-		ret = -EINVAL;
-		dev_err(gpu->dev,
-			"command buffer outside valid memory window\n");
-		goto free_buffer;
+		goto fail;
 	}
 
 	/* Setup event management */
@@ -815,14 +790,6 @@  int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 
 	return 0;
 
-free_buffer:
-	etnaviv_cmdbuf_free(&gpu->buffer);
-unmap_suballoc:
-	etnaviv_cmdbuf_suballoc_unmap(gpu->mmu, &gpu->cmdbuf_mapping);
-destroy_iommu:
-	etnaviv_iommu_context_put(gpu->mmu);
-iommu_global_fini:
-	etnaviv_iommu_global_fini(gpu);
 fail:
 	pm_runtime_mark_last_busy(gpu->dev);
 	pm_runtime_put_autosuspend(gpu->dev);
@@ -1016,6 +983,7 @@  void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
 
 	etnaviv_gpu_hw_init(gpu);
 	gpu->exec_state = -1;
+	gpu->mmu = NULL;
 
 	mutex_unlock(&gpu->lock);
 	pm_runtime_mark_last_busy(gpu->dev);
@@ -1322,6 +1290,15 @@  struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
 		goto out_unlock;
 	}
 
+	if (!gpu->mmu) {
+		etnaviv_iommu_context_get(submit->mmu);
+		gpu->mmu = submit->mmu;
+		etnaviv_gpu_start_fe_idleloop(gpu);
+	} else {
+		etnaviv_iommu_context_get(gpu->mmu);
+		submit->prev_mmu = gpu->mmu;
+	}
+
 	if (submit->nr_pmrs) {
 		gpu->event[event[1]].sync_point = &sync_point_perfmon_sample_pre;
 		kref_get(&submit->refcount);
@@ -1331,7 +1308,7 @@  struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
 
 	gpu->event[event[0]].fence = gpu_fence;
 	submit->cmdbuf.user_size = submit->cmdbuf.size - 8;
-	etnaviv_buffer_queue(gpu, submit->exec_state, event[0],
+	etnaviv_buffer_queue(gpu, submit->exec_state, submit->mmu, event[0],
 			     &submit->cmdbuf);
 
 	if (submit->nr_pmrs) {
@@ -1534,7 +1511,7 @@  int etnaviv_gpu_wait_idle(struct etnaviv_gpu *gpu, unsigned int timeout_ms)
 
 static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu)
 {
-	if (gpu->initialized) {
+	if (gpu->initialized && gpu->mmu) {
 		/* Replace the last WAIT with END */
 		mutex_lock(&gpu->lock);
 		etnaviv_buffer_end(gpu);
@@ -1546,8 +1523,13 @@  static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu)
 		 * we fail, just warn and continue.
 		 */
 		etnaviv_gpu_wait_idle(gpu, 100);
+
+		etnaviv_iommu_context_put(gpu->mmu);
+		gpu->mmu = NULL;
 	}
 
+	gpu->exec_state = -1;
+
 	return etnaviv_gpu_clk_disable(gpu);
 }
 
@@ -1563,8 +1545,6 @@  static int etnaviv_gpu_hw_resume(struct etnaviv_gpu *gpu)
 	etnaviv_gpu_update_clock(gpu);
 	etnaviv_gpu_hw_init(gpu);
 
-	gpu->exec_state = -1;
-
 	mutex_unlock(&gpu->lock);
 
 	return 0;
@@ -1695,8 +1675,6 @@  static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
 
 	if (gpu->initialized) {
 		etnaviv_cmdbuf_free(&gpu->buffer);
-		etnaviv_cmdbuf_suballoc_unmap(gpu->mmu, &gpu->cmdbuf_mapping);
-		etnaviv_iommu_context_put(gpu->mmu);
 		etnaviv_iommu_global_fini(gpu);
 		gpu->initialized = false;
 	}
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
index 83e9256865a4..40e11b12df71 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
@@ -103,13 +103,9 @@  struct etnaviv_gpu {
 	bool initialized;
 
 	/* 'ring'-buffer: */
-	struct etnaviv_vram_mapping cmdbuf_mapping;
 	struct etnaviv_cmdbuf buffer;
 	int exec_state;
 
-	/* bus base address of memory  */
-	u32 memory_base;
-
 	/* event management: */
 	DECLARE_BITMAP(event_bitmap, ETNA_NR_EVENTS);
 	struct etnaviv_event event[ETNA_NR_EVENTS];
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
index a2f1ff151822..aac8dbf3ea56 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
@@ -93,11 +93,11 @@  static void etnaviv_iommuv1_restore(struct etnaviv_gpu *gpu,
 	u32 pgtable;
 
 	/* set base addresses */
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_RA, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_FE, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_TX, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PEZ, gpu->memory_base);
-	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PE, gpu->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_RA, context->global->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_FE, context->global->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_TX, context->global->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PEZ, context->global->memory_base);
+	gpu_write(gpu, VIVS_MC_MEMORY_BASE_ADDR_PE, context->global->memory_base);
 
 	/* set page table address in MC */
 	pgtable = (u32)v1_context->pgtable_dma;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
index 5ca2077c148d..043111a1d60c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
@@ -206,7 +206,7 @@  static void etnaviv_iommuv2_restore_sec(struct etnaviv_gpu *gpu,
 		  VIVS_MMUv2_SAFE_ADDRESS_CONFIG_SEC_SAFE_ADDR_HIGH(
 		  upper_32_bits(context->global->bad_page_dma)));
 
-	context->global->v2.pta_cpu[0] = v2_context->mtlb_dma |
+	context->global->v2.pta_cpu[v2_context->id] = v2_context->mtlb_dma |
 				 	 VIVS_MMUv2_CONFIGURATION_MODE_MODE4_K;
 
 	/* trigger a PTA load through the FE */
@@ -218,6 +218,19 @@  static void etnaviv_iommuv2_restore_sec(struct etnaviv_gpu *gpu,
 	gpu_write(gpu, VIVS_MMUv2_SEC_CONTROL, VIVS_MMUv2_SEC_CONTROL_ENABLE);
 }
 
+u32 etnaviv_iommuv2_get_mtlb_addr(struct etnaviv_iommu_context *context)
+{
+	struct etnaviv_iommuv2_context *v2_context = to_v2_context(context);
+
+	return v2_context->mtlb_dma;
+}
+
+unsigned short etnaviv_iommuv2_get_pta_id(struct etnaviv_iommu_context *context)
+{
+	struct etnaviv_iommuv2_context *v2_context = to_v2_context(context);
+
+	return v2_context->id;
+}
 static void etnaviv_iommuv2_restore(struct etnaviv_gpu *gpu,
 				    struct etnaviv_iommu_context *context)
 {
@@ -272,6 +285,8 @@  etnaviv_iommuv2_context_alloc(struct etnaviv_iommu_global *global)
 	memset32(v2_context->mtlb_cpu, MMUv2_PTE_EXCEPTION,
 		 MMUv2_MAX_STLB_ENTRIES);
 
+	global->v2.pta_cpu[v2_context->id] = v2_context->mtlb_dma;
+
 	context = &v2_context->base;
 	context->global = global;
 	kref_init(&context->refcount);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
index cf49f0e2e1cb..99c20094295c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
@@ -290,6 +290,8 @@  static void etnaviv_iommu_context_free(struct kref *kref)
 	struct etnaviv_iommu_context *context =
 		container_of(kref, struct etnaviv_iommu_context, refcount);
 
+	etnaviv_cmdbuf_suballoc_unmap(context, &context->cmdbuf_mapping);
+
 	context->global->ops->free(context);
 }
 void etnaviv_iommu_context_put(struct etnaviv_iommu_context *context)
@@ -298,12 +300,28 @@  void etnaviv_iommu_context_put(struct etnaviv_iommu_context *context)
 }
 
 struct etnaviv_iommu_context *
-etnaviv_iommu_context_init(struct etnaviv_iommu_global *global)
+etnaviv_iommu_context_init(struct etnaviv_iommu_global *global,
+			   struct etnaviv_cmdbuf_suballoc *suballoc)
 {
+	struct etnaviv_iommu_context *ctx;
+	int ret;
+
 	if (global->version == ETNAVIV_IOMMU_V1)
-		return etnaviv_iommuv1_context_alloc(global);
+		ctx = etnaviv_iommuv1_context_alloc(global);
 	else
-		return etnaviv_iommuv2_context_alloc(global);
+		ctx = etnaviv_iommuv2_context_alloc(global);
+
+	if (!ctx)
+		return NULL;
+
+	ret = etnaviv_cmdbuf_suballoc_map(suballoc, ctx, &ctx->cmdbuf_mapping,
+					  global->memory_base);
+	if (ret) {
+		etnaviv_iommu_context_put(ctx);
+		return NULL;
+	}
+
+	return ctx;
 }
 
 void etnaviv_iommu_restore(struct etnaviv_gpu *gpu,
@@ -319,6 +337,12 @@  int etnaviv_iommu_get_suballoc_va(struct etnaviv_iommu_context *context,
 {
 	mutex_lock(&context->lock);
 
+	if (mapping->use > 0) {
+		mapping->use++;
+		mutex_unlock(&context->lock);
+		return 0;
+	}
+
 	/*
 	 * For MMUv1 we don't add the suballoc region to the pagetables, as
 	 * those GPUs can only work with cmdbufs accessed through the linear
@@ -341,7 +365,6 @@  int etnaviv_iommu_get_suballoc_va(struct etnaviv_iommu_context *context,
 		mapping->iova = node->start;
 		ret = etnaviv_context_map(context, node->start, paddr, size,
 					  ETNAVIV_PROT_READ);
-
 		if (ret < 0) {
 			drm_mm_remove_node(node);
 			mutex_unlock(&context->lock);
@@ -364,15 +387,14 @@  void etnaviv_iommu_put_suballoc_va(struct etnaviv_iommu_context *context,
 {
 	struct drm_mm_node *node = &mapping->vram_node;
 
-	if (!mapping->use)
-		return;
-
-	mapping->use = 0;
+	mutex_lock(&context->lock);
+	mapping->use--;
 
-	if (context->global->version == ETNAVIV_IOMMU_V1)
+	if (mapping->use > 0 || context->global->version == ETNAVIV_IOMMU_V1) {
+		mutex_unlock(&context->lock);
 		return;
+	}
 
-	mutex_lock(&context->lock);
 	etnaviv_context_unmap(context, node->start, node->size);
 	drm_mm_remove_node(node);
 	mutex_unlock(&context->lock);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h
index 4f1bebcd342c..3c219d306eab 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h
@@ -44,6 +44,8 @@  struct etnaviv_iommu_global {
 	void *bad_page_cpu;
 	dma_addr_t bad_page_dma;
 
+	u32 memory_base;
+
 	/*
 	 * This union holds members needed by either MMUv1 or MMUv2, which
 	 * can not exist at the same time.
@@ -71,6 +73,9 @@  struct etnaviv_iommu_context {
 	struct list_head mappings;
 	struct drm_mm mm;
 	unsigned int flush_seq;
+
+	/* Not part of the context, but needs to have the same lifetime */
+	struct etnaviv_vram_mapping cmdbuf_mapping;
 };
 
 int etnaviv_iommu_global_init(struct etnaviv_gpu *gpu);
@@ -95,7 +100,8 @@  size_t etnaviv_iommu_dump_size(struct etnaviv_iommu_context *ctx);
 void etnaviv_iommu_dump(struct etnaviv_iommu_context *ctx, void *buf);
 
 struct etnaviv_iommu_context *
-etnaviv_iommu_context_init(struct etnaviv_iommu_global *global);
+etnaviv_iommu_context_init(struct etnaviv_iommu_global *global,
+			   struct etnaviv_cmdbuf_suballoc *suballoc);
 static inline void etnaviv_iommu_context_get(struct etnaviv_iommu_context *ctx)
 {
 	kref_get(&ctx->refcount);
@@ -109,4 +115,7 @@  etnaviv_iommuv1_context_alloc(struct etnaviv_iommu_global *global);
 struct etnaviv_iommu_context *
 etnaviv_iommuv2_context_alloc(struct etnaviv_iommu_global *global);
 
+u32 etnaviv_iommuv2_get_mtlb_addr(struct etnaviv_iommu_context *context);
+unsigned short etnaviv_iommuv2_get_pta_id(struct etnaviv_iommu_context *context);
+
 #endif /* __ETNAVIV_MMU_H__ */