diff mbox series

[v1,4/4] staging: media: tegra-vde: Defer dmabuf's unmapping

Message ID 20190602213712.26857-8-digetx@gmail.com (mailing list archive)
State New, archived
Headers show
Series NVIDIA Tegra Video Decoder driver improvements | expand

Commit Message

Dmitry Osipenko June 2, 2019, 9:37 p.m. UTC
Frequent IOMMU remappings take about 50% of CPU usage because there is
quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to
mitigate the mapping overhead which goes away completely and driver works
as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU
should also benefit a tad from the caching since CPU cache maintenance
that happens on dmabuf's attaching takes some resources.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/staging/media/tegra-vde/Makefile      |   2 +-
 .../staging/media/tegra-vde/dmabuf-cache.c    | 223 ++++++++++++++++++
 drivers/staging/media/tegra-vde/iommu.c       |   2 -
 drivers/staging/media/tegra-vde/vde.c         | 143 +++--------
 drivers/staging/media/tegra-vde/vde.h         |  18 +-
 5 files changed, 273 insertions(+), 115 deletions(-)
 create mode 100644 drivers/staging/media/tegra-vde/dmabuf-cache.c

Comments

Hans Verkuil June 17, 2019, 1:33 p.m. UTC | #1
On 6/2/19 11:37 PM, Dmitry Osipenko wrote:
> Frequent IOMMU remappings take about 50% of CPU usage because there is
> quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to
> mitigate the mapping overhead which goes away completely and driver works
> as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU
> should also benefit a tad from the caching since CPU cache maintenance
> that happens on dmabuf's attaching takes some resources.
> 
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>  drivers/staging/media/tegra-vde/Makefile      |   2 +-
>  .../staging/media/tegra-vde/dmabuf-cache.c    | 223 ++++++++++++++++++
>  drivers/staging/media/tegra-vde/iommu.c       |   2 -
>  drivers/staging/media/tegra-vde/vde.c         | 143 +++--------
>  drivers/staging/media/tegra-vde/vde.h         |  18 +-
>  5 files changed, 273 insertions(+), 115 deletions(-)
>  create mode 100644 drivers/staging/media/tegra-vde/dmabuf-cache.c
> 
> diff --git a/drivers/staging/media/tegra-vde/Makefile b/drivers/staging/media/tegra-vde/Makefile
> index c11867e28233..2827f7601de8 100644
> --- a/drivers/staging/media/tegra-vde/Makefile
> +++ b/drivers/staging/media/tegra-vde/Makefile
> @@ -1,3 +1,3 @@
>  # SPDX-License-Identifier: GPL-2.0
> -tegra-vde-y := vde.o iommu.o
> +tegra-vde-y := vde.o iommu.o dmabuf-cache.o
>  obj-$(CONFIG_TEGRA_VDE)	+= tegra-vde.o
> diff --git a/drivers/staging/media/tegra-vde/dmabuf-cache.c b/drivers/staging/media/tegra-vde/dmabuf-cache.c
> new file mode 100644
> index 000000000000..fcde8d1c37e7
> --- /dev/null
> +++ b/drivers/staging/media/tegra-vde/dmabuf-cache.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * NVIDIA Tegra Video decoder driver
> + *
> + * Copyright (C) 2016-2019 GRATE-DRIVER project
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/iova.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +
> +#include "vde.h"
> +
> +struct tegra_vde_cache_entry {
> +	enum dma_data_direction dma_dir;
> +	struct dma_buf_attachment *a;
> +	struct delayed_work dwork;
> +	struct tegra_vde *vde;
> +	struct list_head list;
> +	struct sg_table *sgt;
> +	struct iova *iova;
> +	unsigned int refcnt;
> +};
> +
> +static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry)
> +{
> +	struct dma_buf *dmabuf = entry->a->dmabuf;
> +
> +	WARN_ON_ONCE(entry->refcnt);
> +
> +	if (entry->vde->domain)
> +		tegra_vde_iommu_unmap(entry->vde, entry->iova);
> +
> +	dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir);
> +	dma_buf_detach(dmabuf, entry->a);
> +	dma_buf_put(dmabuf);
> +
> +	list_del(&entry->list);
> +	kfree(entry);
> +}
> +
> +static void tegra_vde_delayed_unmap(struct work_struct *work)
> +{
> +	struct tegra_vde_cache_entry *entry;
> +
> +	entry = container_of(work, struct tegra_vde_cache_entry,
> +			     dwork.work);
> +
> +	mutex_lock(&entry->vde->map_lock);
> +	tegra_vde_release_entry(entry);
> +	mutex_unlock(&entry->vde->map_lock);

From smatch:

drivers/staging/media/tegra-vde/dmabuf-cache.c:55 tegra_vde_delayed_unmap() error: dereferencing freed memory 'entry'

> +}
> +
> +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
> +			       struct dma_buf *dmabuf,
> +			       enum dma_data_direction dma_dir,
> +			       struct dma_buf_attachment **ap,
> +			       dma_addr_t *addrp)
> +{
> +	struct device *dev = vde->miscdev.parent;
> +	struct tegra_vde_cache_entry *entry;
> +	struct dma_buf_attachment *attachment;
> +	struct sg_table *sgt;
> +	struct iova *iova;
> +	int err;
> +
> +	mutex_lock(&vde->map_lock);
> +
> +	list_for_each_entry(entry, &vde->map_list, list) {
> +		if (entry->a->dmabuf != dmabuf)
> +			continue;
> +
> +		if (!cancel_delayed_work(&entry->dwork))
> +			continue;
> +
> +		if (entry->dma_dir != dma_dir)
> +			entry->dma_dir = DMA_BIDIRECTIONAL;
> +
> +		dma_buf_put(dmabuf);
> +
> +		if (vde->domain)
> +			*addrp = iova_dma_addr(&vde->iova, entry->iova);
> +		else
> +			*addrp = sg_dma_address(entry->sgt->sgl);
> +
> +		goto ref;
> +	}
> +
> +	attachment = dma_buf_attach(dmabuf, dev);
> +	if (IS_ERR(attachment)) {
> +		dev_err(dev, "Failed to attach dmabuf\n");
> +		err = PTR_ERR(attachment);
> +		goto err_unlock;
> +	}
> +
> +	sgt = dma_buf_map_attachment(attachment, dma_dir);
> +	if (IS_ERR(sgt)) {
> +		dev_err(dev, "Failed to get dmabufs sg_table\n");
> +		err = PTR_ERR(sgt);
> +		goto err_detach;
> +	}
> +
> +	if (!vde->domain && sgt->nents > 1) {
> +		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
> +		err = -EINVAL;
> +		goto err_unmap;
> +	}
> +
> +	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +	if (!entry) {
> +		err = -ENOMEM;
> +		goto err_unmap;
> +	}
> +
> +	if (vde->domain) {
> +		err = tegra_vde_iommu_map(vde, sgt, &iova, dmabuf->size);
> +		if (err)
> +			goto err_free;
> +
> +		*addrp = iova_dma_addr(&vde->iova, iova);
> +	} else {
> +		*addrp = sg_dma_address(sgt->sgl);
> +	}
> +
> +	INIT_DELAYED_WORK(&entry->dwork, tegra_vde_delayed_unmap);
> +	list_add(&entry->list, &vde->map_list);
> +
> +	entry->dma_dir = dma_dir;
> +	entry->iova = iova;

From smatch:

drivers/staging/media/tegra-vde/dmabuf-cache.c:133 tegra_vde_dmabuf_cache_map() error: uninitialized symbol 'iova'.

Regards,

	Hans

> +	entry->vde = vde;
> +	entry->sgt = sgt;
> +	entry->a = attachment;
> +ref:
> +	entry->refcnt++;
> +
> +	*ap = entry->a;
> +
> +	mutex_unlock(&vde->map_lock);
> +
> +	return 0;
> +
> +err_free:
> +	kfree(entry);
> +err_unmap:
> +	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
> +err_detach:
> +	dma_buf_detach(dmabuf, attachment);
> +err_unlock:
> +	mutex_unlock(&vde->map_lock);
> +
> +	return err;
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
> +				  struct dma_buf_attachment *a,
> +				  bool release)
> +{
> +	struct tegra_vde_cache_entry *entry;
> +
> +	mutex_lock(&vde->map_lock);
> +
> +	list_for_each_entry(entry, &vde->map_list, list) {
> +		if (entry->a != a)
> +			continue;
> +
> +		WARN_ON_ONCE(!entry->refcnt);
> +
> +		if (--entry->refcnt == 0) {
> +			if (release)
> +				tegra_vde_release_entry(entry);
> +			else
> +				schedule_delayed_work(&entry->dwork, 5 * HZ);
> +		}
> +		break;
> +	}
> +
> +	mutex_unlock(&vde->map_lock);
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde)
> +{
> +	struct tegra_vde_cache_entry *entry, *tmp;
> +
> +	mutex_lock(&vde->map_lock);
> +
> +	list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
> +		if (entry->refcnt)
> +			continue;
> +
> +		if (!cancel_delayed_work(&entry->dwork))
> +			continue;
> +
> +		tegra_vde_release_entry(entry);
> +	}
> +
> +	mutex_unlock(&vde->map_lock);
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde)
> +{
> +	struct tegra_vde_cache_entry *entry, *tmp;
> +
> +	mutex_lock(&vde->map_lock);
> +
> +	do {
> +		list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
> +			if (!cancel_delayed_work(&entry->dwork))
> +				continue;
> +
> +			tegra_vde_release_entry(entry);
> +		}
> +
> +		mutex_unlock(&vde->map_lock);
> +		schedule();
> +		mutex_lock(&vde->map_lock);
> +	} while (!list_empty(&vde->map_list));
> +
> +	mutex_unlock(&vde->map_lock);
> +}
> diff --git a/drivers/staging/media/tegra-vde/iommu.c b/drivers/staging/media/tegra-vde/iommu.c
> index 295c3d7cccd3..6d635332e0ec 100644
> --- a/drivers/staging/media/tegra-vde/iommu.c
> +++ b/drivers/staging/media/tegra-vde/iommu.c
> @@ -19,7 +19,6 @@
>  int tegra_vde_iommu_map(struct tegra_vde *vde,
>  			struct sg_table *sgt,
>  			struct iova **iovap,
> -			dma_addr_t *addrp,
>  			size_t size)
>  {
>  	struct iova *iova;
> @@ -45,7 +44,6 @@ int tegra_vde_iommu_map(struct tegra_vde *vde,
>  	}
>  
>  	*iovap = iova;
> -	*addrp = addr;
>  
>  	return 0;
>  }
> diff --git a/drivers/staging/media/tegra-vde/vde.c b/drivers/staging/media/tegra-vde/vde.c
> index cbcdbfef072d..3466daddf663 100644
> --- a/drivers/staging/media/tegra-vde/vde.c
> +++ b/drivers/staging/media/tegra-vde/vde.c
> @@ -11,6 +11,7 @@
>  #include <linux/genalloc.h>
>  #include <linux/interrupt.h>
>  #include <linux/iopoll.h>
> +#include <linux/list.h>
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
>  #include <linux/of_device.h>
> @@ -37,18 +38,10 @@
>  #define BSE_DMA_BUSY		BIT(23)
>  
>  struct video_frame {
> -	struct iova *y_iova;
> -	struct iova *cb_iova;
> -	struct iova *cr_iova;
> -	struct iova *aux_iova;
>  	struct dma_buf_attachment *y_dmabuf_attachment;
>  	struct dma_buf_attachment *cb_dmabuf_attachment;
>  	struct dma_buf_attachment *cr_dmabuf_attachment;
>  	struct dma_buf_attachment *aux_dmabuf_attachment;
> -	struct sg_table *y_sgt;
> -	struct sg_table *cb_sgt;
> -	struct sg_table *cr_sgt;
> -	struct sg_table *aux_sgt;
>  	dma_addr_t y_addr;
>  	dma_addr_t cb_addr;
>  	dma_addr_t cr_addr;
> @@ -494,22 +487,6 @@ static void tegra_vde_decode_frame(struct tegra_vde *vde,
>  			 vde->sxe, 0x00);
>  }
>  
> -static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde,
> -					    enum dma_data_direction dma_dir,
> -					    struct dma_buf_attachment *a,
> -					    struct sg_table *sgt,
> -					    struct iova *iova)
> -{
> -	struct dma_buf *dmabuf = a->dmabuf;
> -
> -	if (vde->domain)
> -		tegra_vde_iommu_unmap(vde, iova);
> -
> -	dma_buf_unmap_attachment(a, sgt, dma_dir);
> -	dma_buf_detach(dmabuf, a);
> -	dma_buf_put(dmabuf);
> -}
> -
>  static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
>  				   int fd,
>  				   unsigned long offset,
> @@ -517,15 +494,11 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
>  				   size_t align_size,
>  				   struct dma_buf_attachment **a,
>  				   dma_addr_t *addrp,
> -				   struct sg_table **s,
> -				   struct iova **iovap,
>  				   size_t *size,
>  				   enum dma_data_direction dma_dir)
>  {
>  	struct device *dev = vde->miscdev.parent;
> -	struct dma_buf_attachment *attachment;
>  	struct dma_buf *dmabuf;
> -	struct sg_table *sgt;
>  	int err;
>  
>  	dmabuf = dma_buf_get(fd);
> @@ -546,49 +519,17 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
>  		return -EINVAL;
>  	}
>  
> -	attachment = dma_buf_attach(dmabuf, dev);
> -	if (IS_ERR(attachment)) {
> -		dev_err(dev, "Failed to attach dmabuf\n");
> -		err = PTR_ERR(attachment);
> +	err = tegra_vde_dmabuf_cache_map(vde, dmabuf, dma_dir, a, addrp);
> +	if (err)
>  		goto err_put;
> -	}
> -
> -	sgt = dma_buf_map_attachment(attachment, dma_dir);
> -	if (IS_ERR(sgt)) {
> -		dev_err(dev, "Failed to get dmabufs sg_table\n");
> -		err = PTR_ERR(sgt);
> -		goto err_detach;
> -	}
> -
> -	if (!vde->domain && sgt->nents > 1) {
> -		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
> -		err = -EINVAL;
> -		goto err_unmap;
> -	}
> -
> -	if (vde->domain) {
> -		err = tegra_vde_iommu_map(vde, sgt, iovap, addrp, dmabuf->size);
> -		if (err) {
> -			dev_err(dev, "IOMMU mapping failed: %d\n", err);
> -			goto err_unmap;
> -		}
> -	} else {
> -		*addrp = sg_dma_address(sgt->sgl);
> -	}
>  
>  	*addrp = *addrp + offset;
> -	*a = attachment;
> -	*s = sgt;
>  
>  	if (size)
>  		*size = dmabuf->size - offset;
>  
>  	return 0;
>  
> -err_unmap:
> -	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
> -err_detach:
> -	dma_buf_detach(dmabuf, attachment);
>  err_put:
>  	dma_buf_put(dmabuf);
>  
> @@ -608,8 +549,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  				      src->y_offset, lsize, SZ_256,
>  				      &frame->y_dmabuf_attachment,
>  				      &frame->y_addr,
> -				      &frame->y_sgt,
> -				      &frame->y_iova,
>  				      NULL, dma_dir);
>  	if (err)
>  		return err;
> @@ -618,8 +557,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  				      src->cb_offset, csize, SZ_256,
>  				      &frame->cb_dmabuf_attachment,
>  				      &frame->cb_addr,
> -				      &frame->cb_sgt,
> -				      &frame->cb_iova,
>  				      NULL, dma_dir);
>  	if (err)
>  		goto err_release_y;
> @@ -628,8 +565,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  				      src->cr_offset, csize, SZ_256,
>  				      &frame->cr_dmabuf_attachment,
>  				      &frame->cr_addr,
> -				      &frame->cr_sgt,
> -				      &frame->cr_iova,
>  				      NULL, dma_dir);
>  	if (err)
>  		goto err_release_cb;
> @@ -643,8 +578,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  				      src->aux_offset, csize, SZ_256,
>  				      &frame->aux_dmabuf_attachment,
>  				      &frame->aux_addr,
> -				      &frame->aux_sgt,
> -				      &frame->aux_iova,
>  				      NULL, dma_dir);
>  	if (err)
>  		goto err_release_cr;
> @@ -652,20 +585,11 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  	return 0;
>  
>  err_release_cr:
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->cr_dmabuf_attachment,
> -					frame->cr_sgt,
> -					frame->cr_iova);
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, true);
>  err_release_cb:
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->cb_dmabuf_attachment,
> -					frame->cb_sgt,
> -					frame->cb_iova);
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, true);
>  err_release_y:
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->y_dmabuf_attachment,
> -					frame->y_sgt,
> -					frame->y_iova);
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, true);
>  
>  	return err;
>  }
> @@ -673,28 +597,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
>  static void tegra_vde_release_frame_dmabufs(struct tegra_vde *vde,
>  					    struct video_frame *frame,
>  					    enum dma_data_direction dma_dir,
> -					    bool baseline_profile)
> +					    bool baseline_profile,
> +					    bool release)
>  {
>  	if (!baseline_profile)
> -		tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -						frame->aux_dmabuf_attachment,
> -						frame->aux_sgt,
> -						frame->aux_iova);
> -
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->cr_dmabuf_attachment,
> -					frame->cr_sgt,
> -					frame->cr_iova);
> -
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->cb_dmabuf_attachment,
> -					frame->cb_sgt,
> -					frame->cb_iova);
> -
> -	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> -					frame->y_dmabuf_attachment,
> -					frame->y_sgt,
> -					frame->y_iova);
> +		tegra_vde_dmabuf_cache_unmap(vde, frame->aux_dmabuf_attachment,
> +					     release);
> +
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, release);
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, release);
> +	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, release);
>  }
>  
>  static int tegra_vde_validate_frame(struct device *dev,
> @@ -786,8 +698,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
>  	struct tegra_vde_h264_frame __user *frames_user;
>  	struct video_frame *dpb_frames;
>  	struct dma_buf_attachment *bitstream_data_dmabuf_attachment;
> -	struct sg_table *bitstream_sgt;
> -	struct iova *bitstream_iova;
>  	enum dma_data_direction dma_dir;
>  	dma_addr_t bitstream_data_addr;
>  	dma_addr_t bsev_ptr;
> @@ -812,8 +722,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
>  				      SZ_16K, SZ_16K,
>  				      &bitstream_data_dmabuf_attachment,
>  				      &bitstream_data_addr,
> -				      &bitstream_sgt,
> -				      &bitstream_iova,
>  				      &bitstream_data_size,
>  				      DMA_TO_DEVICE);
>  	if (ret)
> @@ -944,7 +852,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
>  		dma_dir = (i == 0) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>  
>  		tegra_vde_release_frame_dmabufs(vde, &dpb_frames[i], dma_dir,
> -						ctx.baseline_profile);
> +						ctx.baseline_profile, ret != 0);
>  	}
>  
>  free_dpb_frames:
> @@ -954,10 +862,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
>  	kfree(frames);
>  
>  release_bitstream_dmabuf:
> -	tegra_vde_detach_and_put_dmabuf(vde, DMA_TO_DEVICE,
> -					bitstream_data_dmabuf_attachment,
> -					bitstream_sgt,
> -					bitstream_iova);
> +	tegra_vde_dmabuf_cache_unmap(vde, bitstream_data_dmabuf_attachment,
> +				     ret != 0);
>  
>  	return ret;
>  }
> @@ -979,9 +885,21 @@ static long tegra_vde_unlocked_ioctl(struct file *filp,
>  	return -ENOTTY;
>  }
>  
> +static int tegra_vde_release_file(struct inode *inode, struct file *filp)
> +{
> +	struct miscdevice *miscdev = filp->private_data;
> +	struct tegra_vde *vde = container_of(miscdev, struct tegra_vde,
> +					     miscdev);
> +
> +	tegra_vde_dmabuf_cache_unmap_sync(vde);
> +
> +	return 0;
> +}
> +
>  static const struct file_operations tegra_vde_fops = {
>  	.owner		= THIS_MODULE,
>  	.unlocked_ioctl	= tegra_vde_unlocked_ioctl,
> +	.release	= tegra_vde_release_file,
>  };
>  
>  static irqreturn_t tegra_vde_isr(int irq, void *data)
> @@ -1159,6 +1077,8 @@ static int tegra_vde_probe(struct platform_device *pdev)
>  		return -ENOMEM;
>  	}
>  
> +	INIT_LIST_HEAD(&vde->map_list);
> +	mutex_init(&vde->map_lock);
>  	mutex_init(&vde->lock);
>  	init_completion(&vde->decode_completion);
>  
> @@ -1221,6 +1141,7 @@ static int tegra_vde_remove(struct platform_device *pdev)
>  
>  	misc_deregister(&vde->miscdev);
>  
> +	tegra_vde_dmabuf_cache_unmap_all(vde);
>  	tegra_vde_iommu_deinit(vde);
>  
>  	gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
> diff --git a/drivers/staging/media/tegra-vde/vde.h b/drivers/staging/media/tegra-vde/vde.h
> index 37414b7fdee1..e138878e8e14 100644
> --- a/drivers/staging/media/tegra-vde/vde.h
> +++ b/drivers/staging/media/tegra-vde/vde.h
> @@ -9,16 +9,20 @@
>  #define TEGRA_VDE_H
>  
>  #include <linux/completion.h>
> +#include <linux/dma-direction.h>
> +#include <linux/list.h>
>  #include <linux/miscdevice.h>
>  #include <linux/mutex.h>
>  #include <linux/types.h>
>  #include <linux/iova.h>
>  
>  struct clk;
> +struct dma_buf;
>  struct gen_pool;
>  struct iommu_group;
>  struct iommu_domain;
>  struct reset_control;
> +struct dma_buf_attachment;
>  
>  struct tegra_vde {
>  	void __iomem *sxe;
> @@ -31,6 +35,8 @@ struct tegra_vde {
>  	void __iomem *vdma;
>  	void __iomem *frameid;
>  	struct mutex lock;
> +	struct mutex map_lock;
> +	struct list_head map_list;
>  	struct miscdevice miscdev;
>  	struct reset_control *rst;
>  	struct reset_control *rst_mc;
> @@ -49,10 +55,20 @@ void tegra_vde_iommu_deinit(struct tegra_vde *vde);
>  int tegra_vde_iommu_map(struct tegra_vde *vde,
>  			struct sg_table *sgt,
>  			struct iova **iovap,
> -			dma_addr_t *addrp,
>  			size_t size);
>  void tegra_vde_iommu_unmap(struct tegra_vde *vde, struct iova *iova);
>  
> +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
> +			       struct dma_buf *dmabuf,
> +			       enum dma_data_direction dma_dir,
> +			       struct dma_buf_attachment **ap,
> +			       dma_addr_t *addrp);
> +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
> +				  struct dma_buf_attachment *a,
> +				  bool release);
> +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde);
> +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde);
> +
>  static __maybe_unused char const *
>  tegra_vde_reg_base_name(struct tegra_vde *vde, void __iomem *base)
>  {
>
Dmitry Osipenko June 17, 2019, 1:44 p.m. UTC | #2
17.06.2019 16:33, Hans Verkuil пишет:
> On 6/2/19 11:37 PM, Dmitry Osipenko wrote:
>> Frequent IOMMU remappings take about 50% of CPU usage because there is
>> quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to
>> mitigate the mapping overhead which goes away completely and driver works
>> as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU
>> should also benefit a tad from the caching since CPU cache maintenance
>> that happens on dmabuf's attaching takes some resources.
>>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>  drivers/staging/media/tegra-vde/Makefile      |   2 +-
>>  .../staging/media/tegra-vde/dmabuf-cache.c    | 223 ++++++++++++++++++
>>  drivers/staging/media/tegra-vde/iommu.c       |   2 -
>>  drivers/staging/media/tegra-vde/vde.c         | 143 +++--------
>>  drivers/staging/media/tegra-vde/vde.h         |  18 +-
>>  5 files changed, 273 insertions(+), 115 deletions(-)
>>  create mode 100644 drivers/staging/media/tegra-vde/dmabuf-cache.c
>>
>> diff --git a/drivers/staging/media/tegra-vde/Makefile b/drivers/staging/media/tegra-vde/Makefile
>> index c11867e28233..2827f7601de8 100644
>> --- a/drivers/staging/media/tegra-vde/Makefile
>> +++ b/drivers/staging/media/tegra-vde/Makefile
>> @@ -1,3 +1,3 @@
>>  # SPDX-License-Identifier: GPL-2.0
>> -tegra-vde-y := vde.o iommu.o
>> +tegra-vde-y := vde.o iommu.o dmabuf-cache.o
>>  obj-$(CONFIG_TEGRA_VDE)	+= tegra-vde.o
>> diff --git a/drivers/staging/media/tegra-vde/dmabuf-cache.c b/drivers/staging/media/tegra-vde/dmabuf-cache.c
>> new file mode 100644
>> index 000000000000..fcde8d1c37e7
>> --- /dev/null
>> +++ b/drivers/staging/media/tegra-vde/dmabuf-cache.c
>> @@ -0,0 +1,223 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +/*
>> + * NVIDIA Tegra Video decoder driver
>> + *
>> + * Copyright (C) 2016-2019 GRATE-DRIVER project
>> + */
>> +
>> +#include <linux/dma-buf.h>
>> +#include <linux/iova.h>
>> +#include <linux/kernel.h>
>> +#include <linux/list.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/workqueue.h>
>> +
>> +#include "vde.h"
>> +
>> +struct tegra_vde_cache_entry {
>> +	enum dma_data_direction dma_dir;
>> +	struct dma_buf_attachment *a;
>> +	struct delayed_work dwork;
>> +	struct tegra_vde *vde;
>> +	struct list_head list;
>> +	struct sg_table *sgt;
>> +	struct iova *iova;
>> +	unsigned int refcnt;
>> +};
>> +
>> +static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry)
>> +{
>> +	struct dma_buf *dmabuf = entry->a->dmabuf;
>> +
>> +	WARN_ON_ONCE(entry->refcnt);
>> +
>> +	if (entry->vde->domain)
>> +		tegra_vde_iommu_unmap(entry->vde, entry->iova);
>> +
>> +	dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir);
>> +	dma_buf_detach(dmabuf, entry->a);
>> +	dma_buf_put(dmabuf);
>> +
>> +	list_del(&entry->list);
>> +	kfree(entry);
>> +}
>> +
>> +static void tegra_vde_delayed_unmap(struct work_struct *work)
>> +{
>> +	struct tegra_vde_cache_entry *entry;
>> +
>> +	entry = container_of(work, struct tegra_vde_cache_entry,
>> +			     dwork.work);
>> +
>> +	mutex_lock(&entry->vde->map_lock);
>> +	tegra_vde_release_entry(entry);
>> +	mutex_unlock(&entry->vde->map_lock);
> 
> From smatch:
> 
> drivers/staging/media/tegra-vde/dmabuf-cache.c:55 tegra_vde_delayed_unmap() error: dereferencing freed memory 'entry'

That's a very good catch, thanks you very much! I'm keep forgetting about smatch, it's
a useful tool. And unfortunately I can't KASAN the driver because ARM32 doesn't
support KASAN in upstream and Xorg hangs with the unofficial patch that adds support
for the KASAN.

[snip]

>> +	entry->dma_dir = dma_dir;
>> +	entry->iova = iova;
> 
> From smatch:
> 
> drivers/staging/media/tegra-vde/dmabuf-cache.c:133 tegra_vde_dmabuf_cache_map() error: uninitialized symbol 'iova'.

This is fine, but indeed won't hurt to explicitly initialize to NULL.

Thanks again!
diff mbox series

Patch

diff --git a/drivers/staging/media/tegra-vde/Makefile b/drivers/staging/media/tegra-vde/Makefile
index c11867e28233..2827f7601de8 100644
--- a/drivers/staging/media/tegra-vde/Makefile
+++ b/drivers/staging/media/tegra-vde/Makefile
@@ -1,3 +1,3 @@ 
 # SPDX-License-Identifier: GPL-2.0
-tegra-vde-y := vde.o iommu.o
+tegra-vde-y := vde.o iommu.o dmabuf-cache.o
 obj-$(CONFIG_TEGRA_VDE)	+= tegra-vde.o
diff --git a/drivers/staging/media/tegra-vde/dmabuf-cache.c b/drivers/staging/media/tegra-vde/dmabuf-cache.c
new file mode 100644
index 000000000000..fcde8d1c37e7
--- /dev/null
+++ b/drivers/staging/media/tegra-vde/dmabuf-cache.c
@@ -0,0 +1,223 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * NVIDIA Tegra Video decoder driver
+ *
+ * Copyright (C) 2016-2019 GRATE-DRIVER project
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/iova.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "vde.h"
+
+struct tegra_vde_cache_entry {
+	enum dma_data_direction dma_dir;
+	struct dma_buf_attachment *a;
+	struct delayed_work dwork;
+	struct tegra_vde *vde;
+	struct list_head list;
+	struct sg_table *sgt;
+	struct iova *iova;
+	unsigned int refcnt;
+};
+
+static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry)
+{
+	struct dma_buf *dmabuf = entry->a->dmabuf;
+
+	WARN_ON_ONCE(entry->refcnt);
+
+	if (entry->vde->domain)
+		tegra_vde_iommu_unmap(entry->vde, entry->iova);
+
+	dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir);
+	dma_buf_detach(dmabuf, entry->a);
+	dma_buf_put(dmabuf);
+
+	list_del(&entry->list);
+	kfree(entry);
+}
+
+static void tegra_vde_delayed_unmap(struct work_struct *work)
+{
+	struct tegra_vde_cache_entry *entry;
+
+	entry = container_of(work, struct tegra_vde_cache_entry,
+			     dwork.work);
+
+	mutex_lock(&entry->vde->map_lock);
+	tegra_vde_release_entry(entry);
+	mutex_unlock(&entry->vde->map_lock);
+}
+
+int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
+			       struct dma_buf *dmabuf,
+			       enum dma_data_direction dma_dir,
+			       struct dma_buf_attachment **ap,
+			       dma_addr_t *addrp)
+{
+	struct device *dev = vde->miscdev.parent;
+	struct tegra_vde_cache_entry *entry;
+	struct dma_buf_attachment *attachment;
+	struct sg_table *sgt;
+	struct iova *iova;
+	int err;
+
+	mutex_lock(&vde->map_lock);
+
+	list_for_each_entry(entry, &vde->map_list, list) {
+		if (entry->a->dmabuf != dmabuf)
+			continue;
+
+		if (!cancel_delayed_work(&entry->dwork))
+			continue;
+
+		if (entry->dma_dir != dma_dir)
+			entry->dma_dir = DMA_BIDIRECTIONAL;
+
+		dma_buf_put(dmabuf);
+
+		if (vde->domain)
+			*addrp = iova_dma_addr(&vde->iova, entry->iova);
+		else
+			*addrp = sg_dma_address(entry->sgt->sgl);
+
+		goto ref;
+	}
+
+	attachment = dma_buf_attach(dmabuf, dev);
+	if (IS_ERR(attachment)) {
+		dev_err(dev, "Failed to attach dmabuf\n");
+		err = PTR_ERR(attachment);
+		goto err_unlock;
+	}
+
+	sgt = dma_buf_map_attachment(attachment, dma_dir);
+	if (IS_ERR(sgt)) {
+		dev_err(dev, "Failed to get dmabufs sg_table\n");
+		err = PTR_ERR(sgt);
+		goto err_detach;
+	}
+
+	if (!vde->domain && sgt->nents > 1) {
+		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
+		err = -EINVAL;
+		goto err_unmap;
+	}
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		err = -ENOMEM;
+		goto err_unmap;
+	}
+
+	if (vde->domain) {
+		err = tegra_vde_iommu_map(vde, sgt, &iova, dmabuf->size);
+		if (err)
+			goto err_free;
+
+		*addrp = iova_dma_addr(&vde->iova, iova);
+	} else {
+		*addrp = sg_dma_address(sgt->sgl);
+	}
+
+	INIT_DELAYED_WORK(&entry->dwork, tegra_vde_delayed_unmap);
+	list_add(&entry->list, &vde->map_list);
+
+	entry->dma_dir = dma_dir;
+	entry->iova = iova;
+	entry->vde = vde;
+	entry->sgt = sgt;
+	entry->a = attachment;
+ref:
+	entry->refcnt++;
+
+	*ap = entry->a;
+
+	mutex_unlock(&vde->map_lock);
+
+	return 0;
+
+err_free:
+	kfree(entry);
+err_unmap:
+	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
+err_detach:
+	dma_buf_detach(dmabuf, attachment);
+err_unlock:
+	mutex_unlock(&vde->map_lock);
+
+	return err;
+}
+
+void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
+				  struct dma_buf_attachment *a,
+				  bool release)
+{
+	struct tegra_vde_cache_entry *entry;
+
+	mutex_lock(&vde->map_lock);
+
+	list_for_each_entry(entry, &vde->map_list, list) {
+		if (entry->a != a)
+			continue;
+
+		WARN_ON_ONCE(!entry->refcnt);
+
+		if (--entry->refcnt == 0) {
+			if (release)
+				tegra_vde_release_entry(entry);
+			else
+				schedule_delayed_work(&entry->dwork, 5 * HZ);
+		}
+		break;
+	}
+
+	mutex_unlock(&vde->map_lock);
+}
+
+void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde)
+{
+	struct tegra_vde_cache_entry *entry, *tmp;
+
+	mutex_lock(&vde->map_lock);
+
+	list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
+		if (entry->refcnt)
+			continue;
+
+		if (!cancel_delayed_work(&entry->dwork))
+			continue;
+
+		tegra_vde_release_entry(entry);
+	}
+
+	mutex_unlock(&vde->map_lock);
+}
+
+void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde)
+{
+	struct tegra_vde_cache_entry *entry, *tmp;
+
+	mutex_lock(&vde->map_lock);
+
+	do {
+		list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
+			if (!cancel_delayed_work(&entry->dwork))
+				continue;
+
+			tegra_vde_release_entry(entry);
+		}
+
+		mutex_unlock(&vde->map_lock);
+		schedule();
+		mutex_lock(&vde->map_lock);
+	} while (!list_empty(&vde->map_list));
+
+	mutex_unlock(&vde->map_lock);
+}
diff --git a/drivers/staging/media/tegra-vde/iommu.c b/drivers/staging/media/tegra-vde/iommu.c
index 295c3d7cccd3..6d635332e0ec 100644
--- a/drivers/staging/media/tegra-vde/iommu.c
+++ b/drivers/staging/media/tegra-vde/iommu.c
@@ -19,7 +19,6 @@ 
 int tegra_vde_iommu_map(struct tegra_vde *vde,
 			struct sg_table *sgt,
 			struct iova **iovap,
-			dma_addr_t *addrp,
 			size_t size)
 {
 	struct iova *iova;
@@ -45,7 +44,6 @@  int tegra_vde_iommu_map(struct tegra_vde *vde,
 	}
 
 	*iovap = iova;
-	*addrp = addr;
 
 	return 0;
 }
diff --git a/drivers/staging/media/tegra-vde/vde.c b/drivers/staging/media/tegra-vde/vde.c
index cbcdbfef072d..3466daddf663 100644
--- a/drivers/staging/media/tegra-vde/vde.c
+++ b/drivers/staging/media/tegra-vde/vde.c
@@ -11,6 +11,7 @@ 
 #include <linux/genalloc.h>
 #include <linux/interrupt.h>
 #include <linux/iopoll.h>
+#include <linux/list.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
@@ -37,18 +38,10 @@ 
 #define BSE_DMA_BUSY		BIT(23)
 
 struct video_frame {
-	struct iova *y_iova;
-	struct iova *cb_iova;
-	struct iova *cr_iova;
-	struct iova *aux_iova;
 	struct dma_buf_attachment *y_dmabuf_attachment;
 	struct dma_buf_attachment *cb_dmabuf_attachment;
 	struct dma_buf_attachment *cr_dmabuf_attachment;
 	struct dma_buf_attachment *aux_dmabuf_attachment;
-	struct sg_table *y_sgt;
-	struct sg_table *cb_sgt;
-	struct sg_table *cr_sgt;
-	struct sg_table *aux_sgt;
 	dma_addr_t y_addr;
 	dma_addr_t cb_addr;
 	dma_addr_t cr_addr;
@@ -494,22 +487,6 @@  static void tegra_vde_decode_frame(struct tegra_vde *vde,
 			 vde->sxe, 0x00);
 }
 
-static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde,
-					    enum dma_data_direction dma_dir,
-					    struct dma_buf_attachment *a,
-					    struct sg_table *sgt,
-					    struct iova *iova)
-{
-	struct dma_buf *dmabuf = a->dmabuf;
-
-	if (vde->domain)
-		tegra_vde_iommu_unmap(vde, iova);
-
-	dma_buf_unmap_attachment(a, sgt, dma_dir);
-	dma_buf_detach(dmabuf, a);
-	dma_buf_put(dmabuf);
-}
-
 static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
 				   int fd,
 				   unsigned long offset,
@@ -517,15 +494,11 @@  static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
 				   size_t align_size,
 				   struct dma_buf_attachment **a,
 				   dma_addr_t *addrp,
-				   struct sg_table **s,
-				   struct iova **iovap,
 				   size_t *size,
 				   enum dma_data_direction dma_dir)
 {
 	struct device *dev = vde->miscdev.parent;
-	struct dma_buf_attachment *attachment;
 	struct dma_buf *dmabuf;
-	struct sg_table *sgt;
 	int err;
 
 	dmabuf = dma_buf_get(fd);
@@ -546,49 +519,17 @@  static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
 		return -EINVAL;
 	}
 
-	attachment = dma_buf_attach(dmabuf, dev);
-	if (IS_ERR(attachment)) {
-		dev_err(dev, "Failed to attach dmabuf\n");
-		err = PTR_ERR(attachment);
+	err = tegra_vde_dmabuf_cache_map(vde, dmabuf, dma_dir, a, addrp);
+	if (err)
 		goto err_put;
-	}
-
-	sgt = dma_buf_map_attachment(attachment, dma_dir);
-	if (IS_ERR(sgt)) {
-		dev_err(dev, "Failed to get dmabufs sg_table\n");
-		err = PTR_ERR(sgt);
-		goto err_detach;
-	}
-
-	if (!vde->domain && sgt->nents > 1) {
-		dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
-		err = -EINVAL;
-		goto err_unmap;
-	}
-
-	if (vde->domain) {
-		err = tegra_vde_iommu_map(vde, sgt, iovap, addrp, dmabuf->size);
-		if (err) {
-			dev_err(dev, "IOMMU mapping failed: %d\n", err);
-			goto err_unmap;
-		}
-	} else {
-		*addrp = sg_dma_address(sgt->sgl);
-	}
 
 	*addrp = *addrp + offset;
-	*a = attachment;
-	*s = sgt;
 
 	if (size)
 		*size = dmabuf->size - offset;
 
 	return 0;
 
-err_unmap:
-	dma_buf_unmap_attachment(attachment, sgt, dma_dir);
-err_detach:
-	dma_buf_detach(dmabuf, attachment);
 err_put:
 	dma_buf_put(dmabuf);
 
@@ -608,8 +549,6 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 				      src->y_offset, lsize, SZ_256,
 				      &frame->y_dmabuf_attachment,
 				      &frame->y_addr,
-				      &frame->y_sgt,
-				      &frame->y_iova,
 				      NULL, dma_dir);
 	if (err)
 		return err;
@@ -618,8 +557,6 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 				      src->cb_offset, csize, SZ_256,
 				      &frame->cb_dmabuf_attachment,
 				      &frame->cb_addr,
-				      &frame->cb_sgt,
-				      &frame->cb_iova,
 				      NULL, dma_dir);
 	if (err)
 		goto err_release_y;
@@ -628,8 +565,6 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 				      src->cr_offset, csize, SZ_256,
 				      &frame->cr_dmabuf_attachment,
 				      &frame->cr_addr,
-				      &frame->cr_sgt,
-				      &frame->cr_iova,
 				      NULL, dma_dir);
 	if (err)
 		goto err_release_cb;
@@ -643,8 +578,6 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 				      src->aux_offset, csize, SZ_256,
 				      &frame->aux_dmabuf_attachment,
 				      &frame->aux_addr,
-				      &frame->aux_sgt,
-				      &frame->aux_iova,
 				      NULL, dma_dir);
 	if (err)
 		goto err_release_cr;
@@ -652,20 +585,11 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 	return 0;
 
 err_release_cr:
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->cr_dmabuf_attachment,
-					frame->cr_sgt,
-					frame->cr_iova);
+	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, true);
 err_release_cb:
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->cb_dmabuf_attachment,
-					frame->cb_sgt,
-					frame->cb_iova);
+	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, true);
 err_release_y:
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->y_dmabuf_attachment,
-					frame->y_sgt,
-					frame->y_iova);
+	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, true);
 
 	return err;
 }
@@ -673,28 +597,16 @@  static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
 static void tegra_vde_release_frame_dmabufs(struct tegra_vde *vde,
 					    struct video_frame *frame,
 					    enum dma_data_direction dma_dir,
-					    bool baseline_profile)
+					    bool baseline_profile,
+					    bool release)
 {
 	if (!baseline_profile)
-		tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-						frame->aux_dmabuf_attachment,
-						frame->aux_sgt,
-						frame->aux_iova);
-
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->cr_dmabuf_attachment,
-					frame->cr_sgt,
-					frame->cr_iova);
-
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->cb_dmabuf_attachment,
-					frame->cb_sgt,
-					frame->cb_iova);
-
-	tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
-					frame->y_dmabuf_attachment,
-					frame->y_sgt,
-					frame->y_iova);
+		tegra_vde_dmabuf_cache_unmap(vde, frame->aux_dmabuf_attachment,
+					     release);
+
+	tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, release);
+	tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, release);
+	tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, release);
 }
 
 static int tegra_vde_validate_frame(struct device *dev,
@@ -786,8 +698,6 @@  static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
 	struct tegra_vde_h264_frame __user *frames_user;
 	struct video_frame *dpb_frames;
 	struct dma_buf_attachment *bitstream_data_dmabuf_attachment;
-	struct sg_table *bitstream_sgt;
-	struct iova *bitstream_iova;
 	enum dma_data_direction dma_dir;
 	dma_addr_t bitstream_data_addr;
 	dma_addr_t bsev_ptr;
@@ -812,8 +722,6 @@  static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
 				      SZ_16K, SZ_16K,
 				      &bitstream_data_dmabuf_attachment,
 				      &bitstream_data_addr,
-				      &bitstream_sgt,
-				      &bitstream_iova,
 				      &bitstream_data_size,
 				      DMA_TO_DEVICE);
 	if (ret)
@@ -944,7 +852,7 @@  static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
 		dma_dir = (i == 0) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 
 		tegra_vde_release_frame_dmabufs(vde, &dpb_frames[i], dma_dir,
-						ctx.baseline_profile);
+						ctx.baseline_profile, ret != 0);
 	}
 
 free_dpb_frames:
@@ -954,10 +862,8 @@  static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
 	kfree(frames);
 
 release_bitstream_dmabuf:
-	tegra_vde_detach_and_put_dmabuf(vde, DMA_TO_DEVICE,
-					bitstream_data_dmabuf_attachment,
-					bitstream_sgt,
-					bitstream_iova);
+	tegra_vde_dmabuf_cache_unmap(vde, bitstream_data_dmabuf_attachment,
+				     ret != 0);
 
 	return ret;
 }
@@ -979,9 +885,21 @@  static long tegra_vde_unlocked_ioctl(struct file *filp,
 	return -ENOTTY;
 }
 
+static int tegra_vde_release_file(struct inode *inode, struct file *filp)
+{
+	struct miscdevice *miscdev = filp->private_data;
+	struct tegra_vde *vde = container_of(miscdev, struct tegra_vde,
+					     miscdev);
+
+	tegra_vde_dmabuf_cache_unmap_sync(vde);
+
+	return 0;
+}
+
 static const struct file_operations tegra_vde_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= tegra_vde_unlocked_ioctl,
+	.release	= tegra_vde_release_file,
 };
 
 static irqreturn_t tegra_vde_isr(int irq, void *data)
@@ -1159,6 +1077,8 @@  static int tegra_vde_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
+	INIT_LIST_HEAD(&vde->map_list);
+	mutex_init(&vde->map_lock);
 	mutex_init(&vde->lock);
 	init_completion(&vde->decode_completion);
 
@@ -1221,6 +1141,7 @@  static int tegra_vde_remove(struct platform_device *pdev)
 
 	misc_deregister(&vde->miscdev);
 
+	tegra_vde_dmabuf_cache_unmap_all(vde);
 	tegra_vde_iommu_deinit(vde);
 
 	gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
diff --git a/drivers/staging/media/tegra-vde/vde.h b/drivers/staging/media/tegra-vde/vde.h
index 37414b7fdee1..e138878e8e14 100644
--- a/drivers/staging/media/tegra-vde/vde.h
+++ b/drivers/staging/media/tegra-vde/vde.h
@@ -9,16 +9,20 @@ 
 #define TEGRA_VDE_H
 
 #include <linux/completion.h>
+#include <linux/dma-direction.h>
+#include <linux/list.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
 #include <linux/iova.h>
 
 struct clk;
+struct dma_buf;
 struct gen_pool;
 struct iommu_group;
 struct iommu_domain;
 struct reset_control;
+struct dma_buf_attachment;
 
 struct tegra_vde {
 	void __iomem *sxe;
@@ -31,6 +35,8 @@  struct tegra_vde {
 	void __iomem *vdma;
 	void __iomem *frameid;
 	struct mutex lock;
+	struct mutex map_lock;
+	struct list_head map_list;
 	struct miscdevice miscdev;
 	struct reset_control *rst;
 	struct reset_control *rst_mc;
@@ -49,10 +55,20 @@  void tegra_vde_iommu_deinit(struct tegra_vde *vde);
 int tegra_vde_iommu_map(struct tegra_vde *vde,
 			struct sg_table *sgt,
 			struct iova **iovap,
-			dma_addr_t *addrp,
 			size_t size);
 void tegra_vde_iommu_unmap(struct tegra_vde *vde, struct iova *iova);
 
+int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
+			       struct dma_buf *dmabuf,
+			       enum dma_data_direction dma_dir,
+			       struct dma_buf_attachment **ap,
+			       dma_addr_t *addrp);
+void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
+				  struct dma_buf_attachment *a,
+				  bool release);
+void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde);
+void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde);
+
 static __maybe_unused char const *
 tegra_vde_reg_base_name(struct tegra_vde *vde, void __iomem *base)
 {