diff mbox

[v4] dma-buf: Add ioctls to allow userspace to flush

Message ID 1440547375-15046-1-git-send-email-tiago.vignatti@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tiago Vignatti Aug. 26, 2015, 12:02 a.m. UTC
From: Daniel Vetter <daniel.vetter@ffwll.ch>

The userspace might need some sort of cache coherency management e.g. when CPU
and GPU domains are being accessed through dma-buf at the same time. To
circumvent this problem there are begin/end coherency markers, that forward
directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
used like following:

  - mmap dma-buf fd
  - for each drawing/upload cycle in CPU
    1. SYNC_START ioctl
    2. read/write to mmap area or a 2d sub-region of it
    3. SYNC_END ioctl.
  - munamp once you don't need the buffer any more

v2 (Tiago): Fix header file type names (u64 -> __u64)
v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
dma-buf functions. Check for overflows in start/length.
v4 (Tiago): use 2d regions for sync.

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Tiago Vignatti <tiago.vignatti@intel.com>
---

I'm unable to test the 2d sync properly, because begin/end access in i915
don't track mapped range for nothing.

 Documentation/dma-buf-sharing.txt      | 13 ++++++
 drivers/dma-buf/dma-buf.c              | 77 ++++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_gem_dmabuf.c |  6 ++-
 include/linux/dma-buf.h                | 20 +++++----
 include/uapi/linux/dma-buf.h           | 57 +++++++++++++++++++++++++
 5 files changed, 150 insertions(+), 23 deletions(-)
 create mode 100644 include/uapi/linux/dma-buf.h

Comments

Thomas Hellstrom Aug. 26, 2015, 6:49 a.m. UTC | #1
Hi, Tiago.

On 08/26/2015 02:02 AM, Tiago Vignatti wrote:
> From: Daniel Vetter <daniel.vetter@ffwll.ch>
>
> The userspace might need some sort of cache coherency management e.g. when CPU
> and GPU domains are being accessed through dma-buf at the same time. To
> circumvent this problem there are begin/end coherency markers, that forward
> directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
> of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
> used like following:
>
>   - mmap dma-buf fd
>   - for each drawing/upload cycle in CPU
>     1. SYNC_START ioctl
>     2. read/write to mmap area or a 2d sub-region of it
>     3. SYNC_END ioctl.
>   - munamp once you don't need the buffer any more
>
> v2 (Tiago): Fix header file type names (u64 -> __u64)
> v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
> dma-buf functions. Check for overflows in start/length.
> v4 (Tiago): use 2d regions for sync.

Daniel V had issues with the sync argument proposed by Daniel S. I'm
fine with that argument or an argument containing only a single sync
rect. I'm not sure whether Daniel V will find it easier to accept only a
single sync rect...

Also please see below.

>
> Cc: Sumit Semwal <sumit.semwal@linaro.org>
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> Signed-off-by: Tiago Vignatti <tiago.vignatti@intel.com>
> ---
>
> I'm unable to test the 2d sync properly, because begin/end access in i915
> don't track mapped range for nothing.
>
>  Documentation/dma-buf-sharing.txt      | 13 ++++++
>  drivers/dma-buf/dma-buf.c              | 77 ++++++++++++++++++++++++++++------
>  drivers/gpu/drm/i915/i915_gem_dmabuf.c |  6 ++-
>  include/linux/dma-buf.h                | 20 +++++----
>  include/uapi/linux/dma-buf.h           | 57 +++++++++++++++++++++++++
>  5 files changed, 150 insertions(+), 23 deletions(-)
>  create mode 100644 include/uapi/linux/dma-buf.h
>
> diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
> index 480c8de..8061ac0 100644
> --- a/Documentation/dma-buf-sharing.txt
> +++ b/Documentation/dma-buf-sharing.txt
> @@ -355,6 +355,19 @@ Being able to mmap an export dma-buf buffer object has 2 main use-cases:
>  
>     No special interfaces, userspace simply calls mmap on the dma-buf fd.
>  
> +   Also, the userspace might need some sort of cache coherency management e.g.
> +   when CPU and GPU domains are being accessed through dma-buf at the same
> +   time. To circumvent this problem there are begin/end coherency markers, that
> +   forward directly to existing dma-buf device drivers vfunc hooks. Userspace
> +   can make use of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The
> +   sequence would be used like following:
> +     - mmap dma-buf fd
> +     - for each drawing/upload cycle in CPU
> +       1. SYNC_START ioctl
> +       2. read/write to mmap area or a 2d sub-region of it
> +       3. SYNC_END ioctl.
> +     - munamp once you don't need the buffer any more
> +
>  2. Supporting existing mmap interfaces in importers
>  
>     Similar to the motivation for kernel cpu access it is again important that
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index 155c146..b6a4a06 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -251,11 +251,55 @@ out:
>  	return events;
>  }
>  
> +static long dma_buf_ioctl(struct file *file,
> +			  unsigned int cmd, unsigned long arg)
> +{
> +	struct dma_buf *dmabuf;
> +	struct dma_buf_sync sync;
> +	enum dma_data_direction direction;
> +
> +	dmabuf = file->private_data;
> +
> +	if (!is_dma_buf_file(file))
> +		return -EINVAL;
> +
> +	if (cmd != DMA_BUF_IOCTL_SYNC)
> +		return -ENOTTY;
> +
> +	if (copy_from_user(&sync, (void __user *) arg, sizeof(sync)))
> +		return -EFAULT;

Do we actually copy all sync regions here?

> +
> +	if (sync.flags & DMA_BUF_SYNC_RW)
> +		direction = DMA_BIDIRECTIONAL;
> +	else if (sync.flags & DMA_BUF_SYNC_READ)
> +		direction = DMA_FROM_DEVICE;
> +	else if (sync.flags & DMA_BUF_SYNC_WRITE)
> +		direction = DMA_TO_DEVICE;
> +	else
> +		return -EINVAL;
> +
> +	if (sync.flags & ~DMA_BUF_SYNC_VALID_FLAGS_MASK)
> +		return -EINVAL;
> +
> +	/* TODO: check for overflowing the buffer's size - how so, checking region by
> +	 * region here? Maybe need to check for the other parameters as well. */
> +
> +	if (sync.flags & DMA_BUF_SYNC_END)
> +		dma_buf_end_cpu_access(dmabuf, sync.stride_bytes, sync.bytes_per_pixel,
> +				sync.num_regions, sync.regions, direction);
> +	else
> +		dma_buf_begin_cpu_access(dmabuf, sync.stride_bytes, sync.bytes_per_pixel,
> +				sync.num_regions, sync.regions, direction);
> +
> +	return 0;
> +}
> +
>  static const struct file_operations dma_buf_fops = {
>  	.release	= dma_buf_release,
>  	.mmap		= dma_buf_mmap_internal,
>  	.llseek		= dma_buf_llseek,
>  	.poll		= dma_buf_poll,
> +	.unlocked_ioctl	= dma_buf_ioctl,
>  };
>  
>  /*
> @@ -539,14 +583,17 @@ EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
>   * preparations. Coherency is only guaranteed in the specified range for the
>   * specified access direction.
>   * @dmabuf:	[in]	buffer to prepare cpu access for.
> - * @start:	[in]	start of range for cpu access.
> - * @len:	[in]	length of range for cpu access.
> - * @direction:	[in]	length of range for cpu access.
> + * @stride_bytes:	[in]	stride in bytes for cpu access.
> + * @bytes_per_pixel:	[in]	bytes per pixel of the region for cpu access.
> + * @num_regions:   [in]  number of regions.
> + * @region:   [in] vector of 2-dimensional regions for cpu access.
> + * @direction:	[in]	direction of range for cpu access.
>   *
>   * Can return negative error values, returns 0 on success.
>   */
> -int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
> -			     enum dma_data_direction direction)
> +int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t stride_bytes,
> +	size_t bytes_per_pixel, size_t num_regions, struct dma_buf_sync_region regions[],
> +	enum dma_data_direction direction)
>  {
>  	int ret = 0;
>  
> @@ -554,8 +601,8 @@ int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
>  		return -EINVAL;
>  
>  	if (dmabuf->ops->begin_cpu_access)
> -		ret = dmabuf->ops->begin_cpu_access(dmabuf, start,
> -							len, direction);
> +		ret = dmabuf->ops->begin_cpu_access(dmabuf, stride_bytes, bytes_per_pixel,
> +							num_regions, regions, direction);
>  
>  	return ret;
>  }
> @@ -567,19 +614,23 @@ EXPORT_SYMBOL_GPL(dma_buf_begin_cpu_access);
>   * actions. Coherency is only guaranteed in the specified range for the
>   * specified access direction.
>   * @dmabuf:	[in]	buffer to complete cpu access for.
> - * @start:	[in]	start of range for cpu access.
> - * @len:	[in]	length of range for cpu access.
> - * @direction:	[in]	length of range for cpu access.
> + * @stride_bytes:	[in]	stride in bytes for cpu access.
> + * @bytes_per_pixel:	[in]	bytes per pixel of the region for cpu access.
> + * @num_regions:   [in]  number of regions.
> + * @regions:   [in]  vector of 2-dimensional regions for cpu access.
> + * @direction:	[in]	direction of range for cpu access.
>   *
>   * This call must always succeed.
>   */
> -void dma_buf_end_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
> -			    enum dma_data_direction direction)
> +void dma_buf_end_cpu_access(struct dma_buf *dmabuf, size_t stride_bytes,
> +	size_t bytes_per_pixel, size_t num_regions, struct dma_buf_sync_region regions[],
> +	enum dma_data_direction direction)
>  {
>  	WARN_ON(!dmabuf);
>  
>  	if (dmabuf->ops->end_cpu_access)
> -		dmabuf->ops->end_cpu_access(dmabuf, start, len, direction);
> +		dmabuf->ops->end_cpu_access(dmabuf, stride_bytes, bytes_per_pixel,
> +			num_regions, regions, direction);
>  }
>  EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
> index 95cbfff..e5bb7a3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
> +++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
> @@ -212,7 +212,8 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
>  	return 0;
>  }
>  
> -static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)
> +static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes, size_t bytes_per_pixel,
> +		size_t num_regions, struct dma_buf_sync_region regions[], enum dma_data_direction direction)
>  {
>  	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
>  	struct drm_device *dev = obj->base.dev;
> @@ -228,7 +229,8 @@ static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size
>  	return ret;
>  }
>  
> -static void i915_gem_end_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)
> +static void i915_gem_end_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes, size_t bytes_per_pixel,
> +		size_t num_regions, struct dma_buf_sync_region regions[], enum dma_data_direction direction)
>  {
>  	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
>  	struct drm_device *dev = obj->base.dev;
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index f98bd70..ed457cb 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -33,6 +33,8 @@
>  #include <linux/fence.h>
>  #include <linux/wait.h>
>  
> +#include <uapi/linux/dma-buf.h>
> +
>  struct device;
>  struct dma_buf;
>  struct dma_buf_attachment;
> @@ -93,10 +95,10 @@ struct dma_buf_ops {
>  	/* after final dma_buf_put() */
>  	void (*release)(struct dma_buf *);
>  
> -	int (*begin_cpu_access)(struct dma_buf *, size_t, size_t,
> -				enum dma_data_direction);
> -	void (*end_cpu_access)(struct dma_buf *, size_t, size_t,
> -			       enum dma_data_direction);
> +	int (*begin_cpu_access)(struct dma_buf *, size_t, size_t, size_t,
> +				struct dma_buf_sync_region [], enum dma_data_direction);
> +	void (*end_cpu_access)(struct dma_buf *, size_t, size_t, size_t,
> +			       struct dma_buf_sync_region [], enum dma_data_direction);
>  	void *(*kmap_atomic)(struct dma_buf *, unsigned long);
>  	void (*kunmap_atomic)(struct dma_buf *, unsigned long, void *);
>  	void *(*kmap)(struct dma_buf *, unsigned long);
> @@ -224,10 +226,12 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *,
>  					enum dma_data_direction);
>  void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *,
>  				enum dma_data_direction);
> -int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
> -			     enum dma_data_direction dir);
> -void dma_buf_end_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
> -			    enum dma_data_direction dir);
> +int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes,
> +			     size_t bytes_per_pixel, size_t num_regions,
> +			     struct dma_buf_sync_region regions[], enum dma_data_direction dir);
> +void dma_buf_end_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes,
> +			     size_t bytes_per_pixel, size_t num_regions,
> +			     struct dma_buf_sync_region regions[], enum dma_data_direction dir);
>  void *dma_buf_kmap_atomic(struct dma_buf *, unsigned long);
>  void dma_buf_kunmap_atomic(struct dma_buf *, unsigned long, void *);
>  void *dma_buf_kmap(struct dma_buf *, unsigned long);
> diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
> new file mode 100644
> index 0000000..c63b578
> --- /dev/null
> +++ b/include/uapi/linux/dma-buf.h
> @@ -0,0 +1,57 @@
> +/*
> + * Framework for buffer objects that can be shared across devices/subsystems.
> + *
> + * Copyright(C) 2015 Intel Ltd
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published by
> + * the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef _DMA_BUF_UAPI_H_
> +#define _DMA_BUF_UAPI_H_
> +
> +enum dma_buf_sync_flags {
> +	DMA_BUF_SYNC_READ = (1 << 0),
> +	DMA_BUF_SYNC_WRITE = (2 << 0),
> +	DMA_BUF_SYNC_RW = (3 << 0),
> +	DMA_BUF_SYNC_START = (0 << 2),
> +	DMA_BUF_SYNC_END = (1 << 2),
> +
> +	DMA_BUF_SYNC_VALID_FLAGS_MASK = DMA_BUF_SYNC_RW |
> +		DMA_BUF_SYNC_END
> +};
> +
> +/* 2-dimensional region, used for multi-range flush. This can be used to
> + * synchronize the CPU by batching several sub-regions, smaller than the
> + * mapped dma-buf, all at once. */
> +struct dma_buf_sync_region {
> +	__u64 x;
> +	__u64 y;
> +	__u64 width;
> +	__u64 height;
> +};
> +
> +/* begin/end dma-buf functions used for userspace mmap. */
> +struct dma_buf_sync {
> +	enum dma_buf_sync_flags flags;
> +
> +	__u64 stride_bytes;
> +	__u32 bytes_per_pixel;
> +	__u32 num_regions;
> +
> +	struct dma_buf_sync_region regions[];
> +};
> +
> +#define DMA_BUF_BASE		'b'
> +#define DMA_BUF_IOCTL_SYNC	_IOWR(DMA_BUF_BASE, 0, struct dma_buf_sync)

Should be _IOW(  ?


> +
> +#endif

Thomas
Daniel Vetter Aug. 26, 2015, 12:10 p.m. UTC | #2
On Wed, Aug 26, 2015 at 08:49:00AM +0200, Thomas Hellstrom wrote:
> Hi, Tiago.
> 
> On 08/26/2015 02:02 AM, Tiago Vignatti wrote:
> > From: Daniel Vetter <daniel.vetter@ffwll.ch>
> >
> > The userspace might need some sort of cache coherency management e.g. when CPU
> > and GPU domains are being accessed through dma-buf at the same time. To
> > circumvent this problem there are begin/end coherency markers, that forward
> > directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
> > of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
> > used like following:
> >
> >   - mmap dma-buf fd
> >   - for each drawing/upload cycle in CPU
> >     1. SYNC_START ioctl
> >     2. read/write to mmap area or a 2d sub-region of it
> >     3. SYNC_END ioctl.
> >   - munamp once you don't need the buffer any more
> >
> > v2 (Tiago): Fix header file type names (u64 -> __u64)
> > v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
> > dma-buf functions. Check for overflows in start/length.
> > v4 (Tiago): use 2d regions for sync.
> 
> Daniel V had issues with the sync argument proposed by Daniel S. I'm
> fine with that argument or an argument containing only a single sync
> rect. I'm not sure whether Daniel V will find it easier to accept only a
> single sync rect...

I'm kinda against all the 2d rect sync proposals ;-) At least for the
current stuff it's all about linear subranges afaik, and even there we
don't bother with flushing them precisely right now.

My expectation would be that if you _really_ want to etch out that last
bit of performance with a list of 2d sync ranges then probably you want to
do the cpu cache flushing in userspace anyway, with 100% machine-specific
trickery.
-Daniel
Thomas Hellstrom Aug. 26, 2015, 12:28 p.m. UTC | #3
On 08/26/2015 02:10 PM, Daniel Vetter wrote:
> On Wed, Aug 26, 2015 at 08:49:00AM +0200, Thomas Hellstrom wrote:
>> Hi, Tiago.
>>
>> On 08/26/2015 02:02 AM, Tiago Vignatti wrote:
>>> From: Daniel Vetter <daniel.vetter@ffwll.ch>
>>>
>>> The userspace might need some sort of cache coherency management e.g. when CPU
>>> and GPU domains are being accessed through dma-buf at the same time. To
>>> circumvent this problem there are begin/end coherency markers, that forward
>>> directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
>>> of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
>>> used like following:
>>>
>>>   - mmap dma-buf fd
>>>   - for each drawing/upload cycle in CPU
>>>     1. SYNC_START ioctl
>>>     2. read/write to mmap area or a 2d sub-region of it
>>>     3. SYNC_END ioctl.
>>>   - munamp once you don't need the buffer any more
>>>
>>> v2 (Tiago): Fix header file type names (u64 -> __u64)
>>> v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
>>> dma-buf functions. Check for overflows in start/length.
>>> v4 (Tiago): use 2d regions for sync.
>> Daniel V had issues with the sync argument proposed by Daniel S. I'm
>> fine with that argument or an argument containing only a single sync
>> rect. I'm not sure whether Daniel V will find it easier to accept only a
>> single sync rect...
> I'm kinda against all the 2d rect sync proposals ;-) At least for the
> current stuff it's all about linear subranges afaik, and even there we
> don't bother with flushing them precisely right now.
>
> My expectation would be that if you _really_ want to etch out that last
> bit of performance with a list of 2d sync ranges then probably you want to
> do the cpu cache flushing in userspace anyway, with 100% machine-specific
> trickery.

Daniel,

I might be misunderstanding things, but isn't this about finally
accepting a dma-buf mmap() generic interface for people who want to use
it for zero-copy applications (like people have been trying to do for
years but never bothered to specify an interface that actually performed
on incoherent hardware)?

If it's only about exposing the kernel 1D sync interface to user-space
for correctness, then why isn't that done transparently to the user?

Thanks,
Thomas


> -Daniel
Daniel Vetter Aug. 26, 2015, 12:58 p.m. UTC | #4
On Wed, Aug 26, 2015 at 02:28:30PM +0200, Thomas Hellstrom wrote:
> On 08/26/2015 02:10 PM, Daniel Vetter wrote:
> > On Wed, Aug 26, 2015 at 08:49:00AM +0200, Thomas Hellstrom wrote:
> >> Hi, Tiago.
> >>
> >> On 08/26/2015 02:02 AM, Tiago Vignatti wrote:
> >>> From: Daniel Vetter <daniel.vetter@ffwll.ch>
> >>>
> >>> The userspace might need some sort of cache coherency management e.g. when CPU
> >>> and GPU domains are being accessed through dma-buf at the same time. To
> >>> circumvent this problem there are begin/end coherency markers, that forward
> >>> directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
> >>> of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
> >>> used like following:
> >>>
> >>>   - mmap dma-buf fd
> >>>   - for each drawing/upload cycle in CPU
> >>>     1. SYNC_START ioctl
> >>>     2. read/write to mmap area or a 2d sub-region of it
> >>>     3. SYNC_END ioctl.
> >>>   - munamp once you don't need the buffer any more
> >>>
> >>> v2 (Tiago): Fix header file type names (u64 -> __u64)
> >>> v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
> >>> dma-buf functions. Check for overflows in start/length.
> >>> v4 (Tiago): use 2d regions for sync.
> >> Daniel V had issues with the sync argument proposed by Daniel S. I'm
> >> fine with that argument or an argument containing only a single sync
> >> rect. I'm not sure whether Daniel V will find it easier to accept only a
> >> single sync rect...
> > I'm kinda against all the 2d rect sync proposals ;-) At least for the
> > current stuff it's all about linear subranges afaik, and even there we
> > don't bother with flushing them precisely right now.
> >
> > My expectation would be that if you _really_ want to etch out that last
> > bit of performance with a list of 2d sync ranges then probably you want to
> > do the cpu cache flushing in userspace anyway, with 100% machine-specific
> > trickery.
> 
> Daniel,
> 
> I might be misunderstanding things, but isn't this about finally
> accepting a dma-buf mmap() generic interface for people who want to use
> it for zero-copy applications (like people have been trying to do for
> years but never bothered to specify an interface that actually performed
> on incoherent hardware)?
> 
> If it's only about exposing the kernel 1D sync interface to user-space
> for correctness, then why isn't that done transparently to the user?

Mostly pragmatic reasons - we could do the page-fault trickery, but that
means i915 needs another mmap implementation. At least I didn't figure out
how to do faulting in a completely generic way. And we already have 3
other mmap implementations so I prefer not to do that.

The other is that right now there's no user nor implementation in sight
which actually does range-based flush optimizations, so I'm pretty much
expecting we'll get it wrong. Maybe instead we should go one step further
and remove the range from the internal dma-buf interface and also drop it
from the ioctl? With the flags we can always add something later on once
we have a real user with a clear need for it. But afaik cros only wants to
shuffle around entire tiles and has a buffer-per-tile approach.
-Daniel
Tiago Vignatti Aug. 26, 2015, 2:32 p.m. UTC | #5
On 08/26/2015 09:58 AM, Daniel Vetter wrote:
> The other is that right now there's no user nor implementation in sight
> which actually does range-based flush optimizations, so I'm pretty much
> expecting we'll get it wrong. Maybe instead we should go one step further
> and remove the range from the internal dma-buf interface and also drop it
> from the ioctl? With the flags we can always add something later on once
> we have a real user with a clear need for it. But afaik cros only wants to
> shuffle around entire tiles and has a buffer-per-tile approach.

Thomas, I think Daniel has a point here and also, I wouldn't mind 
removing all range control from the dma-buf ioctl either.

In this last patch we can see that it's not complicated to add the 
2d-sync if we eventually decides to want it. But right now there's no 
way we can test it. Therefore in that case I'm all in favour of doing 
this work gradually, adding the basics first.

Tiago
Thomas Hellstrom Aug. 26, 2015, 2:50 p.m. UTC | #6
On 08/26/2015 04:32 PM, Tiago Vignatti wrote:
> On 08/26/2015 09:58 AM, Daniel Vetter wrote:
>> The other is that right now there's no user nor implementation in sight
>> which actually does range-based flush optimizations, so I'm pretty much
>> expecting we'll get it wrong. Maybe instead we should go one step
>> further
>> and remove the range from the internal dma-buf interface and also
>> drop it
>> from the ioctl? With the flags we can always add something later on once
>> we have a real user with a clear need for it. But afaik cros only
>> wants to
>> shuffle around entire tiles and has a buffer-per-tile approach.
>
> Thomas, I think Daniel has a point here and also, I wouldn't mind
> removing all range control from the dma-buf ioctl either.
>
> In this last patch we can see that it's not complicated to add the
> 2d-sync if we eventually decides to want it. But right now there's no
> way we can test it. Therefore in that case I'm all in favour of doing
> this work gradually, adding the basics first.
>

Sure. Unfortunately I wasn't completely clear about the use-case here.
IMO adding a user-space sync functionality should be purely for performance.

/Thomas
Daniel Vetter Aug. 26, 2015, 2:51 p.m. UTC | #7
On Wed, Aug 26, 2015 at 11:32:30AM -0300, Tiago Vignatti wrote:
> On 08/26/2015 09:58 AM, Daniel Vetter wrote:
> >The other is that right now there's no user nor implementation in sight
> >which actually does range-based flush optimizations, so I'm pretty much
> >expecting we'll get it wrong. Maybe instead we should go one step further
> >and remove the range from the internal dma-buf interface and also drop it
> >from the ioctl? With the flags we can always add something later on once
> >we have a real user with a clear need for it. But afaik cros only wants to
> >shuffle around entire tiles and has a buffer-per-tile approach.
> 
> Thomas, I think Daniel has a point here and also, I wouldn't mind removing
> all range control from the dma-buf ioctl either.

if we go with nuking it from the ioctl I'd suggest to also nuke it from
the dma-buf internal inferface first too.
-Daniel
Tiago Vignatti Aug. 26, 2015, 2:58 p.m. UTC | #8
On 08/26/2015 11:51 AM, Daniel Vetter wrote:
> On Wed, Aug 26, 2015 at 11:32:30AM -0300, Tiago Vignatti wrote:
>> On 08/26/2015 09:58 AM, Daniel Vetter wrote:
>>> The other is that right now there's no user nor implementation in sight
>>> which actually does range-based flush optimizations, so I'm pretty much
>>> expecting we'll get it wrong. Maybe instead we should go one step further
>>> and remove the range from the internal dma-buf interface and also drop it
>> >from the ioctl? With the flags we can always add something later on once
>>> we have a real user with a clear need for it. But afaik cros only wants to
>>> shuffle around entire tiles and has a buffer-per-tile approach.
>>
>> Thomas, I think Daniel has a point here and also, I wouldn't mind removing
>> all range control from the dma-buf ioctl either.
>
> if we go with nuking it from the ioctl I'd suggest to also nuke it from
> the dma-buf internal inferface first too.

yep, I can do it.

Thomas, so we leave 2d sync out now?

Tiago
Thomas Hellstrom Aug. 26, 2015, 3:37 p.m. UTC | #9
> 26 aug 2015 kl. 16:58 skrev Tiago Vignatti <tiago.vignatti@intel.com>:
> 
>> On 08/26/2015 11:51 AM, Daniel Vetter wrote:
>>> On Wed, Aug 26, 2015 at 11:32:30AM -0300, Tiago Vignatti wrote:
>>>> On 08/26/2015 09:58 AM, Daniel Vetter wrote:
>>>> The other is that right now there's no user nor implementation in sight
>>>> which actually does range-based flush optimizations, so I'm pretty much
>>>> expecting we'll get it wrong. Maybe instead we should go one step further
>>>> and remove the range from the internal dma-buf interface and also drop it
>>> >from the ioctl? With the flags we can always add something later on once
>>>> we have a real user with a clear need for it. But afaik cros only wants to
>>>> shuffle around entire tiles and has a buffer-per-tile approach.
>>> 
>>> Thomas, I think Daniel has a point here and also, I wouldn't mind removing
>>> all range control from the dma-buf ioctl either.
>> 
>> if we go with nuking it from the ioctl I'd suggest to also nuke it from
>> the dma-buf internal inferface first too.
> 
> yep, I can do it.
> 
> Thomas, so we leave 2d sync out now?
> 
> Tiago
> 
Sure!
Thomas
diff mbox

Patch

diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
index 480c8de..8061ac0 100644
--- a/Documentation/dma-buf-sharing.txt
+++ b/Documentation/dma-buf-sharing.txt
@@ -355,6 +355,19 @@  Being able to mmap an export dma-buf buffer object has 2 main use-cases:
 
    No special interfaces, userspace simply calls mmap on the dma-buf fd.
 
+   Also, the userspace might need some sort of cache coherency management e.g.
+   when CPU and GPU domains are being accessed through dma-buf at the same
+   time. To circumvent this problem there are begin/end coherency markers, that
+   forward directly to existing dma-buf device drivers vfunc hooks. Userspace
+   can make use of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The
+   sequence would be used like following:
+     - mmap dma-buf fd
+     - for each drawing/upload cycle in CPU
+       1. SYNC_START ioctl
+       2. read/write to mmap area or a 2d sub-region of it
+       3. SYNC_END ioctl.
+     - munamp once you don't need the buffer any more
+
 2. Supporting existing mmap interfaces in importers
 
    Similar to the motivation for kernel cpu access it is again important that
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 155c146..b6a4a06 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -251,11 +251,55 @@  out:
 	return events;
 }
 
+static long dma_buf_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct dma_buf *dmabuf;
+	struct dma_buf_sync sync;
+	enum dma_data_direction direction;
+
+	dmabuf = file->private_data;
+
+	if (!is_dma_buf_file(file))
+		return -EINVAL;
+
+	if (cmd != DMA_BUF_IOCTL_SYNC)
+		return -ENOTTY;
+
+	if (copy_from_user(&sync, (void __user *) arg, sizeof(sync)))
+		return -EFAULT;
+
+	if (sync.flags & DMA_BUF_SYNC_RW)
+		direction = DMA_BIDIRECTIONAL;
+	else if (sync.flags & DMA_BUF_SYNC_READ)
+		direction = DMA_FROM_DEVICE;
+	else if (sync.flags & DMA_BUF_SYNC_WRITE)
+		direction = DMA_TO_DEVICE;
+	else
+		return -EINVAL;
+
+	if (sync.flags & ~DMA_BUF_SYNC_VALID_FLAGS_MASK)
+		return -EINVAL;
+
+	/* TODO: check for overflowing the buffer's size - how so, checking region by
+	 * region here? Maybe need to check for the other parameters as well. */
+
+	if (sync.flags & DMA_BUF_SYNC_END)
+		dma_buf_end_cpu_access(dmabuf, sync.stride_bytes, sync.bytes_per_pixel,
+				sync.num_regions, sync.regions, direction);
+	else
+		dma_buf_begin_cpu_access(dmabuf, sync.stride_bytes, sync.bytes_per_pixel,
+				sync.num_regions, sync.regions, direction);
+
+	return 0;
+}
+
 static const struct file_operations dma_buf_fops = {
 	.release	= dma_buf_release,
 	.mmap		= dma_buf_mmap_internal,
 	.llseek		= dma_buf_llseek,
 	.poll		= dma_buf_poll,
+	.unlocked_ioctl	= dma_buf_ioctl,
 };
 
 /*
@@ -539,14 +583,17 @@  EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
  * preparations. Coherency is only guaranteed in the specified range for the
  * specified access direction.
  * @dmabuf:	[in]	buffer to prepare cpu access for.
- * @start:	[in]	start of range for cpu access.
- * @len:	[in]	length of range for cpu access.
- * @direction:	[in]	length of range for cpu access.
+ * @stride_bytes:	[in]	stride in bytes for cpu access.
+ * @bytes_per_pixel:	[in]	bytes per pixel of the region for cpu access.
+ * @num_regions:   [in]  number of regions.
+ * @region:   [in] vector of 2-dimensional regions for cpu access.
+ * @direction:	[in]	direction of range for cpu access.
  *
  * Can return negative error values, returns 0 on success.
  */
-int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
-			     enum dma_data_direction direction)
+int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t stride_bytes,
+	size_t bytes_per_pixel, size_t num_regions, struct dma_buf_sync_region regions[],
+	enum dma_data_direction direction)
 {
 	int ret = 0;
 
@@ -554,8 +601,8 @@  int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
 		return -EINVAL;
 
 	if (dmabuf->ops->begin_cpu_access)
-		ret = dmabuf->ops->begin_cpu_access(dmabuf, start,
-							len, direction);
+		ret = dmabuf->ops->begin_cpu_access(dmabuf, stride_bytes, bytes_per_pixel,
+							num_regions, regions, direction);
 
 	return ret;
 }
@@ -567,19 +614,23 @@  EXPORT_SYMBOL_GPL(dma_buf_begin_cpu_access);
  * actions. Coherency is only guaranteed in the specified range for the
  * specified access direction.
  * @dmabuf:	[in]	buffer to complete cpu access for.
- * @start:	[in]	start of range for cpu access.
- * @len:	[in]	length of range for cpu access.
- * @direction:	[in]	length of range for cpu access.
+ * @stride_bytes:	[in]	stride in bytes for cpu access.
+ * @bytes_per_pixel:	[in]	bytes per pixel of the region for cpu access.
+ * @num_regions:   [in]  number of regions.
+ * @regions:   [in]  vector of 2-dimensional regions for cpu access.
+ * @direction:	[in]	direction of range for cpu access.
  *
  * This call must always succeed.
  */
-void dma_buf_end_cpu_access(struct dma_buf *dmabuf, size_t start, size_t len,
-			    enum dma_data_direction direction)
+void dma_buf_end_cpu_access(struct dma_buf *dmabuf, size_t stride_bytes,
+	size_t bytes_per_pixel, size_t num_regions, struct dma_buf_sync_region regions[],
+	enum dma_data_direction direction)
 {
 	WARN_ON(!dmabuf);
 
 	if (dmabuf->ops->end_cpu_access)
-		dmabuf->ops->end_cpu_access(dmabuf, start, len, direction);
+		dmabuf->ops->end_cpu_access(dmabuf, stride_bytes, bytes_per_pixel,
+			num_regions, regions, direction);
 }
 EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 95cbfff..e5bb7a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -212,7 +212,8 @@  static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	return 0;
 }
 
-static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)
+static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes, size_t bytes_per_pixel,
+		size_t num_regions, struct dma_buf_sync_region regions[], enum dma_data_direction direction)
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
 	struct drm_device *dev = obj->base.dev;
@@ -228,7 +229,8 @@  static int i915_gem_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size
 	return ret;
 }
 
-static void i915_gem_end_cpu_access(struct dma_buf *dma_buf, size_t start, size_t length, enum dma_data_direction direction)
+static void i915_gem_end_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes, size_t bytes_per_pixel,
+		size_t num_regions, struct dma_buf_sync_region regions[], enum dma_data_direction direction)
 {
 	struct drm_i915_gem_object *obj = dma_buf_to_obj(dma_buf);
 	struct drm_device *dev = obj->base.dev;
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index f98bd70..ed457cb 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -33,6 +33,8 @@ 
 #include <linux/fence.h>
 #include <linux/wait.h>
 
+#include <uapi/linux/dma-buf.h>
+
 struct device;
 struct dma_buf;
 struct dma_buf_attachment;
@@ -93,10 +95,10 @@  struct dma_buf_ops {
 	/* after final dma_buf_put() */
 	void (*release)(struct dma_buf *);
 
-	int (*begin_cpu_access)(struct dma_buf *, size_t, size_t,
-				enum dma_data_direction);
-	void (*end_cpu_access)(struct dma_buf *, size_t, size_t,
-			       enum dma_data_direction);
+	int (*begin_cpu_access)(struct dma_buf *, size_t, size_t, size_t,
+				struct dma_buf_sync_region [], enum dma_data_direction);
+	void (*end_cpu_access)(struct dma_buf *, size_t, size_t, size_t,
+			       struct dma_buf_sync_region [], enum dma_data_direction);
 	void *(*kmap_atomic)(struct dma_buf *, unsigned long);
 	void (*kunmap_atomic)(struct dma_buf *, unsigned long, void *);
 	void *(*kmap)(struct dma_buf *, unsigned long);
@@ -224,10 +226,12 @@  struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *,
 					enum dma_data_direction);
 void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *,
 				enum dma_data_direction);
-int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
-			     enum dma_data_direction dir);
-void dma_buf_end_cpu_access(struct dma_buf *dma_buf, size_t start, size_t len,
-			    enum dma_data_direction dir);
+int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes,
+			     size_t bytes_per_pixel, size_t num_regions,
+			     struct dma_buf_sync_region regions[], enum dma_data_direction dir);
+void dma_buf_end_cpu_access(struct dma_buf *dma_buf, size_t stride_bytes,
+			     size_t bytes_per_pixel, size_t num_regions,
+			     struct dma_buf_sync_region regions[], enum dma_data_direction dir);
 void *dma_buf_kmap_atomic(struct dma_buf *, unsigned long);
 void dma_buf_kunmap_atomic(struct dma_buf *, unsigned long, void *);
 void *dma_buf_kmap(struct dma_buf *, unsigned long);
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
new file mode 100644
index 0000000..c63b578
--- /dev/null
+++ b/include/uapi/linux/dma-buf.h
@@ -0,0 +1,57 @@ 
+/*
+ * Framework for buffer objects that can be shared across devices/subsystems.
+ *
+ * Copyright(C) 2015 Intel Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _DMA_BUF_UAPI_H_
+#define _DMA_BUF_UAPI_H_
+
+enum dma_buf_sync_flags {
+	DMA_BUF_SYNC_READ = (1 << 0),
+	DMA_BUF_SYNC_WRITE = (2 << 0),
+	DMA_BUF_SYNC_RW = (3 << 0),
+	DMA_BUF_SYNC_START = (0 << 2),
+	DMA_BUF_SYNC_END = (1 << 2),
+
+	DMA_BUF_SYNC_VALID_FLAGS_MASK = DMA_BUF_SYNC_RW |
+		DMA_BUF_SYNC_END
+};
+
+/* 2-dimensional region, used for multi-range flush. This can be used to
+ * synchronize the CPU by batching several sub-regions, smaller than the
+ * mapped dma-buf, all at once. */
+struct dma_buf_sync_region {
+	__u64 x;
+	__u64 y;
+	__u64 width;
+	__u64 height;
+};
+
+/* begin/end dma-buf functions used for userspace mmap. */
+struct dma_buf_sync {
+	enum dma_buf_sync_flags flags;
+
+	__u64 stride_bytes;
+	__u32 bytes_per_pixel;
+	__u32 num_regions;
+
+	struct dma_buf_sync_region regions[];
+};
+
+#define DMA_BUF_BASE		'b'
+#define DMA_BUF_IOCTL_SYNC	_IOWR(DMA_BUF_BASE, 0, struct dma_buf_sync)
+
+#endif