diff mbox series

[11/15] iio: buffer-dma: Boost performance using write-combine cache setting

Message ID 20211115141925.60164-12-paul@crapouillou.net (mailing list archive)
State New, archived
Headers show
Series iio: buffer-dma: write() and new DMABUF based API | expand

Commit Message

Paul Cercueil Nov. 15, 2021, 2:19 p.m. UTC
We can be certain that the input buffers will only be accessed by
userspace for reading, and output buffers will mostly be accessed by
userspace for writing.

Therefore, it makes more sense to use only fully cached input buffers,
and to use the write-combine cache coherency setting for output buffers.

This boosts performance, as the data written to the output buffers does
not have to be sync'd for coherency. It will halve performance if the
userspace application tries to read from the output buffer, but this
should never happen.

Since we don't need to sync the cache when disabling CPU access either
for input buffers or output buffers, the .end_cpu_access() callback can
be dropped completely.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
---
 drivers/iio/buffer/industrialio-buffer-dma.c | 82 +++++++++++++-------
 1 file changed, 54 insertions(+), 28 deletions(-)

Comments

Paul Cercueil Nov. 18, 2021, 11:45 a.m. UTC | #1
Hi,

Le lun., nov. 15 2021 at 14:19:21 +0000, Paul Cercueil 
<paul@crapouillou.net> a écrit :
> We can be certain that the input buffers will only be accessed by
> userspace for reading, and output buffers will mostly be accessed by
> userspace for writing.
> 
> Therefore, it makes more sense to use only fully cached input buffers,
> and to use the write-combine cache coherency setting for output 
> buffers.
> 
> This boosts performance, as the data written to the output buffers 
> does
> not have to be sync'd for coherency. It will halve performance if the
> userspace application tries to read from the output buffer, but this
> should never happen.
> 
> Since we don't need to sync the cache when disabling CPU access either
> for input buffers or output buffers, the .end_cpu_access() callback 
> can
> be dropped completely.
> 
> Signed-off-by: Paul Cercueil <paul@crapouillou.net>
> ---
>  drivers/iio/buffer/industrialio-buffer-dma.c | 82 
> +++++++++++++-------
>  1 file changed, 54 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c 
> b/drivers/iio/buffer/industrialio-buffer-dma.c
> index 92356ee02f30..fb39054d8c15 100644
> --- a/drivers/iio/buffer/industrialio-buffer-dma.c
> +++ b/drivers/iio/buffer/industrialio-buffer-dma.c
> @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct 
> dma_buf *dbuf,
>  	if (vma->vm_ops->open)
>  		vma->vm_ops->open(vma);
> 
> -	return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> -			      virt_to_page(block->vaddr));
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> +		/*
> +		 * With an input buffer, userspace will only read the data and
> +		 * never write. We can mmap the buffer fully cached.
> +		 */
> +		return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> +				      virt_to_page(block->vaddr));
> +	} else {
> +		/*
> +		 * With an output buffer, userspace will only write the data
> +		 * and should rarely (if never) read from it. It is better to
> +		 * use write-combine in this case.
> +		 */
> +		return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
> +				   vma->vm_end - vma->vm_start);
> +	}
> +}
> +
> +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block 
> *block)
> +{
> +	struct device *dev = block->queue->dev;
> +	size_t size = PAGE_ALIGN(block->size);
> +
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
> +		dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
> +	else
> +		dma_free_wc(dev, size, block->vaddr, block->phys_addr);
>  }
> 
>  static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
> @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct 
> dma_buf *dbuf)
> 
>  	mutex_lock(&queue->lock);
> 
> -	dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
> -			  block->vaddr, block->phys_addr);
> -
> +	iio_dma_buffer_free_dmamem(block);
>  	kfree(block);
> 
>  	queue->num_blocks--;
> @@ -268,19 +291,6 @@ static int 
> iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
>  	return 0;
>  }
> 
> -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
> -					     enum dma_data_direction dma_dir)
> -{
> -	struct iio_dma_buffer_block *block = dbuf->priv;
> -	struct device *dev = block->queue->dev;
> -
> -	/* We only need to sync the cache for output buffers */
> -	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
> -		dma_sync_single_for_device(dev, block->phys_addr, block->size, 
> dma_dir);
> -
> -	return 0;
> -}
> -
>  static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
>  	.attach			= iio_buffer_dma_buf_attach,
>  	.map_dma_buf		= iio_buffer_dma_buf_map,
> @@ -288,9 +298,28 @@ static const struct dma_buf_ops 
> iio_dma_buffer_dmabuf_ops = {
>  	.mmap			= iio_buffer_dma_buf_mmap,
>  	.release		= iio_buffer_dma_buf_release,
>  	.begin_cpu_access	= iio_buffer_dma_buf_begin_cpu_access,
> -	.end_cpu_access		= iio_buffer_dma_buf_end_cpu_access,
>  };
> 
> +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block 
> *block)
> +{
> +	struct device *dev = block->queue->dev;
> +	size_t size = PAGE_ALIGN(block->size);
> +
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> +		block->vaddr = dma_alloc_coherent(dev, size,
> +						  &block->phys_addr,
> +						  GFP_KERNEL);

I'm so used to dma_alloc_noncoherent() that I didn't even notice that 
it was dma_alloc_coherent() here. The code I added meant to work with 
non-coherent memory - hence the dma_sync_* operations and the use of 
dma_mmap_pages().

I'll fix that in V2.

Cheers,
-Paul

> +	} else {
> +		block->vaddr = dma_alloc_wc(dev, size,
> +					    &block->phys_addr,
> +					    GFP_KERNEL);
> +	}
> +	if (!block->vaddr)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
>  static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>  	struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
>  {
> @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block 
> *iio_dma_buffer_alloc_block(
>  	if (!block)
>  		return ERR_PTR(-ENOMEM);
> 
> -	block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
> -		&block->phys_addr, GFP_KERNEL);
> -	if (!block->vaddr) {
> -		err = -ENOMEM;
> +	block->size = size;
> +	block->queue = queue;
> +
> +	err = iio_dma_buffer_alloc_dmamem(block);
> +	if (err)
>  		goto err_free_block;
> -	}
> 
>  	einfo.ops = &iio_dma_buffer_dmabuf_ops;
>  	einfo.size = PAGE_ALIGN(size);
> @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block 
> *iio_dma_buffer_alloc_block(
>  	}
> 
>  	block->dmabuf = dmabuf;
> -	block->size = size;
>  	block->bytes_used = size;
>  	block->state = IIO_BLOCK_STATE_DONE;
> -	block->queue = queue;
>  	block->fileio = fileio;
>  	INIT_LIST_HEAD(&block->head);
> 
> @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block 
> *iio_dma_buffer_alloc_block(
>  	return block;
> 
>  err_free_dma:
> -	dma_free_coherent(queue->dev, PAGE_ALIGN(size),
> -			  block->vaddr, block->phys_addr);
> +	iio_dma_buffer_free_dmamem(block);
>  err_free_block:
>  	kfree(block);
>  	return ERR_PTR(err);
> --
> 2.33.0
>
Jonathan Cameron Nov. 21, 2021, 3 p.m. UTC | #2
On Mon, 15 Nov 2021 14:19:21 +0000
Paul Cercueil <paul@crapouillou.net> wrote:

> We can be certain that the input buffers will only be accessed by
> userspace for reading, and output buffers will mostly be accessed by
> userspace for writing.

Mostly?  Perhaps a little more info on why that's not 'only'.

> 
> Therefore, it makes more sense to use only fully cached input buffers,
> and to use the write-combine cache coherency setting for output buffers.
> 
> This boosts performance, as the data written to the output buffers does
> not have to be sync'd for coherency. It will halve performance if the
> userspace application tries to read from the output buffer, but this
> should never happen.
> 
> Since we don't need to sync the cache when disabling CPU access either
> for input buffers or output buffers, the .end_cpu_access() callback can
> be dropped completely.

We have an odd mix of coherent and non coherent DMA in here as you noted,
but are you sure this is safe on all platforms?

> 
> Signed-off-by: Paul Cercueil <paul@crapouillou.net>

Any numbers to support this patch?  The mapping types are performance
optimisations so nice to know how much of a difference they make.


> ---
>  drivers/iio/buffer/industrialio-buffer-dma.c | 82 +++++++++++++-------
>  1 file changed, 54 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
> index 92356ee02f30..fb39054d8c15 100644
> --- a/drivers/iio/buffer/industrialio-buffer-dma.c
> +++ b/drivers/iio/buffer/industrialio-buffer-dma.c
> @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct dma_buf *dbuf,
>  	if (vma->vm_ops->open)
>  		vma->vm_ops->open(vma);
>  
> -	return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> -			      virt_to_page(block->vaddr));
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> +		/*
> +		 * With an input buffer, userspace will only read the data and
> +		 * never write. We can mmap the buffer fully cached.
> +		 */
> +		return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> +				      virt_to_page(block->vaddr));
> +	} else {
> +		/*
> +		 * With an output buffer, userspace will only write the data
> +		 * and should rarely (if never) read from it. It is better to
> +		 * use write-combine in this case.
> +		 */
> +		return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
> +				   vma->vm_end - vma->vm_start);
> +	}
> +}
> +
> +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block *block)
> +{
> +	struct device *dev = block->queue->dev;
> +	size_t size = PAGE_ALIGN(block->size);
> +
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
> +		dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
> +	else
> +		dma_free_wc(dev, size, block->vaddr, block->phys_addr);
>  }
>  
>  static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
> @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
>  
>  	mutex_lock(&queue->lock);
>  
> -	dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
> -			  block->vaddr, block->phys_addr);
> -
> +	iio_dma_buffer_free_dmamem(block);
>  	kfree(block);
>  
>  	queue->num_blocks--;
> @@ -268,19 +291,6 @@ static int iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
>  	return 0;
>  }
>  
> -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
> -					     enum dma_data_direction dma_dir)
> -{
> -	struct iio_dma_buffer_block *block = dbuf->priv;
> -	struct device *dev = block->queue->dev;
> -
> -	/* We only need to sync the cache for output buffers */
> -	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
> -		dma_sync_single_for_device(dev, block->phys_addr, block->size, dma_dir);
> -
> -	return 0;
> -}
> -
>  static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
>  	.attach			= iio_buffer_dma_buf_attach,
>  	.map_dma_buf		= iio_buffer_dma_buf_map,
> @@ -288,9 +298,28 @@ static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
>  	.mmap			= iio_buffer_dma_buf_mmap,
>  	.release		= iio_buffer_dma_buf_release,
>  	.begin_cpu_access	= iio_buffer_dma_buf_begin_cpu_access,
> -	.end_cpu_access		= iio_buffer_dma_buf_end_cpu_access,
>  };
>  
> +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block *block)
> +{
> +	struct device *dev = block->queue->dev;
> +	size_t size = PAGE_ALIGN(block->size);
> +
> +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> +		block->vaddr = dma_alloc_coherent(dev, size,
> +						  &block->phys_addr,
> +						  GFP_KERNEL);
> +	} else {
> +		block->vaddr = dma_alloc_wc(dev, size,
> +					    &block->phys_addr,
> +					    GFP_KERNEL);
> +	}
> +	if (!block->vaddr)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
>  static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>  	struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
>  {
> @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>  	if (!block)
>  		return ERR_PTR(-ENOMEM);
>  
> -	block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
> -		&block->phys_addr, GFP_KERNEL);
> -	if (!block->vaddr) {
> -		err = -ENOMEM;
> +	block->size = size;
> +	block->queue = queue;
> +
> +	err = iio_dma_buffer_alloc_dmamem(block);
> +	if (err)
>  		goto err_free_block;
> -	}
>  
>  	einfo.ops = &iio_dma_buffer_dmabuf_ops;
>  	einfo.size = PAGE_ALIGN(size);
> @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>  	}
>  
>  	block->dmabuf = dmabuf;
> -	block->size = size;
>  	block->bytes_used = size;
>  	block->state = IIO_BLOCK_STATE_DONE;
> -	block->queue = queue;
>  	block->fileio = fileio;
>  	INIT_LIST_HEAD(&block->head);
>  
> @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>  	return block;
>  
>  err_free_dma:
> -	dma_free_coherent(queue->dev, PAGE_ALIGN(size),
> -			  block->vaddr, block->phys_addr);
> +	iio_dma_buffer_free_dmamem(block);
>  err_free_block:
>  	kfree(block);
>  	return ERR_PTR(err);
Paul Cercueil Nov. 21, 2021, 5:43 p.m. UTC | #3
Hi Jonathan,

Le dim., nov. 21 2021 at 15:00:37 +0000, Jonathan Cameron 
<jic23@kernel.org> a écrit :
> On Mon, 15 Nov 2021 14:19:21 +0000
> Paul Cercueil <paul@crapouillou.net> wrote:
> 
>>  We can be certain that the input buffers will only be accessed by
>>  userspace for reading, and output buffers will mostly be accessed by
>>  userspace for writing.
> 
> Mostly?  Perhaps a little more info on why that's not 'only'.

Just like with a framebuffer, it really depends on what the application 
does. Most of the cases it will just read sequentially an input buffer, 
or write sequentially an output buffer. But then you get the exotic 
application that will try to do something like alpha blending, which 
means read+write. Hence "mostly".

>> 
>>  Therefore, it makes more sense to use only fully cached input 
>> buffers,
>>  and to use the write-combine cache coherency setting for output 
>> buffers.
>> 
>>  This boosts performance, as the data written to the output buffers 
>> does
>>  not have to be sync'd for coherency. It will halve performance if 
>> the
>>  userspace application tries to read from the output buffer, but this
>>  should never happen.
>> 
>>  Since we don't need to sync the cache when disabling CPU access 
>> either
>>  for input buffers or output buffers, the .end_cpu_access() callback 
>> can
>>  be dropped completely.
> 
> We have an odd mix of coherent and non coherent DMA in here as you 
> noted,
> but are you sure this is safe on all platforms?

The mix isn't safe, but using only coherent or only non-coherent should 
be safe, yes.

> 
>> 
>>  Signed-off-by: Paul Cercueil <paul@crapouillou.net>
> 
> Any numbers to support this patch?  The mapping types are performance
> optimisations so nice to know how much of a difference they make.

Output buffers are definitely faster in write-combine mode. On a 
ZedBoard with a AD9361 transceiver set to 66 MSPS, and buffer/size set 
to 8192, I would get about 185 MiB/s before, 197 MiB/s after.

Input buffers... early results are mixed. On ARM32 it does look like it 
is slightly faster to read from *uncached* memory than reading from 
cached memory. The cache sync does take a long time.

Other architectures might have a different result, for instance on MIPS 
invalidating the cache is a very fast operation, so using cached 
buffers would be a huge win in performance.

Setups where the DMA operations are coherent also wouldn't require any 
cache sync and this patch would give a huge win in performance.

I'll run some more tests next week to have some fresh numbers.

Cheers,
-Paul

>>  ---
>>   drivers/iio/buffer/industrialio-buffer-dma.c | 82 
>> +++++++++++++-------
>>   1 file changed, 54 insertions(+), 28 deletions(-)
>> 
>>  diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c 
>> b/drivers/iio/buffer/industrialio-buffer-dma.c
>>  index 92356ee02f30..fb39054d8c15 100644
>>  --- a/drivers/iio/buffer/industrialio-buffer-dma.c
>>  +++ b/drivers/iio/buffer/industrialio-buffer-dma.c
>>  @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct 
>> dma_buf *dbuf,
>>   	if (vma->vm_ops->open)
>>   		vma->vm_ops->open(vma);
>> 
>>  -	return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
>>  -			      virt_to_page(block->vaddr));
>>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
>>  +		/*
>>  +		 * With an input buffer, userspace will only read the data and
>>  +		 * never write. We can mmap the buffer fully cached.
>>  +		 */
>>  +		return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
>>  +				      virt_to_page(block->vaddr));
>>  +	} else {
>>  +		/*
>>  +		 * With an output buffer, userspace will only write the data
>>  +		 * and should rarely (if never) read from it. It is better to
>>  +		 * use write-combine in this case.
>>  +		 */
>>  +		return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
>>  +				   vma->vm_end - vma->vm_start);
>>  +	}
>>  +}
>>  +
>>  +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block 
>> *block)
>>  +{
>>  +	struct device *dev = block->queue->dev;
>>  +	size_t size = PAGE_ALIGN(block->size);
>>  +
>>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
>>  +		dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
>>  +	else
>>  +		dma_free_wc(dev, size, block->vaddr, block->phys_addr);
>>   }
>> 
>>   static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
>>  @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct 
>> dma_buf *dbuf)
>> 
>>   	mutex_lock(&queue->lock);
>> 
>>  -	dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
>>  -			  block->vaddr, block->phys_addr);
>>  -
>>  +	iio_dma_buffer_free_dmamem(block);
>>   	kfree(block);
>> 
>>   	queue->num_blocks--;
>>  @@ -268,19 +291,6 @@ static int 
>> iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
>>   	return 0;
>>   }
>> 
>>  -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
>>  -					     enum dma_data_direction dma_dir)
>>  -{
>>  -	struct iio_dma_buffer_block *block = dbuf->priv;
>>  -	struct device *dev = block->queue->dev;
>>  -
>>  -	/* We only need to sync the cache for output buffers */
>>  -	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
>>  -		dma_sync_single_for_device(dev, block->phys_addr, block->size, 
>> dma_dir);
>>  -
>>  -	return 0;
>>  -}
>>  -
>>   static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
>>   	.attach			= iio_buffer_dma_buf_attach,
>>   	.map_dma_buf		= iio_buffer_dma_buf_map,
>>  @@ -288,9 +298,28 @@ static const struct dma_buf_ops 
>> iio_dma_buffer_dmabuf_ops = {
>>   	.mmap			= iio_buffer_dma_buf_mmap,
>>   	.release		= iio_buffer_dma_buf_release,
>>   	.begin_cpu_access	= iio_buffer_dma_buf_begin_cpu_access,
>>  -	.end_cpu_access		= iio_buffer_dma_buf_end_cpu_access,
>>   };
>> 
>>  +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block 
>> *block)
>>  +{
>>  +	struct device *dev = block->queue->dev;
>>  +	size_t size = PAGE_ALIGN(block->size);
>>  +
>>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
>>  +		block->vaddr = dma_alloc_coherent(dev, size,
>>  +						  &block->phys_addr,
>>  +						  GFP_KERNEL);
>>  +	} else {
>>  +		block->vaddr = dma_alloc_wc(dev, size,
>>  +					    &block->phys_addr,
>>  +					    GFP_KERNEL);
>>  +	}
>>  +	if (!block->vaddr)
>>  +		return -ENOMEM;
>>  +
>>  +	return 0;
>>  +}
>>  +
>>   static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
>>   	struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
>>   {
>>  @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block 
>> *iio_dma_buffer_alloc_block(
>>   	if (!block)
>>   		return ERR_PTR(-ENOMEM);
>> 
>>  -	block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
>>  -		&block->phys_addr, GFP_KERNEL);
>>  -	if (!block->vaddr) {
>>  -		err = -ENOMEM;
>>  +	block->size = size;
>>  +	block->queue = queue;
>>  +
>>  +	err = iio_dma_buffer_alloc_dmamem(block);
>>  +	if (err)
>>   		goto err_free_block;
>>  -	}
>> 
>>   	einfo.ops = &iio_dma_buffer_dmabuf_ops;
>>   	einfo.size = PAGE_ALIGN(size);
>>  @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block 
>> *iio_dma_buffer_alloc_block(
>>   	}
>> 
>>   	block->dmabuf = dmabuf;
>>  -	block->size = size;
>>   	block->bytes_used = size;
>>   	block->state = IIO_BLOCK_STATE_DONE;
>>  -	block->queue = queue;
>>   	block->fileio = fileio;
>>   	INIT_LIST_HEAD(&block->head);
>> 
>>  @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block 
>> *iio_dma_buffer_alloc_block(
>>   	return block;
>> 
>>   err_free_dma:
>>  -	dma_free_coherent(queue->dev, PAGE_ALIGN(size),
>>  -			  block->vaddr, block->phys_addr);
>>  +	iio_dma_buffer_free_dmamem(block);
>>   err_free_block:
>>   	kfree(block);
>>   	return ERR_PTR(err);
>
Paul Cercueil Nov. 25, 2021, 5:29 p.m. UTC | #4
Hi Jonathan,

Le dim., nov. 21 2021 at 17:43:20 +0000, Paul Cercueil 
<paul@crapouillou.net> a écrit :
> Hi Jonathan,
> 
> Le dim., nov. 21 2021 at 15:00:37 +0000, Jonathan Cameron 
> <jic23@kernel.org> a écrit :
>> On Mon, 15 Nov 2021 14:19:21 +0000
>> Paul Cercueil <paul@crapouillou.net> wrote:
>> 
>>>  We can be certain that the input buffers will only be accessed by
>>>  userspace for reading, and output buffers will mostly be accessed 
>>> by
>>>  userspace for writing.
>> 
>> Mostly?  Perhaps a little more info on why that's not 'only'.
> 
> Just like with a framebuffer, it really depends on what the 
> application does. Most of the cases it will just read sequentially an 
> input buffer, or write sequentially an output buffer. But then you 
> get the exotic application that will try to do something like alpha 
> blending, which means read+write. Hence "mostly".
> 
>>> 
>>>  Therefore, it makes more sense to use only fully cached input 
>>> buffers,
>>>  and to use the write-combine cache coherency setting for output 
>>> buffers.
>>> 
>>>  This boosts performance, as the data written to the output buffers 
>>> does
>>>  not have to be sync'd for coherency. It will halve performance if 
>>> the
>>>  userspace application tries to read from the output buffer, but 
>>> this
>>>  should never happen.
>>> 
>>>  Since we don't need to sync the cache when disabling CPU access 
>>> either
>>>  for input buffers or output buffers, the .end_cpu_access() 
>>> callback can
>>>  be dropped completely.
>> 
>> We have an odd mix of coherent and non coherent DMA in here as you 
>> noted,
>> but are you sure this is safe on all platforms?
> 
> The mix isn't safe, but using only coherent or only non-coherent 
> should be safe, yes.
> 
>> 
>>> 
>>>  Signed-off-by: Paul Cercueil <paul@crapouillou.net>
>> 
>> Any numbers to support this patch?  The mapping types are performance
>> optimisations so nice to know how much of a difference they make.
> 
> Output buffers are definitely faster in write-combine mode. On a 
> ZedBoard with a AD9361 transceiver set to 66 MSPS, and buffer/size 
> set to 8192, I would get about 185 MiB/s before, 197 MiB/s after.
> 
> Input buffers... early results are mixed. On ARM32 it does look like 
> it is slightly faster to read from *uncached* memory than reading 
> from cached memory. The cache sync does take a long time.
> 
> Other architectures might have a different result, for instance on 
> MIPS invalidating the cache is a very fast operation, so using cached 
> buffers would be a huge win in performance.
> 
> Setups where the DMA operations are coherent also wouldn't require 
> any cache sync and this patch would give a huge win in performance.
> 
> I'll run some more tests next week to have some fresh numbers.

I think I mixed things up before, because I get different results now.

Here are some fresh benchmarks, triple-checked, using libiio's 
iio_readdev and iio_writedev tools, with 64K samples buffers at 61.44 
MSPS (max. theorical throughput: 234 MiB/s):
  iio_readdev -b 65536 cf-ad9361-lpc voltage0 voltage1 | pv > /dev/null
  pv /dev/zero | iio_writedev -b 65536 cf-ad9361-dds-core-lpc voltage0 
voltage1

Coherent mapping:
- fileio:
    read:	125 MiB/s
    write:	141 MiB/s
- dmabuf:
    read:	171 MiB/s
    write:	210 MiB/s

Coherent reads + Write-combine writes:
- fileio:
    read:	125 MiB/s
    write:	141 MiB/s
- dmabuf:
    read:	171 MiB/s
    write:	210 MiB/s

Non-coherent mapping:
- fileio:
    read:	119 MiB/s
    write:	124 MiB/s
- dmabuf:
    read:	159 MiB/s
    write:	124 MiB/s

Non-coherent reads + write-combine writes:
- fileio:
    read:	119 MiB/s
    write:	140 MiB/s
- dmabuf:
    read:	159 MiB/s
    write:	210 MiB/s

Non-coherent mapping with no cache sync:
- fileio:
    read:	156 MiB/s
    write:	123 MiB/s
- dmabuf:
    read:	234 MiB/s (capped by sample rate)
    write:	182 MiB/s

Non-coherent reads with no cache sync + write-combine writes:
- fileio:
    read:	156 MiB/s
    write:	140 MiB/s
- dmabuf:
    read:	234 MiB/s (capped by sample rate)
    write:	210 MiB/s


A few things we can deduce from this:

* Write-combine is not available on Zynq/ARM? If it was working, it 
should give a better performance than the coherent mapping, but it 
doesn't seem to do anything at all. At least it doesn't harm 
performance.

* Non-coherent + cache invalidation is definitely a good deal slower 
than using coherent mapping, at least on ARM32. However, when the cache 
sync is disabled (e.g. if the DMA operations are coherent) the reads 
are much faster.

* The new dma-buf based API is a great deal faster than the fileio API.

So in the future we could use coherent reads + write-combine writes, 
unless we know the DMA operations are coherent, and in this case use 
non-coherent reads + write-combine writes.

Regarding this patch, unfortunately I cannot prove that write-combine 
is faster, so I'll just drop this patch for now.

Cheers,
-Paul
Jonathan Cameron Nov. 27, 2021, 3:20 p.m. UTC | #5
On Sun, 21 Nov 2021 17:43:20 +0000
Paul Cercueil <paul@crapouillou.net> wrote:

> Hi Jonathan,
> 
> Le dim., nov. 21 2021 at 15:00:37 +0000, Jonathan Cameron 
> <jic23@kernel.org> a écrit :
> > On Mon, 15 Nov 2021 14:19:21 +0000
> > Paul Cercueil <paul@crapouillou.net> wrote:
> >   
> >>  We can be certain that the input buffers will only be accessed by
> >>  userspace for reading, and output buffers will mostly be accessed by
> >>  userspace for writing.  
> > 
> > Mostly?  Perhaps a little more info on why that's not 'only'.  
> 
> Just like with a framebuffer, it really depends on what the application 
> does. Most of the cases it will just read sequentially an input buffer, 
> or write sequentially an output buffer. But then you get the exotic 
> application that will try to do something like alpha blending, which 
> means read+write. Hence "mostly".

Ok. That makes sense though I hope no one actually does it, we can't
prevent them doing so.


> 
> >> 
> >>  Therefore, it makes more sense to use only fully cached input 
> >> buffers,
> >>  and to use the write-combine cache coherency setting for output 
> >> buffers.
> >> 
> >>  This boosts performance, as the data written to the output buffers 
> >> does
> >>  not have to be sync'd for coherency. It will halve performance if 
> >> the
> >>  userspace application tries to read from the output buffer, but this
> >>  should never happen.
> >> 
> >>  Since we don't need to sync the cache when disabling CPU access 
> >> either
> >>  for input buffers or output buffers, the .end_cpu_access() callback 
> >> can
> >>  be dropped completely.  
> > 
> > We have an odd mix of coherent and non coherent DMA in here as you 
> > noted,
> > but are you sure this is safe on all platforms?  
> 
> The mix isn't safe, but using only coherent or only non-coherent should 
> be safe, yes.

yup

> 
> >   
> >> 
> >>  Signed-off-by: Paul Cercueil <paul@crapouillou.net>  
> > 
> > Any numbers to support this patch?  The mapping types are performance
> > optimisations so nice to know how much of a difference they make.  
> 
> Output buffers are definitely faster in write-combine mode. On a 
> ZedBoard with a AD9361 transceiver set to 66 MSPS, and buffer/size set 
> to 8192, I would get about 185 MiB/s before, 197 MiB/s after.
> 
> Input buffers... early results are mixed. On ARM32 it does look like it 
> is slightly faster to read from *uncached* memory than reading from 
> cached memory. The cache sync does take a long time.
> 
> Other architectures might have a different result, for instance on MIPS 
> invalidating the cache is a very fast operation, so using cached 
> buffers would be a huge win in performance.
> 
> Setups where the DMA operations are coherent also wouldn't require any 
> cache sync and this patch would give a huge win in performance.
> 
> I'll run some more tests next week to have some fresh numbers.

Great.

Thanks,

Jonathan

> 
> Cheers,
> -Paul
> 
> >>  ---
> >>   drivers/iio/buffer/industrialio-buffer-dma.c | 82 
> >> +++++++++++++-------
> >>   1 file changed, 54 insertions(+), 28 deletions(-)
> >> 
> >>  diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c 
> >> b/drivers/iio/buffer/industrialio-buffer-dma.c
> >>  index 92356ee02f30..fb39054d8c15 100644
> >>  --- a/drivers/iio/buffer/industrialio-buffer-dma.c
> >>  +++ b/drivers/iio/buffer/industrialio-buffer-dma.c
> >>  @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct 
> >> dma_buf *dbuf,
> >>   	if (vma->vm_ops->open)
> >>   		vma->vm_ops->open(vma);
> >> 
> >>  -	return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> >>  -			      virt_to_page(block->vaddr));
> >>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> >>  +		/*
> >>  +		 * With an input buffer, userspace will only read the data and
> >>  +		 * never write. We can mmap the buffer fully cached.
> >>  +		 */
> >>  +		return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> >>  +				      virt_to_page(block->vaddr));
> >>  +	} else {
> >>  +		/*
> >>  +		 * With an output buffer, userspace will only write the data
> >>  +		 * and should rarely (if never) read from it. It is better to
> >>  +		 * use write-combine in this case.
> >>  +		 */
> >>  +		return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
> >>  +				   vma->vm_end - vma->vm_start);
> >>  +	}
> >>  +}
> >>  +
> >>  +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block 
> >> *block)
> >>  +{
> >>  +	struct device *dev = block->queue->dev;
> >>  +	size_t size = PAGE_ALIGN(block->size);
> >>  +
> >>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
> >>  +		dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
> >>  +	else
> >>  +		dma_free_wc(dev, size, block->vaddr, block->phys_addr);
> >>   }
> >> 
> >>   static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
> >>  @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct 
> >> dma_buf *dbuf)
> >> 
> >>   	mutex_lock(&queue->lock);
> >> 
> >>  -	dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
> >>  -			  block->vaddr, block->phys_addr);
> >>  -
> >>  +	iio_dma_buffer_free_dmamem(block);
> >>   	kfree(block);
> >> 
> >>   	queue->num_blocks--;
> >>  @@ -268,19 +291,6 @@ static int 
> >> iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
> >>   	return 0;
> >>   }
> >> 
> >>  -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
> >>  -					     enum dma_data_direction dma_dir)
> >>  -{
> >>  -	struct iio_dma_buffer_block *block = dbuf->priv;
> >>  -	struct device *dev = block->queue->dev;
> >>  -
> >>  -	/* We only need to sync the cache for output buffers */
> >>  -	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
> >>  -		dma_sync_single_for_device(dev, block->phys_addr, block->size, 
> >> dma_dir);
> >>  -
> >>  -	return 0;
> >>  -}
> >>  -
> >>   static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
> >>   	.attach			= iio_buffer_dma_buf_attach,
> >>   	.map_dma_buf		= iio_buffer_dma_buf_map,
> >>  @@ -288,9 +298,28 @@ static const struct dma_buf_ops 
> >> iio_dma_buffer_dmabuf_ops = {
> >>   	.mmap			= iio_buffer_dma_buf_mmap,
> >>   	.release		= iio_buffer_dma_buf_release,
> >>   	.begin_cpu_access	= iio_buffer_dma_buf_begin_cpu_access,
> >>  -	.end_cpu_access		= iio_buffer_dma_buf_end_cpu_access,
> >>   };
> >> 
> >>  +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block 
> >> *block)
> >>  +{
> >>  +	struct device *dev = block->queue->dev;
> >>  +	size_t size = PAGE_ALIGN(block->size);
> >>  +
> >>  +	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> >>  +		block->vaddr = dma_alloc_coherent(dev, size,
> >>  +						  &block->phys_addr,
> >>  +						  GFP_KERNEL);
> >>  +	} else {
> >>  +		block->vaddr = dma_alloc_wc(dev, size,
> >>  +					    &block->phys_addr,
> >>  +					    GFP_KERNEL);
> >>  +	}
> >>  +	if (!block->vaddr)
> >>  +		return -ENOMEM;
> >>  +
> >>  +	return 0;
> >>  +}
> >>  +
> >>   static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
> >>   	struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
> >>   {
> >>  @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block 
> >> *iio_dma_buffer_alloc_block(
> >>   	if (!block)
> >>   		return ERR_PTR(-ENOMEM);
> >> 
> >>  -	block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
> >>  -		&block->phys_addr, GFP_KERNEL);
> >>  -	if (!block->vaddr) {
> >>  -		err = -ENOMEM;
> >>  +	block->size = size;
> >>  +	block->queue = queue;
> >>  +
> >>  +	err = iio_dma_buffer_alloc_dmamem(block);
> >>  +	if (err)
> >>   		goto err_free_block;
> >>  -	}
> >> 
> >>   	einfo.ops = &iio_dma_buffer_dmabuf_ops;
> >>   	einfo.size = PAGE_ALIGN(size);
> >>  @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block 
> >> *iio_dma_buffer_alloc_block(
> >>   	}
> >> 
> >>   	block->dmabuf = dmabuf;
> >>  -	block->size = size;
> >>   	block->bytes_used = size;
> >>   	block->state = IIO_BLOCK_STATE_DONE;
> >>  -	block->queue = queue;
> >>   	block->fileio = fileio;
> >>   	INIT_LIST_HEAD(&block->head);
> >> 
> >>  @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block 
> >> *iio_dma_buffer_alloc_block(
> >>   	return block;
> >> 
> >>   err_free_dma:
> >>  -	dma_free_coherent(queue->dev, PAGE_ALIGN(size),
> >>  -			  block->vaddr, block->phys_addr);
> >>  +	iio_dma_buffer_free_dmamem(block);
> >>   err_free_block:
> >>   	kfree(block);
> >>   	return ERR_PTR(err);  
> >   
> 
>
Jonathan Cameron Nov. 27, 2021, 4:05 p.m. UTC | #6
On Thu, 25 Nov 2021 17:29:58 +0000
Paul Cercueil <paul@crapouillou.net> wrote:

> Hi Jonathan,
> 
> Le dim., nov. 21 2021 at 17:43:20 +0000, Paul Cercueil 
> <paul@crapouillou.net> a écrit :
> > Hi Jonathan,
> > 
> > Le dim., nov. 21 2021 at 15:00:37 +0000, Jonathan Cameron 
> > <jic23@kernel.org> a écrit :  
> >> On Mon, 15 Nov 2021 14:19:21 +0000
> >> Paul Cercueil <paul@crapouillou.net> wrote:
> >>   
> >>>  We can be certain that the input buffers will only be accessed by
> >>>  userspace for reading, and output buffers will mostly be accessed 
> >>> by
> >>>  userspace for writing.  
> >> 
> >> Mostly?  Perhaps a little more info on why that's not 'only'.  
> > 
> > Just like with a framebuffer, it really depends on what the 
> > application does. Most of the cases it will just read sequentially an 
> > input buffer, or write sequentially an output buffer. But then you 
> > get the exotic application that will try to do something like alpha 
> > blending, which means read+write. Hence "mostly".
> >   
> >>> 
> >>>  Therefore, it makes more sense to use only fully cached input 
> >>> buffers,
> >>>  and to use the write-combine cache coherency setting for output 
> >>> buffers.
> >>> 
> >>>  This boosts performance, as the data written to the output buffers 
> >>> does
> >>>  not have to be sync'd for coherency. It will halve performance if 
> >>> the
> >>>  userspace application tries to read from the output buffer, but 
> >>> this
> >>>  should never happen.
> >>> 
> >>>  Since we don't need to sync the cache when disabling CPU access 
> >>> either
> >>>  for input buffers or output buffers, the .end_cpu_access() 
> >>> callback can
> >>>  be dropped completely.  
> >> 
> >> We have an odd mix of coherent and non coherent DMA in here as you 
> >> noted,
> >> but are you sure this is safe on all platforms?  
> > 
> > The mix isn't safe, but using only coherent or only non-coherent 
> > should be safe, yes.
> >   
> >>   
> >>> 
> >>>  Signed-off-by: Paul Cercueil <paul@crapouillou.net>  
> >> 
> >> Any numbers to support this patch?  The mapping types are performance
> >> optimisations so nice to know how much of a difference they make.  
> > 
> > Output buffers are definitely faster in write-combine mode. On a 
> > ZedBoard with a AD9361 transceiver set to 66 MSPS, and buffer/size 
> > set to 8192, I would get about 185 MiB/s before, 197 MiB/s after.
> > 
> > Input buffers... early results are mixed. On ARM32 it does look like 
> > it is slightly faster to read from *uncached* memory than reading 
> > from cached memory. The cache sync does take a long time.
> > 
> > Other architectures might have a different result, for instance on 
> > MIPS invalidating the cache is a very fast operation, so using cached 
> > buffers would be a huge win in performance.
> > 
> > Setups where the DMA operations are coherent also wouldn't require 
> > any cache sync and this patch would give a huge win in performance.
> > 
> > I'll run some more tests next week to have some fresh numbers.  
> 
> I think I mixed things up before, because I get different results now.
> 
> Here are some fresh benchmarks, triple-checked, using libiio's 
> iio_readdev and iio_writedev tools, with 64K samples buffers at 61.44 
> MSPS (max. theorical throughput: 234 MiB/s):
>   iio_readdev -b 65536 cf-ad9361-lpc voltage0 voltage1 | pv > /dev/null
>   pv /dev/zero | iio_writedev -b 65536 cf-ad9361-dds-core-lpc voltage0 
> voltage1

There is a bit of a terminology confusion going on here.  I think
for the mappings you mean cacheable vs non-cacheable but maybe
I'm misunderstanding.  That doesn't necessarily correspond to
coherency.  Non cached memory is always coherent because all caches
miss.

Non-cacheable can be related to coherency of course. Also beware that given
hardware might not implement non-cacheable if it knows all possible
accesses are IO-coherent.  Affect is the same and if implemented
correctly it will not hurt performance significantly.

firmware should be letting the OS know if the device does coherent
DMA or not... dma-coherent in dt.  It might be optional for a given
piece of DMA engine but I've not seen that..

I'm not sure I see how you can do a mixture of cacheable for reads
and write combine (which means uncacheable) for writes...

> 
> Coherent mapping:
> - fileio:
>     read:	125 MiB/s
>     write:	141 MiB/s
> - dmabuf:
>     read:	171 MiB/s
>     write:	210 MiB/s
> 
> Coherent reads + Write-combine writes:
> - fileio:
>     read:	125 MiB/s
>     write:	141 MiB/s
> - dmabuf:
>     read:	171 MiB/s
>     write:	210 MiB/s
> 
> Non-coherent mapping:
> - fileio:
>     read:	119 MiB/s
>     write:	124 MiB/s
> - dmabuf:
>     read:	159 MiB/s
>     write:	124 MiB/s
> 
> Non-coherent reads + write-combine writes:
> - fileio:
>     read:	119 MiB/s
>     write:	140 MiB/s
> - dmabuf:
>     read:	159 MiB/s
>     write:	210 MiB/s
> 


> Non-coherent mapping with no cache sync:
> - fileio:
>     read:	156 MiB/s
>     write:	123 MiB/s
> - dmabuf:
>     read:	234 MiB/s (capped by sample rate)
>     write:	182 MiB/s
> 
> Non-coherent reads with no cache sync + write-combine writes:
> - fileio:
>     read:	156 MiB/s
>     write:	140 MiB/s
> - dmabuf:
>     read:	234 MiB/s (capped by sample rate)
>     write:	210 MiB/s
> 
> 
> A few things we can deduce from this:
> 
> * Write-combine is not available on Zynq/ARM? If it was working, it 
> should give a better performance than the coherent mapping, but it 
> doesn't seem to do anything at all. At least it doesn't harm 
> performance.

I'm not sure it's very relevant to this sort of streaming write.
If you write a sequence of addresses then nothing stops them getting combined
into a single write whether or not it is write-combining.

You may be right that the particular path to memory doesn't support it anyway.
Also some cache architectures will rapidly detect streaming writes and
elect not to cache them whether coherent or not.




> 
> * Non-coherent + cache invalidation is definitely a good deal slower 
> than using coherent mapping, at least on ARM32. However, when the cache 
> sync is disabled (e.g. if the DMA operations are coherent) the reads 
> are much faster.

If you are running with cache sync then it better not be cached
as such it's coherent in the sense of there being no entries in the cache
in either direction.

> 
> * The new dma-buf based API is a great deal faster than the fileio API.

:)

> 
> So in the future we could use coherent reads + write-combine writes, 
> unless we know the DMA operations are coherent, and in this case use 
> non-coherent reads + write-combine writes.

Not following this argument at all, but anyway we can revisit when it mattrs.  

> 
> Regarding this patch, unfortunately I cannot prove that write-combine 
> is faster, so I'll just drop this patch for now.

Sure, thanks for checking.  It's worth noting that WC usage in kernel
is vanishingly rare and I suspect that's mostly because it doesn't
do anything on many implementations.

Jonathan

> 
> Cheers,
> -Paul
> 
>
Lars-Peter Clausen Nov. 28, 2021, 1:25 p.m. UTC | #7
On 11/27/21 5:05 PM, Jonathan Cameron wrote:
>> Non-coherent mapping with no cache sync:
>> - fileio:
>>      read:	156 MiB/s
>>      write:	123 MiB/s
>> - dmabuf:
>>      read:	234 MiB/s (capped by sample rate)
>>      write:	182 MiB/s
>>
>> Non-coherent reads with no cache sync + write-combine writes:
>> - fileio:
>>      read:	156 MiB/s
>>      write:	140 MiB/s
>> - dmabuf:
>>      read:	234 MiB/s (capped by sample rate)
>>      write:	210 MiB/s
>>
>>
>> A few things we can deduce from this:
>>
>> * Write-combine is not available on Zynq/ARM? If it was working, it
>> should give a better performance than the coherent mapping, but it
>> doesn't seem to do anything at all. At least it doesn't harm
>> performance.
> I'm not sure it's very relevant to this sort of streaming write.
> If you write a sequence of addresses then nothing stops them getting combined
> into a single write whether or not it is write-combining.

There is a difference at which point they can get combined. With 
write-combine they can be coalesced into a single transaction anywhere 
in the interconnect, as early as the CPU itself. Without write-cobmine 
the DDR controller might decide to combine them, but not earlier. This 
can make a difference especially if the write is a narrow write, i.e. 
the access size is smaller than the buswidth.

Lets say you do 32-bit writes, but your bus is 64 bits wide. With WC two 
32-bits can be combined into a 64-bit write. Without WC that is not 
possible and you are potentially not using the bus to its fullest 
capacity. This is especially true if the memory bus is wider than the 
widest access size of the CPU.
diff mbox series

Patch

diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index 92356ee02f30..fb39054d8c15 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -229,8 +229,33 @@  static int iio_buffer_dma_buf_mmap(struct dma_buf *dbuf,
 	if (vma->vm_ops->open)
 		vma->vm_ops->open(vma);
 
-	return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
-			      virt_to_page(block->vaddr));
+	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
+		/*
+		 * With an input buffer, userspace will only read the data and
+		 * never write. We can mmap the buffer fully cached.
+		 */
+		return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
+				      virt_to_page(block->vaddr));
+	} else {
+		/*
+		 * With an output buffer, userspace will only write the data
+		 * and should rarely (if never) read from it. It is better to
+		 * use write-combine in this case.
+		 */
+		return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
+				   vma->vm_end - vma->vm_start);
+	}
+}
+
+static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block *block)
+{
+	struct device *dev = block->queue->dev;
+	size_t size = PAGE_ALIGN(block->size);
+
+	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
+		dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
+	else
+		dma_free_wc(dev, size, block->vaddr, block->phys_addr);
 }
 
 static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
@@ -243,9 +268,7 @@  static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
 
 	mutex_lock(&queue->lock);
 
-	dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
-			  block->vaddr, block->phys_addr);
-
+	iio_dma_buffer_free_dmamem(block);
 	kfree(block);
 
 	queue->num_blocks--;
@@ -268,19 +291,6 @@  static int iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
 	return 0;
 }
 
-static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
-					     enum dma_data_direction dma_dir)
-{
-	struct iio_dma_buffer_block *block = dbuf->priv;
-	struct device *dev = block->queue->dev;
-
-	/* We only need to sync the cache for output buffers */
-	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
-		dma_sync_single_for_device(dev, block->phys_addr, block->size, dma_dir);
-
-	return 0;
-}
-
 static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
 	.attach			= iio_buffer_dma_buf_attach,
 	.map_dma_buf		= iio_buffer_dma_buf_map,
@@ -288,9 +298,28 @@  static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
 	.mmap			= iio_buffer_dma_buf_mmap,
 	.release		= iio_buffer_dma_buf_release,
 	.begin_cpu_access	= iio_buffer_dma_buf_begin_cpu_access,
-	.end_cpu_access		= iio_buffer_dma_buf_end_cpu_access,
 };
 
+static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block *block)
+{
+	struct device *dev = block->queue->dev;
+	size_t size = PAGE_ALIGN(block->size);
+
+	if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
+		block->vaddr = dma_alloc_coherent(dev, size,
+						  &block->phys_addr,
+						  GFP_KERNEL);
+	} else {
+		block->vaddr = dma_alloc_wc(dev, size,
+					    &block->phys_addr,
+					    GFP_KERNEL);
+	}
+	if (!block->vaddr)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 	struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
 {
@@ -303,12 +332,12 @@  static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 	if (!block)
 		return ERR_PTR(-ENOMEM);
 
-	block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
-		&block->phys_addr, GFP_KERNEL);
-	if (!block->vaddr) {
-		err = -ENOMEM;
+	block->size = size;
+	block->queue = queue;
+
+	err = iio_dma_buffer_alloc_dmamem(block);
+	if (err)
 		goto err_free_block;
-	}
 
 	einfo.ops = &iio_dma_buffer_dmabuf_ops;
 	einfo.size = PAGE_ALIGN(size);
@@ -322,10 +351,8 @@  static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 	}
 
 	block->dmabuf = dmabuf;
-	block->size = size;
 	block->bytes_used = size;
 	block->state = IIO_BLOCK_STATE_DONE;
-	block->queue = queue;
 	block->fileio = fileio;
 	INIT_LIST_HEAD(&block->head);
 
@@ -338,8 +365,7 @@  static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 	return block;
 
 err_free_dma:
-	dma_free_coherent(queue->dev, PAGE_ALIGN(size),
-			  block->vaddr, block->phys_addr);
+	iio_dma_buffer_free_dmamem(block);
 err_free_block:
 	kfree(block);
 	return ERR_PTR(err);