diff mbox

[v2] dmaengine: qcom-bam: Process multiple pending descriptors

Message ID 1501766513-21349-1-git-send-email-sricharan@codeaurora.org (mailing list archive)
State Superseded
Headers show

Commit Message

Sricharan Ramabadhran Aug. 3, 2017, 1:21 p.m. UTC
The bam dmaengine has a circular FIFO to which we
add hw descriptors that describes the transaction.
The FIFO has space for about 4096 hw descriptors.

Currently we add one descriptor and wait for it to
complete with interrupt and then add the next pending
descriptor. In this way, the FIFO is underutilized
since only one descriptor is processed at a time, although
there is space in FIFO for the BAM to process more.

Instead keep adding descriptors to FIFO till its full,
that allows BAM to continue to work on the next descriptor
immediately after signalling completion interrupt for the
previous descriptor.

Also when the client has not set the DMA_PREP_INTERRUPT for
a descriptor, then do not configure BAM to trigger a interrupt
upon completion of that descriptor. This way we get a interrupt
only for the descriptor for which DMA_PREP_INTERRUPT was
requested and there signal completion of all the previous completed
descriptors. So we still do callbacks for all requested descriptors,
but just that the number of interrupts are reduced.

CURRENT:

            ------      -------   ---------------
            |DES 0|     |DESC 1|  |DESC 2 + INT |
            ------      -------   ---------------
               |           |            |
               |           |            |
INTERRUPT:   (INT)       (INT)	      (INT)
CALLBACK:     (CB)        (CB)         (CB)

		MTD_SPEEDTEST READ PAGE: 3560 KiB/s
		MTD_SPEEDTEST WRITE PAGE: 2664 KiB/s
		IOZONE READ: 2456 KB/s
		IOZONE WRITE: 1230 KB/s

	bam dma interrupts (after tests): 96508

CHANGE:

        ------  -------    -------------
        |DES 0| |DESC 1   |DESC 2 + INT |
        ------  -------   --------------
				|
				|
          		      (INT)
			      (CB for 0, 1, 2)

		MTD_SPEEDTEST READ PAGE: 3860 KiB/s
		MTD_SPEEDTEST WRITE PAGE: 2837 KiB/s
		IOZONE READ: 2677 KB/s
		IOZONE WRITE: 1308 KB/s

	bam dma interrupts (after tests): 58806

Signed-off-by: Sricharan R <sricharan@codeaurora.org>
---
 drivers/dma/qcom/bam_dma.c | 202 +++++++++++++++++++++++++++++----------------
 1 file changed, 129 insertions(+), 73 deletions(-)

Comments

Abhishek Sahu Aug. 9, 2017, 2:18 p.m. UTC | #1
On 2017-08-03 18:51, Sricharan R wrote:
> The bam dmaengine has a circular FIFO to which we
> add hw descriptors that describes the transaction.
> The FIFO has space for about 4096 hw descriptors.
> 
> Currently we add one descriptor and wait for it to
> complete with interrupt and then add the next pending
> descriptor. In this way, the FIFO is underutilized
> since only one descriptor is processed at a time, although
> there is space in FIFO for the BAM to process more.
> 
> Instead keep adding descriptors to FIFO till its full,
> that allows BAM to continue to work on the next descriptor
> immediately after signalling completion interrupt for the
> previous descriptor.
> 
> Also when the client has not set the DMA_PREP_INTERRUPT for
> a descriptor, then do not configure BAM to trigger a interrupt
> upon completion of that descriptor. This way we get a interrupt
> only for the descriptor for which DMA_PREP_INTERRUPT was
> requested and there signal completion of all the previous completed
> descriptors. So we still do callbacks for all requested descriptors,
> but just that the number of interrupts are reduced.
> 
> CURRENT:
> 
>             ------      -------   ---------------
>             |DES 0|     |DESC 1|  |DESC 2 + INT |
>             ------      -------   ---------------
>                |           |            |
>                |           |            |
> INTERRUPT:   (INT)       (INT)	      (INT)
> CALLBACK:     (CB)        (CB)         (CB)
> 
> 		MTD_SPEEDTEST READ PAGE: 3560 KiB/s
> 		MTD_SPEEDTEST WRITE PAGE: 2664 KiB/s
> 		IOZONE READ: 2456 KB/s
> 		IOZONE WRITE: 1230 KB/s
> 
> 	bam dma interrupts (after tests): 96508
> 
> CHANGE:
> 
>         ------  -------    -------------
>         |DES 0| |DESC 1   |DESC 2 + INT |
>         ------  -------   --------------
> 				|
> 				|
>           		      (INT)
> 			      (CB for 0, 1, 2)
> 
> 		MTD_SPEEDTEST READ PAGE: 3860 KiB/s
> 		MTD_SPEEDTEST WRITE PAGE: 2837 KiB/s
> 		IOZONE READ: 2677 KB/s
> 		IOZONE WRITE: 1308 KB/s
> 
> 	bam dma interrupts (after tests): 58806
> 
> Signed-off-by: Sricharan R <sricharan@codeaurora.org>

  Thanks Sricharan for your patch to do the descriptor
  clubbing in BAM DMA driver.

  Verified this patch with my NAND QPIC patches
  https://www.spinics.net/lists/kernel/msg2573736.html

  I run the MTD test overnight and no failure
  observed. Also, achieved significant improvement in
  NAND speed. Following are the numbers for IPQ4019
  DK04 board.

  Test                    Speed in KiB/s
                         Before 	After

  eraseblock write speed  4716    5483
  eraseblock read speed   6855    8294
  page write speed        4678    5436
  page read speed         6784    8217

Tested-by: Abhishek Sahu <absahu@codeaurora.org>

  Also, I reviewed the patch and following are
  minor comments.

> ---
>  drivers/dma/qcom/bam_dma.c | 202
> +++++++++++++++++++++++++++++----------------
>  1 file changed, 129 insertions(+), 73 deletions(-)
> 
> diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
> index 6d89fb6..9e92d7c 100644
> --- a/drivers/dma/qcom/bam_dma.c
> +++ b/drivers/dma/qcom/bam_dma.c
> @@ -46,6 +46,7 @@
>  #include <linux/of_address.h>
>  #include <linux/of_irq.h>
>  #include <linux/of_dma.h>
> +#include <linux/circ_buf.h>
>  #include <linux/clk.h>
>  #include <linux/dmaengine.h>
>  #include <linux/pm_runtime.h>
> @@ -78,6 +79,8 @@ struct bam_async_desc {
> 
>  	struct bam_desc_hw *curr_desc;
> 
> +	/* list node for the desc in the bam_chan list of descriptors */
> +	struct list_head desc_node;
>  	enum dma_transfer_direction dir;
>  	size_t length;
>  	struct bam_desc_hw desc[0];
> @@ -356,7 +359,7 @@ struct bam_chan {
>  	/* configuration from device tree */
>  	u32 id;
> 
> -	struct bam_async_desc *curr_txd;	/* current running dma */
> +	bool is_busy; /* Set to true if FIFO is full */
> 
>  	/* runtime configuration */
>  	struct dma_slave_config slave;
> @@ -372,6 +375,8 @@ struct bam_chan {
>  	unsigned int initialized;	/* is the channel hw initialized?
> */
>  	unsigned int paused;		/* is the channel paused? */
>  	unsigned int reconfigure;	/* new slave config? */
> +	/* list of descriptors currently processed */
> +	struct list_head desc_list;
> 
>  	struct list_head node;
>  };
> @@ -539,7 +544,7 @@ static void bam_free_chan(struct dma_chan *chan)
> 
>  	vchan_free_chan_resources(to_virt_chan(chan));
> 
> -	if (bchan->curr_txd) {
> +	if (!list_empty(&bchan->desc_list)) {
>  		dev_err(bchan->bdev->dev, "Cannot free busy channel\n");
>  		goto err;
>  	}
> @@ -632,8 +637,6 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
> 
>  	if (flags & DMA_PREP_INTERRUPT)
>  		async_desc->flags |= DESC_FLAG_EOT;
> -	else
> -		async_desc->flags |= DESC_FLAG_INT;
> 
>  	async_desc->num_desc = num_alloc;
>  	async_desc->curr_desc = async_desc->desc;
> @@ -684,15 +687,14 @@ static struct dma_async_tx_descriptor
> *bam_prep_slave_sg(struct dma_chan *chan,
>  static int bam_dma_terminate_all(struct dma_chan *chan)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	struct bam_async_desc *async_desc;
>  	unsigned long flag;
>  	LIST_HEAD(head);
> 
>  	/* remove all transactions, including active transaction */
>  	spin_lock_irqsave(&bchan->vc.lock, flag);
> -	if (bchan->curr_txd) {
> -		list_add(&bchan->curr_txd->vd.node,
> &bchan->vc.desc_issued);
> -		bchan->curr_txd = NULL;
> -	}
> +	list_for_each_entry(async_desc, &bchan->desc_list, desc_node)
> +		list_add(&async_desc->vd.node, &bchan->vc.desc_issued);

   should we free the list also since we are adding these descriptor
   back to issued and vchan_dma_desc_free_list will free all theses
   descriptors

   When the IRQ will be triggered then it will traverse this list
   and fetch the async descriptor for which already we freed the
   memory.

> 
>  	vchan_get_all_descriptors(&bchan->vc, &head);
>  	spin_unlock_irqrestore(&bchan->vc.lock, flag);
> @@ -763,9 +765,9 @@ static int bam_resume(struct dma_chan *chan)
>   */
>  static u32 process_channel_irqs(struct bam_device *bdev)
>  {
> -	u32 i, srcs, pipe_stts;
> +	u32 i, srcs, pipe_stts, offset, avail;
>  	unsigned long flags;
> -	struct bam_async_desc *async_desc;
> +	struct bam_async_desc *async_desc, *tmp;
> 
>  	srcs = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_SRCS_EE));
> 
> @@ -781,31 +783,51 @@ static u32 process_channel_irqs(struct bam_device
> *bdev)
> 
>  		/* clear pipe irq */
>  		pipe_stts = readl_relaxed(bam_addr(bdev, i,
> BAM_P_IRQ_STTS));
> -
>  		writel_relaxed(pipe_stts, bam_addr(bdev, i,
> BAM_P_IRQ_CLR));
> 
>  		spin_lock_irqsave(&bchan->vc.lock, flags);
> -		async_desc = bchan->curr_txd;
> -
> -		if (async_desc) {
> -			async_desc->num_desc -= async_desc->xfer_len;
> -			async_desc->curr_desc += async_desc->xfer_len;
> -			bchan->curr_txd = NULL;
> -
> -			/* manage FIFO */
> -			bchan->head += async_desc->xfer_len;
> -			bchan->head %= MAX_DESCRIPTORS;
> -
> -			/*
> -			 * if complete, process cookie.  Otherwise
> -			 * push back to front of desc_issued so that
> -			 * it gets restarted by the tasklet
> -			 */
> -			if (!async_desc->num_desc)
> -				vchan_cookie_complete(&async_desc->vd);
> -			else
> -				list_add(&async_desc->vd.node,
> -					&bchan->vc.desc_issued);
> +
> +		offset = readl_relaxed(bam_addr(bdev, i, BAM_P_SW_OFSTS))
> &
> +				       P_SW_OFSTS_MASK;
> +		offset /= sizeof(struct bam_desc_hw);
> +
> +		/* Number of bytes available to read */
> +		avail = CIRC_CNT(offset, bchan->head, MAX_DESCRIPTORS +
> 1);
> +
> +		list_for_each_entry_safe(async_desc, tmp,
> +					 &bchan->desc_list, desc_node) {
> +			if (async_desc) {

  Do we need this check since async_desc will be always not NULL.

> +				/* Not enough data to read */
> +				if (avail < async_desc->xfer_len)
> +					break;
> +
> +				/* manage FIFO */
> +				bchan->head += async_desc->xfer_len;
> +				bchan->head %= MAX_DESCRIPTORS;
> +
> +				async_desc->num_desc -=
> async_desc->xfer_len;
> +				async_desc->curr_desc +=
> async_desc->xfer_len;
> +				avail -= async_desc->xfer_len;
> +
> +				/*
> +				 * if complete, process cookie. Otherwise
> +				 * push back to front of desc_issued so
> that
> +				 * it gets restarted by the tasklet
> +				 */
> +				if (!async_desc->num_desc) {
> +
> vchan_cookie_complete(&async_desc->vd);
> +				} else {
> +					list_add(&async_desc->vd.node,
> +						 &bchan->vc.desc_issued);
> +				}
> +				list_del(&async_desc->desc_node);
> +
> +				/*
> +				 * one desc has completed, so there is
> space
> +				 * in FIFO
> +				 */
> +				bchan->is_busy = false;
> +			}
>  		}
> 
>  		spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -867,6 +889,7 @@ static enum dma_status bam_tx_status(struct 
> dma_chan
> *chan, dma_cookie_t cookie,
>  		struct dma_tx_state *txstate)
>  {
>  	struct bam_chan *bchan = to_bam_chan(chan);
> +	struct bam_async_desc *async_desc;
>  	struct virt_dma_desc *vd;
>  	int ret;
>  	size_t residue = 0;
> @@ -882,11 +905,17 @@ static enum dma_status bam_tx_status(struct 
> dma_chan
> *chan, dma_cookie_t cookie,
> 
>  	spin_lock_irqsave(&bchan->vc.lock, flags);
>  	vd = vchan_find_desc(&bchan->vc, cookie);
> -	if (vd)
> +	if (vd) {
>  		residue = container_of(vd, struct bam_async_desc,
> vd)->length;
> -	else if (bchan->curr_txd && bchan->curr_txd->vd.tx.cookie ==
> cookie)
> -		for (i = 0; i < bchan->curr_txd->num_desc; i++)
> -			residue += bchan->curr_txd->curr_desc[i].size;
> +	} else {
> +		list_for_each_entry(async_desc, &bchan->desc_list,
> desc_node) {
> +			if (async_desc->vd.tx.cookie != cookie)
> +				continue;
> +
> +			for (i = 0; i < async_desc->num_desc; i++)
> +				residue += async_desc->curr_desc[i].size;
> +		}
> +	}
> 
>  	spin_unlock_irqrestore(&bchan->vc.lock, flags);
> 
> @@ -927,63 +956,89 @@ static void bam_start_dma(struct bam_chan *bchan)
>  {
>  	struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
>  	struct bam_device *bdev = bchan->bdev;
> -	struct bam_async_desc *async_desc;
> +	struct bam_async_desc *async_desc = NULL;
>  	struct bam_desc_hw *desc;
>  	struct bam_desc_hw *fifo = PTR_ALIGN(bchan->fifo_virt,
>  					sizeof(struct bam_desc_hw));
>  	int ret;
> +	unsigned int avail;
> +	struct dmaengine_desc_callback cb;
> 
>  	lockdep_assert_held(&bchan->vc.lock);
> 
>  	if (!vd)
>  		return;
> 
> -	list_del(&vd->node);
> -
> -	async_desc = container_of(vd, struct bam_async_desc, vd);
> -	bchan->curr_txd = async_desc;
> -
>  	ret = pm_runtime_get_sync(bdev->dev);
>  	if (ret < 0)
>  		return;
> 
> -	/* on first use, initialize the channel hardware */
> -	if (!bchan->initialized)
> -		bam_chan_init_hw(bchan, async_desc->dir);
> +	while (vd && !bchan->is_busy) {
> +		list_del(&vd->node);
> 
> -	/* apply new slave config changes, if necessary */
> -	if (bchan->reconfigure)
> -		bam_apply_new_config(bchan, async_desc->dir);
> +		async_desc = container_of(vd, struct bam_async_desc, vd);
> 
> -	desc = bchan->curr_txd->curr_desc;
> +		/* on first use, initialize the channel hardware */
> +		if (!bchan->initialized)
> +			bam_chan_init_hw(bchan, async_desc->dir);
> 
> -	if (async_desc->num_desc > MAX_DESCRIPTORS)
> -		async_desc->xfer_len = MAX_DESCRIPTORS;
> -	else
> -		async_desc->xfer_len = async_desc->num_desc;
> +		/* apply new slave config changes, if necessary */
> +		if (bchan->reconfigure)
> +			bam_apply_new_config(bchan, async_desc->dir);
> 
> -	/* set any special flags on the last descriptor */
> -	if (async_desc->num_desc == async_desc->xfer_len)
> -		desc[async_desc->xfer_len - 1].flags |=
> -					cpu_to_le16(async_desc->flags);
> -	else
> -		desc[async_desc->xfer_len - 1].flags |=
> -					cpu_to_le16(DESC_FLAG_INT);
> +		desc = async_desc->curr_desc;
> +		avail = CIRC_SPACE(bchan->tail, bchan->head,
> +				   MAX_DESCRIPTORS + 1);
> +
> +		if (async_desc->num_desc > avail)
> +			async_desc->xfer_len = avail;
> +		else
> +			async_desc->xfer_len = async_desc->num_desc;
> +
> +		/* set any special flags on the last descriptor */
> +		if (async_desc->num_desc == async_desc->xfer_len)
> +			desc[async_desc->xfer_len - 1].flags |=
> +
> cpu_to_le16(async_desc->flags);
> +
> +		vd = vchan_next_desc(&bchan->vc);
> +
> +		/* FIFO is FULL */
> +		bchan->is_busy = (avail <= async_desc->xfer_len) ? true :
> false;
> 
> -	if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
> -		u32 partial = MAX_DESCRIPTORS - bchan->tail;
> +		dmaengine_desc_get_callback(&async_desc->vd.tx, &cb);
> 
> -		memcpy(&fifo[bchan->tail], desc,
> -				partial * sizeof(struct bam_desc_hw));
> -		memcpy(fifo, &desc[partial], (async_desc->xfer_len -
> partial) *
> +		/*
> +		 * An interrupt is generated at this desc, if
> +		 *  - FIFO is FULL.
> +		 *  - No more descriptors to add.
> +		 *  - If a callback completion was requested for this
> DESC,
> +		 *     In this case, BAM will deliver the completion
> callback
> +		 *     for this desc and continue processing the next
> desc.
> +		 */
> +		if ((bchan->is_busy || !vd ||
> +		     dmaengine_desc_callback_valid(&cb)) &&
> +		    !(async_desc->flags & DESC_FLAG_EOT))
> +			desc[async_desc->xfer_len - 1].flags |=
> +				cpu_to_le16(DESC_FLAG_INT);
> +
> +		if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS)
> {
> +			u32 partial = MAX_DESCRIPTORS - bchan->tail;
> +
> +			memcpy(&fifo[bchan->tail], desc,
> +			       partial * sizeof(struct bam_desc_hw));
> +			memcpy(fifo, &desc[partial],
> +			       (async_desc->xfer_len - partial) *
>  				sizeof(struct bam_desc_hw));
> -	} else {
> -		memcpy(&fifo[bchan->tail], desc,
> -			async_desc->xfer_len * sizeof(struct
> bam_desc_hw));
> -	}
> +		} else {
> +			memcpy(&fifo[bchan->tail], desc,
> +			       async_desc->xfer_len *
> +			       sizeof(struct bam_desc_hw));
> +		}
> 
> -	bchan->tail += async_desc->xfer_len;
> -	bchan->tail %= MAX_DESCRIPTORS;
> +		bchan->tail += async_desc->xfer_len;
> +		bchan->tail %= MAX_DESCRIPTORS;
> +		list_add_tail(&async_desc->desc_node, &bchan->desc_list);
> +	}
> 
>  	/* ensure descriptor writes and dma start not reordered */
>  	wmb();
> @@ -1012,7 +1067,7 @@ static void dma_tasklet(unsigned long data)
>  		bchan = &bdev->channels[i];
>  		spin_lock_irqsave(&bchan->vc.lock, flags);
> 
> -		if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->curr_txd)
> +		if (!list_empty(&bchan->vc.desc_issued) &&
> !bchan->is_busy)
>  			bam_start_dma(bchan);
>  		spin_unlock_irqrestore(&bchan->vc.lock, flags);
>  	}
> @@ -1033,7 +1088,7 @@ static void bam_issue_pending(struct dma_chan 
> *chan)
>  	spin_lock_irqsave(&bchan->vc.lock, flags);
> 
>  	/* if work pending and idle, start a transaction */
> -	if (vchan_issue_pending(&bchan->vc) && !bchan->curr_txd)
> +	if (vchan_issue_pending(&bchan->vc) && !bchan->is_busy)
>  		bam_start_dma(bchan);

  can we get rid of these bchan->is_busy since it is being used
  whether we have space in actual hardware FIFO and same we can
  check with CIRC_SPACE(bchan->tail, bchan->head, MAX_DESCRIPTORS + 1)

  Thanks,
  Abhishek

> 
>  	spin_unlock_irqrestore(&bchan->vc.lock, flags);
> @@ -1133,6 +1188,7 @@ static void bam_channel_init(struct bam_device
> *bdev, struct bam_chan *bchan,
> 
>  	vchan_init(&bchan->vc, &bdev->common);
>  	bchan->vc.desc_free = bam_dma_free_desc;
> +	INIT_LIST_HEAD(&bchan->desc_list);
>  }
> 
>  static const struct of_device_id bam_of_match[] = {
--
To unsubscribe from this list: send the line "unsubscribe dmaengine" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sricharan Ramabadhran Aug. 9, 2017, 2:24 p.m. UTC | #2
Hi Abhishek,

On 8/9/2017 7:48 PM, Abhishek Sahu wrote:
> On 2017-08-03 18:51, Sricharan R wrote:
>> The bam dmaengine has a circular FIFO to which we
>> add hw descriptors that describes the transaction.
>> The FIFO has space for about 4096 hw descriptors.
>>
>> Currently we add one descriptor and wait for it to
>> complete with interrupt and then add the next pending
>> descriptor. In this way, the FIFO is underutilized
>> since only one descriptor is processed at a time, although
>> there is space in FIFO for the BAM to process more.
>>
>> Instead keep adding descriptors to FIFO till its full,
>> that allows BAM to continue to work on the next descriptor
>> immediately after signalling completion interrupt for the
>> previous descriptor.
>>
>> Also when the client has not set the DMA_PREP_INTERRUPT for
>> a descriptor, then do not configure BAM to trigger a interrupt
>> upon completion of that descriptor. This way we get a interrupt
>> only for the descriptor for which DMA_PREP_INTERRUPT was
>> requested and there signal completion of all the previous completed
>> descriptors. So we still do callbacks for all requested descriptors,
>> but just that the number of interrupts are reduced.
>>
>> CURRENT:
>>
>>             ------      -------   ---------------
>>             |DES 0|     |DESC 1|  |DESC 2 + INT |
>>             ------      -------   ---------------
>>                |           |            |
>>                |           |            |
>> INTERRUPT:   (INT)       (INT)          (INT)
>> CALLBACK:     (CB)        (CB)         (CB)
>>
>>         MTD_SPEEDTEST READ PAGE: 3560 KiB/s
>>         MTD_SPEEDTEST WRITE PAGE: 2664 KiB/s
>>         IOZONE READ: 2456 KB/s
>>         IOZONE WRITE: 1230 KB/s
>>
>>     bam dma interrupts (after tests): 96508
>>
>> CHANGE:
>>
>>         ------  -------    -------------
>>         |DES 0| |DESC 1   |DESC 2 + INT |
>>         ------  -------   --------------
>>                 |
>>                 |
>>                         (INT)
>>                   (CB for 0, 1, 2)
>>
>>         MTD_SPEEDTEST READ PAGE: 3860 KiB/s
>>         MTD_SPEEDTEST WRITE PAGE: 2837 KiB/s
>>         IOZONE READ: 2677 KB/s
>>         IOZONE WRITE: 1308 KB/s
>>
>>     bam dma interrupts (after tests): 58806
>>
>> Signed-off-by: Sricharan R <sricharan@codeaurora.org>
> 
>  Thanks Sricharan for your patch to do the descriptor
>  clubbing in BAM DMA driver.
> 
>  Verified this patch with my NAND QPIC patches
>  https://www.spinics.net/lists/kernel/msg2573736.html
> 
>  I run the MTD test overnight and no failure
>  observed. Also, achieved significant improvement in
>  NAND speed. Following are the numbers for IPQ4019
>  DK04 board.
> 
>  Test                    Speed in KiB/s
>                         Before     After
> 
>  eraseblock write speed  4716    5483
>  eraseblock read speed   6855    8294
>  page write speed        4678    5436
>  page read speed         6784    8217
> 
> Tested-by: Abhishek Sahu <absahu@codeaurora.org>
> 

 Thanks for the testing.

>  Also, I reviewed the patch and following are
>  minor comments.

 ok.


>> &bchan->vc.desc_issued);
>> -        bchan->curr_txd = NULL;
>> -    }
>> +    list_for_each_entry(async_desc, &bchan->desc_list, desc_node)
>> +        list_add(&async_desc->vd.node, &bchan->vc.desc_issued);
> 
>   should we free the list also since we are adding these descriptor
>   back to issued and vchan_dma_desc_free_list will free all theses
>   descriptors
> 
>   When the IRQ will be triggered then it will traverse this list
>   and fetch the async descriptor for which already we freed the
>   memory.

 ha ok, should add list_del here.

<snip..>

>> 1);
>> +
>> +        list_for_each_entry_safe(async_desc, tmp,
>> +                     &bchan->desc_list, desc_node) {
>> +            if (async_desc) {
> 
>  Do we need this check since async_desc will be always not NULL.

 not needed, will remove.

<snip..>

>>      /* if work pending and idle, start a transaction */
>> -    if (vchan_issue_pending(&bchan->vc) && !bchan->curr_txd)
>> +    if (vchan_issue_pending(&bchan->vc) && !bchan->is_busy)
>>          bam_start_dma(bchan);
> 
>  can we get rid of these bchan->is_busy since it is being used
>  whether we have space in actual hardware FIFO and same we can
>  check with CIRC_SPACE(bchan->tail, bchan->head, MAX_DESCRIPTORS + 1)
> 
 yeah, this variable can be removed and simplified. Will do in V3.

Regards,
 Sricharan
diff mbox

Patch

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 6d89fb6..9e92d7c 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -46,6 +46,7 @@ 
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_dma.h>
+#include <linux/circ_buf.h>
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
 #include <linux/pm_runtime.h>
@@ -78,6 +79,8 @@  struct bam_async_desc {
 
 	struct bam_desc_hw *curr_desc;
 
+	/* list node for the desc in the bam_chan list of descriptors */
+	struct list_head desc_node;
 	enum dma_transfer_direction dir;
 	size_t length;
 	struct bam_desc_hw desc[0];
@@ -356,7 +359,7 @@  struct bam_chan {
 	/* configuration from device tree */
 	u32 id;
 
-	struct bam_async_desc *curr_txd;	/* current running dma */
+	bool is_busy; /* Set to true if FIFO is full */
 
 	/* runtime configuration */
 	struct dma_slave_config slave;
@@ -372,6 +375,8 @@  struct bam_chan {
 	unsigned int initialized;	/* is the channel hw initialized? */
 	unsigned int paused;		/* is the channel paused? */
 	unsigned int reconfigure;	/* new slave config? */
+	/* list of descriptors currently processed */
+	struct list_head desc_list;
 
 	struct list_head node;
 };
@@ -539,7 +544,7 @@  static void bam_free_chan(struct dma_chan *chan)
 
 	vchan_free_chan_resources(to_virt_chan(chan));
 
-	if (bchan->curr_txd) {
+	if (!list_empty(&bchan->desc_list)) {
 		dev_err(bchan->bdev->dev, "Cannot free busy channel\n");
 		goto err;
 	}
@@ -632,8 +637,6 @@  static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 
 	if (flags & DMA_PREP_INTERRUPT)
 		async_desc->flags |= DESC_FLAG_EOT;
-	else
-		async_desc->flags |= DESC_FLAG_INT;
 
 	async_desc->num_desc = num_alloc;
 	async_desc->curr_desc = async_desc->desc;
@@ -684,15 +687,14 @@  static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 static int bam_dma_terminate_all(struct dma_chan *chan)
 {
 	struct bam_chan *bchan = to_bam_chan(chan);
+	struct bam_async_desc *async_desc;
 	unsigned long flag;
 	LIST_HEAD(head);
 
 	/* remove all transactions, including active transaction */
 	spin_lock_irqsave(&bchan->vc.lock, flag);
-	if (bchan->curr_txd) {
-		list_add(&bchan->curr_txd->vd.node, &bchan->vc.desc_issued);
-		bchan->curr_txd = NULL;
-	}
+	list_for_each_entry(async_desc, &bchan->desc_list, desc_node)
+		list_add(&async_desc->vd.node, &bchan->vc.desc_issued);
 
 	vchan_get_all_descriptors(&bchan->vc, &head);
 	spin_unlock_irqrestore(&bchan->vc.lock, flag);
@@ -763,9 +765,9 @@  static int bam_resume(struct dma_chan *chan)
  */
 static u32 process_channel_irqs(struct bam_device *bdev)
 {
-	u32 i, srcs, pipe_stts;
+	u32 i, srcs, pipe_stts, offset, avail;
 	unsigned long flags;
-	struct bam_async_desc *async_desc;
+	struct bam_async_desc *async_desc, *tmp;
 
 	srcs = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_SRCS_EE));
 
@@ -781,31 +783,51 @@  static u32 process_channel_irqs(struct bam_device *bdev)
 
 		/* clear pipe irq */
 		pipe_stts = readl_relaxed(bam_addr(bdev, i, BAM_P_IRQ_STTS));
-
 		writel_relaxed(pipe_stts, bam_addr(bdev, i, BAM_P_IRQ_CLR));
 
 		spin_lock_irqsave(&bchan->vc.lock, flags);
-		async_desc = bchan->curr_txd;
-
-		if (async_desc) {
-			async_desc->num_desc -= async_desc->xfer_len;
-			async_desc->curr_desc += async_desc->xfer_len;
-			bchan->curr_txd = NULL;
-
-			/* manage FIFO */
-			bchan->head += async_desc->xfer_len;
-			bchan->head %= MAX_DESCRIPTORS;
-
-			/*
-			 * if complete, process cookie.  Otherwise
-			 * push back to front of desc_issued so that
-			 * it gets restarted by the tasklet
-			 */
-			if (!async_desc->num_desc)
-				vchan_cookie_complete(&async_desc->vd);
-			else
-				list_add(&async_desc->vd.node,
-					&bchan->vc.desc_issued);
+
+		offset = readl_relaxed(bam_addr(bdev, i, BAM_P_SW_OFSTS)) &
+				       P_SW_OFSTS_MASK;
+		offset /= sizeof(struct bam_desc_hw);
+
+		/* Number of bytes available to read */
+		avail = CIRC_CNT(offset, bchan->head, MAX_DESCRIPTORS + 1);
+
+		list_for_each_entry_safe(async_desc, tmp,
+					 &bchan->desc_list, desc_node) {
+			if (async_desc) {
+				/* Not enough data to read */
+				if (avail < async_desc->xfer_len)
+					break;
+
+				/* manage FIFO */
+				bchan->head += async_desc->xfer_len;
+				bchan->head %= MAX_DESCRIPTORS;
+
+				async_desc->num_desc -= async_desc->xfer_len;
+				async_desc->curr_desc += async_desc->xfer_len;
+				avail -= async_desc->xfer_len;
+
+				/*
+				 * if complete, process cookie. Otherwise
+				 * push back to front of desc_issued so that
+				 * it gets restarted by the tasklet
+				 */
+				if (!async_desc->num_desc) {
+					vchan_cookie_complete(&async_desc->vd);
+				} else {
+					list_add(&async_desc->vd.node,
+						 &bchan->vc.desc_issued);
+				}
+				list_del(&async_desc->desc_node);
+
+				/*
+				 * one desc has completed, so there is space
+				 * in FIFO
+				 */
+				bchan->is_busy = false;
+			}
 		}
 
 		spin_unlock_irqrestore(&bchan->vc.lock, flags);
@@ -867,6 +889,7 @@  static enum dma_status bam_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 		struct dma_tx_state *txstate)
 {
 	struct bam_chan *bchan = to_bam_chan(chan);
+	struct bam_async_desc *async_desc;
 	struct virt_dma_desc *vd;
 	int ret;
 	size_t residue = 0;
@@ -882,11 +905,17 @@  static enum dma_status bam_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 
 	spin_lock_irqsave(&bchan->vc.lock, flags);
 	vd = vchan_find_desc(&bchan->vc, cookie);
-	if (vd)
+	if (vd) {
 		residue = container_of(vd, struct bam_async_desc, vd)->length;
-	else if (bchan->curr_txd && bchan->curr_txd->vd.tx.cookie == cookie)
-		for (i = 0; i < bchan->curr_txd->num_desc; i++)
-			residue += bchan->curr_txd->curr_desc[i].size;
+	} else {
+		list_for_each_entry(async_desc, &bchan->desc_list, desc_node) {
+			if (async_desc->vd.tx.cookie != cookie)
+				continue;
+
+			for (i = 0; i < async_desc->num_desc; i++)
+				residue += async_desc->curr_desc[i].size;
+		}
+	}
 
 	spin_unlock_irqrestore(&bchan->vc.lock, flags);
 
@@ -927,63 +956,89 @@  static void bam_start_dma(struct bam_chan *bchan)
 {
 	struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
 	struct bam_device *bdev = bchan->bdev;
-	struct bam_async_desc *async_desc;
+	struct bam_async_desc *async_desc = NULL;
 	struct bam_desc_hw *desc;
 	struct bam_desc_hw *fifo = PTR_ALIGN(bchan->fifo_virt,
 					sizeof(struct bam_desc_hw));
 	int ret;
+	unsigned int avail;
+	struct dmaengine_desc_callback cb;
 
 	lockdep_assert_held(&bchan->vc.lock);
 
 	if (!vd)
 		return;
 
-	list_del(&vd->node);
-
-	async_desc = container_of(vd, struct bam_async_desc, vd);
-	bchan->curr_txd = async_desc;
-
 	ret = pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return;
 
-	/* on first use, initialize the channel hardware */
-	if (!bchan->initialized)
-		bam_chan_init_hw(bchan, async_desc->dir);
+	while (vd && !bchan->is_busy) {
+		list_del(&vd->node);
 
-	/* apply new slave config changes, if necessary */
-	if (bchan->reconfigure)
-		bam_apply_new_config(bchan, async_desc->dir);
+		async_desc = container_of(vd, struct bam_async_desc, vd);
 
-	desc = bchan->curr_txd->curr_desc;
+		/* on first use, initialize the channel hardware */
+		if (!bchan->initialized)
+			bam_chan_init_hw(bchan, async_desc->dir);
 
-	if (async_desc->num_desc > MAX_DESCRIPTORS)
-		async_desc->xfer_len = MAX_DESCRIPTORS;
-	else
-		async_desc->xfer_len = async_desc->num_desc;
+		/* apply new slave config changes, if necessary */
+		if (bchan->reconfigure)
+			bam_apply_new_config(bchan, async_desc->dir);
 
-	/* set any special flags on the last descriptor */
-	if (async_desc->num_desc == async_desc->xfer_len)
-		desc[async_desc->xfer_len - 1].flags |=
-					cpu_to_le16(async_desc->flags);
-	else
-		desc[async_desc->xfer_len - 1].flags |=
-					cpu_to_le16(DESC_FLAG_INT);
+		desc = async_desc->curr_desc;
+		avail = CIRC_SPACE(bchan->tail, bchan->head,
+				   MAX_DESCRIPTORS + 1);
+
+		if (async_desc->num_desc > avail)
+			async_desc->xfer_len = avail;
+		else
+			async_desc->xfer_len = async_desc->num_desc;
+
+		/* set any special flags on the last descriptor */
+		if (async_desc->num_desc == async_desc->xfer_len)
+			desc[async_desc->xfer_len - 1].flags |=
+						cpu_to_le16(async_desc->flags);
+
+		vd = vchan_next_desc(&bchan->vc);
+
+		/* FIFO is FULL */
+		bchan->is_busy = (avail <= async_desc->xfer_len) ? true : false;
 
-	if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
-		u32 partial = MAX_DESCRIPTORS - bchan->tail;
+		dmaengine_desc_get_callback(&async_desc->vd.tx, &cb);
 
-		memcpy(&fifo[bchan->tail], desc,
-				partial * sizeof(struct bam_desc_hw));
-		memcpy(fifo, &desc[partial], (async_desc->xfer_len - partial) *
+		/*
+		 * An interrupt is generated at this desc, if
+		 *  - FIFO is FULL.
+		 *  - No more descriptors to add.
+		 *  - If a callback completion was requested for this DESC,
+		 *     In this case, BAM will deliver the completion callback
+		 *     for this desc and continue processing the next desc.
+		 */
+		if ((bchan->is_busy || !vd ||
+		     dmaengine_desc_callback_valid(&cb)) &&
+		    !(async_desc->flags & DESC_FLAG_EOT))
+			desc[async_desc->xfer_len - 1].flags |=
+				cpu_to_le16(DESC_FLAG_INT);
+
+		if (bchan->tail + async_desc->xfer_len > MAX_DESCRIPTORS) {
+			u32 partial = MAX_DESCRIPTORS - bchan->tail;
+
+			memcpy(&fifo[bchan->tail], desc,
+			       partial * sizeof(struct bam_desc_hw));
+			memcpy(fifo, &desc[partial],
+			       (async_desc->xfer_len - partial) *
 				sizeof(struct bam_desc_hw));
-	} else {
-		memcpy(&fifo[bchan->tail], desc,
-			async_desc->xfer_len * sizeof(struct bam_desc_hw));
-	}
+		} else {
+			memcpy(&fifo[bchan->tail], desc,
+			       async_desc->xfer_len *
+			       sizeof(struct bam_desc_hw));
+		}
 
-	bchan->tail += async_desc->xfer_len;
-	bchan->tail %= MAX_DESCRIPTORS;
+		bchan->tail += async_desc->xfer_len;
+		bchan->tail %= MAX_DESCRIPTORS;
+		list_add_tail(&async_desc->desc_node, &bchan->desc_list);
+	}
 
 	/* ensure descriptor writes and dma start not reordered */
 	wmb();
@@ -1012,7 +1067,7 @@  static void dma_tasklet(unsigned long data)
 		bchan = &bdev->channels[i];
 		spin_lock_irqsave(&bchan->vc.lock, flags);
 
-		if (!list_empty(&bchan->vc.desc_issued) && !bchan->curr_txd)
+		if (!list_empty(&bchan->vc.desc_issued) && !bchan->is_busy)
 			bam_start_dma(bchan);
 		spin_unlock_irqrestore(&bchan->vc.lock, flags);
 	}
@@ -1033,7 +1088,7 @@  static void bam_issue_pending(struct dma_chan *chan)
 	spin_lock_irqsave(&bchan->vc.lock, flags);
 
 	/* if work pending and idle, start a transaction */
-	if (vchan_issue_pending(&bchan->vc) && !bchan->curr_txd)
+	if (vchan_issue_pending(&bchan->vc) && !bchan->is_busy)
 		bam_start_dma(bchan);
 
 	spin_unlock_irqrestore(&bchan->vc.lock, flags);
@@ -1133,6 +1188,7 @@  static void bam_channel_init(struct bam_device *bdev, struct bam_chan *bchan,
 
 	vchan_init(&bchan->vc, &bdev->common);
 	bchan->vc.desc_free = bam_dma_free_desc;
+	INIT_LIST_HEAD(&bchan->desc_list);
 }
 
 static const struct of_device_id bam_of_match[] = {