diff mbox series

[3/3] dmaengine: ptdma: Utilize the AE4DMA engine's multi-queue functionality

Message ID 20250203162511.911946-4-Basavaraj.Natikar@amd.com (mailing list archive)
State New
Headers show
Series Fixes to the AE4DMA | expand

Commit Message

Basavaraj Natikar Feb. 3, 2025, 4:25 p.m. UTC
As AE4DMA offers multi-channel functionality compared to PTDMA’s single
queue, utilize multi-queue, which supports higher speeds than PTDMA, to
achieve higher performance using the AE4DMA workqueue based mechanism.

Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support multi-channel and version")
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
---
 drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
 drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 ++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 3 deletions(-)

Comments

Vinod Koul Feb. 10, 2025, 2:18 p.m. UTC | #1
On 03-02-25, 21:55, Basavaraj Natikar wrote:
> As AE4DMA offers multi-channel functionality compared to PTDMA’s single
> queue, utilize multi-queue, which supports higher speeds than PTDMA, to
> achieve higher performance using the AE4DMA workqueue based mechanism.
> 
> Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support multi-channel and version")

Why is this a fix, again!

> Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
> ---
>  drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
>  drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 ++++++++++++++++++++++++-
>  2 files changed, 89 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/dma/amd/ae4dma/ae4dma.h b/drivers/dma/amd/ae4dma/ae4dma.h
> index 265c5d436008..57f6048726bb 100644
> --- a/drivers/dma/amd/ae4dma/ae4dma.h
> +++ b/drivers/dma/amd/ae4dma/ae4dma.h
> @@ -37,6 +37,8 @@
>  #define AE4_DMA_VERSION			4
>  #define CMD_AE4_DESC_DW0_VAL		2
>  
> +#define AE4_TIME_OUT			5000
> +
>  struct ae4_msix {
>  	int msix_count;
>  	struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
> diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
> index 35c84ec9608b..715ac3ae067b 100644
> --- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
> +++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
> @@ -198,8 +198,10 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>  {
>  	struct dma_async_tx_descriptor *tx_desc;
>  	struct virt_dma_desc *vd;
> +	struct pt_device *pt;
>  	unsigned long flags;
>  
> +	pt = chan->pt;
>  	/* Loop over descriptors until one is found with commands */
>  	do {
>  		if (desc) {
> @@ -217,7 +219,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>  
>  		spin_lock_irqsave(&chan->vc.lock, flags);
>  
> -		if (desc) {
> +		if (pt->ver != AE4_DMA_VERSION && desc) {
>  			if (desc->status != DMA_COMPLETE) {
>  				if (desc->status != DMA_ERROR)
>  					desc->status = DMA_COMPLETE;
> @@ -235,7 +237,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>  
>  		spin_unlock_irqrestore(&chan->vc.lock, flags);
>  
> -		if (tx_desc) {
> +		if (pt->ver != AE4_DMA_VERSION && tx_desc) {

Why should this handling be different for DMA_VERSION?

>  			dmaengine_desc_get_callback_invoke(tx_desc, NULL);
>  			dma_run_dependencies(tx_desc);
>  			vchan_vdesc_fini(vd);
> @@ -245,11 +247,25 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>  	return NULL;
>  }
>  
> +static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
> +{
> +	u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
> +	u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
> +
> +	if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN)  >= (MAX_CMD_QLEN - 1))
> +		return true;
> +
> +	return false;
> +}
> +
>  static void pt_cmd_callback(void *data, int err)
>  {
>  	struct pt_dma_desc *desc = data;
> +	struct ae4_cmd_queue *ae4cmd_q;
>  	struct dma_chan *dma_chan;
>  	struct pt_dma_chan *chan;
> +	struct ae4_device *ae4;
> +	struct pt_device *pt;
>  	int ret;
>  
>  	if (err == -EINPROGRESS)
> @@ -257,11 +273,32 @@ static void pt_cmd_callback(void *data, int err)
>  
>  	dma_chan = desc->vd.tx.chan;
>  	chan = to_pt_chan(dma_chan);
> +	pt = chan->pt;
>  
>  	if (err)
>  		desc->status = DMA_ERROR;
>  
>  	while (true) {
> +		if (pt->ver == AE4_DMA_VERSION) {
> +			ae4 = container_of(pt, struct ae4_device, pt);
> +			ae4cmd_q = &ae4->ae4cmd_q[chan->id];
> +
> +			if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
> +			    ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
> +				wake_up(&ae4cmd_q->q_w);
> +
> +				if (wait_for_completion_timeout(&ae4cmd_q->cmp,
> +								msecs_to_jiffies(AE4_TIME_OUT))
> +								== 0) {
> +					dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
> +					break;
> +				}
> +
> +				reinit_completion(&ae4cmd_q->cmp);
> +				continue;
> +			}
> +		}
> +
>  		/* Check for DMA descriptor completion */
>  		desc = pt_handle_active_desc(chan, desc);
>  
> @@ -296,6 +333,49 @@ static struct pt_dma_desc *pt_alloc_dma_desc(struct pt_dma_chan *chan,
>  	return desc;
>  } 
>  
> +static void pt_cmd_callback_work(void *data, int err)
> +{
> +	struct dma_async_tx_descriptor *tx_desc;
> +	struct pt_dma_desc *desc = data;
> +	struct dma_chan *dma_chan;
> +	struct virt_dma_desc *vd;
> +	struct pt_dma_chan *chan;
> +	unsigned long flags;
> +
> +	dma_chan = desc->vd.tx.chan;
> +	chan = to_pt_chan(dma_chan);
> +
> +	if (err == -EINPROGRESS)
> +		return;
> +
> +	tx_desc = &desc->vd.tx;
> +	vd = &desc->vd;
> +
> +	if (err)
> +		desc->status = DMA_ERROR;
> +
> +	spin_lock_irqsave(&chan->vc.lock, flags);
> +	if (desc) {
> +		if (desc->status != DMA_COMPLETE) {
> +			if (desc->status != DMA_ERROR)
> +				desc->status = DMA_COMPLETE;
> +
> +			dma_cookie_complete(tx_desc);
> +			dma_descriptor_unmap(tx_desc);
> +		} else {
> +			tx_desc = NULL;
> +		}
> +	}
> +	spin_unlock_irqrestore(&chan->vc.lock, flags);
> +
> +	if (tx_desc) {
> +		dmaengine_desc_get_callback_invoke(tx_desc, NULL);
> +		dma_run_dependencies(tx_desc);
> +		list_del(&desc->vd.node);
> +		vchan_vdesc_fini(vd);
> +	}
> +}

Why do we have callback in driver...?

> +
>  static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
>  					  dma_addr_t dst,
>  					  dma_addr_t src,
> @@ -327,6 +407,7 @@ static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
>  	desc->len = len;
>  
>  	if (pt->ver == AE4_DMA_VERSION) {
> +		pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
>  		ae4 = container_of(pt, struct ae4_device, pt);
>  		ae4cmd_q = &ae4->ae4cmd_q[chan->id];
>  		mutex_lock(&ae4cmd_q->cmd_lock);
> @@ -367,13 +448,16 @@ static void pt_issue_pending(struct dma_chan *dma_chan)
>  {
>  	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
>  	struct pt_dma_desc *desc;
> +	struct pt_device *pt;
>  	unsigned long flags;
>  	bool engine_is_idle = true;
>  
> +	pt = chan->pt;
> +
>  	spin_lock_irqsave(&chan->vc.lock, flags);
>  
>  	desc = pt_next_dma_desc(chan);
> -	if (desc)
> +	if (desc && pt->ver != AE4_DMA_VERSION)
>  		engine_is_idle = false;
>  
>  	vchan_issue_pending(&chan->vc);
> -- 
> 2.25.1
Basavaraj Natikar Feb. 10, 2025, 2:26 p.m. UTC | #2
On 2/10/2025 7:48 PM, Vinod Koul wrote:
> On 03-02-25, 21:55, Basavaraj Natikar wrote:
>> As AE4DMA offers multi-channel functionality compared to PTDMA’s single
>> queue, utilize multi-queue, which supports higher speeds than PTDMA, to
>> achieve higher performance using the AE4DMA workqueue based mechanism.
>>
>> Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support multi-channel and version")
> Why is this a fix, again!

Yes, as AE4DMA is much faster with multi-queue. However, sometimes due to multi-queue
processing, it takes longer and introduces a timeout for the synchronization of
consumers and producers, which helps avoid long waits that could eventually cause a hang.

Thanks,
--
Basavaraj

>
>> Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
>> ---
>>   drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
>>   drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 ++++++++++++++++++++++++-
>>   2 files changed, 89 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/dma/amd/ae4dma/ae4dma.h b/drivers/dma/amd/ae4dma/ae4dma.h
>> index 265c5d436008..57f6048726bb 100644
>> --- a/drivers/dma/amd/ae4dma/ae4dma.h
>> +++ b/drivers/dma/amd/ae4dma/ae4dma.h
>> @@ -37,6 +37,8 @@
>>   #define AE4_DMA_VERSION			4
>>   #define CMD_AE4_DESC_DW0_VAL		2
>>   
>> +#define AE4_TIME_OUT			5000
>> +
>>   struct ae4_msix {
>>   	int msix_count;
>>   	struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
>> diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>> index 35c84ec9608b..715ac3ae067b 100644
>> --- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>> +++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>> @@ -198,8 +198,10 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>>   {
>>   	struct dma_async_tx_descriptor *tx_desc;
>>   	struct virt_dma_desc *vd;
>> +	struct pt_device *pt;
>>   	unsigned long flags;
>>   
>> +	pt = chan->pt;
>>   	/* Loop over descriptors until one is found with commands */
>>   	do {
>>   		if (desc) {
>> @@ -217,7 +219,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>>   
>>   		spin_lock_irqsave(&chan->vc.lock, flags);
>>   
>> -		if (desc) {
>> +		if (pt->ver != AE4_DMA_VERSION && desc) {
>>   			if (desc->status != DMA_COMPLETE) {
>>   				if (desc->status != DMA_ERROR)
>>   					desc->status = DMA_COMPLETE;
>> @@ -235,7 +237,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>>   
>>   		spin_unlock_irqrestore(&chan->vc.lock, flags);
>>   
>> -		if (tx_desc) {
>> +		if (pt->ver != AE4_DMA_VERSION && tx_desc) {
> Why should this handling be different for DMA_VERSION?
>
>>   			dmaengine_desc_get_callback_invoke(tx_desc, NULL);
>>   			dma_run_dependencies(tx_desc);
>>   			vchan_vdesc_fini(vd);
>> @@ -245,11 +247,25 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
>>   	return NULL;
>>   }
>>   
>> +static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
>> +{
>> +	u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
>> +	u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
>> +
>> +	if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN)  >= (MAX_CMD_QLEN - 1))
>> +		return true;
>> +
>> +	return false;
>> +}
>> +
>>   static void pt_cmd_callback(void *data, int err)
>>   {
>>   	struct pt_dma_desc *desc = data;
>> +	struct ae4_cmd_queue *ae4cmd_q;
>>   	struct dma_chan *dma_chan;
>>   	struct pt_dma_chan *chan;
>> +	struct ae4_device *ae4;
>> +	struct pt_device *pt;
>>   	int ret;
>>   
>>   	if (err == -EINPROGRESS)
>> @@ -257,11 +273,32 @@ static void pt_cmd_callback(void *data, int err)
>>   
>>   	dma_chan = desc->vd.tx.chan;
>>   	chan = to_pt_chan(dma_chan);
>> +	pt = chan->pt;
>>   
>>   	if (err)
>>   		desc->status = DMA_ERROR;
>>   
>>   	while (true) {
>> +		if (pt->ver == AE4_DMA_VERSION) {
>> +			ae4 = container_of(pt, struct ae4_device, pt);
>> +			ae4cmd_q = &ae4->ae4cmd_q[chan->id];
>> +
>> +			if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
>> +			    ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
>> +				wake_up(&ae4cmd_q->q_w);
>> +
>> +				if (wait_for_completion_timeout(&ae4cmd_q->cmp,
>> +								msecs_to_jiffies(AE4_TIME_OUT))
>> +								== 0) {
>> +					dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
>> +					break;
>> +				}
>> +
>> +				reinit_completion(&ae4cmd_q->cmp);
>> +				continue;
>> +			}
>> +		}
>> +
>>   		/* Check for DMA descriptor completion */
>>   		desc = pt_handle_active_desc(chan, desc);
>>   
>> @@ -296,6 +333,49 @@ static struct pt_dma_desc *pt_alloc_dma_desc(struct pt_dma_chan *chan,
>>   	return desc;
>>   }
>>   
>> +static void pt_cmd_callback_work(void *data, int err)
>> +{
>> +	struct dma_async_tx_descriptor *tx_desc;
>> +	struct pt_dma_desc *desc = data;
>> +	struct dma_chan *dma_chan;
>> +	struct virt_dma_desc *vd;
>> +	struct pt_dma_chan *chan;
>> +	unsigned long flags;
>> +
>> +	dma_chan = desc->vd.tx.chan;
>> +	chan = to_pt_chan(dma_chan);
>> +
>> +	if (err == -EINPROGRESS)
>> +		return;
>> +
>> +	tx_desc = &desc->vd.tx;
>> +	vd = &desc->vd;
>> +
>> +	if (err)
>> +		desc->status = DMA_ERROR;
>> +
>> +	spin_lock_irqsave(&chan->vc.lock, flags);
>> +	if (desc) {
>> +		if (desc->status != DMA_COMPLETE) {
>> +			if (desc->status != DMA_ERROR)
>> +				desc->status = DMA_COMPLETE;
>> +
>> +			dma_cookie_complete(tx_desc);
>> +			dma_descriptor_unmap(tx_desc);
>> +		} else {
>> +			tx_desc = NULL;
>> +		}
>> +	}
>> +	spin_unlock_irqrestore(&chan->vc.lock, flags);
>> +
>> +	if (tx_desc) {
>> +		dmaengine_desc_get_callback_invoke(tx_desc, NULL);
>> +		dma_run_dependencies(tx_desc);
>> +		list_del(&desc->vd.node);
>> +		vchan_vdesc_fini(vd);
>> +	}
>> +}
> Why do we have callback in driver...?
>
>> +
>>   static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
>>   					  dma_addr_t dst,
>>   					  dma_addr_t src,
>> @@ -327,6 +407,7 @@ static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
>>   	desc->len = len;
>>   
>>   	if (pt->ver == AE4_DMA_VERSION) {
>> +		pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
>>   		ae4 = container_of(pt, struct ae4_device, pt);
>>   		ae4cmd_q = &ae4->ae4cmd_q[chan->id];
>>   		mutex_lock(&ae4cmd_q->cmd_lock);
>> @@ -367,13 +448,16 @@ static void pt_issue_pending(struct dma_chan *dma_chan)
>>   {
>>   	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
>>   	struct pt_dma_desc *desc;
>> +	struct pt_device *pt;
>>   	unsigned long flags;
>>   	bool engine_is_idle = true;
>>   
>> +	pt = chan->pt;
>> +
>>   	spin_lock_irqsave(&chan->vc.lock, flags);
>>   
>>   	desc = pt_next_dma_desc(chan);
>> -	if (desc)
>> +	if (desc && pt->ver != AE4_DMA_VERSION)
>>   		engine_is_idle = false;
>>   
>>   	vchan_issue_pending(&chan->vc);
>> -- 
>> 2.25.1
Basavaraj Natikar Feb. 10, 2025, 2:37 p.m. UTC | #3
On 2/10/2025 7:56 PM, Basavaraj Natikar wrote:
>
>
> On 2/10/2025 7:48 PM, Vinod Koul wrote:
>> On 03-02-25, 21:55, Basavaraj Natikar wrote:
>>> As AE4DMA offers multi-channel functionality compared to PTDMA’s single
>>> queue, utilize multi-queue, which supports higher speeds than PTDMA, to
>>> achieve higher performance using the AE4DMA workqueue based mechanism.
>>>
>>> Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support 
>>> multi-channel and version")
>> Why is this a fix, again!
>
> Yes, as AE4DMA is much faster with multi-queue. However, sometimes due 
> to multi-queue
> processing, it takes longer and introduces a timeout for the 
> synchronization of
> consumers and producers, which helps avoid long waits that could 
> eventually cause a hang.
>
> Thanks,
> -- 
> Basavaraj
>
>>
>>> Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
>>> ---
>>>   drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
>>>   drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 
>>> ++++++++++++++++++++++++-
>>>   2 files changed, 89 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/dma/amd/ae4dma/ae4dma.h 
>>> b/drivers/dma/amd/ae4dma/ae4dma.h
>>> index 265c5d436008..57f6048726bb 100644
>>> --- a/drivers/dma/amd/ae4dma/ae4dma.h
>>> +++ b/drivers/dma/amd/ae4dma/ae4dma.h
>>> @@ -37,6 +37,8 @@
>>>   #define AE4_DMA_VERSION            4
>>>   #define CMD_AE4_DESC_DW0_VAL        2
>>>   +#define AE4_TIME_OUT            5000
>>> +
>>>   struct ae4_msix {
>>>       int msix_count;
>>>       struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
>>> diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c 
>>> b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>>> index 35c84ec9608b..715ac3ae067b 100644
>>> --- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>>> +++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
>>> @@ -198,8 +198,10 @@ static struct pt_dma_desc 
>>> *pt_handle_active_desc(struct pt_dma_chan *chan,
>>>   {
>>>       struct dma_async_tx_descriptor *tx_desc;
>>>       struct virt_dma_desc *vd;
>>> +    struct pt_device *pt;
>>>       unsigned long flags;
>>>   +    pt = chan->pt;
>>>       /* Loop over descriptors until one is found with commands */
>>>       do {
>>>           if (desc) {
>>> @@ -217,7 +219,7 @@ static struct pt_dma_desc 
>>> *pt_handle_active_desc(struct pt_dma_chan *chan,
>>>             spin_lock_irqsave(&chan->vc.lock, flags);
>>>   -        if (desc) {
>>> +        if (pt->ver != AE4_DMA_VERSION && desc) {
>>>               if (desc->status != DMA_COMPLETE) {
>>>                   if (desc->status != DMA_ERROR)
>>>                       desc->status = DMA_COMPLETE;
>>> @@ -235,7 +237,7 @@ static struct pt_dma_desc 
>>> *pt_handle_active_desc(struct pt_dma_chan *chan,
>>>             spin_unlock_irqrestore(&chan->vc.lock, flags);
>>>   -        if (tx_desc) {
>>> +        if (pt->ver != AE4_DMA_VERSION && tx_desc) {
>> Why should this handling be different for DMA_VERSION?

PTDMA always handles per command based interrupt and it is single queue while
AE4DMA we can submit multiple command each time with multiple as AE4DMA is
much faster with multi-queue.
However, sometimes due to multi-queue processing, it takes longer and introduces a
timeout for the synchronization of consumers and producers,
which helps avoid long waits that could eventually cause a hang.

>>
>>> dmaengine_desc_get_callback_invoke(tx_desc, NULL);
>>>               dma_run_dependencies(tx_desc);
>>>               vchan_vdesc_fini(vd);
>>> @@ -245,11 +247,25 @@ static struct pt_dma_desc 
>>> *pt_handle_active_desc(struct pt_dma_chan *chan,
>>>       return NULL;
>>>   }
>>>   +static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
>>> +{
>>> +    u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
>>> +    u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
>>> +
>>> +    if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN) >= 
>>> (MAX_CMD_QLEN - 1))
>>> +        return true;
>>> +
>>> +    return false;
>>> +}
>>> +
>>>   static void pt_cmd_callback(void *data, int err)
>>>   {
>>>       struct pt_dma_desc *desc = data;
>>> +    struct ae4_cmd_queue *ae4cmd_q;
>>>       struct dma_chan *dma_chan;
>>>       struct pt_dma_chan *chan;
>>> +    struct ae4_device *ae4;
>>> +    struct pt_device *pt;
>>>       int ret;
>>>         if (err == -EINPROGRESS)
>>> @@ -257,11 +273,32 @@ static void pt_cmd_callback(void *data, int err)
>>>         dma_chan = desc->vd.tx.chan;
>>>       chan = to_pt_chan(dma_chan);
>>> +    pt = chan->pt;
>>>         if (err)
>>>           desc->status = DMA_ERROR;
>>>         while (true) {
>>> +        if (pt->ver == AE4_DMA_VERSION) {
>>> +            ae4 = container_of(pt, struct ae4_device, pt);
>>> +            ae4cmd_q = &ae4->ae4cmd_q[chan->id];
>>> +
>>> +            if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
>>> +                ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
>>> +                wake_up(&ae4cmd_q->q_w);
>>> +
>>> +                if (wait_for_completion_timeout(&ae4cmd_q->cmp,
>>> + msecs_to_jiffies(AE4_TIME_OUT))
>>> +                                == 0) {
>>> +                    dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
>>> +                    break;
>>> +                }
>>> +
>>> +                reinit_completion(&ae4cmd_q->cmp);
>>> +                continue;
>>> +            }
>>> +        }
>>> +
>>>           /* Check for DMA descriptor completion */
>>>           desc = pt_handle_active_desc(chan, desc);
>>>   @@ -296,6 +333,49 @@ static struct pt_dma_desc 
>>> *pt_alloc_dma_desc(struct pt_dma_chan *chan,
>>>       return desc;
>>>   }
>>>   +static void pt_cmd_callback_work(void *data, int err)
>>> +{
>>> +    struct dma_async_tx_descriptor *tx_desc;
>>> +    struct pt_dma_desc *desc = data;
>>> +    struct dma_chan *dma_chan;
>>> +    struct virt_dma_desc *vd;
>>> +    struct pt_dma_chan *chan;
>>> +    unsigned long flags;
>>> +
>>> +    dma_chan = desc->vd.tx.chan;
>>> +    chan = to_pt_chan(dma_chan);
>>> +
>>> +    if (err == -EINPROGRESS)
>>> +        return;
>>> +
>>> +    tx_desc = &desc->vd.tx;
>>> +    vd = &desc->vd;
>>> +
>>> +    if (err)
>>> +        desc->status = DMA_ERROR;
>>> +
>>> +    spin_lock_irqsave(&chan->vc.lock, flags);
>>> +    if (desc) {
>>> +        if (desc->status != DMA_COMPLETE) {
>>> +            if (desc->status != DMA_ERROR)
>>> +                desc->status = DMA_COMPLETE;
>>> +
>>> +            dma_cookie_complete(tx_desc);
>>> +            dma_descriptor_unmap(tx_desc);
>>> +        } else {
>>> +            tx_desc = NULL;
>>> +        }
>>> +    }
>>> +    spin_unlock_irqrestore(&chan->vc.lock, flags);
>>> +
>>> +    if (tx_desc) {
>>> +        dmaengine_desc_get_callback_invoke(tx_desc, NULL);
>>> +        dma_run_dependencies(tx_desc);
>>> +        list_del(&desc->vd.node);
>>> +        vchan_vdesc_fini(vd);
>>> +    }
>>> +}
>> Why do we have callback in driver...?

PTDMA also has similar callback pt_cmd_callback
hence AE4DMA also has callback for the multi-queue command ,
once command is processed to signal upper layer that processing
done for that queue.Thanks,
--
Basavaraj

>>
>>> +
>>>   static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
>>>                         dma_addr_t dst,
>>>                         dma_addr_t src,
>>> @@ -327,6 +407,7 @@ static struct pt_dma_desc *pt_create_desc(struct 
>>> dma_chan *dma_chan,
>>>       desc->len = len;
>>>         if (pt->ver == AE4_DMA_VERSION) {
>>> +        pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
>>>           ae4 = container_of(pt, struct ae4_device, pt);
>>>           ae4cmd_q = &ae4->ae4cmd_q[chan->id];
>>>           mutex_lock(&ae4cmd_q->cmd_lock);
>>> @@ -367,13 +448,16 @@ static void pt_issue_pending(struct dma_chan 
>>> *dma_chan)
>>>   {
>>>       struct pt_dma_chan *chan = to_pt_chan(dma_chan);
>>>       struct pt_dma_desc *desc;
>>> +    struct pt_device *pt;
>>>       unsigned long flags;
>>>       bool engine_is_idle = true;
>>>   +    pt = chan->pt;
>>> +
>>>       spin_lock_irqsave(&chan->vc.lock, flags);
>>>         desc = pt_next_dma_desc(chan);
>>> -    if (desc)
>>> +    if (desc && pt->ver != AE4_DMA_VERSION)
>>>           engine_is_idle = false;
>>>         vchan_issue_pending(&chan->vc);
>>> -- 
>>> 2.25.1
>
diff mbox series

Patch

diff --git a/drivers/dma/amd/ae4dma/ae4dma.h b/drivers/dma/amd/ae4dma/ae4dma.h
index 265c5d436008..57f6048726bb 100644
--- a/drivers/dma/amd/ae4dma/ae4dma.h
+++ b/drivers/dma/amd/ae4dma/ae4dma.h
@@ -37,6 +37,8 @@ 
 #define AE4_DMA_VERSION			4
 #define CMD_AE4_DESC_DW0_VAL		2
 
+#define AE4_TIME_OUT			5000
+
 struct ae4_msix {
 	int msix_count;
 	struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
index 35c84ec9608b..715ac3ae067b 100644
--- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
+++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
@@ -198,8 +198,10 @@  static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 {
 	struct dma_async_tx_descriptor *tx_desc;
 	struct virt_dma_desc *vd;
+	struct pt_device *pt;
 	unsigned long flags;
 
+	pt = chan->pt;
 	/* Loop over descriptors until one is found with commands */
 	do {
 		if (desc) {
@@ -217,7 +219,7 @@  static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 
 		spin_lock_irqsave(&chan->vc.lock, flags);
 
-		if (desc) {
+		if (pt->ver != AE4_DMA_VERSION && desc) {
 			if (desc->status != DMA_COMPLETE) {
 				if (desc->status != DMA_ERROR)
 					desc->status = DMA_COMPLETE;
@@ -235,7 +237,7 @@  static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 
 		spin_unlock_irqrestore(&chan->vc.lock, flags);
 
-		if (tx_desc) {
+		if (pt->ver != AE4_DMA_VERSION && tx_desc) {
 			dmaengine_desc_get_callback_invoke(tx_desc, NULL);
 			dma_run_dependencies(tx_desc);
 			vchan_vdesc_fini(vd);
@@ -245,11 +247,25 @@  static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 	return NULL;
 }
 
+static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
+{
+	u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
+	u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
+
+	if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN)  >= (MAX_CMD_QLEN - 1))
+		return true;
+
+	return false;
+}
+
 static void pt_cmd_callback(void *data, int err)
 {
 	struct pt_dma_desc *desc = data;
+	struct ae4_cmd_queue *ae4cmd_q;
 	struct dma_chan *dma_chan;
 	struct pt_dma_chan *chan;
+	struct ae4_device *ae4;
+	struct pt_device *pt;
 	int ret;
 
 	if (err == -EINPROGRESS)
@@ -257,11 +273,32 @@  static void pt_cmd_callback(void *data, int err)
 
 	dma_chan = desc->vd.tx.chan;
 	chan = to_pt_chan(dma_chan);
+	pt = chan->pt;
 
 	if (err)
 		desc->status = DMA_ERROR;
 
 	while (true) {
+		if (pt->ver == AE4_DMA_VERSION) {
+			ae4 = container_of(pt, struct ae4_device, pt);
+			ae4cmd_q = &ae4->ae4cmd_q[chan->id];
+
+			if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
+			    ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
+				wake_up(&ae4cmd_q->q_w);
+
+				if (wait_for_completion_timeout(&ae4cmd_q->cmp,
+								msecs_to_jiffies(AE4_TIME_OUT))
+								== 0) {
+					dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
+					break;
+				}
+
+				reinit_completion(&ae4cmd_q->cmp);
+				continue;
+			}
+		}
+
 		/* Check for DMA descriptor completion */
 		desc = pt_handle_active_desc(chan, desc);
 
@@ -296,6 +333,49 @@  static struct pt_dma_desc *pt_alloc_dma_desc(struct pt_dma_chan *chan,
 	return desc;
 }
 
+static void pt_cmd_callback_work(void *data, int err)
+{
+	struct dma_async_tx_descriptor *tx_desc;
+	struct pt_dma_desc *desc = data;
+	struct dma_chan *dma_chan;
+	struct virt_dma_desc *vd;
+	struct pt_dma_chan *chan;
+	unsigned long flags;
+
+	dma_chan = desc->vd.tx.chan;
+	chan = to_pt_chan(dma_chan);
+
+	if (err == -EINPROGRESS)
+		return;
+
+	tx_desc = &desc->vd.tx;
+	vd = &desc->vd;
+
+	if (err)
+		desc->status = DMA_ERROR;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (desc) {
+		if (desc->status != DMA_COMPLETE) {
+			if (desc->status != DMA_ERROR)
+				desc->status = DMA_COMPLETE;
+
+			dma_cookie_complete(tx_desc);
+			dma_descriptor_unmap(tx_desc);
+		} else {
+			tx_desc = NULL;
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+	if (tx_desc) {
+		dmaengine_desc_get_callback_invoke(tx_desc, NULL);
+		dma_run_dependencies(tx_desc);
+		list_del(&desc->vd.node);
+		vchan_vdesc_fini(vd);
+	}
+}
+
 static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
 					  dma_addr_t dst,
 					  dma_addr_t src,
@@ -327,6 +407,7 @@  static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
 	desc->len = len;
 
 	if (pt->ver == AE4_DMA_VERSION) {
+		pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
 		ae4 = container_of(pt, struct ae4_device, pt);
 		ae4cmd_q = &ae4->ae4cmd_q[chan->id];
 		mutex_lock(&ae4cmd_q->cmd_lock);
@@ -367,13 +448,16 @@  static void pt_issue_pending(struct dma_chan *dma_chan)
 {
 	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
 	struct pt_dma_desc *desc;
+	struct pt_device *pt;
 	unsigned long flags;
 	bool engine_is_idle = true;
 
+	pt = chan->pt;
+
 	spin_lock_irqsave(&chan->vc.lock, flags);
 
 	desc = pt_next_dma_desc(chan);
-	if (desc)
+	if (desc && pt->ver != AE4_DMA_VERSION)
 		engine_is_idle = false;
 
 	vchan_issue_pending(&chan->vc);