diff mbox series

[v3] dmaengine: tegra-apb: Support per-burst residue granularity

Message ID 20190627194728.8948-1-digetx@gmail.com (mailing list archive)
State Superseded
Headers show
Series [v3] dmaengine: tegra-apb: Support per-burst residue granularity | expand

Commit Message

Dmitry Osipenko June 27, 2019, 7:47 p.m. UTC
Tegra's APB DMA engine updates words counter after each transferred burst
of data, hence it can report transfer's residual with more fidelity which
may be required in cases like audio playback. In particular this fixes
audio stuttering during playback in a chromium web browser. The patch is
based on the original work that was made by Ben Dooks and a patch from
downstream kernel. It was tested on Tegra20 and Tegra30 devices.

Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---

Changelog:

v3:  Added workaround for a hardware design shortcoming that results
     in a words counter wraparound before end-of-transfer bit is set
     in a cyclic mode.

v2:  Addressed review comments made by Jon Hunter to v1. We won't try
     to get words count if dma_desc is on free list as it will result
     in a NULL dereference because this case wasn't handled properly.

     The residual value is now updated properly, avoiding potential
     integer overflow by adding the "bytes" to the "bytes_transferred"
     instead of the subtraction.

 drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 7 deletions(-)

Comments

Dmitry Osipenko July 2, 2019, 9:25 a.m. UTC | #1
27.06.2019 22:47, Dmitry Osipenko пишет:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromium web browser. The patch is
> based on the original work that was made by Ben Dooks and a patch from
> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
> 
> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
> 
> Changelog:
> 
> v3:  Added workaround for a hardware design shortcoming that results
>      in a words counter wraparound before end-of-transfer bit is set
>      in a cyclic mode.
> 
> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>      to get words count if dma_desc is on free list as it will result
>      in a NULL dereference because this case wasn't handled properly.
> 
>      The residual value is now updated properly, avoiding potential
>      integer overflow by adding the "bytes" to the "bytes_transferred"
>      instead of the subtraction.

Is there still any chance to get this into 5.3? Will be very nice! Jon / Vinod ?
Jon Hunter July 2, 2019, 11:20 a.m. UTC | #2
On 27/06/2019 20:47, Dmitry Osipenko wrote:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromium web browser. The patch is
> based on the original work that was made by Ben Dooks and a patch from
> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
> 
> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
> 
> Changelog:
> 
> v3:  Added workaround for a hardware design shortcoming that results
>      in a words counter wraparound before end-of-transfer bit is set
>      in a cyclic mode.
> 
> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>      to get words count if dma_desc is on free list as it will result
>      in a NULL dereference because this case wasn't handled properly.
> 
>      The residual value is now updated properly, avoiding potential
>      integer overflow by adding the "bytes" to the "bytes_transferred"
>      instead of the subtraction.
> 
>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>  1 file changed, 62 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 79e9593815f1..71473eda28ee 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>  	bool				last_sg;
>  	struct list_head		node;
>  	struct tegra_dma_desc		*dma_desc;
> +	unsigned int			words_xferred;
>  };
>  
>  /*
> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>  	nsg_req->configured = true;
> +	nsg_req->words_xferred = 0;
>  
>  	tegra_dma_resume(tdc);
>  }
> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>  					typeof(*sg_req), node);
>  	tegra_dma_start(tdc, sg_req);
>  	sg_req->configured = true;
> +	sg_req->words_xferred = 0;
>  	tdc->busy = true;
>  }
>  
> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>  	return 0;
>  }
>  
> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
> +					       struct tegra_dma_sg_req *sg_req)
> +{
> +	unsigned long status, wcount = 0;
> +
> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
> +		return 0;
> +
> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
> +
> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
> +
> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = status;
> +
> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
> +		return sg_req->req_len;
> +
> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
> +
> +	if (!wcount) {
> +		/*
> +		 * If wcount wasn't ever polled for this SG before, then
> +		 * simply assume that transfer hasn't started yet.
> +		 *
> +		 * Otherwise it's the end of the transfer.
> +		 *
> +		 * The alternative would be to poll the status register
> +		 * until EOC bit is set or wcount goes UP. That's so
> +		 * because EOC bit is getting set only after the last
> +		 * burst's completion and counter is less than the actual
> +		 * transfer size by 4 bytes. The counter value wraps around
> +		 * in a cyclic mode before EOC is set(!), so we can't easily
> +		 * distinguish start of transfer from its end.
> +		 */
> +		if (sg_req->words_xferred)
> +			wcount = sg_req->req_len - 4;
> +
> +	} else if (wcount < sg_req->words_xferred) {
> +		/*
> +		 * This case shall not ever happen because EOC bit
> +		 * must be set once next cyclic transfer is started.

I am not sure I follow this and why this condition cannot happen for
cyclic transfers. What about non-cyclic transfers?

Cheers
Jon
Dmitry Osipenko July 2, 2019, 11:37 a.m. UTC | #3
02.07.2019 14:20, Jon Hunter пишет:
> 
> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>> Tegra's APB DMA engine updates words counter after each transferred burst
>> of data, hence it can report transfer's residual with more fidelity which
>> may be required in cases like audio playback. In particular this fixes
>> audio stuttering during playback in a chromium web browser. The patch is
>> based on the original work that was made by Ben Dooks and a patch from
>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>
>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>
>> Changelog:
>>
>> v3:  Added workaround for a hardware design shortcoming that results
>>      in a words counter wraparound before end-of-transfer bit is set
>>      in a cyclic mode.
>>
>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>      to get words count if dma_desc is on free list as it will result
>>      in a NULL dereference because this case wasn't handled properly.
>>
>>      The residual value is now updated properly, avoiding potential
>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>      instead of the subtraction.
>>
>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 79e9593815f1..71473eda28ee 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>  	bool				last_sg;
>>  	struct list_head		node;
>>  	struct tegra_dma_desc		*dma_desc;
>> +	unsigned int			words_xferred;
>>  };
>>  
>>  /*
>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>  	nsg_req->configured = true;
>> +	nsg_req->words_xferred = 0;
>>  
>>  	tegra_dma_resume(tdc);
>>  }
>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>  					typeof(*sg_req), node);
>>  	tegra_dma_start(tdc, sg_req);
>>  	sg_req->configured = true;
>> +	sg_req->words_xferred = 0;
>>  	tdc->busy = true;
>>  }
>>  
>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>  	return 0;
>>  }
>>  
>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>> +					       struct tegra_dma_sg_req *sg_req)
>> +{
>> +	unsigned long status, wcount = 0;
>> +
>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>> +		return 0;
>> +
>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>> +
>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>> +
>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = status;
>> +
>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>> +		return sg_req->req_len;
>> +
>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>> +
>> +	if (!wcount) {
>> +		/*
>> +		 * If wcount wasn't ever polled for this SG before, then
>> +		 * simply assume that transfer hasn't started yet.
>> +		 *
>> +		 * Otherwise it's the end of the transfer.
>> +		 *
>> +		 * The alternative would be to poll the status register
>> +		 * until EOC bit is set or wcount goes UP. That's so
>> +		 * because EOC bit is getting set only after the last
>> +		 * burst's completion and counter is less than the actual
>> +		 * transfer size by 4 bytes. The counter value wraps around
>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>> +		 * distinguish start of transfer from its end.
>> +		 */
>> +		if (sg_req->words_xferred)
>> +			wcount = sg_req->req_len - 4;
>> +
>> +	} else if (wcount < sg_req->words_xferred) {
>> +		/*
>> +		 * This case shall not ever happen because EOC bit
>> +		 * must be set once next cyclic transfer is started.
> 
> I am not sure I follow this and why this condition cannot happen for
> cyclic transfers. What about non-cyclic transfers?

It cannot happen because the EOC bit will be set in that case. The counter wraps
around when the transfer of a last burst happens, EOC bit is guaranteed to be set
after completion of the last burst. That's my observation after a thorough testing,
it will be very odd if EOC setting happened completely asynchronously.

For a non-cyclic transfers it doesn't matter.. because they are not cyclic and thus
counter will be stopped by itself. It will be a disaster if all of sudden a
non-cyclic transfer becomes cyclic, don't you think so? :)
Dmitry Osipenko July 2, 2019, 11:56 a.m. UTC | #4
02.07.2019 14:37, Dmitry Osipenko пишет:
> 02.07.2019 14:20, Jon Hunter пишет:
>>
>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>> of data, hence it can report transfer's residual with more fidelity which
>>> may be required in cases like audio playback. In particular this fixes
>>> audio stuttering during playback in a chromium web browser. The patch is
>>> based on the original work that was made by Ben Dooks and a patch from
>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>
>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>
>>> Changelog:
>>>
>>> v3:  Added workaround for a hardware design shortcoming that results
>>>      in a words counter wraparound before end-of-transfer bit is set
>>>      in a cyclic mode.
>>>
>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>      to get words count if dma_desc is on free list as it will result
>>>      in a NULL dereference because this case wasn't handled properly.
>>>
>>>      The residual value is now updated properly, avoiding potential
>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>      instead of the subtraction.
>>>
>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>> index 79e9593815f1..71473eda28ee 100644
>>> --- a/drivers/dma/tegra20-apb-dma.c
>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>  	bool				last_sg;
>>>  	struct list_head		node;
>>>  	struct tegra_dma_desc		*dma_desc;
>>> +	unsigned int			words_xferred;
>>>  };
>>>  
>>>  /*
>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>  	nsg_req->configured = true;
>>> +	nsg_req->words_xferred = 0;
>>>  
>>>  	tegra_dma_resume(tdc);
>>>  }
>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>  					typeof(*sg_req), node);
>>>  	tegra_dma_start(tdc, sg_req);
>>>  	sg_req->configured = true;
>>> +	sg_req->words_xferred = 0;
>>>  	tdc->busy = true;
>>>  }
>>>  
>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>  	return 0;
>>>  }
>>>  
>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>> +					       struct tegra_dma_sg_req *sg_req)
>>> +{
>>> +	unsigned long status, wcount = 0;
>>> +
>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>> +		return 0;
>>> +
>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>> +
>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>> +
>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = status;
>>> +
>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>> +		return sg_req->req_len;
>>> +
>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>> +
>>> +	if (!wcount) {
>>> +		/*
>>> +		 * If wcount wasn't ever polled for this SG before, then
>>> +		 * simply assume that transfer hasn't started yet.
>>> +		 *
>>> +		 * Otherwise it's the end of the transfer.
>>> +		 *
>>> +		 * The alternative would be to poll the status register
>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>> +		 * because EOC bit is getting set only after the last
>>> +		 * burst's completion and counter is less than the actual
>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>> +		 * distinguish start of transfer from its end.
>>> +		 */
>>> +		if (sg_req->words_xferred)
>>> +			wcount = sg_req->req_len - 4;
>>> +
>>> +	} else if (wcount < sg_req->words_xferred) {
>>> +		/*
>>> +		 * This case shall not ever happen because EOC bit
>>> +		 * must be set once next cyclic transfer is started.
>>
>> I am not sure I follow this and why this condition cannot happen for
>> cyclic transfers. What about non-cyclic transfers?
> 
> It cannot happen because the EOC bit will be set in that case. The counter wraps
> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
> after completion of the last burst. That's my observation after a thorough testing,
> it will be very odd if EOC setting happened completely asynchronously.
> 
> For a non-cyclic transfers it doesn't matter.. because they are not cyclic and thus
> counter will be stopped by itself. It will be a disaster if all of sudden a
> non-cyclic transfer becomes cyclic, don't you think so? :)
> 

Ah, probably I was too focused on audio playback use-case. If it's a free-running
transfer, then that case of a wraparound seems should be legit.. hmm.
Dmitry Osipenko July 2, 2019, 12:04 p.m. UTC | #5
02.07.2019 14:56, Dmitry Osipenko пишет:
> 02.07.2019 14:37, Dmitry Osipenko пишет:
>> 02.07.2019 14:20, Jon Hunter пишет:
>>>
>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>> based on the original work that was made by Ben Dooks and a patch from
>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>
>>>> Changelog:
>>>>
>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>      in a cyclic mode.
>>>>
>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>      to get words count if dma_desc is on free list as it will result
>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>
>>>>      The residual value is now updated properly, avoiding potential
>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>      instead of the subtraction.
>>>>
>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..71473eda28ee 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>  	bool				last_sg;
>>>>  	struct list_head		node;
>>>>  	struct tegra_dma_desc		*dma_desc;
>>>> +	unsigned int			words_xferred;
>>>>  };
>>>>  
>>>>  /*
>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>  	nsg_req->configured = true;
>>>> +	nsg_req->words_xferred = 0;
>>>>  
>>>>  	tegra_dma_resume(tdc);
>>>>  }
>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>  					typeof(*sg_req), node);
>>>>  	tegra_dma_start(tdc, sg_req);
>>>>  	sg_req->configured = true;
>>>> +	sg_req->words_xferred = 0;
>>>>  	tdc->busy = true;
>>>>  }
>>>>  
>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>> +{
>>>> +	unsigned long status, wcount = 0;
>>>> +
>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +		return 0;
>>>> +
>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = status;
>>>> +
>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +		return sg_req->req_len;
>>>> +
>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>> +
>>>> +	if (!wcount) {
>>>> +		/*
>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>> +		 * simply assume that transfer hasn't started yet.
>>>> +		 *
>>>> +		 * Otherwise it's the end of the transfer.
>>>> +		 *
>>>> +		 * The alternative would be to poll the status register
>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>> +		 * because EOC bit is getting set only after the last
>>>> +		 * burst's completion and counter is less than the actual
>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>> +		 * distinguish start of transfer from its end.
>>>> +		 */
>>>> +		if (sg_req->words_xferred)
>>>> +			wcount = sg_req->req_len - 4;
>>>> +
>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>> +		/*
>>>> +		 * This case shall not ever happen because EOC bit
>>>> +		 * must be set once next cyclic transfer is started.
>>>
>>> I am not sure I follow this and why this condition cannot happen for
>>> cyclic transfers. What about non-cyclic transfers?
>>
>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>> after completion of the last burst. That's my observation after a thorough testing,
>> it will be very odd if EOC setting happened completely asynchronously.
>>
>> For a non-cyclic transfers it doesn't matter.. because they are not cyclic and thus
>> counter will be stopped by itself. It will be a disaster if all of sudden a
>> non-cyclic transfer becomes cyclic, don't you think so? :)
>>
> 
> Ah, probably I was too focused on audio playback use-case. If it's a free-running
> transfer, then that case of a wraparound seems should be legit.. hmm.
> 

Looks like that will work:

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 71473eda28ee..201c693b11f5 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -648,6 +648,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
 		st = handle_continuous_head_request(tdc, sgreq, to_terminate);
 		if (!st)
 			dma_desc->dma_status = DMA_ERROR;
+	} else {
+		sg_req->words_xferred = 0;
 	}
 }
Dmitry Osipenko July 2, 2019, 12:27 p.m. UTC | #6
02.07.2019 15:04, Dmitry Osipenko пишет:
> 02.07.2019 14:56, Dmitry Osipenko пишет:
>> 02.07.2019 14:37, Dmitry Osipenko пишет:
>>> 02.07.2019 14:20, Jon Hunter пишет:
>>>>
>>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>
>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>
>>>>> Changelog:
>>>>>
>>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>>      in a cyclic mode.
>>>>>
>>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>      to get words count if dma_desc is on free list as it will result
>>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>>
>>>>>      The residual value is now updated properly, avoiding potential
>>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>      instead of the subtraction.
>>>>>
>>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..71473eda28ee 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>  	bool				last_sg;
>>>>>  	struct list_head		node;
>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>> +	unsigned int			words_xferred;
>>>>>  };
>>>>>  
>>>>>  /*
>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>  	nsg_req->configured = true;
>>>>> +	nsg_req->words_xferred = 0;
>>>>>  
>>>>>  	tegra_dma_resume(tdc);
>>>>>  }
>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>  					typeof(*sg_req), node);
>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>  	sg_req->configured = true;
>>>>> +	sg_req->words_xferred = 0;
>>>>>  	tdc->busy = true;
>>>>>  }
>>>>>  
>>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>> +{
>>>>> +	unsigned long status, wcount = 0;
>>>>> +
>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +		return 0;
>>>>> +
>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = status;
>>>>> +
>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +		return sg_req->req_len;
>>>>> +
>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +
>>>>> +	if (!wcount) {
>>>>> +		/*
>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>> +		 *
>>>>> +		 * Otherwise it's the end of the transfer.
>>>>> +		 *
>>>>> +		 * The alternative would be to poll the status register
>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>> +		 * because EOC bit is getting set only after the last
>>>>> +		 * burst's completion and counter is less than the actual
>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>> +		 * distinguish start of transfer from its end.
>>>>> +		 */
>>>>> +		if (sg_req->words_xferred)
>>>>> +			wcount = sg_req->req_len - 4;
>>>>> +
>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>> +		/*
>>>>> +		 * This case shall not ever happen because EOC bit
>>>>> +		 * must be set once next cyclic transfer is started.
>>>>
>>>> I am not sure I follow this and why this condition cannot happen for
>>>> cyclic transfers. What about non-cyclic transfers?
>>>
>>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>>> after completion of the last burst. That's my observation after a thorough testing,
>>> it will be very odd if EOC setting happened completely asynchronously.
>>>
>>> For a non-cyclic transfers it doesn't matter.. because they are not cyclic and thus
>>> counter will be stopped by itself. It will be a disaster if all of sudden a
>>> non-cyclic transfer becomes cyclic, don't you think so? :)
>>>
>>
>> Ah, probably I was too focused on audio playback use-case. If it's a free-running
>> transfer, then that case of a wraparound seems should be legit.. hmm.
>>
> 
> Looks like that will work:
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 71473eda28ee..201c693b11f5 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -648,6 +648,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>  		st = handle_continuous_head_request(tdc, sgreq, to_terminate);
>  		if (!st)
>  			dma_desc->dma_status = DMA_ERROR;
> +	} else {
> +		sg_req->words_xferred = 0;
>  	}
>  }
> 

BTW, I don't understand the logic of having multiple SGs for a cyclic transfer. It looks
like driver is doing insane thing by trying to substitute an in-fly SG with another one..
Jon Hunter July 2, 2019, 12:54 p.m. UTC | #7
On 02/07/2019 12:37, Dmitry Osipenko wrote:
> 02.07.2019 14:20, Jon Hunter пишет:
>>
>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>> of data, hence it can report transfer's residual with more fidelity which
>>> may be required in cases like audio playback. In particular this fixes
>>> audio stuttering during playback in a chromium web browser. The patch is
>>> based on the original work that was made by Ben Dooks and a patch from
>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>
>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>
>>> Changelog:
>>>
>>> v3:  Added workaround for a hardware design shortcoming that results
>>>      in a words counter wraparound before end-of-transfer bit is set
>>>      in a cyclic mode.
>>>
>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>      to get words count if dma_desc is on free list as it will result
>>>      in a NULL dereference because this case wasn't handled properly.
>>>
>>>      The residual value is now updated properly, avoiding potential
>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>      instead of the subtraction.
>>>
>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>> index 79e9593815f1..71473eda28ee 100644
>>> --- a/drivers/dma/tegra20-apb-dma.c
>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>  	bool				last_sg;
>>>  	struct list_head		node;
>>>  	struct tegra_dma_desc		*dma_desc;
>>> +	unsigned int			words_xferred;
>>>  };
>>>  
>>>  /*
>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>  	nsg_req->configured = true;
>>> +	nsg_req->words_xferred = 0;
>>>  
>>>  	tegra_dma_resume(tdc);
>>>  }
>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>  					typeof(*sg_req), node);
>>>  	tegra_dma_start(tdc, sg_req);
>>>  	sg_req->configured = true;
>>> +	sg_req->words_xferred = 0;
>>>  	tdc->busy = true;
>>>  }
>>>  
>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>  	return 0;
>>>  }
>>>  
>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>> +					       struct tegra_dma_sg_req *sg_req)
>>> +{
>>> +	unsigned long status, wcount = 0;
>>> +
>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>> +		return 0;
>>> +
>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>> +
>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>> +
>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>> +		wcount = status;
>>> +
>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>> +		return sg_req->req_len;
>>> +
>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>> +
>>> +	if (!wcount) {
>>> +		/*
>>> +		 * If wcount wasn't ever polled for this SG before, then
>>> +		 * simply assume that transfer hasn't started yet.
>>> +		 *
>>> +		 * Otherwise it's the end of the transfer.
>>> +		 *
>>> +		 * The alternative would be to poll the status register
>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>> +		 * because EOC bit is getting set only after the last
>>> +		 * burst's completion and counter is less than the actual
>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>> +		 * distinguish start of transfer from its end.
>>> +		 */
>>> +		if (sg_req->words_xferred)
>>> +			wcount = sg_req->req_len - 4;
>>> +
>>> +	} else if (wcount < sg_req->words_xferred) {
>>> +		/*
>>> +		 * This case shall not ever happen because EOC bit
>>> +		 * must be set once next cyclic transfer is started.
>>
>> I am not sure I follow this and why this condition cannot happen for
>> cyclic transfers. What about non-cyclic transfers?
> 
> It cannot happen because the EOC bit will be set in that case. The counter wraps
> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
> after completion of the last burst. That's my observation after a thorough testing,
> it will be very odd if EOC setting happened completely asynchronously.

I see how you know that the EOC is set. Anyway, you check if the EOC is
set before and if so return sg_req->req_len prior to this test.

Maybe I am missing something, but what happens if we are mid block when
dmaengine_tx_status() is called? That happen asynchronously right?

Jon
Jon Hunter July 2, 2019, 12:56 p.m. UTC | #8
On 02/07/2019 13:27, Dmitry Osipenko wrote:
> 02.07.2019 15:04, Dmitry Osipenko пишет:
>> 02.07.2019 14:56, Dmitry Osipenko пишет:
>>> 02.07.2019 14:37, Dmitry Osipenko пишет:
>>>> 02.07.2019 14:20, Jon Hunter пишет:
>>>>>
>>>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>
>>>>>> Changelog:
>>>>>>
>>>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>>>      in a cyclic mode.
>>>>>>
>>>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>>      to get words count if dma_desc is on free list as it will result
>>>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>>>
>>>>>>      The residual value is now updated properly, avoiding potential
>>>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>>      instead of the subtraction.
>>>>>>
>>>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..71473eda28ee 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>>  	bool				last_sg;
>>>>>>  	struct list_head		node;
>>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>>> +	unsigned int			words_xferred;
>>>>>>  };
>>>>>>  
>>>>>>  /*
>>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>>  	nsg_req->configured = true;
>>>>>> +	nsg_req->words_xferred = 0;
>>>>>>  
>>>>>>  	tegra_dma_resume(tdc);
>>>>>>  }
>>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>>  					typeof(*sg_req), node);
>>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>>  	sg_req->configured = true;
>>>>>> +	sg_req->words_xferred = 0;
>>>>>>  	tdc->busy = true;
>>>>>>  }
>>>>>>  
>>>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>  	return 0;
>>>>>>  }
>>>>>>  
>>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>>> +{
>>>>>> +	unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +		return 0;
>>>>>> +
>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = status;
>>>>>> +
>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +		return sg_req->req_len;
>>>>>> +
>>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +
>>>>>> +	if (!wcount) {
>>>>>> +		/*
>>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>>> +		 *
>>>>>> +		 * Otherwise it's the end of the transfer.
>>>>>> +		 *
>>>>>> +		 * The alternative would be to poll the status register
>>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>>> +		 * because EOC bit is getting set only after the last
>>>>>> +		 * burst's completion and counter is less than the actual
>>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>>> +		 * distinguish start of transfer from its end.
>>>>>> +		 */
>>>>>> +		if (sg_req->words_xferred)
>>>>>> +			wcount = sg_req->req_len - 4;
>>>>>> +
>>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>>> +		/*
>>>>>> +		 * This case shall not ever happen because EOC bit
>>>>>> +		 * must be set once next cyclic transfer is started.
>>>>>
>>>>> I am not sure I follow this and why this condition cannot happen for
>>>>> cyclic transfers. What about non-cyclic transfers?
>>>>
>>>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>>>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>>>> after completion of the last burst. That's my observation after a thorough testing,
>>>> it will be very odd if EOC setting happened completely asynchronously.
>>>>
>>>> For a non-cyclic transfers it doesn't matter.. because they are not cyclic and thus
>>>> counter will be stopped by itself. It will be a disaster if all of sudden a
>>>> non-cyclic transfer becomes cyclic, don't you think so? :)
>>>>
>>>
>>> Ah, probably I was too focused on audio playback use-case. If it's a free-running
>>> transfer, then that case of a wraparound seems should be legit.. hmm.
>>>
>>
>> Looks like that will work:
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 71473eda28ee..201c693b11f5 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -648,6 +648,8 @@ static void handle_cont_sngl_cycle_dma_done(struct tegra_dma_channel *tdc,
>>  		st = handle_continuous_head_request(tdc, sgreq, to_terminate);
>>  		if (!st)
>>  			dma_desc->dma_status = DMA_ERROR;
>> +	} else {
>> +		sg_req->words_xferred = 0;
>>  	}
>>  }
>>
> 
> BTW, I don't understand the logic of having multiple SGs for a cyclic transfer. It looks
> like driver is doing insane thing by trying to substitute an in-fly SG with another one..

I may have mentioned before that I don't like how this driver was
implemented. Ideally it should use the virt-dma helpers and avoid all
this nonsense. However, best to leave sleeping dogs lie.

Jon
Jon Hunter July 2, 2019, 12:57 p.m. UTC | #9
On 02/07/2019 13:54, Jon Hunter wrote:
> 
> On 02/07/2019 12:37, Dmitry Osipenko wrote:
>> 02.07.2019 14:20, Jon Hunter пишет:
>>>
>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>> based on the original work that was made by Ben Dooks and a patch from
>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>
>>>> Changelog:
>>>>
>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>      in a cyclic mode.
>>>>
>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>      to get words count if dma_desc is on free list as it will result
>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>
>>>>      The residual value is now updated properly, avoiding potential
>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>      instead of the subtraction.
>>>>
>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..71473eda28ee 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>  	bool				last_sg;
>>>>  	struct list_head		node;
>>>>  	struct tegra_dma_desc		*dma_desc;
>>>> +	unsigned int			words_xferred;
>>>>  };
>>>>  
>>>>  /*
>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>  	nsg_req->configured = true;
>>>> +	nsg_req->words_xferred = 0;
>>>>  
>>>>  	tegra_dma_resume(tdc);
>>>>  }
>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>  					typeof(*sg_req), node);
>>>>  	tegra_dma_start(tdc, sg_req);
>>>>  	sg_req->configured = true;
>>>> +	sg_req->words_xferred = 0;
>>>>  	tdc->busy = true;
>>>>  }
>>>>  
>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>> +{
>>>> +	unsigned long status, wcount = 0;
>>>> +
>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +		return 0;
>>>> +
>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = status;
>>>> +
>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +		return sg_req->req_len;
>>>> +
>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>> +
>>>> +	if (!wcount) {
>>>> +		/*
>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>> +		 * simply assume that transfer hasn't started yet.
>>>> +		 *
>>>> +		 * Otherwise it's the end of the transfer.
>>>> +		 *
>>>> +		 * The alternative would be to poll the status register
>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>> +		 * because EOC bit is getting set only after the last
>>>> +		 * burst's completion and counter is less than the actual
>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>> +		 * distinguish start of transfer from its end.
>>>> +		 */
>>>> +		if (sg_req->words_xferred)
>>>> +			wcount = sg_req->req_len - 4;
>>>> +
>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>> +		/*
>>>> +		 * This case shall not ever happen because EOC bit
>>>> +		 * must be set once next cyclic transfer is started.
>>>
>>> I am not sure I follow this and why this condition cannot happen for
>>> cyclic transfers. What about non-cyclic transfers?
>>
>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>> after completion of the last burst. That's my observation after a thorough testing,
>> it will be very odd if EOC setting happened completely asynchronously.
> 
> I see how you know that the EOC is set. Anyway, you check if the EOC is
> set before and if so return sg_req->req_len prior to this test.

s/I see/I don't see/

Jon
Dmitry Osipenko July 2, 2019, 1:22 p.m. UTC | #10
02.07.2019 15:54, Jon Hunter пишет:
> 
> On 02/07/2019 12:37, Dmitry Osipenko wrote:
>> 02.07.2019 14:20, Jon Hunter пишет:
>>>
>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>> of data, hence it can report transfer's residual with more fidelity which
>>>> may be required in cases like audio playback. In particular this fixes
>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>> based on the original work that was made by Ben Dooks and a patch from
>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>> ---
>>>>
>>>> Changelog:
>>>>
>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>      in a cyclic mode.
>>>>
>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>      to get words count if dma_desc is on free list as it will result
>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>
>>>>      The residual value is now updated properly, avoiding potential
>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>      instead of the subtraction.
>>>>
>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>> index 79e9593815f1..71473eda28ee 100644
>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>  	bool				last_sg;
>>>>  	struct list_head		node;
>>>>  	struct tegra_dma_desc		*dma_desc;
>>>> +	unsigned int			words_xferred;
>>>>  };
>>>>  
>>>>  /*
>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>  	nsg_req->configured = true;
>>>> +	nsg_req->words_xferred = 0;
>>>>  
>>>>  	tegra_dma_resume(tdc);
>>>>  }
>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>  					typeof(*sg_req), node);
>>>>  	tegra_dma_start(tdc, sg_req);
>>>>  	sg_req->configured = true;
>>>> +	sg_req->words_xferred = 0;
>>>>  	tdc->busy = true;
>>>>  }
>>>>  
>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>> +{
>>>> +	unsigned long status, wcount = 0;
>>>> +
>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>> +		return 0;
>>>> +
>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>> +
>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>> +
>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>> +		wcount = status;
>>>> +
>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>> +		return sg_req->req_len;
>>>> +
>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>> +
>>>> +	if (!wcount) {
>>>> +		/*
>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>> +		 * simply assume that transfer hasn't started yet.
>>>> +		 *
>>>> +		 * Otherwise it's the end of the transfer.
>>>> +		 *
>>>> +		 * The alternative would be to poll the status register
>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>> +		 * because EOC bit is getting set only after the last
>>>> +		 * burst's completion and counter is less than the actual
>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>> +		 * distinguish start of transfer from its end.
>>>> +		 */
>>>> +		if (sg_req->words_xferred)
>>>> +			wcount = sg_req->req_len - 4;
>>>> +
>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>> +		/*
>>>> +		 * This case shall not ever happen because EOC bit
>>>> +		 * must be set once next cyclic transfer is started.
>>>
>>> I am not sure I follow this and why this condition cannot happen for
>>> cyclic transfers. What about non-cyclic transfers?
>>
>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>> after completion of the last burst. That's my observation after a thorough testing,
>> it will be very odd if EOC setting happened completely asynchronously.
> 
> I see how you know that the EOC is set. Anyway, you check if the EOC is
> set before and if so return sg_req->req_len prior to this test.
> 
> Maybe I am missing something, but what happens if we are mid block when
> dmaengine_tx_status() is called? That happen asynchronously right?


Do you mean asynchronously in regards to the ISR? Or something else?

tegra_dma_tx_status() takes the channels spinlock, hence IRQ handling can't happen
simultaneously.
Jon Hunter July 2, 2019, 1:41 p.m. UTC | #11
On 02/07/2019 14:22, Dmitry Osipenko wrote:
> 02.07.2019 15:54, Jon Hunter пишет:
>>
>> On 02/07/2019 12:37, Dmitry Osipenko wrote:
>>> 02.07.2019 14:20, Jon Hunter пишет:
>>>>
>>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>> may be required in cases like audio playback. In particular this fixes
>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>
>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>> ---
>>>>>
>>>>> Changelog:
>>>>>
>>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>>      in a cyclic mode.
>>>>>
>>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>      to get words count if dma_desc is on free list as it will result
>>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>>
>>>>>      The residual value is now updated properly, avoiding potential
>>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>      instead of the subtraction.
>>>>>
>>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>> index 79e9593815f1..71473eda28ee 100644
>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>  	bool				last_sg;
>>>>>  	struct list_head		node;
>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>> +	unsigned int			words_xferred;
>>>>>  };
>>>>>  
>>>>>  /*
>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>  	nsg_req->configured = true;
>>>>> +	nsg_req->words_xferred = 0;
>>>>>  
>>>>>  	tegra_dma_resume(tdc);
>>>>>  }
>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>  					typeof(*sg_req), node);
>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>  	sg_req->configured = true;
>>>>> +	sg_req->words_xferred = 0;
>>>>>  	tdc->busy = true;
>>>>>  }
>>>>>  
>>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>> +{
>>>>> +	unsigned long status, wcount = 0;
>>>>> +
>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>> +		return 0;
>>>>> +
>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>> +
>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>> +
>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>> +		wcount = status;
>>>>> +
>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>> +		return sg_req->req_len;
>>>>> +
>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>> +
>>>>> +	if (!wcount) {
>>>>> +		/*
>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>> +		 *
>>>>> +		 * Otherwise it's the end of the transfer.
>>>>> +		 *
>>>>> +		 * The alternative would be to poll the status register
>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>> +		 * because EOC bit is getting set only after the last
>>>>> +		 * burst's completion and counter is less than the actual
>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>> +		 * distinguish start of transfer from its end.
>>>>> +		 */
>>>>> +		if (sg_req->words_xferred)
>>>>> +			wcount = sg_req->req_len - 4;
>>>>> +
>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>> +		/*
>>>>> +		 * This case shall not ever happen because EOC bit
>>>>> +		 * must be set once next cyclic transfer is started.
>>>>
>>>> I am not sure I follow this and why this condition cannot happen for
>>>> cyclic transfers. What about non-cyclic transfers?
>>>
>>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>>> after completion of the last burst. That's my observation after a thorough testing,
>>> it will be very odd if EOC setting happened completely asynchronously.
>>
>> I see how you know that the EOC is set. Anyway, you check if the EOC is
>> set before and if so return sg_req->req_len prior to this test.
>>
>> Maybe I am missing something, but what happens if we are mid block when
>> dmaengine_tx_status() is called? That happen asynchronously right?
> 
> 
> Do you mean asynchronously in regards to the ISR? Or something else?

In the sense that the client can call dmaengine_tx_status() at anytime
to check the status of a transfer.

Cheers
Jon
Dmitry Osipenko July 2, 2019, 2:41 p.m. UTC | #12
02.07.2019 16:41, Jon Hunter пишет:
> 
> On 02/07/2019 14:22, Dmitry Osipenko wrote:
>> 02.07.2019 15:54, Jon Hunter пишет:
>>>
>>> On 02/07/2019 12:37, Dmitry Osipenko wrote:
>>>> 02.07.2019 14:20, Jon Hunter пишет:
>>>>>
>>>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>>
>>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>> ---
>>>>>>
>>>>>> Changelog:
>>>>>>
>>>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>>>      in a cyclic mode.
>>>>>>
>>>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>>      to get words count if dma_desc is on free list as it will result
>>>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>>>
>>>>>>      The residual value is now updated properly, avoiding potential
>>>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>>      instead of the subtraction.
>>>>>>
>>>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>> index 79e9593815f1..71473eda28ee 100644
>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>>  	bool				last_sg;
>>>>>>  	struct list_head		node;
>>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>>> +	unsigned int			words_xferred;
>>>>>>  };
>>>>>>  
>>>>>>  /*
>>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>>  	nsg_req->configured = true;
>>>>>> +	nsg_req->words_xferred = 0;
>>>>>>  
>>>>>>  	tegra_dma_resume(tdc);
>>>>>>  }
>>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>>  					typeof(*sg_req), node);
>>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>>  	sg_req->configured = true;
>>>>>> +	sg_req->words_xferred = 0;
>>>>>>  	tdc->busy = true;
>>>>>>  }
>>>>>>  
>>>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>  	return 0;
>>>>>>  }
>>>>>>  
>>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>>> +{
>>>>>> +	unsigned long status, wcount = 0;
>>>>>> +
>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>> +		return 0;
>>>>>> +
>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>> +
>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>> +
>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>> +		wcount = status;
>>>>>> +
>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>> +		return sg_req->req_len;
>>>>>> +
>>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>>> +
>>>>>> +	if (!wcount) {
>>>>>> +		/*
>>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>>> +		 *
>>>>>> +		 * Otherwise it's the end of the transfer.
>>>>>> +		 *
>>>>>> +		 * The alternative would be to poll the status register
>>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>>> +		 * because EOC bit is getting set only after the last
>>>>>> +		 * burst's completion and counter is less than the actual
>>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>>> +		 * distinguish start of transfer from its end.
>>>>>> +		 */
>>>>>> +		if (sg_req->words_xferred)
>>>>>> +			wcount = sg_req->req_len - 4;
>>>>>> +
>>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>>> +		/*
>>>>>> +		 * This case shall not ever happen because EOC bit
>>>>>> +		 * must be set once next cyclic transfer is started.
>>>>>
>>>>> I am not sure I follow this and why this condition cannot happen for
>>>>> cyclic transfers. What about non-cyclic transfers?
>>>>
>>>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>>>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>>>> after completion of the last burst. That's my observation after a thorough testing,
>>>> it will be very odd if EOC setting happened completely asynchronously.
>>>
>>> I see how you know that the EOC is set. Anyway, you check if the EOC is
>>> set before and if so return sg_req->req_len prior to this test.
>>>
>>> Maybe I am missing something, but what happens if we are mid block when
>>> dmaengine_tx_status() is called? That happen asynchronously right?
>>
>>
>> Do you mean asynchronously in regards to the ISR? Or something else?
> 
> In the sense that the client can call dmaengine_tx_status() at anytime
> to check the status of a transfer.

Should be alright, I think this patch covers all of possible cases:

1) Start of transfer, when wcount=0.
2) Middle of transfer, when wcount!=0.
3) End of transfer, when wcount=0.
4) End of transfer, when wcount!=0 and EOC is set.
Jon Hunter July 2, 2019, 3:27 p.m. UTC | #13
On 02/07/2019 15:41, Dmitry Osipenko wrote:
> 02.07.2019 16:41, Jon Hunter пишет:
>>
>> On 02/07/2019 14:22, Dmitry Osipenko wrote:
>>> 02.07.2019 15:54, Jon Hunter пишет:
>>>>
>>>> On 02/07/2019 12:37, Dmitry Osipenko wrote:
>>>>> 02.07.2019 14:20, Jon Hunter пишет:
>>>>>>
>>>>>> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>>>>>>> Tegra's APB DMA engine updates words counter after each transferred burst
>>>>>>> of data, hence it can report transfer's residual with more fidelity which
>>>>>>> may be required in cases like audio playback. In particular this fixes
>>>>>>> audio stuttering during playback in a chromium web browser. The patch is
>>>>>>> based on the original work that was made by Ben Dooks and a patch from
>>>>>>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>>>>>>
>>>>>>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>>>>>>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>>>>>>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>>>>>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>>>>>> ---
>>>>>>>
>>>>>>> Changelog:
>>>>>>>
>>>>>>> v3:  Added workaround for a hardware design shortcoming that results
>>>>>>>      in a words counter wraparound before end-of-transfer bit is set
>>>>>>>      in a cyclic mode.
>>>>>>>
>>>>>>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>>>>>>      to get words count if dma_desc is on free list as it will result
>>>>>>>      in a NULL dereference because this case wasn't handled properly.
>>>>>>>
>>>>>>>      The residual value is now updated properly, avoiding potential
>>>>>>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>>>>>>      instead of the subtraction.
>>>>>>>
>>>>>>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>>>>>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>>>>>>> index 79e9593815f1..71473eda28ee 100644
>>>>>>> --- a/drivers/dma/tegra20-apb-dma.c
>>>>>>> +++ b/drivers/dma/tegra20-apb-dma.c
>>>>>>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>>>>>>  	bool				last_sg;
>>>>>>>  	struct list_head		node;
>>>>>>>  	struct tegra_dma_desc		*dma_desc;
>>>>>>> +	unsigned int			words_xferred;
>>>>>>>  };
>>>>>>>  
>>>>>>>  /*
>>>>>>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>>>>>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>>>>>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>>>>>>  	nsg_req->configured = true;
>>>>>>> +	nsg_req->words_xferred = 0;
>>>>>>>  
>>>>>>>  	tegra_dma_resume(tdc);
>>>>>>>  }
>>>>>>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>>>>>>  					typeof(*sg_req), node);
>>>>>>>  	tegra_dma_start(tdc, sg_req);
>>>>>>>  	sg_req->configured = true;
>>>>>>> +	sg_req->words_xferred = 0;
>>>>>>>  	tdc->busy = true;
>>>>>>>  }
>>>>>>>  
>>>>>>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>>>>>>  	return 0;
>>>>>>>  }
>>>>>>>  
>>>>>>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>>>>>>> +					       struct tegra_dma_sg_req *sg_req)
>>>>>>> +{
>>>>>>> +	unsigned long status, wcount = 0;
>>>>>>> +
>>>>>>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>>>>>>> +		return 0;
>>>>>>> +
>>>>>>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>>>>>>> +
>>>>>>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>>>>>>> +
>>>>>>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>>>>>>> +		wcount = status;
>>>>>>> +
>>>>>>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>>>>>>> +		return sg_req->req_len;
>>>>>>> +
>>>>>>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>>>>>>> +
>>>>>>> +	if (!wcount) {
>>>>>>> +		/*
>>>>>>> +		 * If wcount wasn't ever polled for this SG before, then
>>>>>>> +		 * simply assume that transfer hasn't started yet.
>>>>>>> +		 *
>>>>>>> +		 * Otherwise it's the end of the transfer.
>>>>>>> +		 *
>>>>>>> +		 * The alternative would be to poll the status register
>>>>>>> +		 * until EOC bit is set or wcount goes UP. That's so
>>>>>>> +		 * because EOC bit is getting set only after the last
>>>>>>> +		 * burst's completion and counter is less than the actual
>>>>>>> +		 * transfer size by 4 bytes. The counter value wraps around
>>>>>>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>>>>>>> +		 * distinguish start of transfer from its end.
>>>>>>> +		 */
>>>>>>> +		if (sg_req->words_xferred)
>>>>>>> +			wcount = sg_req->req_len - 4;
>>>>>>> +
>>>>>>> +	} else if (wcount < sg_req->words_xferred) {
>>>>>>> +		/*
>>>>>>> +		 * This case shall not ever happen because EOC bit
>>>>>>> +		 * must be set once next cyclic transfer is started.
>>>>>>
>>>>>> I am not sure I follow this and why this condition cannot happen for
>>>>>> cyclic transfers. What about non-cyclic transfers?
>>>>>
>>>>> It cannot happen because the EOC bit will be set in that case. The counter wraps
>>>>> around when the transfer of a last burst happens, EOC bit is guaranteed to be set
>>>>> after completion of the last burst. That's my observation after a thorough testing,
>>>>> it will be very odd if EOC setting happened completely asynchronously.
>>>>
>>>> I see how you know that the EOC is set. Anyway, you check if the EOC is
>>>> set before and if so return sg_req->req_len prior to this test.
>>>>
>>>> Maybe I am missing something, but what happens if we are mid block when
>>>> dmaengine_tx_status() is called? That happen asynchronously right?
>>>
>>>
>>> Do you mean asynchronously in regards to the ISR? Or something else?
>>
>> In the sense that the client can call dmaengine_tx_status() at anytime
>> to check the status of a transfer.
> 
> Should be alright, I think this patch covers all of possible cases:
> 
> 1) Start of transfer, when wcount=0.
> 2) Middle of transfer, when wcount!=0.
> 3) End of transfer, when wcount=0.
> 4) End of transfer, when wcount!=0 and EOC is set.

I think I see my problem I read 'wcount < sg_req->req_len' and NOT
'wcount < sg_req->words_xferred'. So yes this should be fine.

Jon
Jon Hunter July 2, 2019, 3:29 p.m. UTC | #14
On 27/06/2019 20:47, Dmitry Osipenko wrote:
> Tegra's APB DMA engine updates words counter after each transferred burst
> of data, hence it can report transfer's residual with more fidelity which
> may be required in cases like audio playback. In particular this fixes
> audio stuttering during playback in a chromium web browser. The patch is
> based on the original work that was made by Ben Dooks and a patch from
> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
> 
> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
> 
> Changelog:
> 
> v3:  Added workaround for a hardware design shortcoming that results
>      in a words counter wraparound before end-of-transfer bit is set
>      in a cyclic mode.
> 
> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>      to get words count if dma_desc is on free list as it will result
>      in a NULL dereference because this case wasn't handled properly.
> 
>      The residual value is now updated properly, avoiding potential
>      integer overflow by adding the "bytes" to the "bytes_transferred"
>      instead of the subtraction.
> 
>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>  1 file changed, 62 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
> index 79e9593815f1..71473eda28ee 100644
> --- a/drivers/dma/tegra20-apb-dma.c
> +++ b/drivers/dma/tegra20-apb-dma.c
> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>  	bool				last_sg;
>  	struct list_head		node;
>  	struct tegra_dma_desc		*dma_desc;
> +	unsigned int			words_xferred;
>  };
>  
>  /*
> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>  	nsg_req->configured = true;
> +	nsg_req->words_xferred = 0;
>  
>  	tegra_dma_resume(tdc);
>  }
> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>  					typeof(*sg_req), node);
>  	tegra_dma_start(tdc, sg_req);
>  	sg_req->configured = true;
> +	sg_req->words_xferred = 0;
>  	tdc->busy = true;
>  }
>  
> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>  	return 0;
>  }
>  
> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
> +					       struct tegra_dma_sg_req *sg_req)
> +{
> +	unsigned long status, wcount = 0;
> +
> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
> +		return 0;
> +
> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
> +
> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
> +
> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
> +		wcount = status;
> +
> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
> +		return sg_req->req_len;
> +
> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
> +
> +	if (!wcount) {
> +		/*
> +		 * If wcount wasn't ever polled for this SG before, then
> +		 * simply assume that transfer hasn't started yet.
> +		 *
> +		 * Otherwise it's the end of the transfer.
> +		 *
> +		 * The alternative would be to poll the status register
> +		 * until EOC bit is set or wcount goes UP. That's so
> +		 * because EOC bit is getting set only after the last
> +		 * burst's completion and counter is less than the actual
> +		 * transfer size by 4 bytes. The counter value wraps around
> +		 * in a cyclic mode before EOC is set(!), so we can't easily
> +		 * distinguish start of transfer from its end.
> +		 */
> +		if (sg_req->words_xferred)
> +			wcount = sg_req->req_len - 4;
> +
> +	} else if (wcount < sg_req->words_xferred) {

Minor comment, why not ...

	} else if WARN_ON_ONCE(wcount < sg_req->words_xferred) {

Otherwise ...

Reviewed-by: Jon Hunter <jonathanh@nvidia.com>

Cheers
Jon
Dmitry Osipenko July 2, 2019, 3:37 p.m. UTC | #15
02.07.2019 18:29, Jon Hunter пишет:
> 
> On 27/06/2019 20:47, Dmitry Osipenko wrote:
>> Tegra's APB DMA engine updates words counter after each transferred burst
>> of data, hence it can report transfer's residual with more fidelity which
>> may be required in cases like audio playback. In particular this fixes
>> audio stuttering during playback in a chromium web browser. The patch is
>> based on the original work that was made by Ben Dooks and a patch from
>> downstream kernel. It was tested on Tegra20 and Tegra30 devices.
>>
>> Link: https://lore.kernel.org/lkml/20190424162348.23692-1-ben.dooks@codethink.co.uk/
>> Link: https://nv-tegra.nvidia.com/gitweb/?p=linux-4.4.git;a=commit;h=c7bba40c6846fbf3eaad35c4472dcc7d8bbc02e5
>> Inspired-by: Ben Dooks <ben.dooks@codethink.co.uk>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>
>> Changelog:
>>
>> v3:  Added workaround for a hardware design shortcoming that results
>>      in a words counter wraparound before end-of-transfer bit is set
>>      in a cyclic mode.
>>
>> v2:  Addressed review comments made by Jon Hunter to v1. We won't try
>>      to get words count if dma_desc is on free list as it will result
>>      in a NULL dereference because this case wasn't handled properly.
>>
>>      The residual value is now updated properly, avoiding potential
>>      integer overflow by adding the "bytes" to the "bytes_transferred"
>>      instead of the subtraction.
>>
>>  drivers/dma/tegra20-apb-dma.c | 69 +++++++++++++++++++++++++++++++----
>>  1 file changed, 62 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
>> index 79e9593815f1..71473eda28ee 100644
>> --- a/drivers/dma/tegra20-apb-dma.c
>> +++ b/drivers/dma/tegra20-apb-dma.c
>> @@ -152,6 +152,7 @@ struct tegra_dma_sg_req {
>>  	bool				last_sg;
>>  	struct list_head		node;
>>  	struct tegra_dma_desc		*dma_desc;
>> +	unsigned int			words_xferred;
>>  };
>>  
>>  /*
>> @@ -496,6 +497,7 @@ static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
>>  	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
>>  				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
>>  	nsg_req->configured = true;
>> +	nsg_req->words_xferred = 0;
>>  
>>  	tegra_dma_resume(tdc);
>>  }
>> @@ -511,6 +513,7 @@ static void tdc_start_head_req(struct tegra_dma_channel *tdc)
>>  					typeof(*sg_req), node);
>>  	tegra_dma_start(tdc, sg_req);
>>  	sg_req->configured = true;
>> +	sg_req->words_xferred = 0;
>>  	tdc->busy = true;
>>  }
>>  
>> @@ -797,6 +800,61 @@ static int tegra_dma_terminate_all(struct dma_chan *dc)
>>  	return 0;
>>  }
>>  
>> +static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
>> +					       struct tegra_dma_sg_req *sg_req)
>> +{
>> +	unsigned long status, wcount = 0;
>> +
>> +	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
>> +		return 0;
>> +
>> +	if (tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
>> +
>> +	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
>> +
>> +	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
>> +		wcount = status;
>> +
>> +	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
>> +		return sg_req->req_len;
>> +
>> +	wcount = get_current_xferred_count(tdc, sg_req, wcount);
>> +
>> +	if (!wcount) {
>> +		/*
>> +		 * If wcount wasn't ever polled for this SG before, then
>> +		 * simply assume that transfer hasn't started yet.
>> +		 *
>> +		 * Otherwise it's the end of the transfer.
>> +		 *
>> +		 * The alternative would be to poll the status register
>> +		 * until EOC bit is set or wcount goes UP. That's so
>> +		 * because EOC bit is getting set only after the last
>> +		 * burst's completion and counter is less than the actual
>> +		 * transfer size by 4 bytes. The counter value wraps around
>> +		 * in a cyclic mode before EOC is set(!), so we can't easily
>> +		 * distinguish start of transfer from its end.
>> +		 */
>> +		if (sg_req->words_xferred)
>> +			wcount = sg_req->req_len - 4;
>> +
>> +	} else if (wcount < sg_req->words_xferred) {
> 
> Minor comment, why not ...
> 
> 	} else if WARN_ON_ONCE(wcount < sg_req->words_xferred) {

Because there should be parens around WARN_ON_ONCE() and that makes it to look not very
nice, IMHO.

> 
> Otherwise ...
> 
> Reviewed-by: Jon Hunter <jonathanh@nvidia.com>

Thanks! I'll probably make a v4 later today with words_xferred reset made after end of a
transfer's cycle, similar to what I suggested in the other email.
diff mbox series

Patch

diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c
index 79e9593815f1..71473eda28ee 100644
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -152,6 +152,7 @@  struct tegra_dma_sg_req {
 	bool				last_sg;
 	struct list_head		node;
 	struct tegra_dma_desc		*dma_desc;
+	unsigned int			words_xferred;
 };
 
 /*
@@ -496,6 +497,7 @@  static void tegra_dma_configure_for_next(struct tegra_dma_channel *tdc,
 	tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
 				nsg_req->ch_regs.csr | TEGRA_APBDMA_CSR_ENB);
 	nsg_req->configured = true;
+	nsg_req->words_xferred = 0;
 
 	tegra_dma_resume(tdc);
 }
@@ -511,6 +513,7 @@  static void tdc_start_head_req(struct tegra_dma_channel *tdc)
 					typeof(*sg_req), node);
 	tegra_dma_start(tdc, sg_req);
 	sg_req->configured = true;
+	sg_req->words_xferred = 0;
 	tdc->busy = true;
 }
 
@@ -797,6 +800,61 @@  static int tegra_dma_terminate_all(struct dma_chan *dc)
 	return 0;
 }
 
+static unsigned int tegra_dma_sg_bytes_xferred(struct tegra_dma_channel *tdc,
+					       struct tegra_dma_sg_req *sg_req)
+{
+	unsigned long status, wcount = 0;
+
+	if (!list_is_first(&sg_req->node, &tdc->pending_sg_req))
+		return 0;
+
+	if (tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = tdc_read(tdc, TEGRA_APBDMA_CHAN_WORD_TRANSFER);
+
+	status = tdc_read(tdc, TEGRA_APBDMA_CHAN_STATUS);
+
+	if (!tdc->tdma->chip_data->support_separate_wcount_reg)
+		wcount = status;
+
+	if (status & TEGRA_APBDMA_STATUS_ISE_EOC)
+		return sg_req->req_len;
+
+	wcount = get_current_xferred_count(tdc, sg_req, wcount);
+
+	if (!wcount) {
+		/*
+		 * If wcount wasn't ever polled for this SG before, then
+		 * simply assume that transfer hasn't started yet.
+		 *
+		 * Otherwise it's the end of the transfer.
+		 *
+		 * The alternative would be to poll the status register
+		 * until EOC bit is set or wcount goes UP. That's so
+		 * because EOC bit is getting set only after the last
+		 * burst's completion and counter is less than the actual
+		 * transfer size by 4 bytes. The counter value wraps around
+		 * in a cyclic mode before EOC is set(!), so we can't easily
+		 * distinguish start of transfer from its end.
+		 */
+		if (sg_req->words_xferred)
+			wcount = sg_req->req_len - 4;
+
+	} else if (wcount < sg_req->words_xferred) {
+		/*
+		 * This case shall not ever happen because EOC bit
+		 * must be set once next cyclic transfer is started.
+		 * Assume that hardware is malfunctioning or there is
+		 * a software bug.
+		 */
+		wcount = sg_req->req_len - 4;
+		WARN_ON_ONCE(1);
+	} else {
+		sg_req->words_xferred = wcount;
+	}
+
+	return wcount;
+}
+
 static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	dma_cookie_t cookie, struct dma_tx_state *txstate)
 {
@@ -806,6 +864,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	enum dma_status ret;
 	unsigned long flags;
 	unsigned int residual;
+	unsigned int bytes = 0;
 
 	ret = dma_cookie_status(dc, cookie, txstate);
 	if (ret == DMA_COMPLETE)
@@ -825,6 +884,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 	list_for_each_entry(sg_req, &tdc->pending_sg_req, node) {
 		dma_desc = sg_req->dma_desc;
 		if (dma_desc->txd.cookie == cookie) {
+			bytes = tegra_dma_sg_bytes_xferred(tdc, sg_req);
 			ret = dma_desc->dma_status;
 			goto found;
 		}
@@ -836,7 +896,7 @@  static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
 found:
 	if (dma_desc && txstate) {
 		residual = dma_desc->bytes_requested -
-			   (dma_desc->bytes_transferred %
+			   ((dma_desc->bytes_transferred + bytes) %
 			    dma_desc->bytes_requested);
 		dma_set_residue(txstate, residual);
 	}
@@ -1441,12 +1501,7 @@  static int tegra_dma_probe(struct platform_device *pdev)
 		BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
 		BIT(DMA_SLAVE_BUSWIDTH_8_BYTES);
 	tdma->dma_dev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-	/*
-	 * XXX The hardware appears to support
-	 * DMA_RESIDUE_GRANULARITY_BURST-level reporting, but it's
-	 * only used by this driver during tegra_dma_terminate_all()
-	 */
-	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT;
+	tdma->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 	tdma->dma_dev.device_config = tegra_dma_slave_config;
 	tdma->dma_dev.device_terminate_all = tegra_dma_terminate_all;
 	tdma->dma_dev.device_tx_status = tegra_dma_tx_status;